/mwdumper/src/org/mediawiki/importer/XmlDumpReader.java
Java | 405 lines | 284 code | 58 blank | 63 comment | 107 complexity | 86192d36ba117e75735486c3f366daeb MD5 | raw file
1/*
2 * MediaWiki import/export processing tools
3 * Copyright 2005 by Brion Vibber
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a copy
6 * of this software and associated documentation files (the "Software"), to deal
7 * in the Software without restriction, including without limitation the rights
8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the Software is
10 * furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be included in
13 * all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * $Id$
24 */
25
26package org.mediawiki.importer;
27
28import java.io.IOException;
29import java.io.InputStream;
30import java.util.Calendar;
31import java.util.GregorianCalendar;
32import java.util.HashMap;
33import java.util.Map;
34import java.util.TimeZone;
35
36import javax.xml.parsers.ParserConfigurationException;
37import javax.xml.parsers.SAXParser;
38import javax.xml.parsers.SAXParserFactory;
39
40import org.xml.sax.Attributes;
41import org.xml.sax.SAXException;
42import org.xml.sax.helpers.DefaultHandler;
43
44public class XmlDumpReader extends DefaultHandler {
45 InputStream input;
46 DumpWriter writer;
47
48 private char[] buffer;
49 private int len;
50 private boolean hasContent = false;
51 private boolean deleted = false;
52
53 Siteinfo siteinfo;
54 Page page;
55 boolean pageSent;
56 Contributor contrib;
57 Revision rev;
58 int nskey;
59
60 boolean abortFlag;
61
62 /**
63 * Initialize a processor for a MediaWiki XML dump stream.
64 * Events are sent to a single DumpWriter output sink, but you
65 * can chain multiple output processors with a MultiWriter.
66 * @param inputStream Stream to read XML from.
67 * @param writer Output sink to send processed events to.
68 */
69 public XmlDumpReader(InputStream inputStream, DumpWriter writer) {
70 input = inputStream;
71 this.writer = writer;
72 buffer = new char[4096];
73 len = 0;
74 hasContent = false;
75 }
76
77 /**
78 * Reads through the entire XML dump on the input stream, sending
79 * events to the DumpWriter as it goes. May throw exceptions on
80 * invalid input or due to problems with the output.
81 * @throws IOException
82 */
83 public void readDump() throws IOException {
84 try {
85 SAXParserFactory factory = SAXParserFactory.newInstance();
86 SAXParser parser = factory.newSAXParser();
87
88 parser.parse(input, this);
89 } catch (ParserConfigurationException e) {
90 throw (IOException)new IOException(e.getMessage()).initCause(e);
91 } catch (SAXException e) {
92 throw (IOException)new IOException(e.getMessage()).initCause(e);
93 }
94 writer.close();
95 }
96
97 /**
98 * Request that the dump processing be aborted.
99 * At the next element, an exception will be thrown to stop the XML parser.
100 * @fixme Is setting a bool thread-safe? It should be atomic...
101 */
102 public void abort() {
103 abortFlag = true;
104 }
105
106 // --------------------------
107 // SAX handler interface methods:
108
109 private static final Map startElements = new HashMap(64);
110 private static final Map endElements = new HashMap(64);
111 static {
112 startElements.put("revision","revision");
113 startElements.put("contributor","contributor");
114 startElements.put("page","page");
115 startElements.put("mediawiki", "mediawiki");
116 startElements.put("siteinfo","siteinfo");
117 startElements.put("namespaces","namespaces");
118 startElements.put("namespace","namespace");
119
120 endElements.put("ThreadSubject","ThreadSubject");
121 endElements.put("ThreadParent","ThreadParent");
122 endElements.put("ThreadAncestor","ThreadAncestor");
123 endElements.put("ThreadPage","ThreadPage");
124 endElements.put("ThreadID","ThreadID");
125 endElements.put("ThreadSummaryPage","ThreadSummaryPage");
126 endElements.put("ThreadAuthor","ThreadAuthor");
127 endElements.put("ThreadEditStatus","ThreadEditStatus");
128 endElements.put("ThreadType","ThreadType");
129 endElements.put("base","base");
130 endElements.put("case","case");
131 endElements.put("comment","comment");
132 endElements.put("contributor","contributor");
133 endElements.put("generator","generator");
134 endElements.put("id","id");
135 endElements.put("ip","ip");
136 endElements.put("mediawiki", "mediawiki");
137 endElements.put("minor","minor");
138 endElements.put("namespaces","namespaces");
139 endElements.put("namespace","namespace");
140 endElements.put("page","page");
141 endElements.put("restrictions","restrictions");
142 endElements.put("revision","revision");
143 endElements.put("siteinfo","siteinfo");
144 endElements.put("sitename","sitename");
145 endElements.put("text","text");
146 endElements.put("timestamp","timestamp");
147 endElements.put("title","title");
148 endElements.put("username","username");
149 }
150
151 public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException {
152 // Clear the buffer for character data; we'll initialize it
153 // if and when character data arrives -- at that point we
154 // have a length.
155 len = 0;
156 hasContent = false;
157
158 if (abortFlag)
159 throw new SAXException("XmlDumpReader set abort flag.");
160
161 // check for deleted="deleted", and set deleted flag for the current element.
162 String d = attributes.getValue("deleted");
163 deleted = (d!=null && d.equals("deleted"));
164
165 try {
166 qName = (String)startElements.get(qName);
167 if (qName == null)
168 return;
169 // frequent tags:
170 if (qName == "revision") openRevision();
171 else if (qName == "contributor") openContributor();
172 else if (qName == "page") openPage();
173 // rare tags:
174 else if (qName == "mediawiki") openMediaWiki();
175 else if (qName == "siteinfo") openSiteinfo();
176 else if (qName == "namespaces") openNamespaces();
177 else if (qName == "namespace") openNamespace(attributes);
178 } catch (IOException e) {
179 throw new SAXException(e);
180 }
181 }
182
183 public void characters(char[] ch, int start, int length) {
184 if (buffer.length < len + length) {
185 int maxlen = buffer.length * 2;
186 if (maxlen < len + length)
187 maxlen = len + length;
188 char[] tmp = new char[maxlen];
189 System.arraycopy(buffer, 0, tmp, 0, len);
190 buffer = tmp;
191 }
192 System.arraycopy(ch, start, buffer, len, length);
193 len += length;
194 hasContent = true;
195 }
196
197 public void endElement(String uri, String localname, String qName) throws SAXException {
198 try {
199 qName = (String)endElements.get(qName);
200 if (qName == null)
201 return;
202 // frequent tags:
203 if (qName == "id") readId();
204 else if (qName == "revision") closeRevision();
205 else if (qName == "timestamp") readTimestamp();
206 else if (qName == "text") readText();
207 else if (qName == "contributor") closeContributor();
208 else if (qName == "username") readUsername();
209 else if (qName == "ip") readIp();
210 else if (qName == "comment") readComment();
211 else if (qName == "minor") readMinor();
212 else if (qName == "page") closePage();
213 else if (qName == "title") readTitle();
214 else if (qName == "restrictions") readRestrictions();
215 // rare tags:
216 else if (qName.startsWith("Thread")) threadAttribute(qName);
217 else if (qName == "mediawiki") closeMediaWiki();
218 else if (qName == "siteinfo") closeSiteinfo();
219 else if (qName == "sitename") readSitename();
220 else if (qName == "base") readBase();
221 else if (qName == "generator") readGenerator();
222 else if (qName == "case") readCase();
223 else if (qName == "namespaces") closeNamespaces();
224 else if (qName == "namespace") closeNamespace();
225// else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")");
226 } catch (IOException e) {
227 throw (SAXException)new SAXException(e.getMessage()).initCause(e);
228 }
229 }
230
231 // ----------
232
233 void threadAttribute(String attrib) throws IOException {
234 if(attrib.equals("ThreadPage")) // parse title
235 page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces));
236 else
237 page.DiscussionThreadingInfo.put(attrib, bufferContents());
238 }
239
240 void openMediaWiki() throws IOException {
241 siteinfo = null;
242 writer.writeStartWiki();
243 }
244
245 void closeMediaWiki() throws IOException {
246 writer.writeEndWiki();
247 siteinfo = null;
248 }
249
250 // ------------------
251
252 void openSiteinfo() {
253 siteinfo = new Siteinfo();
254 }
255
256 void closeSiteinfo() throws IOException {
257 writer.writeSiteinfo(siteinfo);
258 }
259
260 private String bufferContentsOrNull() {
261 if (!hasContent) return null;
262 else return bufferContents();
263 }
264
265 private String bufferContents() {
266 return len == 0 ? "" : new String(buffer, 0, len);
267 }
268
269 void readSitename() {
270 siteinfo.Sitename = bufferContents();
271 }
272
273 void readBase() {
274 siteinfo.Base = bufferContents();
275 }
276
277 void readGenerator() {
278 siteinfo.Generator = bufferContents();
279 }
280
281 void readCase() {
282 siteinfo.Case = bufferContents();
283 }
284
285 void openNamespaces() {
286 siteinfo.Namespaces = new NamespaceSet();
287 }
288
289 void openNamespace(Attributes attribs) {
290 nskey = Integer.parseInt(attribs.getValue("key"));
291 }
292
293 void closeNamespace() {
294 siteinfo.Namespaces.add(nskey, bufferContents());
295 }
296
297 void closeNamespaces() {
298 // NOP
299 }
300
301 // -----------
302
303 void openPage() {
304 page = new Page();
305 pageSent = false;
306 }
307
308 void closePage() throws IOException {
309 if (pageSent)
310 writer.writeEndPage();
311 page = null;
312 }
313
314 void readTitle() {
315 page.Title = new Title(bufferContents(), siteinfo.Namespaces);
316 }
317
318 void readId() {
319 int id = Integer.parseInt(bufferContents());
320 if (contrib != null)
321 contrib.Id = id;
322 else if (rev != null)
323 rev.Id = id;
324 else if (page != null)
325 page.Id = id;
326 else
327 throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>");
328 }
329
330 void readRestrictions() {
331 page.Restrictions = bufferContents();
332 }
333
334 // ------
335
336 void openRevision() throws IOException {
337 if (!pageSent) {
338 writer.writeStartPage(page);
339 pageSent = true;
340 }
341
342 rev = new Revision();
343 }
344
345 void closeRevision() throws IOException {
346 writer.writeRevision(rev);
347 rev = null;
348 }
349
350 void readTimestamp() {
351 rev.Timestamp = parseUTCTimestamp(bufferContents());
352 }
353
354 void readComment() {
355 rev.Comment = bufferContentsOrNull();
356 if (rev.Comment==null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed
357 }
358
359 void readMinor() {
360 rev.Minor = true;
361 }
362
363 void readText() {
364 rev.Text = bufferContentsOrNull();
365 if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed
366 }
367
368 // -----------
369 void openContributor() {
370 //XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted"
371 contrib = new Contributor();
372 }
373
374 void closeContributor() {
375 //NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object
376 rev.Contributor = contrib;
377 contrib = null;
378 }
379
380
381 void readUsername() {
382 contrib.Username = bufferContentsOrNull();
383 }
384
385 void readIp() {
386 contrib.Username = bufferContents();
387 contrib.isIP = true;
388 }
389
390 private static final TimeZone utc = TimeZone.getTimeZone("UTC");
391 private static Calendar parseUTCTimestamp(String text) {
392 // 2003-10-26T04:50:47Z
393 // We're doing this manually for now, though DateFormatter might work...
394 String trimmed = text.trim();
395 GregorianCalendar ts = new GregorianCalendar(utc);
396 ts.set(
397 Integer.parseInt(trimmed.substring(0,0+4)), // year
398 Integer.parseInt(trimmed.substring(5,5+2)) - 1, // month is 0-based!
399 Integer.parseInt(trimmed.substring(8,8+2)), // day
400 Integer.parseInt(trimmed.substring(11,11+2)), // hour
401 Integer.parseInt(trimmed.substring(14,14+2)), // minute
402 Integer.parseInt(trimmed.substring(17,17+2))); // second
403 return ts;
404 }
405}