PageRenderTime 148ms CodeModel.GetById 33ms app.highlight 93ms RepoModel.GetById 10ms app.codeStats 0ms

/mwdumper/src/org/mediawiki/importer/XmlDumpReader.java

https://github.com/ChuguluGames/mediawiki-svn
Java | 405 lines | 284 code | 58 blank | 63 comment | 107 complexity | 86192d36ba117e75735486c3f366daeb MD5 | raw file
  1/*
  2 * MediaWiki import/export processing tools
  3 * Copyright 2005 by Brion Vibber
  4 *
  5 * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 * of this software and associated documentation files (the "Software"), to deal
  7 * in the Software without restriction, including without limitation the rights
  8 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 * copies of the Software, and to permit persons to whom the Software is
 10 * furnished to do so, subject to the following conditions:
 11 *
 12 * The above copyright notice and this permission notice shall be included in
 13 * all copies or substantial portions of the Software.
 14 *
 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 * SOFTWARE.
 22 *
 23 * $Id$
 24 */
 25
 26package org.mediawiki.importer;
 27
 28import java.io.IOException;
 29import java.io.InputStream;
 30import java.util.Calendar;
 31import java.util.GregorianCalendar;
 32import java.util.HashMap;
 33import java.util.Map;
 34import java.util.TimeZone;
 35
 36import javax.xml.parsers.ParserConfigurationException;
 37import javax.xml.parsers.SAXParser;
 38import javax.xml.parsers.SAXParserFactory;
 39
 40import org.xml.sax.Attributes;
 41import org.xml.sax.SAXException;
 42import org.xml.sax.helpers.DefaultHandler;
 43
 44public class XmlDumpReader  extends DefaultHandler {
 45	InputStream input;
 46	DumpWriter writer;
 47	
 48	private char[] buffer;
 49	private int len;
 50	private boolean hasContent = false;
 51	private boolean deleted = false;
 52	
 53	Siteinfo siteinfo;
 54	Page page;
 55	boolean pageSent;
 56	Contributor contrib;
 57	Revision rev;
 58	int nskey;
 59	
 60	boolean abortFlag;
 61	
 62	/**
 63	 * Initialize a processor for a MediaWiki XML dump stream.
 64	 * Events are sent to a single DumpWriter output sink, but you
 65	 * can chain multiple output processors with a MultiWriter.
 66	 * @param inputStream Stream to read XML from.
 67	 * @param writer Output sink to send processed events to.
 68	 */
 69	public XmlDumpReader(InputStream inputStream, DumpWriter writer) {
 70		input = inputStream;
 71		this.writer = writer;
 72		buffer = new char[4096];
 73		len = 0;
 74		hasContent = false;
 75	}
 76	
 77	/**
 78	 * Reads through the entire XML dump on the input stream, sending
 79	 * events to the DumpWriter as it goes. May throw exceptions on
 80	 * invalid input or due to problems with the output.
 81	 * @throws IOException
 82	 */
 83	public void readDump() throws IOException {
 84		try {
 85			SAXParserFactory factory = SAXParserFactory.newInstance();
 86			SAXParser parser = factory.newSAXParser();
 87	
 88			parser.parse(input, this);
 89		} catch (ParserConfigurationException e) {
 90			throw (IOException)new IOException(e.getMessage()).initCause(e);
 91		} catch (SAXException e) {
 92			throw (IOException)new IOException(e.getMessage()).initCause(e);
 93		}
 94		writer.close();
 95	}
 96	
 97	/**
 98	 * Request that the dump processing be aborted.
 99	 * At the next element, an exception will be thrown to stop the XML parser.
100	 * @fixme Is setting a bool thread-safe? It should be atomic...
101	 */
102	public void abort() {
103		abortFlag = true;
104	}
105	
106	// --------------------------
107	// SAX handler interface methods:
108	
109	private static final Map startElements = new HashMap(64);
110	private static final Map endElements = new HashMap(64);
111	static {
112		startElements.put("revision","revision");
113		startElements.put("contributor","contributor");
114		startElements.put("page","page");
115		startElements.put("mediawiki", "mediawiki");
116		startElements.put("siteinfo","siteinfo");
117		startElements.put("namespaces","namespaces");
118		startElements.put("namespace","namespace");
119
120		endElements.put("ThreadSubject","ThreadSubject");
121		endElements.put("ThreadParent","ThreadParent");
122		endElements.put("ThreadAncestor","ThreadAncestor");
123		endElements.put("ThreadPage","ThreadPage");
124		endElements.put("ThreadID","ThreadID");
125		endElements.put("ThreadSummaryPage","ThreadSummaryPage");
126		endElements.put("ThreadAuthor","ThreadAuthor");
127		endElements.put("ThreadEditStatus","ThreadEditStatus");
128		endElements.put("ThreadType","ThreadType");
129		endElements.put("base","base");
130		endElements.put("case","case");
131		endElements.put("comment","comment");
132		endElements.put("contributor","contributor");
133		endElements.put("generator","generator");
134		endElements.put("id","id");
135		endElements.put("ip","ip");
136		endElements.put("mediawiki", "mediawiki");
137		endElements.put("minor","minor");
138		endElements.put("namespaces","namespaces");
139		endElements.put("namespace","namespace");
140		endElements.put("page","page");
141		endElements.put("restrictions","restrictions");
142		endElements.put("revision","revision");
143		endElements.put("siteinfo","siteinfo");
144		endElements.put("sitename","sitename");
145		endElements.put("text","text");
146		endElements.put("timestamp","timestamp");
147		endElements.put("title","title");
148		endElements.put("username","username");
149	}
150	
151	public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException {
152		// Clear the buffer for character data; we'll initialize it
153		// if and when character data arrives -- at that point we
154		// have a length.
155		len = 0;
156		hasContent = false;
157		
158		if (abortFlag)
159			throw new SAXException("XmlDumpReader set abort flag.");
160
161		// check for deleted="deleted", and set deleted flag for the current element. 
162		String d = attributes.getValue("deleted");
163		deleted = (d!=null && d.equals("deleted")); 
164		
165		try {
166			qName = (String)startElements.get(qName);
167			if (qName == null)
168				return;
169			// frequent tags:
170			if (qName == "revision") openRevision();
171			else if (qName == "contributor") openContributor();
172			else if (qName == "page") openPage();
173			// rare tags:
174			else if (qName == "mediawiki") openMediaWiki();
175			else if (qName == "siteinfo") openSiteinfo();
176			else if (qName == "namespaces") openNamespaces();
177			else if (qName == "namespace") openNamespace(attributes);
178		} catch (IOException e) {
179			throw new SAXException(e);
180		}
181	}
182	
183	public void characters(char[] ch, int start, int length) {
184		if (buffer.length < len + length) {
185			int maxlen = buffer.length * 2;
186			if (maxlen < len + length)
187				maxlen = len + length;
188			char[] tmp = new char[maxlen];
189			System.arraycopy(buffer, 0, tmp, 0, len);
190			buffer = tmp;
191		}
192		System.arraycopy(ch, start, buffer, len, length);
193		len += length;
194		hasContent = true;
195	}
196	
197	public void endElement(String uri, String localname, String qName) throws SAXException {
198		try {
199			qName = (String)endElements.get(qName);
200			if (qName == null)
201				return;
202			// frequent tags:
203			if (qName == "id") readId();
204			else if (qName == "revision") closeRevision();
205			else if (qName == "timestamp") readTimestamp();
206			else if (qName == "text") readText();
207			else if (qName == "contributor") closeContributor();
208			else if (qName == "username") readUsername();
209			else if (qName == "ip") readIp();
210			else if (qName == "comment") readComment();
211			else if (qName == "minor") readMinor();
212			else if (qName == "page") closePage();
213			else if (qName == "title") readTitle();
214			else if (qName == "restrictions") readRestrictions();
215			// rare tags:
216			else if (qName.startsWith("Thread")) threadAttribute(qName);
217			else if (qName == "mediawiki") closeMediaWiki();
218			else if (qName == "siteinfo") closeSiteinfo();
219			else if (qName == "sitename") readSitename();
220			else if (qName == "base") readBase();
221			else if (qName == "generator") readGenerator();
222			else if (qName == "case") readCase();
223			else if (qName == "namespaces") closeNamespaces();
224			else if (qName == "namespace") closeNamespace();
225//			else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")");
226		} catch (IOException e) {
227			throw (SAXException)new SAXException(e.getMessage()).initCause(e);
228		}
229	}
230
231	// ----------
232	
233	void threadAttribute(String attrib) throws IOException {
234		if(attrib.equals("ThreadPage")) // parse title
235			page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces));
236		else
237			page.DiscussionThreadingInfo.put(attrib, bufferContents());
238	}
239	
240	void openMediaWiki() throws IOException {
241		siteinfo = null;
242		writer.writeStartWiki();
243	}
244	
245	void closeMediaWiki() throws IOException {
246		writer.writeEndWiki();
247		siteinfo = null;
248	}
249	
250	// ------------------
251		
252	void openSiteinfo() {
253		siteinfo = new Siteinfo();
254	}
255	
256	void closeSiteinfo() throws IOException {
257		writer.writeSiteinfo(siteinfo);
258	}
259
260	private String bufferContentsOrNull() {
261		if (!hasContent) return null;
262		else return bufferContents();
263	}
264	
265	private String bufferContents() {
266		return len == 0 ? "" : new String(buffer, 0, len);
267	}
268	
269	void readSitename() {
270		siteinfo.Sitename = bufferContents();
271	}
272	
273	void readBase() {
274		siteinfo.Base = bufferContents();
275	}
276	
277	void readGenerator() {
278		siteinfo.Generator = bufferContents();
279	}
280	
281	void readCase() {
282		siteinfo.Case = bufferContents();
283	}
284	
285	void openNamespaces() {
286		siteinfo.Namespaces = new NamespaceSet();
287	}
288	
289	void openNamespace(Attributes attribs) {
290		nskey = Integer.parseInt(attribs.getValue("key"));
291	}
292	
293	void closeNamespace() {
294		siteinfo.Namespaces.add(nskey, bufferContents());
295	}
296
297	void closeNamespaces() {
298		// NOP
299	}
300	
301	// -----------
302	
303	void openPage() {
304		page = new Page();
305		pageSent = false;
306	}
307	
308	void closePage() throws IOException {
309		if (pageSent)
310			writer.writeEndPage();
311		page = null;
312	}
313	
314	void readTitle() {
315		page.Title = new Title(bufferContents(), siteinfo.Namespaces);
316	}
317	
318	void readId() {
319		int id = Integer.parseInt(bufferContents());
320		if (contrib != null) 
321			contrib.Id = id;
322		else if (rev != null)
323			rev.Id = id;
324		else if (page != null)
325			page.Id = id;
326		else
327			throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>");
328	}
329	
330	void readRestrictions() {
331		page.Restrictions = bufferContents();
332	}
333	
334	// ------
335	
336	void openRevision() throws IOException {
337		if (!pageSent) {
338			writer.writeStartPage(page);
339			pageSent = true;
340		}
341		
342		rev = new Revision();
343	}
344	
345	void closeRevision() throws IOException {
346		writer.writeRevision(rev);
347		rev = null;
348	}
349
350	void readTimestamp() {
351		rev.Timestamp = parseUTCTimestamp(bufferContents());
352	}
353
354	void readComment() {
355		rev.Comment = bufferContentsOrNull();
356		if (rev.Comment==null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed
357	}
358
359	void readMinor() {
360		rev.Minor = true;
361	}
362
363	void readText() {
364		rev.Text = bufferContentsOrNull();
365		if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed
366	}
367	
368	// -----------
369	void openContributor() {
370		//XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted"
371		contrib =  new Contributor();
372	}
373	
374	void closeContributor() {
375		//NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object
376		rev.Contributor = contrib;
377		contrib = null;
378	}
379
380
381	void readUsername() {
382		contrib.Username = bufferContentsOrNull();
383	}
384	
385	void readIp() {
386		contrib.Username = bufferContents();
387		contrib.isIP = true;
388	}
389	
390	private static final TimeZone utc = TimeZone.getTimeZone("UTC");
391	private static Calendar parseUTCTimestamp(String text) {
392		// 2003-10-26T04:50:47Z
393		// We're doing this manually for now, though DateFormatter might work...
394		String trimmed = text.trim();
395		GregorianCalendar ts = new GregorianCalendar(utc);
396		ts.set(
397			Integer.parseInt(trimmed.substring(0,0+4)),     // year
398			Integer.parseInt(trimmed.substring(5,5+2)) - 1, // month is 0-based!
399			Integer.parseInt(trimmed.substring(8,8+2)),     // day
400			Integer.parseInt(trimmed.substring(11,11+2)),   // hour
401			Integer.parseInt(trimmed.substring(14,14+2)),   // minute
402			Integer.parseInt(trimmed.substring(17,17+2)));  // second
403		return ts;
404	}
405}