/mwdumper/src/org/mediawiki/importer/XmlDumpReader.java

https://github.com/ChuguluGames/mediawiki-svn · Java · 405 lines · 284 code · 58 blank · 63 comment · 107 complexity · 86192d36ba117e75735486c3f366daeb MD5 · raw file

  1. /*
  2. * MediaWiki import/export processing tools
  3. * Copyright 2005 by Brion Vibber
  4. *
  5. * Permission is hereby granted, free of charge, to any person obtaining a copy
  6. * of this software and associated documentation files (the "Software"), to deal
  7. * in the Software without restriction, including without limitation the rights
  8. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. * copies of the Software, and to permit persons to whom the Software is
  10. * furnished to do so, subject to the following conditions:
  11. *
  12. * The above copyright notice and this permission notice shall be included in
  13. * all copies or substantial portions of the Software.
  14. *
  15. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. * SOFTWARE.
  22. *
  23. * $Id$
  24. */
  25. package org.mediawiki.importer;
  26. import java.io.IOException;
  27. import java.io.InputStream;
  28. import java.util.Calendar;
  29. import java.util.GregorianCalendar;
  30. import java.util.HashMap;
  31. import java.util.Map;
  32. import java.util.TimeZone;
  33. import javax.xml.parsers.ParserConfigurationException;
  34. import javax.xml.parsers.SAXParser;
  35. import javax.xml.parsers.SAXParserFactory;
  36. import org.xml.sax.Attributes;
  37. import org.xml.sax.SAXException;
  38. import org.xml.sax.helpers.DefaultHandler;
  39. public class XmlDumpReader extends DefaultHandler {
  40. InputStream input;
  41. DumpWriter writer;
  42. private char[] buffer;
  43. private int len;
  44. private boolean hasContent = false;
  45. private boolean deleted = false;
  46. Siteinfo siteinfo;
  47. Page page;
  48. boolean pageSent;
  49. Contributor contrib;
  50. Revision rev;
  51. int nskey;
  52. boolean abortFlag;
  53. /**
  54. * Initialize a processor for a MediaWiki XML dump stream.
  55. * Events are sent to a single DumpWriter output sink, but you
  56. * can chain multiple output processors with a MultiWriter.
  57. * @param inputStream Stream to read XML from.
  58. * @param writer Output sink to send processed events to.
  59. */
  60. public XmlDumpReader(InputStream inputStream, DumpWriter writer) {
  61. input = inputStream;
  62. this.writer = writer;
  63. buffer = new char[4096];
  64. len = 0;
  65. hasContent = false;
  66. }
  67. /**
  68. * Reads through the entire XML dump on the input stream, sending
  69. * events to the DumpWriter as it goes. May throw exceptions on
  70. * invalid input or due to problems with the output.
  71. * @throws IOException
  72. */
  73. public void readDump() throws IOException {
  74. try {
  75. SAXParserFactory factory = SAXParserFactory.newInstance();
  76. SAXParser parser = factory.newSAXParser();
  77. parser.parse(input, this);
  78. } catch (ParserConfigurationException e) {
  79. throw (IOException)new IOException(e.getMessage()).initCause(e);
  80. } catch (SAXException e) {
  81. throw (IOException)new IOException(e.getMessage()).initCause(e);
  82. }
  83. writer.close();
  84. }
  85. /**
  86. * Request that the dump processing be aborted.
  87. * At the next element, an exception will be thrown to stop the XML parser.
  88. * @fixme Is setting a bool thread-safe? It should be atomic...
  89. */
  90. public void abort() {
  91. abortFlag = true;
  92. }
  93. // --------------------------
  94. // SAX handler interface methods:
  95. private static final Map startElements = new HashMap(64);
  96. private static final Map endElements = new HashMap(64);
  97. static {
  98. startElements.put("revision","revision");
  99. startElements.put("contributor","contributor");
  100. startElements.put("page","page");
  101. startElements.put("mediawiki", "mediawiki");
  102. startElements.put("siteinfo","siteinfo");
  103. startElements.put("namespaces","namespaces");
  104. startElements.put("namespace","namespace");
  105. endElements.put("ThreadSubject","ThreadSubject");
  106. endElements.put("ThreadParent","ThreadParent");
  107. endElements.put("ThreadAncestor","ThreadAncestor");
  108. endElements.put("ThreadPage","ThreadPage");
  109. endElements.put("ThreadID","ThreadID");
  110. endElements.put("ThreadSummaryPage","ThreadSummaryPage");
  111. endElements.put("ThreadAuthor","ThreadAuthor");
  112. endElements.put("ThreadEditStatus","ThreadEditStatus");
  113. endElements.put("ThreadType","ThreadType");
  114. endElements.put("base","base");
  115. endElements.put("case","case");
  116. endElements.put("comment","comment");
  117. endElements.put("contributor","contributor");
  118. endElements.put("generator","generator");
  119. endElements.put("id","id");
  120. endElements.put("ip","ip");
  121. endElements.put("mediawiki", "mediawiki");
  122. endElements.put("minor","minor");
  123. endElements.put("namespaces","namespaces");
  124. endElements.put("namespace","namespace");
  125. endElements.put("page","page");
  126. endElements.put("restrictions","restrictions");
  127. endElements.put("revision","revision");
  128. endElements.put("siteinfo","siteinfo");
  129. endElements.put("sitename","sitename");
  130. endElements.put("text","text");
  131. endElements.put("timestamp","timestamp");
  132. endElements.put("title","title");
  133. endElements.put("username","username");
  134. }
  135. public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException {
  136. // Clear the buffer for character data; we'll initialize it
  137. // if and when character data arrives -- at that point we
  138. // have a length.
  139. len = 0;
  140. hasContent = false;
  141. if (abortFlag)
  142. throw new SAXException("XmlDumpReader set abort flag.");
  143. // check for deleted="deleted", and set deleted flag for the current element.
  144. String d = attributes.getValue("deleted");
  145. deleted = (d!=null && d.equals("deleted"));
  146. try {
  147. qName = (String)startElements.get(qName);
  148. if (qName == null)
  149. return;
  150. // frequent tags:
  151. if (qName == "revision") openRevision();
  152. else if (qName == "contributor") openContributor();
  153. else if (qName == "page") openPage();
  154. // rare tags:
  155. else if (qName == "mediawiki") openMediaWiki();
  156. else if (qName == "siteinfo") openSiteinfo();
  157. else if (qName == "namespaces") openNamespaces();
  158. else if (qName == "namespace") openNamespace(attributes);
  159. } catch (IOException e) {
  160. throw new SAXException(e);
  161. }
  162. }
  163. public void characters(char[] ch, int start, int length) {
  164. if (buffer.length < len + length) {
  165. int maxlen = buffer.length * 2;
  166. if (maxlen < len + length)
  167. maxlen = len + length;
  168. char[] tmp = new char[maxlen];
  169. System.arraycopy(buffer, 0, tmp, 0, len);
  170. buffer = tmp;
  171. }
  172. System.arraycopy(ch, start, buffer, len, length);
  173. len += length;
  174. hasContent = true;
  175. }
  176. public void endElement(String uri, String localname, String qName) throws SAXException {
  177. try {
  178. qName = (String)endElements.get(qName);
  179. if (qName == null)
  180. return;
  181. // frequent tags:
  182. if (qName == "id") readId();
  183. else if (qName == "revision") closeRevision();
  184. else if (qName == "timestamp") readTimestamp();
  185. else if (qName == "text") readText();
  186. else if (qName == "contributor") closeContributor();
  187. else if (qName == "username") readUsername();
  188. else if (qName == "ip") readIp();
  189. else if (qName == "comment") readComment();
  190. else if (qName == "minor") readMinor();
  191. else if (qName == "page") closePage();
  192. else if (qName == "title") readTitle();
  193. else if (qName == "restrictions") readRestrictions();
  194. // rare tags:
  195. else if (qName.startsWith("Thread")) threadAttribute(qName);
  196. else if (qName == "mediawiki") closeMediaWiki();
  197. else if (qName == "siteinfo") closeSiteinfo();
  198. else if (qName == "sitename") readSitename();
  199. else if (qName == "base") readBase();
  200. else if (qName == "generator") readGenerator();
  201. else if (qName == "case") readCase();
  202. else if (qName == "namespaces") closeNamespaces();
  203. else if (qName == "namespace") closeNamespace();
  204. // else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")");
  205. } catch (IOException e) {
  206. throw (SAXException)new SAXException(e.getMessage()).initCause(e);
  207. }
  208. }
  209. // ----------
  210. void threadAttribute(String attrib) throws IOException {
  211. if(attrib.equals("ThreadPage")) // parse title
  212. page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces));
  213. else
  214. page.DiscussionThreadingInfo.put(attrib, bufferContents());
  215. }
  216. void openMediaWiki() throws IOException {
  217. siteinfo = null;
  218. writer.writeStartWiki();
  219. }
  220. void closeMediaWiki() throws IOException {
  221. writer.writeEndWiki();
  222. siteinfo = null;
  223. }
  224. // ------------------
  225. void openSiteinfo() {
  226. siteinfo = new Siteinfo();
  227. }
  228. void closeSiteinfo() throws IOException {
  229. writer.writeSiteinfo(siteinfo);
  230. }
  231. private String bufferContentsOrNull() {
  232. if (!hasContent) return null;
  233. else return bufferContents();
  234. }
  235. private String bufferContents() {
  236. return len == 0 ? "" : new String(buffer, 0, len);
  237. }
  238. void readSitename() {
  239. siteinfo.Sitename = bufferContents();
  240. }
  241. void readBase() {
  242. siteinfo.Base = bufferContents();
  243. }
  244. void readGenerator() {
  245. siteinfo.Generator = bufferContents();
  246. }
  247. void readCase() {
  248. siteinfo.Case = bufferContents();
  249. }
  250. void openNamespaces() {
  251. siteinfo.Namespaces = new NamespaceSet();
  252. }
  253. void openNamespace(Attributes attribs) {
  254. nskey = Integer.parseInt(attribs.getValue("key"));
  255. }
  256. void closeNamespace() {
  257. siteinfo.Namespaces.add(nskey, bufferContents());
  258. }
  259. void closeNamespaces() {
  260. // NOP
  261. }
  262. // -----------
  263. void openPage() {
  264. page = new Page();
  265. pageSent = false;
  266. }
  267. void closePage() throws IOException {
  268. if (pageSent)
  269. writer.writeEndPage();
  270. page = null;
  271. }
  272. void readTitle() {
  273. page.Title = new Title(bufferContents(), siteinfo.Namespaces);
  274. }
  275. void readId() {
  276. int id = Integer.parseInt(bufferContents());
  277. if (contrib != null)
  278. contrib.Id = id;
  279. else if (rev != null)
  280. rev.Id = id;
  281. else if (page != null)
  282. page.Id = id;
  283. else
  284. throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>");
  285. }
  286. void readRestrictions() {
  287. page.Restrictions = bufferContents();
  288. }
  289. // ------
  290. void openRevision() throws IOException {
  291. if (!pageSent) {
  292. writer.writeStartPage(page);
  293. pageSent = true;
  294. }
  295. rev = new Revision();
  296. }
  297. void closeRevision() throws IOException {
  298. writer.writeRevision(rev);
  299. rev = null;
  300. }
  301. void readTimestamp() {
  302. rev.Timestamp = parseUTCTimestamp(bufferContents());
  303. }
  304. void readComment() {
  305. rev.Comment = bufferContentsOrNull();
  306. if (rev.Comment==null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed
  307. }
  308. void readMinor() {
  309. rev.Minor = true;
  310. }
  311. void readText() {
  312. rev.Text = bufferContentsOrNull();
  313. if (rev.Text==null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed
  314. }
  315. // -----------
  316. void openContributor() {
  317. //XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted"
  318. contrib = new Contributor();
  319. }
  320. void closeContributor() {
  321. //NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object
  322. rev.Contributor = contrib;
  323. contrib = null;
  324. }
  325. void readUsername() {
  326. contrib.Username = bufferContentsOrNull();
  327. }
  328. void readIp() {
  329. contrib.Username = bufferContents();
  330. contrib.isIP = true;
  331. }
  332. private static final TimeZone utc = TimeZone.getTimeZone("UTC");
  333. private static Calendar parseUTCTimestamp(String text) {
  334. // 2003-10-26T04:50:47Z
  335. // We're doing this manually for now, though DateFormatter might work...
  336. String trimmed = text.trim();
  337. GregorianCalendar ts = new GregorianCalendar(utc);
  338. ts.set(
  339. Integer.parseInt(trimmed.substring(0,0+4)), // year
  340. Integer.parseInt(trimmed.substring(5,5+2)) - 1, // month is 0-based!
  341. Integer.parseInt(trimmed.substring(8,8+2)), // day
  342. Integer.parseInt(trimmed.substring(11,11+2)), // hour
  343. Integer.parseInt(trimmed.substring(14,14+2)), // minute
  344. Integer.parseInt(trimmed.substring(17,17+2))); // second
  345. return ts;
  346. }
  347. }