PageRenderTime 405ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java

https://github.com/lritter/gnutch
Java | 431 lines | 213 code | 56 blank | 162 comment | 85 complexity | 75c37848bf3d7cb3ed9cca98daec6b70 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.nutch.parse.mspowerpoint;
  18. import java.util.Enumeration;
  19. import java.util.Hashtable;
  20. import java.util.List;
  21. import java.util.Vector;
  22. import org.apache.commons.logging.Log;
  23. import org.apache.commons.logging.LogFactory;
  24. import org.apache.poi.hdf.extractor.Utils;
  25. import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
  26. import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
  27. import org.apache.poi.poifs.filesystem.DocumentInputStream;
  28. import org.apache.poi.util.LittleEndian;
  29. import org.apache.poi.util.StringUtil;
  30. /**
  31. * Listener to read the content of PowerPoint file and transfere it to the
  32. * passed <code>StringBuffer</code>.
  33. *
  34. * @author Stephan Strittmatter - http://www.sybit.de
  35. *
  36. * @version 1.0
  37. *
  38. */
  39. class ContentReaderListener implements POIFSReaderListener {
  40. private static final Log LOG = LogFactory.getLog(ContentReaderListener.class);
  41. /** Buffer holding the content of the file */
  42. protected final transient StringBuffer buf;
  43. /**
  44. * Constructs Listener to get content of PowerPoint file.
  45. *
  46. * @param content
  47. * StringBuffer refereing the content of the PowerPoint file.
  48. */
  49. public ContentReaderListener(final StringBuffer content) {
  50. this.buf = content;
  51. }
  52. /**
  53. * Reads the internal PowerPoint document stream.
  54. *
  55. * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
  56. */
  57. public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
  58. if (event == null || event.getName() == null
  59. || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
  60. if (LOG.isWarnEnabled()) {
  61. LOG.warn("Stream not processed. It is not a PowerPoint document: : "
  62. + event.getName());
  63. }
  64. return;
  65. }
  66. try {
  67. final DocumentInputStream dis = event.getStream();
  68. final byte pptdata[] = new byte[dis.available()];
  69. dis.read(pptdata, 0, dis.available());
  70. int offset = 0;
  71. long offsetPD = 0;
  72. /*
  73. * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
  74. * in all PlaceHolders to hold PPTClientTextBox objects for mapping into
  75. * Slide Objects
  76. */
  77. Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
  78. * <Long,
  79. * TextBox>
  80. */();
  81. // Traverse ByteArray to identiy edit paths of ClientTextBoxes
  82. long n = pptdata.length - 20;
  83. for (long i = 0; i < n; i++) {
  84. final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
  85. // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
  86. if (PPTConstants.PPT_ATOM_USEREDIT == type) {
  87. /*
  88. * Checking the Record Header (UserEditAtom)
  89. */
  90. // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
  91. // final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
  92. offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
  93. offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
  94. /*
  95. * Call to extract ClientTextBox text in each UserEditAtom
  96. */
  97. containerTextBox = extractTextBoxes(containerTextBox, offset,
  98. pptdata, offsetPD);
  99. } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
  100. // if (LOG.isTraceEnabled()) {
  101. // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
  102. // }
  103. } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
  104. // if (LOG.isTraceEnabled()) {
  105. // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
  106. // }
  107. } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
  108. // if (LOG.isTraceEnabled()) {
  109. // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
  110. // }
  111. } else {
  112. // no action
  113. // if (LOG.isTraceEnabled()) {
  114. // LOG.trace("type not handled: " + type);
  115. // }
  116. }
  117. }
  118. final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
  119. offsetPD);
  120. if (slides.size() == 0) {
  121. if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); }
  122. } else {
  123. Slide slide = (Slide) slides.get(slides.size() - 1);
  124. for (Enumeration enumeration = containerTextBox.elements(); enumeration
  125. .hasMoreElements();) {
  126. final TextBox textBox = (TextBox) enumeration.nextElement();
  127. slide.addContent(textBox.getContent());
  128. }
  129. /*
  130. * Merging TextBox data with Slide Data Printing the text from Slides
  131. * vector object.
  132. */
  133. List scontent;
  134. for (int i = 0; i < slides.size(); i++) {
  135. slide = (Slide) slides.get(i);
  136. scontent = slide.getContent();
  137. String contentText;
  138. for (int j = 0; j < scontent.size(); j++) {
  139. contentText = scontent.get(j).toString();
  140. this.buf.append(contentText);
  141. // to avoid concatinated words we add a blank additional
  142. if (contentText.length() > 0
  143. && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
  144. this.buf.append(" ");
  145. }
  146. }
  147. }
  148. }
  149. } catch (Throwable ex) {
  150. // because of not killing complete crawling all Throwables are catched.
  151. if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); }
  152. }
  153. }
  154. /**
  155. * Extracts the client text boxes of a slide.
  156. *
  157. * @param containerTextBox
  158. * @param offset
  159. * @param pptdata
  160. * @param offsetPD
  161. * @return Hashtable
  162. * @see TextBox
  163. */
  164. protected Hashtable/* <Long, TextBox> */extractTextBoxes(
  165. final Hashtable/* <Long, TextBox> */containerTextBox, final int offset,
  166. final byte[] pptdata, final long offsetPD) {
  167. // To hold temporary data
  168. FilteredStringWriter outStream = new FilteredStringWriter();
  169. TextBox textBox;
  170. // Traversing the bytearray up to Presist directory position
  171. for (int i = offset; i < offsetPD - 20; i++) {
  172. try {
  173. // Record info
  174. // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
  175. // Record Type
  176. final long recordType = LittleEndian.getUShort(pptdata, i + 2);
  177. // Record Size
  178. final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
  179. if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
  180. /*
  181. * Record type is of Drawing Group
  182. */
  183. // Total number of objects
  184. // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
  185. // 8);
  186. // currentID = Group ID+number of objects
  187. long currentID = LittleEndian.getInt(pptdata, i + 12);
  188. currentID = ((int) (currentID / 1024)) * 1024;
  189. if (currentID == PPTConstants.PPT_MASTERSLIDE) {
  190. // Ignore Master Slide objects
  191. if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); }
  192. i++;
  193. continue;
  194. }
  195. // Check for the ClientTextBox GroupID existence
  196. if (containerTextBox.containsKey(new Long(currentID))) {
  197. // If exists get Client Textbox Group
  198. textBox = (TextBox) containerTextBox.get(new Long(currentID));
  199. textBox.setContent("");
  200. } else {
  201. textBox = new TextBox(currentID);
  202. containerTextBox.put(new Long(currentID), textBox);
  203. }
  204. /*
  205. * Iterating the bytearray for TextCharAtoms and TextBytesAtom
  206. */
  207. if ((offsetPD - 20) != recordSize) {
  208. // TODO something wrong? Probably an OLE-Object, which we ignore.
  209. if (LOG.isDebugEnabled()) {
  210. LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize="
  211. + recordSize);
  212. }
  213. } else {
  214. for (int startPos = i + 8; startPos < offsetPD - 20
  215. && startPos < recordSize; startPos++) { // && startPos <
  216. // recordSize??
  217. try {
  218. // Record info
  219. // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);
  220. // Record Type
  221. final long ntype = LittleEndian
  222. .getUShort(pptdata, startPos + 2);
  223. // Record size
  224. // Note that the size doesn't include the 8 byte atom header
  225. final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);
  226. if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
  227. /*
  228. * Break the loop if next GroupID found
  229. */
  230. i = startPos - 1;
  231. break;
  232. } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
  233. // TextByteAtom record
  234. outStream = new FilteredStringWriter();
  235. long ii = 0;
  236. for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
  237. // For loop to changed to a function
  238. // if ((ii + 2) >= pptdata.length)
  239. // break; // FIXME
  240. outStream.write((char) (pptdata[(int) ii + 2]));
  241. }
  242. // Setting the identified text for Current
  243. // groupID
  244. textBox.setContent(textBox.getContent()
  245. + outStream.toString());
  246. } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
  247. // TextCharAtom record
  248. final String strTempContent = new String(pptdata,
  249. startPos + 6, (int) (nsize) + 2);
  250. final byte bytes[] = strTempContent.getBytes();
  251. if (true) {
  252. outStream = new FilteredStringWriter();
  253. for (int ii = 0; ii < bytes.length - 1; ii += 2) {
  254. // For loop to changed to a function
  255. outStream.write((char) (pptdata[ii + 2]));
  256. }
  257. textBox.setContent(textBox.getContent()
  258. + outStream.toString());
  259. } else {
  260. // this version is used within POI
  261. String text = StringUtil.getFromCompressedUnicode(bytes, 0,
  262. bytes.length);
  263. textBox.setContent(textBox.getContent() + text);
  264. }
  265. } else {
  266. // ignored
  267. // if (LOG.isTraceEnabled()) {
  268. // LOG.trace("Ignored atom type: " + type);
  269. // }
  270. }
  271. } catch (Throwable e) {
  272. if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); }
  273. break;
  274. }
  275. }
  276. }
  277. } else {
  278. // Record type is ignored
  279. // if (LOG.isTraceEnabled()) {
  280. // LOG.trace("Ignored record type: " + type);
  281. // }
  282. }
  283. } catch (Throwable ee) {
  284. if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); }
  285. break;
  286. }
  287. }
  288. return containerTextBox;
  289. }
  290. /**
  291. * Returns the Powerpoint <code>Slide</code> s of document as vector.
  292. *
  293. * @param offset
  294. * @param pptdata
  295. * @param offsetPD
  296. * @return Vector of the powerpoint slides. Contains
  297. * <code>{@link Slide Slide}</code>
  298. * @see Slide
  299. */
  300. protected List /* <Slide> */extractSlides(final long offset,
  301. final byte[] pptdata, final long offsetPD) {
  302. int sNum = 0;
  303. // List of all slides found
  304. final List/* <Slide> */slides = new Vector/* <Slide> */();
  305. // current slide data
  306. Slide currentSlide = null;
  307. // To store data found in TextCharAtoms and TextBytesAtoms
  308. FilteredStringWriter outStream;
  309. for (long i = offset; i < pptdata.length - 20; i++) {
  310. final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
  311. final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
  312. final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
  313. if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
  314. /*
  315. * TextByteAtom record
  316. */
  317. outStream = new FilteredStringWriter();
  318. for (long ii = i + 6; (ii <= i + 6 + atomSize)
  319. && (ii + 2 < pptdata.length); ii++) {
  320. try {
  321. // if(ii+2 >= pptdata.length) break; //FIXME
  322. byte value = pptdata[(int) ii + 2];
  323. outStream.write(value);
  324. } catch (ArrayIndexOutOfBoundsException ex) {
  325. if (LOG.isTraceEnabled()) { LOG.trace("size=" + pptdata.length); }
  326. if (LOG.isErrorEnabled()) { LOG.error("extractSlides", ex); }
  327. }
  328. }
  329. // Setting the identified text for Current Slide
  330. if (currentSlide != null) {
  331. currentSlide.addContent(outStream.toString());
  332. }
  333. } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
  334. /*
  335. * TextCharAtom record
  336. */
  337. outStream = new FilteredStringWriter();
  338. final String strTempContent = new String(pptdata, (int) i + 6,
  339. (int) (atomSize) + 2);
  340. final byte bytes[] = strTempContent.getBytes();
  341. for (int ii = 0; ii < bytes.length - 1; ii += 2) {
  342. outStream.write(Utils.getUnicodeCharacter(bytes, ii));
  343. }
  344. // Setting the identified text for Current Slide
  345. if (currentSlide != null) {
  346. currentSlide.addContent(outStream.toString());
  347. }
  348. } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
  349. /*
  350. * SlidePresistAtom Record
  351. */
  352. if (sNum != 0) {
  353. outStream = new FilteredStringWriter();
  354. final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20);
  355. currentSlide = new Slide(slideID);
  356. // currentSlide.addContent(outStream.toString());
  357. slides.add(currentSlide);
  358. }
  359. sNum++;
  360. } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
  361. /*
  362. * Diagram records are ignored
  363. */
  364. if (LOG.isTraceEnabled()) { LOG.trace("Drawing Groups are ignored."); }
  365. break;
  366. } else {
  367. // ignored
  368. // if (LOG.isTraceEnabled()) {
  369. // LOG.trace("Unhandled atomType: " + atomType);
  370. // }
  371. }
  372. }
  373. return slides;
  374. }
  375. }