PageRenderTime 60ms CodeModel.GetById 17ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java

https://github.com/lritter/gnutch
Java | 230 lines | 151 code | 32 blank | 47 comment | 21 complexity | 40789e266310b897b848347166114477 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /* Copyright 2004 Ryan Ackley
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. package org.apache.nutch.parse.msword;
  16. // JDK imports
  17. import java.io.InputStream;
  18. import java.util.ArrayList;
  19. import java.util.Iterator;
  20. import java.util.List;
  21. // Jakarta POI imports
  22. import org.apache.poi.hwpf.model.CHPBinTable;
  23. import org.apache.poi.hwpf.model.CHPX;
  24. import org.apache.poi.hwpf.model.ComplexFileTable;
  25. import org.apache.poi.hwpf.model.TextPiece;
  26. import org.apache.poi.hwpf.model.TextPieceTable;
  27. import org.apache.poi.hwpf.sprm.SprmIterator;
  28. import org.apache.poi.hwpf.sprm.SprmOperation;
  29. import org.apache.poi.poifs.filesystem.DocumentEntry;
  30. import org.apache.poi.poifs.filesystem.DocumentInputStream;
  31. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  32. import org.apache.poi.util.LittleEndian;
  33. // Nutch imports
  34. import org.apache.nutch.parse.ms.MSExtractor;
  35. /**
  36. * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
  37. *
  38. * @author Ryan Ackley
  39. * @author Andy Hedges
  40. * @author Jérôme Charron
  41. *
  42. */
  43. class WordExtractor extends MSExtractor {
  44. /**
  45. * Gets the text from a Word document.
  46. *
  47. * @param in The InputStream representing the Word file.
  48. */
  49. protected String extractText(InputStream in) throws Exception {
  50. ArrayList text = new ArrayList();
  51. POIFSFileSystem fsys = new POIFSFileSystem(in);
  52. // load our POIFS document streams.
  53. DocumentEntry headerProps =
  54. (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
  55. DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
  56. byte[] header = new byte[headerProps.getSize()];
  57. din.read(header);
  58. din.close();
  59. int info = LittleEndian.getShort(header, 0xa);
  60. if ((info & 0x4) != 0)
  61. {
  62. throw new FastSavedException("Fast-saved files are unsupported at this time");
  63. }
  64. if ((info & 0x100) != 0)
  65. {
  66. throw new PasswordProtectedException("This document is password protected");
  67. }
  68. // determine the version of Word this document came from.
  69. int nFib = LittleEndian.getShort(header, 0x2);
  70. switch (nFib)
  71. {
  72. case 101:
  73. case 102:
  74. case 103:
  75. case 104:
  76. // this is a Word 6.0 doc send it to the extractor for that version.
  77. Word6Extractor oldExtractor = new Word6Extractor();
  78. return oldExtractor.extractText(header);
  79. }
  80. //Get the information we need from the header
  81. boolean useTable1 = (info & 0x200) != 0;
  82. //get the location of the piece table
  83. int complexOffset = LittleEndian.getInt(header, 0x1a2);
  84. // determine which table stream we must use.
  85. String tableName = null;
  86. if (useTable1)
  87. {
  88. tableName = "1Table";
  89. }
  90. else
  91. {
  92. tableName = "0Table";
  93. }
  94. DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
  95. byte[] tableStream = new byte[table.getSize()];
  96. din = fsys.createDocumentInputStream(tableName);
  97. din.read(tableStream);
  98. din.close();
  99. int chpOffset = LittleEndian.getInt(header, 0xfa);
  100. int chpSize = LittleEndian.getInt(header, 0xfe);
  101. int fcMin = LittleEndian.getInt(header, 0x18);
  102. CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
  103. // load our text pieces and our character runs
  104. ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
  105. TextPieceTable tpt = cft.getTextPieceTable();
  106. List textPieces = tpt.getTextPieces();
  107. // make the POIFS objects available for garbage collection
  108. din = null;
  109. fsys = null;
  110. table = null;
  111. headerProps = null;
  112. List textRuns = cbt.getTextRuns();
  113. Iterator runIt = textRuns.iterator();
  114. Iterator textIt = textPieces.iterator();
  115. TextPiece currentPiece = (TextPiece)textIt.next();
  116. int currentTextStart = currentPiece.getStart();
  117. int currentTextEnd = currentPiece.getEnd();
  118. WordTextBuffer finalTextBuf = new WordTextBuffer();
  119. // iterate through all text runs extract the text only if they haven't been
  120. // deleted
  121. while (runIt.hasNext())
  122. {
  123. CHPX chpx = (CHPX)runIt.next();
  124. boolean deleted = isDeleted(chpx.getGrpprl());
  125. if (deleted)
  126. {
  127. continue;
  128. }
  129. int runStart = chpx.getStart();
  130. int runEnd = chpx.getEnd();
  131. while (runStart >= currentTextEnd)
  132. {
  133. currentPiece = (TextPiece) textIt.next ();
  134. currentTextStart = currentPiece.getStart ();
  135. currentTextEnd = currentPiece.getEnd ();
  136. }
  137. if (runEnd < currentTextEnd)
  138. {
  139. String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
  140. finalTextBuf.append(str);
  141. }
  142. else if (runEnd > currentTextEnd)
  143. {
  144. while (runEnd > currentTextEnd)
  145. {
  146. String str = currentPiece.substring(runStart - currentTextStart,
  147. currentTextEnd - currentTextStart);
  148. finalTextBuf.append(str);
  149. if (textIt.hasNext())
  150. {
  151. currentPiece = (TextPiece) textIt.next ();
  152. currentTextStart = currentPiece.getStart ();
  153. runStart = currentTextStart;
  154. currentTextEnd = currentPiece.getEnd ();
  155. }
  156. else
  157. {
  158. return finalTextBuf.toString();
  159. }
  160. }
  161. String str = currentPiece.substring(0, runEnd - currentTextStart);
  162. finalTextBuf.append(str);
  163. }
  164. else
  165. {
  166. String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
  167. if (textIt.hasNext())
  168. {
  169. currentPiece = (TextPiece) textIt.next();
  170. currentTextStart = currentPiece.getStart();
  171. currentTextEnd = currentPiece.getEnd();
  172. }
  173. finalTextBuf.append(str);
  174. }
  175. }
  176. return finalTextBuf.toString();
  177. }
  178. /**
  179. * Used to determine if a run of text has been deleted.
  180. *
  181. * @param grpprl The list of sprms for a particular run of text.
  182. * @return true if this run of text has been deleted.
  183. */
  184. private boolean isDeleted(byte[] grpprl)
  185. {
  186. SprmIterator iterator = new SprmIterator(grpprl,0);
  187. while (iterator.hasNext())
  188. {
  189. SprmOperation op = iterator.next();
  190. // 0 is the operation that signals a FDelRMark operation
  191. if (op.getOperation() == 0 && op.getOperand() != 0)
  192. {
  193. return true;
  194. }
  195. }
  196. return false;
  197. }
  198. }