PageRenderTime 61ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java

https://github.com/lritter/gnutch
Java | 229 lines | 179 code | 14 blank | 36 comment | 5 complexity | 89c0e58ea2ef4f5257dee3754fce8523 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /* Copyright 2004 Ryan Ackley
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. package org.apache.nutch.parse.msword;
  16. import org.apache.nutch.parse.msword.chp.*;
  17. import org.apache.poi.util.LittleEndian;
  18. import org.apache.poi.hwpf.model.*;
  19. import java.util.*;
  20. /**
  21. * This class is used to extract text from Word 6 documents only. It should
  22. * only be called from the org.textmining.text.extraction.WordExtractor because
  23. * it will automatically determine the version.
  24. *
  25. * @author Ryan Ackley
  26. */
  27. class Word6Extractor
  28. {
  29. public Word6Extractor()
  30. {
  31. }
  32. /**
  33. * Extracts the text
  34. *
  35. * @param mainStream The POIFS document stream entitled "WordDocument".
  36. *
  37. * @return The text from the document
  38. * @throws Exception If there are any unexpected exceptions.
  39. */
  40. public String extractText(byte[] mainStream) throws Exception
  41. {
  42. int fcMin = LittleEndian.getInt(mainStream, 0x18);
  43. int fcMax = LittleEndian.getInt(mainStream, 0x1C);
  44. int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
  45. int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
  46. // get a list of character properties
  47. Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
  48. chpTableSize, fcMin);
  49. List textRuns = chpTable.getTextRuns();
  50. // iterate through the
  51. WordTextBuffer finalTextBuf = new WordTextBuffer();
  52. Iterator runsIt = textRuns.iterator();
  53. while(runsIt.hasNext())
  54. {
  55. CHPX chpx = (CHPX)runsIt.next();
  56. int runStart = chpx.getStart() + fcMin;
  57. int runEnd = chpx.getEnd() + fcMin;
  58. if (!isDeleted(chpx.getGrpprl()))
  59. {
  60. String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
  61. finalTextBuf.append(s);
  62. if (runEnd >= fcMax)
  63. {
  64. break;
  65. }
  66. }
  67. }
  68. return finalTextBuf.toString();
  69. }
  70. /**
  71. * Used to determine if a run of text has been deleted.
  72. * @param grpprl The list of sprms for this run of text.
  73. * @return
  74. */
  75. private boolean isDeleted(byte[] grpprl)
  76. {
  77. int offset = 0;
  78. boolean deleted = false;
  79. while (offset < grpprl.length)
  80. {
  81. switch (LittleEndian.getUnsignedByte(grpprl, offset++))
  82. {
  83. case 65:
  84. deleted = grpprl[offset++] != 0;
  85. break;
  86. case 66:
  87. offset++;
  88. break;
  89. case 67:
  90. offset++;
  91. break;
  92. case 68:
  93. offset += grpprl[offset];
  94. break;
  95. case 69:
  96. offset += 2;
  97. break;
  98. case 70:
  99. offset += 4;
  100. break;
  101. case 71:
  102. offset++;
  103. break;
  104. case 72:
  105. offset += 2;
  106. break;
  107. case 73:
  108. offset += 3;
  109. break;
  110. case 74:
  111. offset += grpprl[offset];
  112. break;
  113. case 75:
  114. offset++;
  115. break;
  116. case 80:
  117. offset += 2;
  118. break;
  119. case 81:
  120. offset += grpprl[offset];
  121. break;
  122. case 82:
  123. offset += grpprl[offset];
  124. break;
  125. case 83:
  126. break;
  127. case 85:
  128. offset++;
  129. break;
  130. case 86:
  131. offset++;
  132. break;
  133. case 87:
  134. offset++;
  135. break;
  136. case 88:
  137. offset++;
  138. break;
  139. case 89:
  140. offset++;
  141. break;
  142. case 90:
  143. offset++;
  144. break;
  145. case 91:
  146. offset++;
  147. break;
  148. case 92:
  149. offset++;
  150. break;
  151. case 93:
  152. offset += 2;
  153. break;
  154. case 94:
  155. offset++;
  156. break;
  157. case 95:
  158. offset += 3;
  159. break;
  160. case 96:
  161. offset += 2;
  162. break;
  163. case 97:
  164. offset += 2;
  165. break;
  166. case 98:
  167. offset++;
  168. break;
  169. case 99:
  170. offset++;
  171. break;
  172. case 100:
  173. offset++;
  174. break;
  175. case 101:
  176. offset++;
  177. break;
  178. case 102:
  179. offset++;
  180. break;
  181. case 103:
  182. offset += grpprl[offset];
  183. break;
  184. case 104:
  185. offset++;
  186. break;
  187. case 105:
  188. offset += grpprl[offset];
  189. break;
  190. case 106:
  191. offset += grpprl[offset];
  192. break;
  193. case 107:
  194. offset += 2;
  195. break;
  196. case 108:
  197. offset += grpprl[offset];
  198. break;
  199. case 109:
  200. offset += 2;
  201. break;
  202. case 110:
  203. offset += 2;
  204. break;
  205. case 117:
  206. offset++;
  207. break;
  208. case 118:
  209. offset++;
  210. break;
  211. }
  212. }
  213. return deleted;
  214. }
  215. }