PageRenderTime 51ms CodeModel.GetById 20ms RepoModel.GetById 1ms app.codeStats 0ms

/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java

https://github.com/apache/tika
Java | 384 lines | 279 code | 67 blank | 38 comment | 29 complexity | e87a6708a4ccb1c1224d20be76800ce2 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft.ooxml.xwpf;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.util.Date;
  21. import java.util.HashMap;
  22. import java.util.List;
  23. import java.util.Map;
  24. import javax.xml.parsers.ParserConfigurationException;
  25. import org.apache.commons.io.input.CloseShieldInputStream;
  26. import org.apache.poi.ooxml.POIXMLDocument;
  27. import org.apache.poi.ooxml.POIXMLProperties;
  28. import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
  29. import org.apache.poi.ooxml.util.SAXHelper;
  30. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  31. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  32. import org.apache.poi.openxml4j.opc.OPCPackage;
  33. import org.apache.poi.openxml4j.opc.PackageAccess;
  34. import org.apache.poi.openxml4j.opc.PackagePart;
  35. import org.apache.poi.openxml4j.opc.PackageRelationship;
  36. import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
  37. import org.apache.poi.xwpf.usermodel.XWPFNumbering;
  38. import org.apache.poi.xwpf.usermodel.XWPFRelation;
  39. import org.apache.xmlbeans.XmlException;
  40. import org.slf4j.Logger;
  41. import org.slf4j.LoggerFactory;
  42. import org.xml.sax.InputSource;
  43. import org.xml.sax.SAXException;
  44. import org.xml.sax.XMLReader;
  45. import org.apache.tika.exception.RuntimeSAXException;
  46. import org.apache.tika.exception.WriteLimitReachedException;
  47. import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
  48. import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
  49. import org.apache.tika.parser.microsoft.ooxml.RunProperties;
  50. import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
  51. //TODO: move this into POI?
  52. /**
  53. * Experimental class that is based on POI's XSSFEventBasedExcelExtractor
  54. */
  55. public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
  56. private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class);
  57. private OPCPackage container;
  58. private POIXMLProperties properties;
  59. public XWPFEventBasedWordExtractor(String path)
  60. throws XmlException, OpenXML4JException, IOException {
  61. this(OPCPackage.open(path, PackageAccess.READ));
  62. }
  63. public XWPFEventBasedWordExtractor(OPCPackage container)
  64. throws XmlException, OpenXML4JException, IOException {
  65. super((POIXMLDocument) null);
  66. this.container = container;
  67. this.properties = new POIXMLProperties(container);
  68. }
  69. public static void main(String[] args) throws Exception {
  70. if (args.length < 1) {
  71. System.err.println("Use:");
  72. System.err.println(" XWPFEventBasedWordExtractor <filename.xlsx>");
  73. System.exit(1);
  74. }
  75. XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
  76. System.out.println(extractor.getText());
  77. extractor.close();
  78. }
  79. public OPCPackage getPackage() {
  80. return this.container;
  81. }
  82. public POIXMLProperties.CoreProperties getCoreProperties() {
  83. return this.properties.getCoreProperties();
  84. }
  85. public POIXMLProperties.ExtendedProperties getExtendedProperties() {
  86. return this.properties.getExtendedProperties();
  87. }
  88. public POIXMLProperties.CustomProperties getCustomProperties() {
  89. return this.properties.getCustomProperties();
  90. }
  91. @Override
  92. public String getText() {
  93. StringBuilder sb = new StringBuilder();
  94. //handle main document
  95. List<PackagePart> pps =
  96. container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
  97. if (pps != null) {
  98. for (PackagePart pp : pps) {
  99. //likely only one, but why not...
  100. try {
  101. handleDocumentPart(pp, sb);
  102. } catch (IOException e) {
  103. LOG.warn("IOException handling document part", e);
  104. } catch (SAXException e) {
  105. if (WriteLimitReachedException.isWriteLimitReached(e)) {
  106. throw new RuntimeSAXException(e);
  107. }
  108. //swallow this because we don't actually call it
  109. LOG.warn("SAXException handling document part", e);
  110. }
  111. }
  112. }
  113. //handle glossary document
  114. pps = container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
  115. if (pps != null) {
  116. for (PackagePart pp : pps) {
  117. //likely only one, but why not...
  118. try {
  119. handleDocumentPart(pp, sb);
  120. } catch (IOException e) {
  121. LOG.warn("IOException handling glossary document part", e);
  122. } catch (SAXException e) {
  123. if (WriteLimitReachedException.isWriteLimitReached(e)) {
  124. throw new RuntimeSAXException(e);
  125. }
  126. //swallow this because we don't actually call it
  127. LOG.warn("SAXException handling glossary document part", e);
  128. }
  129. }
  130. }
  131. return sb.toString();
  132. }
  133. private void handleDocumentPart(PackagePart documentPart, StringBuilder sb)
  134. throws IOException, SAXException {
  135. //load the numbering/list manager and styles from the main document part
  136. XWPFNumbering numbering = loadNumbering(documentPart);
  137. XWPFListManager xwpfListManager = new XWPFListManager(numbering);
  138. //TODO: XWPFStyles styles = loadStyles(documentPart);
  139. //headers
  140. try {
  141. PackageRelationshipCollection headersPRC =
  142. documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
  143. if (headersPRC != null) {
  144. for (int i = 0; i < headersPRC.size(); i++) {
  145. PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
  146. handlePart(header, xwpfListManager, sb);
  147. }
  148. }
  149. } catch (InvalidFormatException e) {
  150. LOG.warn("Invalid format", e);
  151. }
  152. //main document
  153. handlePart(documentPart, xwpfListManager, sb);
  154. //for now, just dump other components at end
  155. for (XWPFRelation rel : new XWPFRelation[]{XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT,
  156. XWPFRelation.FOOTER, XWPFRelation.ENDNOTE}) {
  157. try {
  158. PackageRelationshipCollection prc =
  159. documentPart.getRelationshipsByType(rel.getRelation());
  160. if (prc != null) {
  161. for (int i = 0; i < prc.size(); i++) {
  162. PackagePart packagePart =
  163. documentPart.getRelatedPart(prc.getRelationship(i));
  164. handlePart(packagePart, xwpfListManager, sb);
  165. }
  166. }
  167. } catch (InvalidFormatException e) {
  168. LOG.warn("Invalid format", e);
  169. }
  170. }
  171. }
  172. private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager,
  173. StringBuilder buffer) throws IOException, SAXException {
  174. Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
  175. try (InputStream stream = packagePart.getInputStream()) {
  176. XMLReader reader = SAXHelper.newXMLReader();
  177. reader.setContentHandler(
  178. new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer),
  179. hyperlinks));
  180. reader.parse(new InputSource(new CloseShieldInputStream(stream)));
  181. } catch (ParserConfigurationException e) {
  182. LOG.warn("Can't configure XMLReader", e);
  183. }
  184. }
  185. private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
  186. Map<String, String> hyperlinks = new HashMap<>();
  187. try {
  188. PackageRelationshipCollection prc =
  189. bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
  190. for (int i = 0; i < prc.size(); i++) {
  191. PackageRelationship pr = prc.getRelationship(i);
  192. if (pr == null) {
  193. continue;
  194. }
  195. String id = pr.getId();
  196. String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
  197. if (id != null && url != null) {
  198. hyperlinks.put(id, url);
  199. }
  200. }
  201. } catch (InvalidFormatException e) {
  202. LOG.warn("Invalid format", e);
  203. }
  204. return hyperlinks;
  205. }
  206. private XWPFNumbering loadNumbering(PackagePart packagePart) throws IOException {
  207. try {
  208. PackageRelationshipCollection numberingParts =
  209. packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
  210. if (numberingParts.size() > 0) {
  211. PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
  212. if (numberingRelationShip == null) {
  213. return null;
  214. }
  215. PackagePart numberingPart = container.getPart(numberingRelationShip);
  216. if (numberingPart == null) {
  217. return null;
  218. }
  219. return new XWPFNumbering(numberingPart);
  220. }
  221. } catch (OpenXML4JException e) {
  222. LOG.warn("Couldn't load numbering", e);
  223. }
  224. return null;
  225. }
  226. private static class XWPFToTextContentHandler
  227. implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
  228. private final StringBuilder buffer;
  229. public XWPFToTextContentHandler(StringBuilder buffer) {
  230. this.buffer = buffer;
  231. }
  232. @Override
  233. public void run(RunProperties runProperties, String contents) {
  234. buffer.append(contents);
  235. }
  236. @Override
  237. public void hyperlinkStart(String link) {
  238. //no-op
  239. }
  240. @Override
  241. public void hyperlinkEnd() {
  242. //no-op
  243. }
  244. @Override
  245. public void startParagraph(ParagraphProperties paragraphProperties) {
  246. //no-op
  247. }
  248. @Override
  249. public void endParagraph() {
  250. buffer.append("\n");
  251. }
  252. @Override
  253. public void startTable() {
  254. }
  255. @Override
  256. public void endTable() {
  257. }
  258. @Override
  259. public void startTableRow() {
  260. }
  261. @Override
  262. public void endTableRow() {
  263. buffer.append("\n");
  264. }
  265. @Override
  266. public void startTableCell() {
  267. }
  268. @Override
  269. public void endTableCell() {
  270. buffer.append("\t");
  271. }
  272. @Override
  273. public void startSDT() {
  274. }
  275. @Override
  276. public void endSDT() {
  277. buffer.append("\n");
  278. }
  279. @Override
  280. public void startEditedSection(String editor, Date date,
  281. OOXMLWordAndPowerPointTextHandler.EditType editType) {
  282. }
  283. @Override
  284. public void endEditedSection() {
  285. }
  286. @Override
  287. public boolean isIncludeDeletedText() {
  288. return true;
  289. }
  290. @Override
  291. public void footnoteReference(String id) {
  292. }
  293. @Override
  294. public void endnoteReference(String id) {
  295. }
  296. @Override
  297. public boolean isIncludeMoveFromText() {
  298. return false;
  299. }
  300. @Override
  301. public void embeddedOLERef(String refId) {
  302. //no-op
  303. }
  304. @Override
  305. public void embeddedPicRef(String picFileName, String picDescription) {
  306. //no-op
  307. }
  308. @Override
  309. public void startBookmark(String id, String name) {
  310. //no-op
  311. }
  312. @Override
  313. public void endBookmark(String id) {
  314. //no-op
  315. }
  316. }
  317. }