PageRenderTime 117ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java

https://github.com/solsson/tika
Java | 183 lines | 124 code | 25 blank | 34 comment | 24 complexity | 182b92c20f0ceb47e5952f2b30e98cd2 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft.ooxml;
  18. import java.io.IOException;
  19. import java.util.ArrayList;
  20. import java.util.Iterator;
  21. import java.util.List;
  22. import org.apache.poi.openxml4j.opc.PackagePart;
  23. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  24. import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
  25. import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
  26. import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
  27. import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
  28. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  29. import org.apache.poi.xwpf.usermodel.XWPFParagraph;
  30. import org.apache.tika.sax.XHTMLContentHandler;
  31. import org.apache.xmlbeans.XmlException;
  32. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
  33. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
  34. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
  35. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
  36. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
  37. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
  38. import org.xml.sax.SAXException;
  39. public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
  40. public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
  41. super(extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document");
  42. }
  43. /**
  44. * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
  45. */
  46. @Override
  47. protected void buildXHTML(XHTMLContentHandler xhtml)
  48. throws SAXException, XmlException, IOException {
  49. XWPFDocument document = (XWPFDocument) extractor.getDocument();
  50. XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
  51. // headers
  52. extractHeaders(xhtml, hfPolicy);
  53. // first all paragraphs
  54. Iterator<XWPFParagraph> i = document.getParagraphsIterator();
  55. while (i.hasNext()) {
  56. XWPFParagraph paragraph = i.next();
  57. CTSectPr ctSectPr = null;
  58. if (paragraph.getCTP().getPPr() != null) {
  59. ctSectPr = paragraph.getCTP().getPPr().getSectPr();
  60. }
  61. XWPFHeaderFooterPolicy headerFooterPolicy = null;
  62. if (ctSectPr != null) {
  63. headerFooterPolicy =
  64. new XWPFHeaderFooterPolicy(document, ctSectPr);
  65. extractHeaders(xhtml, headerFooterPolicy);
  66. }
  67. XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
  68. new XWPFHyperlinkDecorator(paragraph, null, true));
  69. CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
  70. for (CTBookmark bookmark : bookmarks) {
  71. xhtml.element("p", bookmark.getName());
  72. }
  73. xhtml.element("p", decorator.getText());
  74. if (ctSectPr != null) {
  75. extractFooters(xhtml, headerFooterPolicy);
  76. }
  77. }
  78. // then all document tables
  79. extractTableContent(document, xhtml);
  80. extractFooters(xhtml, hfPolicy);
  81. }
  82. private void extractFooters(
  83. XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
  84. throws SAXException {
  85. // footers
  86. if (hfPolicy.getFirstPageFooter() != null) {
  87. xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
  88. }
  89. if (hfPolicy.getEvenPageFooter() != null) {
  90. xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
  91. }
  92. if (hfPolicy.getDefaultFooter() != null) {
  93. xhtml.element("p", hfPolicy.getDefaultFooter().getText());
  94. }
  95. }
  96. private void extractHeaders(
  97. XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
  98. throws SAXException {
  99. if (hfPolicy.getFirstPageHeader() != null) {
  100. xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
  101. }
  102. if (hfPolicy.getEvenPageHeader() != null) {
  103. xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
  104. }
  105. if (hfPolicy.getDefaultHeader() != null) {
  106. xhtml.element("p", hfPolicy.getDefaultHeader().getText());
  107. }
  108. }
  109. /**
  110. * Low level structured parsing of document tables.
  111. */
  112. private void extractTableContent(XWPFDocument doc, XHTMLContentHandler xhtml)
  113. throws SAXException {
  114. for (CTTbl table : doc.getDocument().getBody().getTblArray()) {
  115. xhtml.startElement("table");
  116. xhtml.startElement("tbody");
  117. CTRow[] rows = table.getTrArray();
  118. for (CTRow row : rows) {
  119. xhtml.startElement("tr");
  120. CTTc[] cells = row.getTcArray();
  121. for (CTTc tc : cells) {
  122. xhtml.startElement("td");
  123. CTP[] content = tc.getPArray();
  124. for (CTP ctp : content) {
  125. XWPFParagraph p = new MyXWPFParagraph(ctp, doc);
  126. XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
  127. new XWPFHyperlinkDecorator(p, null, true));
  128. xhtml.element("p", decorator.getText());
  129. }
  130. xhtml.endElement("td");
  131. }
  132. xhtml.endElement("tr");
  133. }
  134. xhtml.endElement("tbody");
  135. xhtml.endElement("table");
  136. }
  137. }
  138. /**
  139. * Word documents are simple, they only have the one
  140. * main part
  141. */
  142. @Override
  143. protected List<PackagePart> getMainDocumentParts() {
  144. XWPFDocument document = (XWPFDocument) extractor.getDocument();
  145. List<PackagePart> parts = new ArrayList<PackagePart>();
  146. parts.add( document.getPackagePart() );
  147. return parts;
  148. }
  149. /**
  150. * Private wrapper class that makes the protected {@link XWPFParagraph}
  151. * constructor available.
  152. */
  153. private static class MyXWPFParagraph extends XWPFParagraph {
  154. private MyXWPFParagraph(CTP ctp, XWPFDocument xwpfDocument) {
  155. super(ctp, xwpfDocument);
  156. }
  157. }
  158. }