PageRenderTime 5915ms CodeModel.GetById 28ms RepoModel.GetById 9ms app.codeStats 0ms

/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java

https://bitbucket.org/mg/olat
Java | 156 lines | 109 code | 18 blank | 29 comment | 24 complexity | dfe530c94dc566561a0e060d649eb7c5 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-3.0, 0BSD, MPL-2.0-no-copyleft-exception, AGPL-3.0, Apache-2.0
  1. /**
  2. * OLAT - Online Learning and Training<br>
  3. * http://www.olat.org
  4. * <p>
  5. * Licensed under the Apache License, Version 2.0 (the "License"); <br>
  6. * you may not use this file except in compliance with the License.<br>
  7. * You may obtain a copy of the License at
  8. * <p>
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. * <p>
  11. * Unless required by applicable law or agreed to in writing,<br>
  12. * software distributed under the License is distributed on an "AS IS" BASIS, <br>
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
  14. * See the License for the specific language governing permissions and <br>
  15. * limitations under the License.
  16. * <p>
  17. * Copyright (c) frentix GmbH<br>
  18. * http://www.frentix.com<br>
  19. * <p>
  20. */
  21. package org.olat.search.service.document.file;
  22. import java.io.BufferedInputStream;
  23. import java.io.IOException;
  24. import java.util.Iterator;
  25. import org.apache.lucene.document.Document;
  26. import org.apache.poi.POIXMLDocument;
  27. import org.apache.poi.POIXMLTextExtractor;
  28. import org.apache.poi.extractor.ExtractorFactory;
  29. import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
  30. import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
  31. import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
  32. import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
  33. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  34. import org.apache.poi.xwpf.usermodel.XWPFParagraph;
  35. import org.apache.xmlbeans.XmlException;
  36. import org.olat.core.logging.OLog;
  37. import org.olat.core.logging.Tracing;
  38. import org.olat.core.util.vfs.VFSLeaf;
  39. import org.olat.search.service.SearchResourceContext;
  40. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
  41. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
  42. /**
  43. * Description:<br>
  44. * Parse the Word XML document (.docx) with Apache POI
  45. * <P>
  46. * Initial Date: 14 dec. 2009 <br>
  47. *
  48. * @author srosse, stephane.rosse@frentix.com
  49. */
  50. public class WordOOXMLDocument extends FileDocument {
  51. private static final OLog log = Tracing.createLoggerFor(WordOOXMLDocument.class);
  52. public final static String FILE_TYPE = "type.file.word";
  53. public WordOOXMLDocument() {
  54. super();
  55. }
  56. public static Document createDocument(final SearchResourceContext leafResourceContext, final VFSLeaf leaf) throws IOException, DocumentException,
  57. DocumentAccessException {
  58. final WordOOXMLDocument wordDocument = new WordOOXMLDocument();
  59. wordDocument.init(leafResourceContext, leaf);
  60. wordDocument.setFileType(FILE_TYPE);
  61. wordDocument.setCssIcon("b_filetype_doc");
  62. if (log.isDebug()) {
  63. log.debug(wordDocument.toString());
  64. }
  65. return wordDocument.getLuceneDocument();
  66. }
  67. @Override
  68. protected String readContent(final VFSLeaf leaf) throws IOException, DocumentException {
  69. BufferedInputStream bis = null;
  70. final StringBuilder buffy = new StringBuilder();
  71. try {
  72. bis = new BufferedInputStream(leaf.getInputStream());
  73. final POIXMLTextExtractor extractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(bis);
  74. final POIXMLDocument document = extractor.getDocument();
  75. if (document instanceof XWPFDocument) {
  76. final XWPFDocument xDocument = (XWPFDocument) document;
  77. final XWPFHeaderFooterPolicy hfPolicy = xDocument.getHeaderFooterPolicy();
  78. extractHeaders(buffy, hfPolicy);
  79. extractContent(buffy, xDocument);
  80. extractFooters(buffy, hfPolicy);
  81. }
  82. return buffy.toString();
  83. } catch (final Exception e) {
  84. throw new DocumentException(e.getMessage());
  85. } finally {
  86. if (bis != null) {
  87. bis.close();
  88. }
  89. }
  90. }
  91. private void extractContent(final StringBuilder buffy, final XWPFDocument document) throws IOException, XmlException {
  92. // first all paragraphs
  93. final Iterator<XWPFParagraph> i = document.getParagraphsIterator();
  94. while (i.hasNext()) {
  95. final XWPFParagraph paragraph = i.next();
  96. CTSectPr ctSectPr = null;
  97. if (paragraph.getCTP().getPPr() != null) {
  98. ctSectPr = paragraph.getCTP().getPPr().getSectPr();
  99. }
  100. XWPFHeaderFooterPolicy headerFooterPolicy = null;
  101. if (ctSectPr != null) {
  102. headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
  103. extractHeaders(buffy, headerFooterPolicy);
  104. }
  105. final XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(new XWPFHyperlinkDecorator(paragraph, null, true));
  106. final CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
  107. for (final CTBookmark bookmark : bookmarks) {
  108. buffy.append(bookmark.getName()).append(' ');
  109. }
  110. buffy.append(decorator.getText()).append(' ');
  111. if (ctSectPr != null) {
  112. extractFooters(buffy, headerFooterPolicy);
  113. }
  114. }
  115. }
  116. private void extractFooters(final StringBuilder buffy, final XWPFHeaderFooterPolicy hfPolicy) {
  117. if (hfPolicy.getFirstPageFooter() != null) {
  118. buffy.append(hfPolicy.getFirstPageFooter().getText()).append(' ');
  119. }
  120. if (hfPolicy.getEvenPageFooter() != null) {
  121. buffy.append(hfPolicy.getEvenPageFooter().getText()).append(' ');
  122. }
  123. if (hfPolicy.getDefaultFooter() != null) {
  124. buffy.append(hfPolicy.getDefaultFooter().getText()).append(' ');
  125. }
  126. }
  127. private void extractHeaders(final StringBuilder buffy, final XWPFHeaderFooterPolicy hfPolicy) {
  128. if (hfPolicy.getFirstPageHeader() != null) {
  129. buffy.append(hfPolicy.getFirstPageHeader().getText()).append(' ');
  130. }
  131. if (hfPolicy.getEvenPageHeader() != null) {
  132. buffy.append(hfPolicy.getEvenPageHeader().getText()).append(' ');
  133. }
  134. if (hfPolicy.getDefaultHeader() != null) {
  135. buffy.append(hfPolicy.getDefaultHeader().getText()).append(' ');
  136. }
  137. }
  138. }