/src/main/java/org/olat/search/service/document/file/WordOOXMLDocument.java
Java | 156 lines | 109 code | 18 blank | 29 comment | 24 complexity | dfe530c94dc566561a0e060d649eb7c5 MD5 | raw file
Possible License(s): LGPL-2.1, GPL-3.0, 0BSD, MPL-2.0-no-copyleft-exception, AGPL-3.0, Apache-2.0
- /**
- * OLAT - Online Learning and Training<br>
- * http://www.olat.org
- * <p>
- * Licensed under the Apache License, Version 2.0 (the "License"); <br>
- * you may not use this file except in compliance with the License.<br>
- * You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing,<br>
- * software distributed under the License is distributed on an "AS IS" BASIS, <br>
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
- * See the License for the specific language governing permissions and <br>
- * limitations under the License.
- * <p>
- * Copyright (c) frentix GmbH<br>
- * http://www.frentix.com<br>
- * <p>
- */
- package org.olat.search.service.document.file;
- import java.io.BufferedInputStream;
- import java.io.IOException;
- import java.util.Iterator;
- import org.apache.lucene.document.Document;
- import org.apache.poi.POIXMLDocument;
- import org.apache.poi.POIXMLTextExtractor;
- import org.apache.poi.extractor.ExtractorFactory;
- import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
- import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
- import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
- import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
- import org.apache.poi.xwpf.usermodel.XWPFDocument;
- import org.apache.poi.xwpf.usermodel.XWPFParagraph;
- import org.apache.xmlbeans.XmlException;
- import org.olat.core.logging.OLog;
- import org.olat.core.logging.Tracing;
- import org.olat.core.util.vfs.VFSLeaf;
- import org.olat.search.service.SearchResourceContext;
- import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
- import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
- /**
- * Description:<br>
- * Parse the Word XML document (.docx) with Apache POI
- * <P>
- * Initial Date: 14 dec. 2009 <br>
- *
- * @author srosse, stephane.rosse@frentix.com
- */
- public class WordOOXMLDocument extends FileDocument {
- private static final OLog log = Tracing.createLoggerFor(WordOOXMLDocument.class);
- public final static String FILE_TYPE = "type.file.word";
- public WordOOXMLDocument() {
- super();
- }
- public static Document createDocument(final SearchResourceContext leafResourceContext, final VFSLeaf leaf) throws IOException, DocumentException,
- DocumentAccessException {
- final WordOOXMLDocument wordDocument = new WordOOXMLDocument();
- wordDocument.init(leafResourceContext, leaf);
- wordDocument.setFileType(FILE_TYPE);
- wordDocument.setCssIcon("b_filetype_doc");
- if (log.isDebug()) {
- log.debug(wordDocument.toString());
- }
- return wordDocument.getLuceneDocument();
- }
- @Override
- protected String readContent(final VFSLeaf leaf) throws IOException, DocumentException {
- BufferedInputStream bis = null;
- final StringBuilder buffy = new StringBuilder();
- try {
- bis = new BufferedInputStream(leaf.getInputStream());
- final POIXMLTextExtractor extractor = (POIXMLTextExtractor) ExtractorFactory.createExtractor(bis);
- final POIXMLDocument document = extractor.getDocument();
- if (document instanceof XWPFDocument) {
- final XWPFDocument xDocument = (XWPFDocument) document;
- final XWPFHeaderFooterPolicy hfPolicy = xDocument.getHeaderFooterPolicy();
- extractHeaders(buffy, hfPolicy);
- extractContent(buffy, xDocument);
- extractFooters(buffy, hfPolicy);
- }
- return buffy.toString();
- } catch (final Exception e) {
- throw new DocumentException(e.getMessage());
- } finally {
- if (bis != null) {
- bis.close();
- }
- }
- }
- private void extractContent(final StringBuilder buffy, final XWPFDocument document) throws IOException, XmlException {
- // first all paragraphs
- final Iterator<XWPFParagraph> i = document.getParagraphsIterator();
- while (i.hasNext()) {
- final XWPFParagraph paragraph = i.next();
- CTSectPr ctSectPr = null;
- if (paragraph.getCTP().getPPr() != null) {
- ctSectPr = paragraph.getCTP().getPPr().getSectPr();
- }
- XWPFHeaderFooterPolicy headerFooterPolicy = null;
- if (ctSectPr != null) {
- headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
- extractHeaders(buffy, headerFooterPolicy);
- }
- final XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(new XWPFHyperlinkDecorator(paragraph, null, true));
- final CTBookmark[] bookmarks = paragraph.getCTP().getBookmarkStartArray();
- for (final CTBookmark bookmark : bookmarks) {
- buffy.append(bookmark.getName()).append(' ');
- }
- buffy.append(decorator.getText()).append(' ');
- if (ctSectPr != null) {
- extractFooters(buffy, headerFooterPolicy);
- }
- }
- }
- private void extractFooters(final StringBuilder buffy, final XWPFHeaderFooterPolicy hfPolicy) {
- if (hfPolicy.getFirstPageFooter() != null) {
- buffy.append(hfPolicy.getFirstPageFooter().getText()).append(' ');
- }
- if (hfPolicy.getEvenPageFooter() != null) {
- buffy.append(hfPolicy.getEvenPageFooter().getText()).append(' ');
- }
- if (hfPolicy.getDefaultFooter() != null) {
- buffy.append(hfPolicy.getDefaultFooter().getText()).append(' ');
- }
- }
- private void extractHeaders(final StringBuilder buffy, final XWPFHeaderFooterPolicy hfPolicy) {
- if (hfPolicy.getFirstPageHeader() != null) {
- buffy.append(hfPolicy.getFirstPageHeader().getText()).append(' ');
- }
- if (hfPolicy.getEvenPageHeader() != null) {
- buffy.append(hfPolicy.getEvenPageHeader().getText()).append(' ');
- }
- if (hfPolicy.getDefaultHeader() != null) {
- buffy.append(hfPolicy.getDefaultHeader().getText()).append(' ');
- }
- }
- }