/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
Java | 384 lines | 279 code | 67 blank | 38 comment | 29 complexity | e87a6708a4ccb1c1224d20be76800ce2 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.tika.parser.microsoft.ooxml.xwpf;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import javax.xml.parsers.ParserConfigurationException;
- import org.apache.commons.io.input.CloseShieldInputStream;
- import org.apache.poi.ooxml.POIXMLDocument;
- import org.apache.poi.ooxml.POIXMLProperties;
- import org.apache.poi.ooxml.extractor.POIXMLTextExtractor;
- import org.apache.poi.ooxml.util.SAXHelper;
- import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
- import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.openxml4j.opc.PackageAccess;
- import org.apache.poi.openxml4j.opc.PackagePart;
- import org.apache.poi.openxml4j.opc.PackageRelationship;
- import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
- import org.apache.poi.xwpf.usermodel.XWPFNumbering;
- import org.apache.poi.xwpf.usermodel.XWPFRelation;
- import org.apache.xmlbeans.XmlException;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.xml.sax.InputSource;
- import org.xml.sax.SAXException;
- import org.xml.sax.XMLReader;
- import org.apache.tika.exception.RuntimeSAXException;
- import org.apache.tika.exception.WriteLimitReachedException;
- import org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
- import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
- import org.apache.tika.parser.microsoft.ooxml.RunProperties;
- import org.apache.tika.parser.microsoft.ooxml.XWPFListManager;
- //TODO: move this into POI?
- /**
- * Experimental class that is based on POI's XSSFEventBasedExcelExtractor
- */
- public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
- private static final Logger LOG = LoggerFactory.getLogger(XWPFEventBasedWordExtractor.class);
- private OPCPackage container;
- private POIXMLProperties properties;
- public XWPFEventBasedWordExtractor(String path)
- throws XmlException, OpenXML4JException, IOException {
- this(OPCPackage.open(path, PackageAccess.READ));
- }
- public XWPFEventBasedWordExtractor(OPCPackage container)
- throws XmlException, OpenXML4JException, IOException {
- super((POIXMLDocument) null);
- this.container = container;
- this.properties = new POIXMLProperties(container);
- }
- public static void main(String[] args) throws Exception {
- if (args.length < 1) {
- System.err.println("Use:");
- System.err.println(" XWPFEventBasedWordExtractor <filename.xlsx>");
- System.exit(1);
- }
- XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
- System.out.println(extractor.getText());
- extractor.close();
- }
- public OPCPackage getPackage() {
- return this.container;
- }
- public POIXMLProperties.CoreProperties getCoreProperties() {
- return this.properties.getCoreProperties();
- }
- public POIXMLProperties.ExtendedProperties getExtendedProperties() {
- return this.properties.getExtendedProperties();
- }
- public POIXMLProperties.CustomProperties getCustomProperties() {
- return this.properties.getCustomProperties();
- }
- @Override
- public String getText() {
- StringBuilder sb = new StringBuilder();
- //handle main document
- List<PackagePart> pps =
- container.getPartsByContentType(XWPFRelation.DOCUMENT.getContentType());
- if (pps != null) {
- for (PackagePart pp : pps) {
- //likely only one, but why not...
- try {
- handleDocumentPart(pp, sb);
- } catch (IOException e) {
- LOG.warn("IOException handling document part", e);
- } catch (SAXException e) {
- if (WriteLimitReachedException.isWriteLimitReached(e)) {
- throw new RuntimeSAXException(e);
- }
- //swallow this because we don't actually call it
- LOG.warn("SAXException handling document part", e);
- }
- }
- }
- //handle glossary document
- pps = container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
- if (pps != null) {
- for (PackagePart pp : pps) {
- //likely only one, but why not...
- try {
- handleDocumentPart(pp, sb);
- } catch (IOException e) {
- LOG.warn("IOException handling glossary document part", e);
- } catch (SAXException e) {
- if (WriteLimitReachedException.isWriteLimitReached(e)) {
- throw new RuntimeSAXException(e);
- }
- //swallow this because we don't actually call it
- LOG.warn("SAXException handling glossary document part", e);
- }
- }
- }
- return sb.toString();
- }
- private void handleDocumentPart(PackagePart documentPart, StringBuilder sb)
- throws IOException, SAXException {
- //load the numbering/list manager and styles from the main document part
- XWPFNumbering numbering = loadNumbering(documentPart);
- XWPFListManager xwpfListManager = new XWPFListManager(numbering);
- //TODO: XWPFStyles styles = loadStyles(documentPart);
- //headers
- try {
- PackageRelationshipCollection headersPRC =
- documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
- if (headersPRC != null) {
- for (int i = 0; i < headersPRC.size(); i++) {
- PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
- handlePart(header, xwpfListManager, sb);
- }
- }
- } catch (InvalidFormatException e) {
- LOG.warn("Invalid format", e);
- }
- //main document
- handlePart(documentPart, xwpfListManager, sb);
- //for now, just dump other components at end
- for (XWPFRelation rel : new XWPFRelation[]{XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT,
- XWPFRelation.FOOTER, XWPFRelation.ENDNOTE}) {
- try {
- PackageRelationshipCollection prc =
- documentPart.getRelationshipsByType(rel.getRelation());
- if (prc != null) {
- for (int i = 0; i < prc.size(); i++) {
- PackagePart packagePart =
- documentPart.getRelatedPart(prc.getRelationship(i));
- handlePart(packagePart, xwpfListManager, sb);
- }
- }
- } catch (InvalidFormatException e) {
- LOG.warn("Invalid format", e);
- }
- }
- }
- private void handlePart(PackagePart packagePart, XWPFListManager xwpfListManager,
- StringBuilder buffer) throws IOException, SAXException {
- Map<String, String> hyperlinks = loadHyperlinkRelationships(packagePart);
- try (InputStream stream = packagePart.getInputStream()) {
- XMLReader reader = SAXHelper.newXMLReader();
- reader.setContentHandler(
- new OOXMLWordAndPowerPointTextHandler(new XWPFToTextContentHandler(buffer),
- hyperlinks));
- reader.parse(new InputSource(new CloseShieldInputStream(stream)));
- } catch (ParserConfigurationException e) {
- LOG.warn("Can't configure XMLReader", e);
- }
- }
- private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
- Map<String, String> hyperlinks = new HashMap<>();
- try {
- PackageRelationshipCollection prc =
- bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
- for (int i = 0; i < prc.size(); i++) {
- PackageRelationship pr = prc.getRelationship(i);
- if (pr == null) {
- continue;
- }
- String id = pr.getId();
- String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
- if (id != null && url != null) {
- hyperlinks.put(id, url);
- }
- }
- } catch (InvalidFormatException e) {
- LOG.warn("Invalid format", e);
- }
- return hyperlinks;
- }
- private XWPFNumbering loadNumbering(PackagePart packagePart) throws IOException {
- try {
- PackageRelationshipCollection numberingParts =
- packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
- if (numberingParts.size() > 0) {
- PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
- if (numberingRelationShip == null) {
- return null;
- }
- PackagePart numberingPart = container.getPart(numberingRelationShip);
- if (numberingPart == null) {
- return null;
- }
- return new XWPFNumbering(numberingPart);
- }
- } catch (OpenXML4JException e) {
- LOG.warn("Couldn't load numbering", e);
- }
- return null;
- }
- private static class XWPFToTextContentHandler
- implements OOXMLWordAndPowerPointTextHandler.XWPFBodyContentsHandler {
- private final StringBuilder buffer;
- public XWPFToTextContentHandler(StringBuilder buffer) {
- this.buffer = buffer;
- }
- @Override
- public void run(RunProperties runProperties, String contents) {
- buffer.append(contents);
- }
- @Override
- public void hyperlinkStart(String link) {
- //no-op
- }
- @Override
- public void hyperlinkEnd() {
- //no-op
- }
- @Override
- public void startParagraph(ParagraphProperties paragraphProperties) {
- //no-op
- }
- @Override
- public void endParagraph() {
- buffer.append("\n");
- }
- @Override
- public void startTable() {
- }
- @Override
- public void endTable() {
- }
- @Override
- public void startTableRow() {
- }
- @Override
- public void endTableRow() {
- buffer.append("\n");
- }
- @Override
- public void startTableCell() {
- }
- @Override
- public void endTableCell() {
- buffer.append("\t");
- }
- @Override
- public void startSDT() {
- }
- @Override
- public void endSDT() {
- buffer.append("\n");
- }
- @Override
- public void startEditedSection(String editor, Date date,
- OOXMLWordAndPowerPointTextHandler.EditType editType) {
- }
- @Override
- public void endEditedSection() {
- }
- @Override
- public boolean isIncludeDeletedText() {
- return true;
- }
- @Override
- public void footnoteReference(String id) {
- }
- @Override
- public void endnoteReference(String id) {
- }
- @Override
- public boolean isIncludeMoveFromText() {
- return false;
- }
- @Override
- public void embeddedOLERef(String refId) {
- //no-op
- }
- @Override
- public void embeddedPicRef(String picFileName, String picDescription) {
- //no-op
- }
- @Override
- public void startBookmark(String id, String name) {
- //no-op
- }
- @Override
- public void endBookmark(String id) {
- //no-op
- }
- }
- }