PageRenderTime 69ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java

https://github.com/solsson/tika
Java | 164 lines | 96 code | 17 blank | 51 comment | 6 complexity | e792b324ce485597dfa73d012dd27d56 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft.ooxml;
  18. import java.io.IOException;
  19. import java.util.List;
  20. import org.apache.poi.POIXMLDocument;
  21. import org.apache.poi.POIXMLTextExtractor;
  22. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  23. import org.apache.poi.openxml4j.opc.PackagePart;
  24. import org.apache.poi.openxml4j.opc.PackagePartName;
  25. import org.apache.poi.openxml4j.opc.PackageRelationship;
  26. import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
  27. import org.apache.poi.openxml4j.opc.PackagingURIHelper;
  28. import org.apache.poi.openxml4j.opc.TargetMode;
  29. import org.apache.tika.exception.TikaException;
  30. import org.apache.tika.io.TikaInputStream;
  31. import org.apache.tika.metadata.Metadata;
  32. import org.apache.tika.parser.EmptyParser;
  33. import org.apache.tika.parser.ParseContext;
  34. import org.apache.tika.parser.Parser;
  35. import org.apache.tika.sax.EmbeddedContentHandler;
  36. import org.apache.tika.sax.XHTMLContentHandler;
  37. import org.apache.xmlbeans.XmlException;
  38. import org.xml.sax.ContentHandler;
  39. import org.xml.sax.SAXException;
  40. /**
  41. * Base class for all Tika OOXML extractors.
  42. *
  43. * Tika extractors decorate POI extractors so that the parsed content of
  44. * documents is returned as a sequence of XHTML SAX events. Subclasses must
  45. * implement the buildXHTML method {@link #buildXHTML(XHTMLContentHandler)} that
  46. * populates the {@link XHTMLContentHandler} object received as parameter.
  47. */
  48. public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
  49. static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
  50. static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
  51. static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
  52. static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
  53. protected POIXMLTextExtractor extractor;
  54. private final String type;
  55. public AbstractOOXMLExtractor(POIXMLTextExtractor extractor, String type) {
  56. this.extractor = extractor;
  57. this.type = type;
  58. }
  59. /**
  60. * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
  61. */
  62. public POIXMLDocument getDocument() {
  63. return extractor.getDocument();
  64. }
  65. /**
  66. * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
  67. */
  68. public MetadataExtractor getMetadataExtractor() {
  69. return new MetadataExtractor(extractor, type);
  70. }
  71. /**
  72. * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
  73. * org.apache.tika.metadata.Metadata)
  74. */
  75. public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
  76. throws SAXException, XmlException, IOException, TikaException {
  77. XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  78. xhtml.startDocument();
  79. buildXHTML(xhtml);
  80. xhtml.endDocument();
  81. // Now do any embedded parts
  82. List<PackagePart> mainParts = getMainDocumentParts();
  83. for(PackagePart part : mainParts) {
  84. PackageRelationshipCollection rels;
  85. try {
  86. rels = part.getRelationships();
  87. } catch(InvalidFormatException e) {
  88. throw new TikaException("Corrupt OOXML file", e);
  89. }
  90. for(PackageRelationship rel : rels) {
  91. // Is it an embedded type (not part of the document)
  92. if( rel.getRelationshipType().equals(RELATION_AUDIO) ||
  93. rel.getRelationshipType().equals(RELATION_IMAGE) ||
  94. rel.getRelationshipType().equals(RELATION_OLE_OBJECT) ||
  95. rel.getRelationshipType().equals(RELATION_PACKAGE) ) {
  96. if(rel.getTargetMode() == TargetMode.INTERNAL) {
  97. PackagePartName relName;
  98. try {
  99. relName = PackagingURIHelper.createPartName(rel.getTargetURI());
  100. } catch(InvalidFormatException e) {
  101. throw new TikaException("Broken OOXML file", e);
  102. }
  103. PackagePart relPart = rel.getPackage().getPart(relName);
  104. handleEmbedded(rel, relPart, handler, context);
  105. }
  106. }
  107. }
  108. }
  109. }
  110. /**
  111. * Handles an embedded resource in the file
  112. */
  113. protected void handleEmbedded(PackageRelationship rel, PackagePart part,
  114. ContentHandler handler, ParseContext context)
  115. throws SAXException, XmlException, IOException, TikaException {
  116. // Get the name
  117. String name = rel.getTargetURI().toString();
  118. if(name.indexOf('/') > -1) {
  119. name = name.substring(name.lastIndexOf('/')+1);
  120. }
  121. // Get the content type
  122. String type = part.getContentType();
  123. // Call the recursing handler
  124. Metadata metadata = new Metadata();
  125. metadata.set(Metadata.TIKA_MIME_FILE, name);
  126. metadata.set(Metadata.CONTENT_TYPE, type);
  127. Parser parser = context.get(Parser.class, EmptyParser.INSTANCE);
  128. parser.parse(
  129. TikaInputStream.get(part.getInputStream()),
  130. new EmbeddedContentHandler(handler),
  131. metadata, context
  132. );
  133. }
  134. /**
  135. * Populates the {@link XHTMLContentHandler} object received as parameter.
  136. */
  137. protected abstract void buildXHTML(XHTMLContentHandler xhtml)
  138. throws SAXException, XmlException, IOException;
  139. /**
  140. * Return a list of the main parts of the document, used
  141. * when searching for embedded resources.
  142. * This should be all the parts of the document that end
  143. * up with things embedded into them.
  144. */
  145. protected abstract List<PackagePart> getMainDocumentParts()
  146. throws TikaException;
  147. }