PageRenderTime 4002ms CodeModel.GetById 7ms RepoModel.GetById 5ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java

https://github.com/solsson/tika
Java | 139 lines | 97 code | 14 blank | 28 comment | 9 complexity | 00cd5fb17c44cf06140281a60e81832a MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft;
  18. import java.io.File;
  19. import java.io.FileNotFoundException;
  20. import java.io.FileOutputStream;
  21. import java.io.IOException;
  22. import java.io.InputStream;
  23. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  24. import org.apache.poi.poifs.filesystem.DocumentEntry;
  25. import org.apache.poi.poifs.filesystem.DocumentInputStream;
  26. import org.apache.poi.poifs.filesystem.Entry;
  27. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  28. import org.apache.tika.detect.ZipContainerDetector;
  29. import org.apache.tika.exception.TikaException;
  30. import org.apache.tika.extractor.EmbeddedDocumentExtractor;
  31. import org.apache.tika.io.TikaInputStream;
  32. import org.apache.tika.metadata.Metadata;
  33. import org.apache.tika.mime.MediaType;
  34. import org.apache.tika.parser.EmptyParser;
  35. import org.apache.tika.parser.ParseContext;
  36. import org.apache.tika.parser.Parser;
  37. import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
  38. import org.apache.tika.sax.EmbeddedContentHandler;
  39. import org.apache.tika.sax.XHTMLContentHandler;
  40. import org.xml.sax.SAXException;
  41. abstract class AbstractPOIFSExtractor {
  42. private final EmbeddedDocumentExtractor extractor;
  43. protected AbstractPOIFSExtractor(ParseContext context) {
  44. this.extractor = new EmbeddedDocumentExtractor(context);
  45. }
  46. protected void handleEmbeddedResource(TikaInputStream resource,
  47. String filename, String mediaType, XHTMLContentHandler xhtml)
  48. throws IOException, SAXException, TikaException {
  49. try {
  50. Metadata metadata = new Metadata();
  51. if(filename != null) {
  52. metadata.set(Metadata.TIKA_MIME_FILE, filename);
  53. metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
  54. }
  55. if(mediaType != null) {
  56. metadata.set(Metadata.CONTENT_TYPE, mediaType);
  57. }
  58. if (extractor.shouldParseEmbedded(metadata)) {
  59. extractor.parseEmbedded(resource, xhtml, metadata);
  60. }
  61. } finally {
  62. resource.close();
  63. }
  64. }
  65. /**
  66. * Handle an office document that's embedded at the POIFS level
  67. */
  68. protected void handleEmbededOfficeDoc(
  69. DirectoryEntry dir, XHTMLContentHandler xhtml)
  70. throws IOException, SAXException, TikaException {
  71. // Is it an embedded OLE2 document, or an embedded OOXML document?
  72. try {
  73. Entry ooxml = dir.getEntry("Package");
  74. // It's OOXML
  75. TikaInputStream ooxmlStream = TikaInputStream.get(
  76. new DocumentInputStream((DocumentEntry)ooxml)
  77. );
  78. ZipContainerDetector detector = new ZipContainerDetector();
  79. MediaType type = detector.detect(ooxmlStream, new Metadata());
  80. handleEmbeddedResource(ooxmlStream, null, type.toString(), xhtml);
  81. return;
  82. } catch(FileNotFoundException e) {
  83. // It's regular OLE2
  84. }
  85. // Need to dump the directory out to a new temp file, so
  86. // it's stand along
  87. POIFSFileSystem newFS = new POIFSFileSystem();
  88. copy(dir, newFS.getRoot());
  89. File tmpFile = File.createTempFile("tika", ".ole2");
  90. try {
  91. FileOutputStream out = new FileOutputStream(tmpFile);
  92. newFS.writeFilesystem(out);
  93. out.close();
  94. // What kind of document is it?
  95. Metadata metadata = new Metadata();
  96. POIFSDocumentType type = POIFSDocumentType.detectType(dir);
  97. metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
  98. // Trigger for the document itself
  99. TikaInputStream embedded = TikaInputStream.get(tmpFile);
  100. try {
  101. if (extractor.shouldParseEmbedded(metadata)) {
  102. extractor.parseEmbedded(embedded, xhtml, metadata);
  103. }
  104. } finally {
  105. embedded.close();
  106. }
  107. } finally {
  108. tmpFile.delete();
  109. }
  110. }
  111. protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
  112. throws IOException {
  113. for (Entry entry : sourceDir) {
  114. if (entry instanceof DirectoryEntry) {
  115. // Need to recurse
  116. DirectoryEntry newDir = destDir.createDirectory(entry.getName());
  117. copy((DirectoryEntry)entry, newDir);
  118. } else {
  119. // Copy entry
  120. InputStream contents = new DocumentInputStream((DocumentEntry)entry);
  121. destDir.createDocument(entry.getName(), contents);
  122. }
  123. }
  124. }
  125. }