PageRenderTime 36ms CodeModel.GetById 1ms RepoModel.GetById 1ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java

https://github.com/solsson/tika
Java | 251 lines | 194 code | 30 blank | 27 comment | 21 complexity | 3a2a0badbeb8b43e9095f9f8eb316b46 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.security.GeneralSecurityException;
  21. import java.util.Arrays;
  22. import java.util.Collections;
  23. import java.util.HashSet;
  24. import java.util.Locale;
  25. import java.util.Set;
  26. import org.apache.poi.hdgf.extractor.VisioTextExtractor;
  27. import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
  28. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  29. import org.apache.poi.poifs.crypt.Decryptor;
  30. import org.apache.poi.poifs.crypt.EncryptionInfo;
  31. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  32. import org.apache.poi.poifs.filesystem.Entry;
  33. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  34. import org.apache.tika.exception.TikaException;
  35. import org.apache.tika.io.TikaInputStream;
  36. import org.apache.tika.metadata.Metadata;
  37. import org.apache.tika.mime.MediaType;
  38. import org.apache.tika.parser.ParseContext;
  39. import org.apache.tika.parser.Parser;
  40. import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
  41. import org.apache.tika.sax.BodyContentHandler;
  42. import org.apache.tika.sax.EmbeddedContentHandler;
  43. import org.apache.tika.sax.XHTMLContentHandler;
  44. import org.xml.sax.ContentHandler;
  45. import org.xml.sax.SAXException;
  46. /**
  47. * Defines a Microsoft document content extractor.
  48. */
  49. public class OfficeParser implements Parser {
  50. private static final long serialVersionUID = 7393462244028653479L;
  51. private static final Set<MediaType> SUPPORTED_TYPES =
  52. Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
  53. POIFSDocumentType.WORKBOOK.type,
  54. POIFSDocumentType.OLE10_NATIVE.type,
  55. POIFSDocumentType.WORDDOCUMENT.type,
  56. POIFSDocumentType.UNKNOWN.type,
  57. POIFSDocumentType.ENCRYPTED.type,
  58. POIFSDocumentType.POWERPOINT.type,
  59. POIFSDocumentType.PUBLISHER.type,
  60. POIFSDocumentType.VISIO.type,
  61. POIFSDocumentType.OUTLOOK.type,
  62. MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12")
  63. )));
  64. public enum POIFSDocumentType {
  65. WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
  66. OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice")),
  67. WORDDOCUMENT("doc", MediaType.application("msword")),
  68. UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
  69. ENCRYPTED("ole", MediaType.application("x-tika-msoffice")),
  70. POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
  71. PUBLISHER("pub", MediaType.application("x-mspublisher")),
  72. VISIO("vsd", MediaType.application("vnd.visio")),
  73. WORKS("wps", MediaType.application("vnd.ms-works")),
  74. OUTLOOK("msg", MediaType.application("vnd.ms-outlook"));
  75. private final String extension;
  76. private final MediaType type;
  77. POIFSDocumentType(String extension, MediaType type) {
  78. this.extension = extension;
  79. this.type = type;
  80. }
  81. public String getExtension() {
  82. return extension;
  83. }
  84. public MediaType getType() {
  85. return type;
  86. }
  87. public static POIFSDocumentType detectType(POIFSFileSystem fs) {
  88. return detectType(fs.getRoot());
  89. }
  90. public static POIFSDocumentType detectType(DirectoryEntry node) {
  91. for (Entry entry : node) {
  92. POIFSDocumentType type = detectType(entry);
  93. if (type!=UNKNOWN) {
  94. return type;
  95. }
  96. }
  97. return UNKNOWN;
  98. }
  99. public static POIFSDocumentType detectType(Entry entry) {
  100. String name = entry.getName();
  101. if ("Workbook".equals(name)) {
  102. return WORKBOOK;
  103. }
  104. if ("EncryptedPackage".equals(name)) {
  105. return ENCRYPTED;
  106. }
  107. if ("WordDocument".equals(name)) {
  108. return WORDDOCUMENT;
  109. }
  110. if ("Quill".equals(name)) {
  111. return PUBLISHER;
  112. }
  113. if ("PowerPoint Document".equals(entry.getName())) {
  114. return POWERPOINT;
  115. }
  116. if ("VisioDocument".equals(entry.getName())) {
  117. return VISIO;
  118. }
  119. if ("CONTENTS".equals(entry.getName())) {
  120. return WORKS;
  121. }
  122. if (entry.getName().startsWith("__substg1.0_")) {
  123. return OUTLOOK;
  124. }
  125. if ("\u0001Ole10Native".equals(name)) {
  126. return POIFSDocumentType.OLE10_NATIVE;
  127. }
  128. return UNKNOWN;
  129. }
  130. }
  131. public Set<MediaType> getSupportedTypes(ParseContext context) {
  132. return SUPPORTED_TYPES;
  133. }
  134. /**
  135. * Extracts properties and text from an MS Document input stream
  136. */
  137. public void parse(
  138. InputStream stream, ContentHandler handler,
  139. Metadata metadata, ParseContext context)
  140. throws IOException, SAXException, TikaException {
  141. XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  142. xhtml.startDocument();
  143. POIFSFileSystem filesystem;
  144. if(stream instanceof TikaInputStream &&
  145. ((TikaInputStream)stream).getOpenContainer() != null) {
  146. filesystem = (POIFSFileSystem)((TikaInputStream)stream).getOpenContainer();
  147. } else {
  148. filesystem = new POIFSFileSystem(stream);
  149. }
  150. // Parse summary entries first, to make metadata available early
  151. new SummaryExtractor(metadata).parseSummaries(filesystem);
  152. // Parse remaining document entries
  153. boolean outlookExtracted = false;
  154. for (Entry entry : filesystem.getRoot()) {
  155. POIFSDocumentType type = POIFSDocumentType.detectType(entry);
  156. if (type!=POIFSDocumentType.UNKNOWN) {
  157. setType(metadata, type.getType());
  158. }
  159. switch (type) {
  160. case PUBLISHER:
  161. PublisherTextExtractor publisherTextExtractor =
  162. new PublisherTextExtractor(filesystem);
  163. xhtml.element("p", publisherTextExtractor.getText());
  164. break;
  165. case WORDDOCUMENT:
  166. new WordExtractor(context).parse(filesystem, xhtml);
  167. break;
  168. case POWERPOINT:
  169. PowerPointExtractor powerPointExtractor =
  170. new PowerPointExtractor(filesystem);
  171. xhtml.element("p", powerPointExtractor.getText(true, true));
  172. break;
  173. case WORKBOOK:
  174. Locale locale = context.get(Locale.class, Locale.getDefault());
  175. new ExcelExtractor(context).parse(filesystem, xhtml, locale);
  176. break;
  177. case VISIO:
  178. VisioTextExtractor visioTextExtractor =
  179. new VisioTextExtractor(filesystem);
  180. for (String text : visioTextExtractor.getAllText()) {
  181. xhtml.element("p", text);
  182. }
  183. break;
  184. case OUTLOOK:
  185. if (!outlookExtracted) {
  186. outlookExtracted = true;
  187. OutlookExtractor extractor =
  188. new OutlookExtractor(filesystem, context);
  189. extractor.parse(xhtml, metadata);
  190. }
  191. break;
  192. case ENCRYPTED:
  193. EncryptionInfo info = new EncryptionInfo(filesystem);
  194. Decryptor d = new Decryptor(info);
  195. try {
  196. if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
  197. throw new TikaException("Unable to process: document is encrypted");
  198. }
  199. OOXMLParser parser = new OOXMLParser();
  200. parser.parse(d.getDataStream(filesystem), new EmbeddedContentHandler(
  201. new BodyContentHandler(xhtml)),
  202. metadata, context);
  203. } catch (GeneralSecurityException ex) {
  204. throw new TikaException("Unable to process encrypted document", ex);
  205. }
  206. }
  207. }
  208. xhtml.endDocument();
  209. }
  210. /**
  211. * @deprecated This method will be removed in Apache Tika 1.0.
  212. */
  213. public void parse(
  214. InputStream stream, ContentHandler handler, Metadata metadata)
  215. throws IOException, SAXException, TikaException {
  216. parse(stream, handler, metadata, new ParseContext());
  217. }
  218. private void setType(Metadata metadata, MediaType type) {
  219. metadata.set(Metadata.CONTENT_TYPE, type.toString());
  220. }
  221. }