PageRenderTime 5715ms CodeModel.GetById 21ms RepoModel.GetById 11ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/detect/ZipContainerDetector.java

https://github.com/solsson/tika
Java | 110 lines | 72 code | 11 blank | 27 comment | 14 complexity | 7527ef2d855a73e56a539cc1c000a1c9 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.detect;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.util.Collections;
  21. import java.util.zip.ZipEntry;
  22. import java.util.zip.ZipFile;
  23. import org.apache.poi.extractor.ExtractorFactory;
  24. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  25. import org.apache.poi.openxml4j.opc.OPCPackage;
  26. import org.apache.poi.openxml4j.opc.PackagePart;
  27. import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
  28. import org.apache.tika.io.IOUtils;
  29. import org.apache.tika.io.TikaInputStream;
  30. import org.apache.tika.metadata.Metadata;
  31. import org.apache.tika.mime.MediaType;
  32. /**
  33. * A detector that works on a Zip document
  34. * to figure out exactly what the file is
  35. */
  36. public class ZipContainerDetector implements ContainerDetector {
  37. public MediaType getDefault() {
  38. return MediaType.APPLICATION_ZIP;
  39. }
  40. public MediaType detect(InputStream input, Metadata metadata)
  41. throws IOException {
  42. if (TikaInputStream.isTikaInputStream(input)) {
  43. return detect(TikaInputStream.get(input), metadata);
  44. } else {
  45. return MediaType.APPLICATION_ZIP;
  46. }
  47. }
  48. public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException {
  49. ZipFile zip = new ZipFile(input.getFile());
  50. for (ZipEntry entry : Collections.list(zip.entries())) {
  51. // Is it an Open Document file?
  52. if (entry.getName().equals("mimetype")) {
  53. InputStream stream = zip.getInputStream(entry);
  54. try {
  55. return fromString(IOUtils.toString(stream, "UTF-8"));
  56. } finally {
  57. stream.close();
  58. }
  59. } else if (entry.getName().equals("_rels/.rels") ||
  60. entry.getName().equals("[Content_Types].xml")) {
  61. // Office Open XML File
  62. // As POI to open and investigate it for us
  63. try {
  64. OPCPackage pkg = OPCPackage.open(input.getFile().toString());
  65. input.setOpenContainer(pkg);
  66. PackageRelationshipCollection core =
  67. pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
  68. if(core.size() != 1) {
  69. throw new IOException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
  70. }
  71. // Get the type of the core document part
  72. PackagePart corePart = pkg.getPart(core.getRelationship(0));
  73. String coreType = corePart.getContentType();
  74. // Turn that into the type of the overall document
  75. String docType = coreType.substring(0, coreType.lastIndexOf('.'));
  76. return fromString(docType);
  77. } catch(InvalidFormatException e) {
  78. throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage());
  79. }
  80. } else if(entry.getName().equals("buildVersionHistory.plist")) {
  81. // TODO - iWork
  82. } else if(entry.getName().equals("META-INF/")) {
  83. // Java Jar
  84. return MediaType.application("java-archive");
  85. }
  86. }
  87. return MediaType.APPLICATION_ZIP;
  88. }
  89. private static MediaType fromString(String type) {
  90. int splitAt = type.indexOf('/');
  91. if(splitAt > -1) {
  92. return new MediaType(
  93. type.substring(0,splitAt),
  94. type.substring(splitAt+1)
  95. );
  96. }
  97. return MediaType.APPLICATION_ZIP;
  98. }
  99. }