PageRenderTime 52ms CodeModel.GetById 8ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/su/msk/jet/tikaserver/UnpackerResource.java

https://github.com/maxcom/tikaserver-ex
Java | 164 lines | 132 code | 32 blank | 0 comment | 16 complexity | 6dc3574dca462f0e9a4b22e8173f374d MD5 | raw file
  1. package su.msk.jet.tikaserver;
  2. import org.apache.commons.lang.mutable.MutableInt;
  3. import org.apache.commons.logging.Log;
  4. import org.apache.commons.logging.LogFactory;
  5. import org.apache.poi.poifs.filesystem.Ole10Native;
  6. import org.apache.poi.poifs.filesystem.Ole10NativeException;
  7. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  8. import org.apache.poi.util.IOUtils;
  9. import org.apache.tika.config.TikaConfig;
  10. import org.apache.tika.extractor.EmbeddedDocumentExtractor;
  11. import org.apache.tika.metadata.Metadata;
  12. import org.apache.tika.metadata.TikaMetadataKeys;
  13. import org.apache.tika.mime.MediaType;
  14. import org.apache.tika.mime.MimeTypeException;
  15. import org.apache.tika.parser.ParseContext;
  16. import org.apache.tika.parser.Parser;
  17. import org.apache.tika.parser.microsoft.OfficeParser;
  18. import org.xml.sax.ContentHandler;
  19. import org.xml.sax.SAXException;
  20. import org.xml.sax.helpers.DefaultHandler;
  21. import javax.ws.rs.PUT;
  22. import javax.ws.rs.Path;
  23. import javax.ws.rs.Produces;
  24. import javax.ws.rs.WebApplicationException;
  25. import javax.ws.rs.core.Context;
  26. import javax.ws.rs.core.HttpHeaders;
  27. import javax.ws.rs.core.Response;
  28. import javax.ws.rs.core.StreamingOutput;
  29. import java.io.*;
  30. import java.util.Collections;
  31. import java.util.zip.ZipOutputStream;
  32. @Path("/unpacker")
  33. public class UnpackerResource {
  34. private static final Log logger = LogFactory.getLog(UnpackerResource.class);
  35. private final TikaConfig tikaConfig;
  36. public UnpackerResource() {
  37. tikaConfig = TikaConfig.getDefaultConfig();
  38. }
  39. @PUT
  40. @Produces("application/zip")
  41. public StreamingOutput getText(
  42. InputStream is,
  43. @Context HttpHeaders httpHeaders
  44. ) throws Exception {
  45. if (!is.markSupported()) {
  46. is = new BufferedInputStream(is);
  47. }
  48. Parser parser;
  49. javax.ws.rs.core.MediaType mediaType = httpHeaders.getMediaType();
  50. if (mediaType !=null && !mediaType.equals(javax.ws.rs.core.MediaType.APPLICATION_OCTET_STREAM_TYPE)) {
  51. parser = tikaConfig.getParser(new MediaType(httpHeaders.getMediaType().getType(), httpHeaders.getMediaType().getSubtype()));
  52. } else {
  53. MediaType type = tikaConfig.getMimeRepository().detect(is, new Metadata());
  54. parser = tikaConfig.getParser(type);
  55. }
  56. if (parser==null) {
  57. throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
  58. }
  59. ContentHandler ch = new DefaultHandler();
  60. ParseContext pc = new ParseContext();
  61. ZipOutput zout = new ZipOutput();
  62. MutableInt count = new MutableInt();
  63. pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, zout));
  64. parser.parse(is, ch, new Metadata(), pc);
  65. if (count.intValue()==0) {
  66. throw new WebApplicationException(Response.Status.NO_CONTENT);
  67. }
  68. return zout;
  69. }
  70. private class MyEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
  71. private final MutableInt count;
  72. private final ZipOutput zout;
  73. MyEmbeddedDocumentExtractor(MutableInt count, ZipOutput zout) {
  74. this.count = count;
  75. this.zout = zout;
  76. }
  77. @Override
  78. public boolean shouldParseEmbedded(Metadata metadata) {
  79. return true;
  80. }
  81. @Override
  82. public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean b) throws SAXException, IOException {
  83. ByteArrayOutputStream bos = new ByteArrayOutputStream();
  84. IOUtils.copy(inputStream, bos);
  85. byte[] data = bos.toByteArray();
  86. String name = metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY);
  87. String contentType = metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
  88. if (name == null) {
  89. name = Integer.toString(count.intValue());
  90. }
  91. if (!name.contains(".")) {
  92. try {
  93. String ext = tikaConfig.getMimeRepository().forName(contentType).getExtension();
  94. if (ext!=null) {
  95. name += ext;
  96. }
  97. } catch (MimeTypeException e) {
  98. logger.warn("Unexpected MimeTypeException", e);
  99. }
  100. }
  101. if ("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) {
  102. POIFSFileSystem poifs = new POIFSFileSystem(new ByteArrayInputStream(data));
  103. OfficeParser.POIFSDocumentType type = OfficeParser.POIFSDocumentType.detectType(poifs);
  104. if (type == OfficeParser.POIFSDocumentType.OLE10_NATIVE) {
  105. try {
  106. Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(poifs);
  107. if (ole.getDataSize()>0) {
  108. String label = ole.getLabel();
  109. if (label.startsWith("ole-")) {
  110. label = Integer.toString(count.intValue()) + '-' + label;
  111. }
  112. name = label;
  113. data = ole.getDataBuffer();
  114. }
  115. } catch (Ole10NativeException ex) {
  116. logger.warn("Skipping invalid part", ex);
  117. }
  118. } else {
  119. name += '.' + type.getExtension();
  120. }
  121. }
  122. final String finalName = name;
  123. zout.put(new PartExtractor<byte[]>() {
  124. @Override
  125. public void extract(byte[] part, ZipOutputStream output) throws IOException {
  126. ZipUtils.zipStoreBuffer(output, finalName, part);
  127. }
  128. }, Collections.singletonList(data));
  129. count.increment();
  130. }
  131. }
  132. }