PageRenderTime 43ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 0ms

/portal-impl/src/com/liferay/portal/metadata/TikaRawMetadataProcessor.java

http://github.com/liferay/liferay-portal
Java | 255 lines | 181 code | 55 blank | 19 comment | 14 complexity | ed379e48c9033136d2cb6d6d44d76a36 MD5 | raw file
Possible License(s): LGPL-2.0
  1. /**
  2. * Copyright (c) 2000-present Liferay, Inc. All rights reserved.
  3. *
  4. * This library is free software; you can redistribute it and/or modify it under
  5. * the terms of the GNU Lesser General Public License as published by the Free
  6. * Software Foundation; either version 2.1 of the License, or (at your option)
  7. * any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful, but WITHOUT
  10. * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11. * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
  12. * details.
  13. */
  14. package com.liferay.portal.metadata;
  15. import com.liferay.petra.process.ProcessCallable;
  16. import com.liferay.petra.process.ProcessChannel;
  17. import com.liferay.petra.process.ProcessException;
  18. import com.liferay.petra.process.ProcessExecutor;
  19. import com.liferay.portal.fabric.InputResource;
  20. import com.liferay.portal.kernel.exception.SystemException;
  21. import com.liferay.portal.kernel.io.DummyWriter;
  22. import com.liferay.portal.kernel.log.Log;
  23. import com.liferay.portal.kernel.log.LogFactoryUtil;
  24. import com.liferay.portal.kernel.util.ArrayUtil;
  25. import com.liferay.portal.kernel.util.ContentTypes;
  26. import com.liferay.portal.kernel.util.FileUtil;
  27. import com.liferay.portal.kernel.util.ServiceProxyFactory;
  28. import com.liferay.portal.kernel.util.StringUtil;
  29. import com.liferay.portal.util.PortalClassPathUtil;
  30. import com.liferay.portal.util.PropsValues;
  31. import java.io.File;
  32. import java.io.FileInputStream;
  33. import java.io.IOException;
  34. import java.io.InputStream;
  35. import java.util.concurrent.Future;
  36. import java.util.logging.Level;
  37. import java.util.logging.Logger;
  38. import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
  39. import org.apache.commons.lang.exception.ExceptionUtils;
  40. import org.apache.poi.EncryptedDocumentException;
  41. import org.apache.tika.exception.TikaException;
  42. import org.apache.tika.metadata.Metadata;
  43. import org.apache.tika.metadata.XMPDM;
  44. import org.apache.tika.parser.ParseContext;
  45. import org.apache.tika.parser.Parser;
  46. import org.apache.tika.sax.WriteOutContentHandler;
  47. import org.xml.sax.ContentHandler;
  48. /**
  49. * @author Miguel Pastor
  50. * @author Alexander Chow
  51. * @author Shuyang Zhou
  52. */
  53. public class TikaRawMetadataProcessor extends XugglerRawMetadataProcessor {
  54. public void setParser(Parser parser) {
  55. _parser = parser;
  56. }
  57. @Override
  58. protected Metadata extractMetadata(
  59. String extension, String mimeType, File file) {
  60. Metadata metadata = super.extractMetadata(extension, mimeType, file);
  61. boolean forkProcess = false;
  62. if (PropsValues.TEXT_EXTRACTION_FORK_PROCESS_ENABLED &&
  63. ArrayUtil.contains(
  64. PropsValues.TEXT_EXTRACTION_FORK_PROCESS_MIME_TYPES,
  65. mimeType)) {
  66. forkProcess = true;
  67. }
  68. if (forkProcess) {
  69. ExtractMetadataProcessCallable extractMetadataProcessCallable =
  70. new ExtractMetadataProcessCallable(file, metadata, _parser);
  71. try {
  72. ProcessChannel<Metadata> processChannel =
  73. _processExecutor.execute(
  74. PortalClassPathUtil.getPortalProcessConfig(),
  75. extractMetadataProcessCallable);
  76. Future<Metadata> future =
  77. processChannel.getProcessNoticeableFuture();
  78. return _postProcessMetadata(mimeType, future.get());
  79. }
  80. catch (Exception exception) {
  81. throw new SystemException(exception);
  82. }
  83. }
  84. try {
  85. return _postProcessMetadata(
  86. mimeType,
  87. ExtractMetadataProcessCallable.extractMetadata(
  88. file, metadata, _parser));
  89. }
  90. catch (IOException ioException) {
  91. throw new SystemException(ioException);
  92. }
  93. }
  94. @Override
  95. protected Metadata extractMetadata(
  96. String extension, String mimeType, InputStream inputStream) {
  97. File file = FileUtil.createTempFile();
  98. try {
  99. FileUtil.write(file, inputStream);
  100. return extractMetadata(extension, mimeType, file);
  101. }
  102. catch (Exception exception) {
  103. throw new SystemException(exception);
  104. }
  105. finally {
  106. file.delete();
  107. }
  108. }
  109. private Metadata _postProcessMetadata(String mimeType, Metadata metadata) {
  110. if (!mimeType.equals(ContentTypes.IMAGE_SVG_XML)) {
  111. return metadata;
  112. }
  113. String contentType = metadata.get("Content-Type");
  114. if (contentType.startsWith(ContentTypes.TEXT_PLAIN)) {
  115. metadata.set(
  116. "Content-Type",
  117. StringUtil.replace(
  118. mimeType, ContentTypes.TEXT_PLAIN,
  119. ContentTypes.IMAGE_SVG_XML));
  120. }
  121. return metadata;
  122. }
  123. private static final Log _log = LogFactoryUtil.getLog(
  124. TikaRawMetadataProcessor.class);
  125. private static volatile ProcessExecutor _processExecutor =
  126. ServiceProxyFactory.newServiceTrackedInstance(
  127. ProcessExecutor.class, TikaRawMetadataProcessor.class,
  128. "_processExecutor", true);
  129. private Parser _parser;
  130. private static class ExtractMetadataProcessCallable
  131. implements ProcessCallable<Metadata> {
  132. public ExtractMetadataProcessCallable(
  133. File file, Metadata metadata, Parser parser) {
  134. _file = file;
  135. _metadata = metadata;
  136. _parser = parser;
  137. }
  138. @Override
  139. public Metadata call() throws ProcessException {
  140. Logger logger = Logger.getLogger(
  141. "org.apache.tika.parser.SQLite3Parser");
  142. logger.setLevel(Level.SEVERE);
  143. logger = Logger.getLogger("org.apache.tika.parsers.PDFParser");
  144. logger.setLevel(Level.SEVERE);
  145. try {
  146. return extractMetadata(_file, _metadata, _parser);
  147. }
  148. catch (IOException ioException) {
  149. throw new ProcessException(ioException);
  150. }
  151. }
  152. protected static Metadata extractMetadata(
  153. File file, Metadata metadata, Parser parser)
  154. throws IOException {
  155. if (metadata == null) {
  156. metadata = new Metadata();
  157. }
  158. if (file.length() == 0) {
  159. return metadata;
  160. }
  161. ParseContext parseContext = new ParseContext();
  162. parseContext.set(Parser.class, parser);
  163. ContentHandler contentHandler = new WriteOutContentHandler(
  164. new DummyWriter());
  165. try (InputStream inputStream = new FileInputStream(file)) {
  166. parser.parse(
  167. inputStream, contentHandler, metadata, parseContext);
  168. }
  169. catch (Exception exception) {
  170. Throwable throwable = ExceptionUtils.getRootCause(exception);
  171. if (throwable instanceof EncryptedDocumentException ||
  172. throwable instanceof UnsupportedZipFeatureException) {
  173. if (_log.isWarnEnabled()) {
  174. _log.warn(
  175. "Unable to extract metadata from an encrypted " +
  176. "file");
  177. }
  178. }
  179. else if (exception instanceof TikaException) {
  180. if (_log.isWarnEnabled()) {
  181. _log.warn("Unable to extract metadata");
  182. }
  183. }
  184. else {
  185. _log.error(exception, exception);
  186. }
  187. throw new IOException(exception);
  188. }
  189. // Remove potential security risks
  190. metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName());
  191. metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName());
  192. return metadata;
  193. }
  194. private static final long serialVersionUID = 1L;
  195. @InputResource
  196. private final File _file;
  197. private final Metadata _metadata;
  198. private final Parser _parser;
  199. }
  200. }