PageRenderTime 43ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/nutchindexing/nutch-1.2/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java

https://bitbucket.org/AlexeyD/hibench
Java | 199 lines | 120 code | 36 blank | 43 comment | 13 complexity | 6da808cb8bd38959fdb78963ffbdbba3 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. /**
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.nutch.parse.ms;
  18. // JDK imports
  19. import java.io.InputStream;
  20. import java.util.Date;
  21. import java.util.Properties;
  22. // Commons Logging imports
  23. import org.apache.commons.logging.Log;
  24. import org.apache.commons.logging.LogFactory;
  25. // Nutch imports
  26. import org.apache.nutch.metadata.DublinCore;
  27. import org.apache.nutch.metadata.HttpHeaders;
  28. import org.apache.nutch.metadata.Metadata;
  29. import org.apache.nutch.metadata.Office;
  30. import org.apache.nutch.net.protocols.HttpDateFormat;
  31. import org.apache.nutch.util.StringUtil;
  32. // Jakarta POI imports
  33. import org.apache.poi.hpsf.PropertySetFactory;
  34. import org.apache.poi.hpsf.SummaryInformation;
  35. import org.apache.poi.poifs.eventfilesystem.POIFSReader;
  36. import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
  37. import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
  38. /**
  39. * Defines a Microsoft document content extractor.
  40. *
  41. * @author Jérôme Charron
  42. */
  43. public abstract class MSExtractor {
  44. protected final static Log LOG = LogFactory.getLog(MSExtractor.class);
  45. private String text = null;
  46. private POIFSReader reader = null;
  47. private PropertiesBroker properties = null;
  48. /** Constructs a new Microsoft document extractor. */
  49. protected MSExtractor() { }
  50. /**
  51. * Extracts properties and text from an MS Document input stream
  52. */
  53. protected void extract(InputStream input) throws Exception {
  54. // First, extract properties
  55. this.reader = new POIFSReader();
  56. this.properties = new PropertiesBroker();
  57. this.reader.registerListener(
  58. new PropertiesReaderListener(this.properties),
  59. SummaryInformation.DEFAULT_STREAM_NAME);
  60. input.reset();
  61. if (input.available() > 0) {
  62. reader.read(input);
  63. }
  64. // Then, extract text
  65. input.reset();
  66. this.text = extractText(input);
  67. }
  68. /**
  69. * Extracts the text content from a Microsoft document input stream.
  70. */
  71. protected abstract String extractText(InputStream input) throws Exception;
  72. /**
  73. * Get the content text of the Microsoft document.
  74. * @return the content text of the document
  75. */
  76. protected String getText() {
  77. return this.text;
  78. }
  79. /**
  80. * Get the <code>Properties</code> of the Microsoft document.
  81. * @return the properties of the document
  82. */
  83. protected Properties getProperties() {
  84. return properties.getProperties();
  85. }
  86. private final static class PropertiesBroker {
  87. private final static int TIMEOUT = 2 * 1000;
  88. private Properties properties = null;
  89. public synchronized Properties getProperties() {
  90. final long start = new Date().getTime();
  91. long now = start;
  92. while (this.properties == null && now - start < TIMEOUT) {
  93. try {
  94. wait(TIMEOUT / 10);
  95. } catch (InterruptedException e) {
  96. }
  97. now = new Date().getTime();
  98. }
  99. notifyAll();
  100. return this.properties;
  101. }
  102. public synchronized void setProperties(Properties properties) {
  103. this.properties = properties;
  104. notifyAll();
  105. }
  106. }
  107. private class PropertiesReaderListener implements POIFSReaderListener {
  108. private PropertiesBroker propertiesBroker;
  109. private Properties metadata = new Properties();
  110. PropertiesReaderListener(PropertiesBroker propertiesBroker) {
  111. this.propertiesBroker = propertiesBroker;
  112. }
  113. public void processPOIFSReaderEvent(POIFSReaderEvent event) {
  114. if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
  115. return;
  116. }
  117. try {
  118. SummaryInformation si = (SummaryInformation)
  119. PropertySetFactory.create(event.getStream());
  120. setProperty(DublinCore.TITLE, si.getTitle());
  121. setProperty(Office.APPLICATION_NAME, si.getApplicationName());
  122. setProperty(Office.AUTHOR, si.getAuthor());
  123. setProperty(Office.CHARACTER_COUNT, si.getCharCount());
  124. setProperty(Office.COMMENTS, si.getComments());
  125. setProperty(DublinCore.DATE, si.getCreateDateTime());
  126. // setProperty(Office.EDIT_TIME, si.getEditTime());
  127. setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
  128. setProperty(Office.KEYWORDS, si.getKeywords());
  129. setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
  130. setProperty(Office.LAST_PRINTED, si.getLastPrinted());
  131. setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
  132. setProperty(Office.PAGE_COUNT, si.getPageCount());
  133. setProperty(Office.REVISION_NUMBER, si.getRevNumber());
  134. setProperty(DublinCore.RIGHTS, si.getSecurity());
  135. setProperty(DublinCore.SUBJECT, si.getSubject());
  136. setProperty(Office.TEMPLATE, si.getTemplate());
  137. setProperty(Office.WORD_COUNT, si.getWordCount());
  138. } catch (Exception ex) {
  139. }
  140. propertiesBroker.setProperties(metadata);
  141. }
  142. private final void setProperty(String name, String value) {
  143. if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) {
  144. metadata.setProperty(name, value);
  145. }
  146. }
  147. private final void setProperty(String name, int value) {
  148. if (value != 0) {
  149. setProperty(name, String.valueOf(value));
  150. }
  151. }
  152. private final void setProperty(String name, long value) {
  153. if (value != 0) {
  154. setProperty(name, String.valueOf(value));
  155. }
  156. }
  157. private final void setProperty(String name, Date date) {
  158. if (date != null) {
  159. setProperty(name, HttpDateFormat.toString(date));
  160. }
  161. }
  162. }
  163. }