/nutchindexing/nutch-1.2/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSExtractor.java
Java | 199 lines | 120 code | 36 blank | 43 comment | 13 complexity | 6da808cb8bd38959fdb78963ffbdbba3 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.nutch.parse.ms;
- // JDK imports
- import java.io.InputStream;
- import java.util.Date;
- import java.util.Properties;
- // Commons Logging imports
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- // Nutch imports
- import org.apache.nutch.metadata.DublinCore;
- import org.apache.nutch.metadata.HttpHeaders;
- import org.apache.nutch.metadata.Metadata;
- import org.apache.nutch.metadata.Office;
- import org.apache.nutch.net.protocols.HttpDateFormat;
- import org.apache.nutch.util.StringUtil;
- // Jakarta POI imports
- import org.apache.poi.hpsf.PropertySetFactory;
- import org.apache.poi.hpsf.SummaryInformation;
- import org.apache.poi.poifs.eventfilesystem.POIFSReader;
- import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
- import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
- /**
- * Defines a Microsoft document content extractor.
- *
- * @author Jérôme Charron
- */
- public abstract class MSExtractor {
-
- protected final static Log LOG = LogFactory.getLog(MSExtractor.class);
- private String text = null;
- private POIFSReader reader = null;
- private PropertiesBroker properties = null;
-
- /** Constructs a new Microsoft document extractor. */
- protected MSExtractor() { }
-
- /**
- * Extracts properties and text from an MS Document input stream
- */
- protected void extract(InputStream input) throws Exception {
- // First, extract properties
- this.reader = new POIFSReader();
- this.properties = new PropertiesBroker();
- this.reader.registerListener(
- new PropertiesReaderListener(this.properties),
- SummaryInformation.DEFAULT_STREAM_NAME);
- input.reset();
- if (input.available() > 0) {
- reader.read(input);
- }
- // Then, extract text
- input.reset();
- this.text = extractText(input);
- }
- /**
- * Extracts the text content from a Microsoft document input stream.
- */
- protected abstract String extractText(InputStream input) throws Exception;
-
-
- /**
- * Get the content text of the Microsoft document.
- * @return the content text of the document
- */
- protected String getText() {
- return this.text;
- }
-
- /**
- * Get the <code>Properties</code> of the Microsoft document.
- * @return the properties of the document
- */
- protected Properties getProperties() {
- return properties.getProperties();
- }
-
- private final static class PropertiesBroker {
- private final static int TIMEOUT = 2 * 1000;
- private Properties properties = null;
- public synchronized Properties getProperties() {
- final long start = new Date().getTime();
- long now = start;
- while (this.properties == null && now - start < TIMEOUT) {
- try {
- wait(TIMEOUT / 10);
- } catch (InterruptedException e) {
- }
- now = new Date().getTime();
- }
- notifyAll();
- return this.properties;
- }
- public synchronized void setProperties(Properties properties) {
- this.properties = properties;
- notifyAll();
- }
- }
-
-
- private class PropertiesReaderListener implements POIFSReaderListener {
-
- private PropertiesBroker propertiesBroker;
- private Properties metadata = new Properties();
-
- PropertiesReaderListener(PropertiesBroker propertiesBroker) {
- this.propertiesBroker = propertiesBroker;
- }
-
- public void processPOIFSReaderEvent(POIFSReaderEvent event) {
- if (!event.getName().startsWith(SummaryInformation.DEFAULT_STREAM_NAME)) {
- return;
- }
-
- try {
- SummaryInformation si = (SummaryInformation)
- PropertySetFactory.create(event.getStream());
- setProperty(DublinCore.TITLE, si.getTitle());
- setProperty(Office.APPLICATION_NAME, si.getApplicationName());
- setProperty(Office.AUTHOR, si.getAuthor());
- setProperty(Office.CHARACTER_COUNT, si.getCharCount());
- setProperty(Office.COMMENTS, si.getComments());
- setProperty(DublinCore.DATE, si.getCreateDateTime());
- // setProperty(Office.EDIT_TIME, si.getEditTime());
- setProperty(HttpHeaders.LAST_MODIFIED, si.getLastSaveDateTime());
- setProperty(Office.KEYWORDS, si.getKeywords());
- setProperty(Office.LAST_AUTHOR, si.getLastAuthor());
- setProperty(Office.LAST_PRINTED, si.getLastPrinted());
- setProperty(Office.LAST_SAVED, si.getLastSaveDateTime());
- setProperty(Office.PAGE_COUNT, si.getPageCount());
- setProperty(Office.REVISION_NUMBER, si.getRevNumber());
- setProperty(DublinCore.RIGHTS, si.getSecurity());
- setProperty(DublinCore.SUBJECT, si.getSubject());
- setProperty(Office.TEMPLATE, si.getTemplate());
- setProperty(Office.WORD_COUNT, si.getWordCount());
- } catch (Exception ex) {
- }
- propertiesBroker.setProperties(metadata);
- }
-
- private final void setProperty(String name, String value) {
- if (!StringUtil.isEmpty(name) && !StringUtil.isEmpty(value)) {
- metadata.setProperty(name, value);
- }
- }
- private final void setProperty(String name, int value) {
- if (value != 0) {
- setProperty(name, String.valueOf(value));
- }
- }
- private final void setProperty(String name, long value) {
- if (value != 0) {
- setProperty(name, String.valueOf(value));
- }
- }
- private final void setProperty(String name, Date date) {
- if (date != null) {
- setProperty(name, HttpDateFormat.toString(date));
- }
- }
- }
-
- }