/nutchindexing/nutch-1.2/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java
Java | 431 lines | 213 code | 56 blank | 162 comment | 85 complexity | 75c37848bf3d7cb3ed9cca98daec6b70 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
- /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.nutch.parse.mspowerpoint;
- import java.util.Enumeration;
- import java.util.Hashtable;
- import java.util.List;
- import java.util.Vector;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.poi.hdf.extractor.Utils;
- import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
- import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
- import org.apache.poi.poifs.filesystem.DocumentInputStream;
- import org.apache.poi.util.LittleEndian;
- import org.apache.poi.util.StringUtil;
- /**
- * Listener to read the content of PowerPoint file and transfere it to the
- * passed <code>StringBuffer</code>.
- *
- * @author Stephan Strittmatter - http://www.sybit.de
- *
- * @version 1.0
- *
- */
- class ContentReaderListener implements POIFSReaderListener {
- private static final Log LOG = LogFactory.getLog(ContentReaderListener.class);
- /** Buffer holding the content of the file */
- protected final transient StringBuffer buf;
- /**
- * Constructs Listener to get content of PowerPoint file.
- *
- * @param content
- * StringBuffer refereing the content of the PowerPoint file.
- */
- public ContentReaderListener(final StringBuffer content) {
- this.buf = content;
- }
- /**
- * Reads the internal PowerPoint document stream.
- *
- * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
- */
- public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
- if (event == null || event.getName() == null
- || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Stream not processed. It is not a PowerPoint document: : "
- + event.getName());
- }
- return;
- }
- try {
- final DocumentInputStream dis = event.getStream();
- final byte pptdata[] = new byte[dis.available()];
- dis.read(pptdata, 0, dis.available());
- int offset = 0;
- long offsetPD = 0;
- /*
- * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
- * in all PlaceHolders to hold PPTClientTextBox objects for mapping into
- * Slide Objects
- */
- Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
- * <Long,
- * TextBox>
- */();
- // Traverse ByteArray to identiy edit paths of ClientTextBoxes
- long n = pptdata.length - 20;
- for (long i = 0; i < n; i++) {
- final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
- // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
- if (PPTConstants.PPT_ATOM_USEREDIT == type) {
- /*
- * Checking the Record Header (UserEditAtom)
- */
- // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
- // final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
- offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
- offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
- /*
- * Call to extract ClientTextBox text in each UserEditAtom
- */
- containerTextBox = extractTextBoxes(containerTextBox, offset,
- pptdata, offsetPD);
- } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
- // }
- } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
- // }
- } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
- // }
- } else {
- // no action
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("type not handled: " + type);
- // }
- }
- }
- final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
- offsetPD);
- if (slides.size() == 0) {
- if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); }
- } else {
- Slide slide = (Slide) slides.get(slides.size() - 1);
- for (Enumeration enumeration = containerTextBox.elements(); enumeration
- .hasMoreElements();) {
- final TextBox textBox = (TextBox) enumeration.nextElement();
- slide.addContent(textBox.getContent());
- }
- /*
- * Merging TextBox data with Slide Data Printing the text from Slides
- * vector object.
- */
- List scontent;
- for (int i = 0; i < slides.size(); i++) {
- slide = (Slide) slides.get(i);
- scontent = slide.getContent();
- String contentText;
- for (int j = 0; j < scontent.size(); j++) {
- contentText = scontent.get(j).toString();
- this.buf.append(contentText);
- // to avoid concatinated words we add a blank additional
- if (contentText.length() > 0
- && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
- this.buf.append(" ");
- }
- }
- }
- }
- } catch (Throwable ex) {
- // because of not killing complete crawling all Throwables are catched.
- if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); }
- }
- }
- /**
- * Extracts the client text boxes of a slide.
- *
- * @param containerTextBox
- * @param offset
- * @param pptdata
- * @param offsetPD
- * @return Hashtable
- * @see TextBox
- */
- protected Hashtable/* <Long, TextBox> */extractTextBoxes(
- final Hashtable/* <Long, TextBox> */containerTextBox, final int offset,
- final byte[] pptdata, final long offsetPD) {
- // To hold temporary data
- FilteredStringWriter outStream = new FilteredStringWriter();
- TextBox textBox;
- // Traversing the bytearray up to Presist directory position
- for (int i = offset; i < offsetPD - 20; i++) {
- try {
- // Record info
- // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
- // Record Type
- final long recordType = LittleEndian.getUShort(pptdata, i + 2);
- // Record Size
- final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
- if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
- /*
- * Record type is of Drawing Group
- */
- // Total number of objects
- // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
- // 8);
- // currentID = Group ID+number of objects
- long currentID = LittleEndian.getInt(pptdata, i + 12);
- currentID = ((int) (currentID / 1024)) * 1024;
- if (currentID == PPTConstants.PPT_MASTERSLIDE) {
- // Ignore Master Slide objects
- if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); }
- i++;
- continue;
- }
- // Check for the ClientTextBox GroupID existence
- if (containerTextBox.containsKey(new Long(currentID))) {
- // If exists get Client Textbox Group
- textBox = (TextBox) containerTextBox.get(new Long(currentID));
- textBox.setContent("");
- } else {
- textBox = new TextBox(currentID);
- containerTextBox.put(new Long(currentID), textBox);
- }
- /*
- * Iterating the bytearray for TextCharAtoms and TextBytesAtom
- */
- if ((offsetPD - 20) != recordSize) {
- // TODO something wrong? Probably an OLE-Object, which we ignore.
- if (LOG.isDebugEnabled()) {
- LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize="
- + recordSize);
- }
- } else {
- for (int startPos = i + 8; startPos < offsetPD - 20
- && startPos < recordSize; startPos++) { // && startPos <
- // recordSize??
- try {
- // Record info
- // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);
- // Record Type
- final long ntype = LittleEndian
- .getUShort(pptdata, startPos + 2);
- // Record size
- // Note that the size doesn't include the 8 byte atom header
- final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);
- if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
- /*
- * Break the loop if next GroupID found
- */
- i = startPos - 1;
- break;
- } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
- // TextByteAtom record
- outStream = new FilteredStringWriter();
- long ii = 0;
- for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
- // For loop to changed to a function
- // if ((ii + 2) >= pptdata.length)
- // break; // FIXME
- outStream.write((char) (pptdata[(int) ii + 2]));
- }
- // Setting the identified text for Current
- // groupID
- textBox.setContent(textBox.getContent()
- + outStream.toString());
- } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
- // TextCharAtom record
- final String strTempContent = new String(pptdata,
- startPos + 6, (int) (nsize) + 2);
- final byte bytes[] = strTempContent.getBytes();
- if (true) {
- outStream = new FilteredStringWriter();
- for (int ii = 0; ii < bytes.length - 1; ii += 2) {
- // For loop to changed to a function
- outStream.write((char) (pptdata[ii + 2]));
- }
- textBox.setContent(textBox.getContent()
- + outStream.toString());
- } else {
- // this version is used within POI
- String text = StringUtil.getFromCompressedUnicode(bytes, 0,
- bytes.length);
- textBox.setContent(textBox.getContent() + text);
- }
- } else {
- // ignored
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("Ignored atom type: " + type);
- // }
- }
- } catch (Throwable e) {
- if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); }
- break;
- }
- }
- }
- } else {
- // Record type is ignored
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("Ignored record type: " + type);
- // }
- }
- } catch (Throwable ee) {
- if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); }
- break;
- }
- }
- return containerTextBox;
- }
- /**
- * Returns the Powerpoint <code>Slide</code> s of document as vector.
- *
- * @param offset
- * @param pptdata
- * @param offsetPD
- * @return Vector of the powerpoint slides. Contains
- * <code>{@link Slide Slide}</code>
- * @see Slide
- */
- protected List /* <Slide> */extractSlides(final long offset,
- final byte[] pptdata, final long offsetPD) {
- int sNum = 0;
- // List of all slides found
- final List/* <Slide> */slides = new Vector/* <Slide> */();
- // current slide data
- Slide currentSlide = null;
- // To store data found in TextCharAtoms and TextBytesAtoms
- FilteredStringWriter outStream;
- for (long i = offset; i < pptdata.length - 20; i++) {
- final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
- final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
- final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
- if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
- /*
- * TextByteAtom record
- */
- outStream = new FilteredStringWriter();
- for (long ii = i + 6; (ii <= i + 6 + atomSize)
- && (ii + 2 < pptdata.length); ii++) {
- try {
- // if(ii+2 >= pptdata.length) break; //FIXME
- byte value = pptdata[(int) ii + 2];
- outStream.write(value);
- } catch (ArrayIndexOutOfBoundsException ex) {
- if (LOG.isTraceEnabled()) { LOG.trace("size=" + pptdata.length); }
- if (LOG.isErrorEnabled()) { LOG.error("extractSlides", ex); }
- }
- }
- // Setting the identified text for Current Slide
- if (currentSlide != null) {
- currentSlide.addContent(outStream.toString());
- }
- } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
- /*
- * TextCharAtom record
- */
- outStream = new FilteredStringWriter();
- final String strTempContent = new String(pptdata, (int) i + 6,
- (int) (atomSize) + 2);
- final byte bytes[] = strTempContent.getBytes();
- for (int ii = 0; ii < bytes.length - 1; ii += 2) {
- outStream.write(Utils.getUnicodeCharacter(bytes, ii));
- }
- // Setting the identified text for Current Slide
- if (currentSlide != null) {
- currentSlide.addContent(outStream.toString());
- }
- } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
- /*
- * SlidePresistAtom Record
- */
- if (sNum != 0) {
- outStream = new FilteredStringWriter();
- final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20);
- currentSlide = new Slide(slideID);
- // currentSlide.addContent(outStream.toString());
- slides.add(currentSlide);
- }
- sNum++;
- } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
- /*
- * Diagram records are ignored
- */
- if (LOG.isTraceEnabled()) { LOG.trace("Drawing Groups are ignored."); }
- break;
- } else {
- // ignored
- // if (LOG.isTraceEnabled()) {
- // LOG.trace("Unhandled atomType: " + atomType);
- // }
- }
- }
- return slides;
- }
- }