ContentReaderListener.java

/nutchindexing/nutch-1.2/src/plugin/parse-mspowerpoint/src/java/org/apache/nutch/parse/mspowerpoint/ContentReaderListener.java

https://bitbucket.org/AlexeyD/hibench
Java | 431 lines | 213 code | 56 blank | 162 comment | 85 complexity | 75c37848bf3d7cb3ed9cca98daec6b70 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.nutch.parse.mspowerpoint;

import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.poi.hdf.extractor.Utils;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;

/**
 * Listener to read the content of PowerPoint file and transfere it to the
 * passed <code>StringBuffer</code>.
 * 
 * @author Stephan Strittmatter - http://www.sybit.de
 * 
 * @version 1.0
 * 
 */
class ContentReaderListener implements POIFSReaderListener {

  private static final Log LOG = LogFactory.getLog(ContentReaderListener.class);

  /** Buffer holding the content of the file */
  protected final transient StringBuffer buf;

  /**
   * Constructs Listener to get content of PowerPoint file.
   * 
   * @param content
   *          StringBuffer refereing the content of the PowerPoint file.
   */
  public ContentReaderListener(final StringBuffer content) {
    this.buf = content;
  }

  /**
   * Reads the internal PowerPoint document stream.
   * 
   * @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
   */
  public void processPOIFSReaderEvent(final POIFSReaderEvent event) {

    if (event == null || event.getName() == null
        || !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
      if (LOG.isWarnEnabled()) {
        LOG.warn("Stream not processed. It is not a PowerPoint document: : "
                 + event.getName());
      }
      return;
    }

    try {
      final DocumentInputStream dis = event.getStream();
      final byte pptdata[] = new byte[dis.available()];
      dis.read(pptdata, 0, dis.available());
      int offset = 0;
      long offsetPD = 0;

      /*
       * Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
       * in all PlaceHolders to hold PPTClientTextBox objects for mapping into
       * Slide Objects
       */
      Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
                                                                     * <Long,
                                                                     * TextBox>
                                                                     */();
      // Traverse ByteArray to identiy edit paths of ClientTextBoxes
      long n = pptdata.length - 20;
      for (long i = 0; i < n; i++) {

        final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
        // final long size = LittleEndian.getUInt(pptdata, (int) i + 4);

        if (PPTConstants.PPT_ATOM_USEREDIT == type) {
          /*
           * Checking the Record Header (UserEditAtom)
           */
          // final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
          // final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
          offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
          offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);

          /*
           * Call to extract ClientTextBox text in each UserEditAtom
           */
          containerTextBox = extractTextBoxes(containerTextBox, offset,
              pptdata, offsetPD);
        } else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
          // if (LOG.isTraceEnabled()) {
          //   LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
          // }
        } else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
          // if (LOG.isTraceEnabled()) {
          //   LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
          // }
        } else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
          // if (LOG.isTraceEnabled()) {
          //   LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
          // }
        } else {
          // no action
          // if (LOG.isTraceEnabled()) {
          //   LOG.trace("type not handled: " + type);
          // }
        }
      }

      final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
          offsetPD);

      if (slides.size() == 0) {
        if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); }

      } else {
        Slide slide = (Slide) slides.get(slides.size() - 1);

        for (Enumeration enumeration = containerTextBox.elements(); enumeration
            .hasMoreElements();) {
          final TextBox textBox = (TextBox) enumeration.nextElement();
          slide.addContent(textBox.getContent());
        }

        /*
         * Merging TextBox data with Slide Data Printing the text from Slides
         * vector object.
         */
        List scontent;
        for (int i = 0; i < slides.size(); i++) {
          slide = (Slide) slides.get(i);
          scontent = slide.getContent();
          String contentText;

          for (int j = 0; j < scontent.size(); j++) {
            contentText = scontent.get(j).toString();
            this.buf.append(contentText);

            // to avoid concatinated words we add a blank additional
            if (contentText.length() > 0
                && !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
              this.buf.append(" ");
            }
          }
        }
      }
    } catch (Throwable ex) {
      // because of not killing complete crawling all Throwables are catched.
      if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); }
    }
  }

  /**
   * Extracts the client text boxes of a slide.
   * 
   * @param containerTextBox
   * @param offset
   * @param pptdata
   * @param offsetPD
   * @return Hashtable
   * @see TextBox
   */
  protected Hashtable/* <Long, TextBox> */extractTextBoxes(
      final Hashtable/* <Long, TextBox> */containerTextBox, final int offset,
      final byte[] pptdata, final long offsetPD) {

    // To hold temporary data
    FilteredStringWriter outStream = new FilteredStringWriter();

    TextBox textBox;

    // Traversing the bytearray up to Presist directory position
    for (int i = offset; i < offsetPD - 20; i++) {
      try {
        // Record info
        // final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
        // Record Type
        final long recordType = LittleEndian.getUShort(pptdata, i + 2);
        // Record Size
        final long recordSize = LittleEndian.getUInt(pptdata, i + 4);

        if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
          /*
           * Record type is of Drawing Group
           */

          // Total number of objects
          // final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
          // 8);
          // currentID = Group ID+number of objects
          long currentID = LittleEndian.getInt(pptdata, i + 12);
          currentID = ((int) (currentID / 1024)) * 1024;

          if (currentID == PPTConstants.PPT_MASTERSLIDE) {
            // Ignore Master Slide objects
            if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); }
            i++;
            continue;
          }

          // Check for the ClientTextBox GroupID existence
          if (containerTextBox.containsKey(new Long(currentID))) {
            // If exists get Client Textbox Group
            textBox = (TextBox) containerTextBox.get(new Long(currentID));
            textBox.setContent("");

          } else {
            textBox = new TextBox(currentID);
            containerTextBox.put(new Long(currentID), textBox);
          }

          /*
           * Iterating the bytearray for TextCharAtoms and TextBytesAtom
           */
          if ((offsetPD - 20) != recordSize) {
            // TODO something wrong? Probably an OLE-Object, which we ignore.
            if (LOG.isDebugEnabled()) {
              LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize="
                        + recordSize);
            }
          } else {
            for (int startPos = i + 8; startPos < offsetPD - 20
                && startPos < recordSize; startPos++) { // && startPos <
              // recordSize??
              try {

                // Record info
                // final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);

                // Record Type
                final long ntype = LittleEndian
                    .getUShort(pptdata, startPos + 2);

                // Record size
                // Note that the size doesn't include the 8 byte atom header
                final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);

                if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
                  /*
                   * Break the loop if next GroupID found
                   */
                  i = startPos - 1;
                  break;
                } else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
                  // TextByteAtom record
                  outStream = new FilteredStringWriter();
                  long ii = 0;
                  for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
                    // For loop to changed to a function
                    // if ((ii + 2) >= pptdata.length)
                    // break; // FIXME
                    outStream.write((char) (pptdata[(int) ii + 2]));
                  }

                  // Setting the identified text for Current
                  // groupID
                  textBox.setContent(textBox.getContent()
                      + outStream.toString());

                } else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
                  // TextCharAtom record

                  final String strTempContent = new String(pptdata,
                      startPos + 6, (int) (nsize) + 2);
                  final byte bytes[] = strTempContent.getBytes();
                  if (true) {
                    outStream = new FilteredStringWriter();
                    for (int ii = 0; ii < bytes.length - 1; ii += 2) {
                      // For loop to changed to a function
                      outStream.write((char) (pptdata[ii + 2]));
                    }
                    textBox.setContent(textBox.getContent()
                        + outStream.toString());
                  } else {
                    // this version is used within POI
                    String text = StringUtil.getFromCompressedUnicode(bytes, 0,
                        bytes.length);
                    textBox.setContent(textBox.getContent() + text);
                  }

                } else {
                  // ignored
                  // if (LOG.isTraceEnabled()) {
                  //   LOG.trace("Ignored atom type: " + type);
                  // }
                }
              } catch (Throwable e) {
                if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); }
                break;
              }
            }
          }
        } else {
          // Record type is ignored
          // if (LOG.isTraceEnabled()) {
          //   LOG.trace("Ignored record type: " + type);
          // }
        }
      } catch (Throwable ee) {
        if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); }
        break;
      }
    }
    return containerTextBox;
  }

  /**
   * Returns the Powerpoint <code>Slide</code> s of document as vector.
   * 
   * @param offset
   * @param pptdata
   * @param offsetPD
   * @return Vector of the powerpoint slides. Contains
   *         <code>{@link Slide Slide}</code>
   * @see Slide
   */
  protected List /* <Slide> */extractSlides(final long offset,
      final byte[] pptdata, final long offsetPD) {

    int sNum = 0;

    // List of all slides found
    final List/* <Slide> */slides = new Vector/* <Slide> */();

    // current slide data
    Slide currentSlide = null;

    // To store data found in TextCharAtoms and TextBytesAtoms
    FilteredStringWriter outStream;

    for (long i = offset; i < pptdata.length - 20; i++) {

      final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
      final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
      final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);

      if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
        /*
         * TextByteAtom record
         */
        outStream = new FilteredStringWriter();

        for (long ii = i + 6; (ii <= i + 6 + atomSize)
            && (ii + 2 < pptdata.length); ii++) {
          try {
            // if(ii+2 >= pptdata.length) break; //FIXME
            byte value = pptdata[(int) ii + 2];
            outStream.write(value);
          } catch (ArrayIndexOutOfBoundsException ex) {
            if (LOG.isTraceEnabled()) { LOG.trace("size=" + pptdata.length); }
            if (LOG.isErrorEnabled()) { LOG.error("extractSlides", ex); }
          }
        }

        // Setting the identified text for Current Slide
        if (currentSlide != null) {
          currentSlide.addContent(outStream.toString());
        }

      } else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
        /*
         * TextCharAtom record
         */
        outStream = new FilteredStringWriter();
        final String strTempContent = new String(pptdata, (int) i + 6,
            (int) (atomSize) + 2);
        final byte bytes[] = strTempContent.getBytes();

        for (int ii = 0; ii < bytes.length - 1; ii += 2) {
          outStream.write(Utils.getUnicodeCharacter(bytes, ii));
        }

        // Setting the identified text for Current Slide
        if (currentSlide != null) {
          currentSlide.addContent(outStream.toString());
        }

      } else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
        /*
         * SlidePresistAtom Record
         */
        if (sNum != 0) {
          outStream = new FilteredStringWriter();

          final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20);

          currentSlide = new Slide(slideID);
          // currentSlide.addContent(outStream.toString());
          slides.add(currentSlide);
        }
        sNum++;
      } else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
        /*
         * Diagram records are ignored
         */
        if (LOG.isTraceEnabled()) { LOG.trace("Drawing Groups are ignored."); }
        break;
      } else {
        // ignored
        // if (LOG.isTraceEnabled()) {
        //   LOG.trace("Unhandled atomType: " + atomType);
        // }
      }
    }

    return slides;
  }
}