/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/WordExtractor.java
Java | 230 lines | 151 code | 32 blank | 47 comment | 21 complexity | 40789e266310b897b848347166114477 MD5 | raw file
Possible License(s): BSD-3-Clause
- /* Copyright 2004 Ryan Ackley
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.nutch.parse.msword;
- // JDK imports
- import java.io.InputStream;
- import java.util.ArrayList;
- import java.util.Iterator;
- import java.util.List;
- // Jakarta POI imports
- import org.apache.poi.hwpf.model.CHPBinTable;
- import org.apache.poi.hwpf.model.CHPX;
- import org.apache.poi.hwpf.model.ComplexFileTable;
- import org.apache.poi.hwpf.model.TextPiece;
- import org.apache.poi.hwpf.model.TextPieceTable;
- import org.apache.poi.hwpf.sprm.SprmIterator;
- import org.apache.poi.hwpf.sprm.SprmOperation;
- import org.apache.poi.poifs.filesystem.DocumentEntry;
- import org.apache.poi.poifs.filesystem.DocumentInputStream;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.LittleEndian;
- // Nutch imports
- import org.apache.nutch.parse.ms.MSExtractor;
- /**
- * This class extracts the text from a Word 6.0/95/97/2000/XP word doc
- *
- * @author Ryan Ackley
- * @author Andy Hedges
- * @author Jérôme Charron
- *
- */
- class WordExtractor extends MSExtractor {
- /**
- * Gets the text from a Word document.
- *
- * @param in The InputStream representing the Word file.
- */
- protected String extractText(InputStream in) throws Exception {
- ArrayList text = new ArrayList();
- POIFSFileSystem fsys = new POIFSFileSystem(in);
- // load our POIFS document streams.
- DocumentEntry headerProps =
- (DocumentEntry)fsys.getRoot().getEntry("WordDocument");
- DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
- byte[] header = new byte[headerProps.getSize()];
- din.read(header);
- din.close();
- int info = LittleEndian.getShort(header, 0xa);
- if ((info & 0x4) != 0)
- {
- throw new FastSavedException("Fast-saved files are unsupported at this time");
- }
- if ((info & 0x100) != 0)
- {
- throw new PasswordProtectedException("This document is password protected");
- }
- // determine the version of Word this document came from.
- int nFib = LittleEndian.getShort(header, 0x2);
- switch (nFib)
- {
- case 101:
- case 102:
- case 103:
- case 104:
- // this is a Word 6.0 doc send it to the extractor for that version.
- Word6Extractor oldExtractor = new Word6Extractor();
- return oldExtractor.extractText(header);
- }
- //Get the information we need from the header
- boolean useTable1 = (info & 0x200) != 0;
- //get the location of the piece table
- int complexOffset = LittleEndian.getInt(header, 0x1a2);
- // determine which table stream we must use.
- String tableName = null;
- if (useTable1)
- {
- tableName = "1Table";
- }
- else
- {
- tableName = "0Table";
- }
- DocumentEntry table = (DocumentEntry)fsys.getRoot().getEntry(tableName);
- byte[] tableStream = new byte[table.getSize()];
- din = fsys.createDocumentInputStream(tableName);
- din.read(tableStream);
- din.close();
- int chpOffset = LittleEndian.getInt(header, 0xfa);
- int chpSize = LittleEndian.getInt(header, 0xfe);
- int fcMin = LittleEndian.getInt(header, 0x18);
- CHPBinTable cbt = new CHPBinTable(header, tableStream, chpOffset, chpSize, fcMin);
- // load our text pieces and our character runs
- ComplexFileTable cft = new ComplexFileTable(header, tableStream, complexOffset, fcMin);
- TextPieceTable tpt = cft.getTextPieceTable();
- List textPieces = tpt.getTextPieces();
- // make the POIFS objects available for garbage collection
- din = null;
- fsys = null;
- table = null;
- headerProps = null;
- List textRuns = cbt.getTextRuns();
- Iterator runIt = textRuns.iterator();
- Iterator textIt = textPieces.iterator();
- TextPiece currentPiece = (TextPiece)textIt.next();
- int currentTextStart = currentPiece.getStart();
- int currentTextEnd = currentPiece.getEnd();
- WordTextBuffer finalTextBuf = new WordTextBuffer();
- // iterate through all text runs extract the text only if they haven't been
- // deleted
- while (runIt.hasNext())
- {
- CHPX chpx = (CHPX)runIt.next();
- boolean deleted = isDeleted(chpx.getGrpprl());
- if (deleted)
- {
- continue;
- }
- int runStart = chpx.getStart();
- int runEnd = chpx.getEnd();
- while (runStart >= currentTextEnd)
- {
- currentPiece = (TextPiece) textIt.next ();
- currentTextStart = currentPiece.getStart ();
- currentTextEnd = currentPiece.getEnd ();
- }
- if (runEnd < currentTextEnd)
- {
- String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
- finalTextBuf.append(str);
- }
- else if (runEnd > currentTextEnd)
- {
- while (runEnd > currentTextEnd)
- {
- String str = currentPiece.substring(runStart - currentTextStart,
- currentTextEnd - currentTextStart);
- finalTextBuf.append(str);
- if (textIt.hasNext())
- {
- currentPiece = (TextPiece) textIt.next ();
- currentTextStart = currentPiece.getStart ();
- runStart = currentTextStart;
- currentTextEnd = currentPiece.getEnd ();
- }
- else
- {
- return finalTextBuf.toString();
- }
- }
- String str = currentPiece.substring(0, runEnd - currentTextStart);
- finalTextBuf.append(str);
- }
- else
- {
- String str = currentPiece.substring(runStart - currentTextStart, runEnd - currentTextStart);
- if (textIt.hasNext())
- {
- currentPiece = (TextPiece) textIt.next();
- currentTextStart = currentPiece.getStart();
- currentTextEnd = currentPiece.getEnd();
- }
- finalTextBuf.append(str);
- }
- }
- return finalTextBuf.toString();
- }
- /**
- * Used to determine if a run of text has been deleted.
- *
- * @param grpprl The list of sprms for a particular run of text.
- * @return true if this run of text has been deleted.
- */
- private boolean isDeleted(byte[] grpprl)
- {
- SprmIterator iterator = new SprmIterator(grpprl,0);
- while (iterator.hasNext())
- {
- SprmOperation op = iterator.next();
- // 0 is the operation that signals a FDelRMark operation
- if (op.getOperation() == 0 && op.getOperand() != 0)
- {
- return true;
- }
- }
- return false;
- }
- }