/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java
Java | 229 lines | 179 code | 14 blank | 36 comment | 5 complexity | 89c0e58ea2ef4f5257dee3754fce8523 MD5 | raw file
Possible License(s): BSD-3-Clause
- /* Copyright 2004 Ryan Ackley
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- package org.apache.nutch.parse.msword;
- import org.apache.nutch.parse.msword.chp.*;
- import org.apache.poi.util.LittleEndian;
- import org.apache.poi.hwpf.model.*;
- import java.util.*;
- /**
- * This class is used to extract text from Word 6 documents only. It should
- * only be called from the org.textmining.text.extraction.WordExtractor because
- * it will automatically determine the version.
- *
- * @author Ryan Ackley
- */
- class Word6Extractor
- {
- public Word6Extractor()
- {
- }
- /**
- * Extracts the text
- *
- * @param mainStream The POIFS document stream entitled "WordDocument".
- *
- * @return The text from the document
- * @throws Exception If there are any unexpected exceptions.
- */
- public String extractText(byte[] mainStream) throws Exception
- {
- int fcMin = LittleEndian.getInt(mainStream, 0x18);
- int fcMax = LittleEndian.getInt(mainStream, 0x1C);
- int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
- int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);
- // get a list of character properties
- Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
- chpTableSize, fcMin);
- List textRuns = chpTable.getTextRuns();
- // iterate through the
- WordTextBuffer finalTextBuf = new WordTextBuffer();
- Iterator runsIt = textRuns.iterator();
- while(runsIt.hasNext())
- {
- CHPX chpx = (CHPX)runsIt.next();
- int runStart = chpx.getStart() + fcMin;
- int runEnd = chpx.getEnd() + fcMin;
- if (!isDeleted(chpx.getGrpprl()))
- {
- String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
- finalTextBuf.append(s);
- if (runEnd >= fcMax)
- {
- break;
- }
- }
- }
- return finalTextBuf.toString();
- }
- /**
- * Used to determine if a run of text has been deleted.
- * @param grpprl The list of sprms for this run of text.
- * @return
- */
- private boolean isDeleted(byte[] grpprl)
- {
- int offset = 0;
- boolean deleted = false;
- while (offset < grpprl.length)
- {
- switch (LittleEndian.getUnsignedByte(grpprl, offset++))
- {
- case 65:
- deleted = grpprl[offset++] != 0;
- break;
- case 66:
- offset++;
- break;
- case 67:
- offset++;
- break;
- case 68:
- offset += grpprl[offset];
- break;
- case 69:
- offset += 2;
- break;
- case 70:
- offset += 4;
- break;
- case 71:
- offset++;
- break;
- case 72:
- offset += 2;
- break;
- case 73:
- offset += 3;
- break;
- case 74:
- offset += grpprl[offset];
- break;
- case 75:
- offset++;
- break;
- case 80:
- offset += 2;
- break;
- case 81:
- offset += grpprl[offset];
- break;
- case 82:
- offset += grpprl[offset];
- break;
- case 83:
- break;
- case 85:
- offset++;
- break;
- case 86:
- offset++;
- break;
- case 87:
- offset++;
- break;
- case 88:
- offset++;
- break;
- case 89:
- offset++;
- break;
- case 90:
- offset++;
- break;
- case 91:
- offset++;
- break;
- case 92:
- offset++;
- break;
- case 93:
- offset += 2;
- break;
- case 94:
- offset++;
- break;
- case 95:
- offset += 3;
- break;
- case 96:
- offset += 2;
- break;
- case 97:
- offset += 2;
- break;
- case 98:
- offset++;
- break;
- case 99:
- offset++;
- break;
- case 100:
- offset++;
- break;
- case 101:
- offset++;
- break;
- case 102:
- offset++;
- break;
- case 103:
- offset += grpprl[offset];
- break;
- case 104:
- offset++;
- break;
- case 105:
- offset += grpprl[offset];
- break;
- case 106:
- offset += grpprl[offset];
- break;
- case 107:
- offset += 2;
- break;
- case 108:
- offset += grpprl[offset];
- break;
- case 109:
- offset += 2;
- break;
- case 110:
- offset += 2;
- break;
- case 117:
- offset++;
- break;
- case 118:
- offset++;
- break;
- }
- }
- return deleted;
- }
- }