Word6Extractor.java | searchcode

/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/Word6Extractor.java

https://github.com/lritter/gnutch · Java · 229 lines · 179 code · 14 blank · 36 comment · 5 complexity · 89c0e58ea2ef4f5257dee3754fce8523 MD5 · raw file


/*  Copyright 2004 Ryan Ackley
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.apache.nutch.parse.msword;

import org.apache.nutch.parse.msword.chp.*;

import org.apache.poi.util.LittleEndian;
import org.apache.poi.hwpf.model.*;

import java.util.*;

/**
 * This class is used to extract text from Word 6 documents only. It should
 * only be called from the org.textmining.text.extraction.WordExtractor because
 * it will automatically determine the version.
 *
 * @author Ryan Ackley
 */
class Word6Extractor
{

  public Word6Extractor()
  {
  }

  /**
   * Extracts the text
   *
   * @param mainStream The POIFS document stream entitled "WordDocument".
   *
   * @return The text from the document
   * @throws Exception If there are any unexpected exceptions.
   */
  public String extractText(byte[] mainStream) throws Exception
  {
    int fcMin = LittleEndian.getInt(mainStream, 0x18);
    int fcMax = LittleEndian.getInt(mainStream, 0x1C);

    int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8);
    int chpTableSize = LittleEndian.getInt(mainStream, 0xbc);

    // get a list of character properties
    Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset,
      chpTableSize, fcMin);
    List textRuns = chpTable.getTextRuns();

    // iterate through the
    WordTextBuffer finalTextBuf = new WordTextBuffer();
    Iterator runsIt = textRuns.iterator();
    while(runsIt.hasNext())
    {
      CHPX chpx = (CHPX)runsIt.next();
      int runStart = chpx.getStart() + fcMin;
      int runEnd = chpx.getEnd() + fcMin;

      if (!isDeleted(chpx.getGrpprl()))
      {
        String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252");
        finalTextBuf.append(s);
        if (runEnd >= fcMax)
        {
          break;
        }
      }
    }

    return finalTextBuf.toString();
  }

  /**
   * Used to determine if a run of text has been deleted.
   * @param grpprl The list of sprms for this run of text.
   * @return
   */
  private boolean isDeleted(byte[] grpprl)
  {
    int offset = 0;
    boolean deleted = false;
    while (offset < grpprl.length)
    {
      switch (LittleEndian.getUnsignedByte(grpprl, offset++))
      {
        case 65:
          deleted = grpprl[offset++] != 0;
          break;
        case 66:
          offset++;
          break;
        case 67:
          offset++;
          break;
        case 68:
          offset += grpprl[offset];
          break;
        case 69:
          offset += 2;
          break;
        case 70:
          offset += 4;
          break;
        case 71:
          offset++;
          break;
        case 72:
          offset += 2;
          break;
        case 73:
          offset += 3;
          break;
        case 74:
          offset += grpprl[offset];
          break;
        case 75:
          offset++;
          break;
        case 80:
          offset += 2;
          break;
        case 81:
          offset += grpprl[offset];
          break;
        case 82:
          offset += grpprl[offset];
          break;
        case 83:
          break;
        case 85:
          offset++;
          break;
        case 86:
          offset++;
          break;
        case 87:
          offset++;
          break;
        case 88:
          offset++;
          break;
        case 89:
          offset++;
          break;
        case 90:
          offset++;
          break;
        case 91:
          offset++;
          break;
        case 92:
          offset++;
          break;
        case 93:
          offset += 2;
          break;
        case 94:
          offset++;
          break;
        case 95:
          offset += 3;
          break;
        case 96:
          offset += 2;
          break;
        case 97:
          offset += 2;
          break;
        case 98:
          offset++;
          break;
        case 99:
          offset++;
          break;
        case 100:
          offset++;
          break;
        case 101:
          offset++;
          break;
        case 102:
          offset++;
          break;
        case 103:
          offset += grpprl[offset];
          break;
        case 104:
          offset++;
          break;
        case 105:
          offset += grpprl[offset];
          break;
        case 106:
          offset += grpprl[offset];
          break;
        case 107:
          offset += 2;
          break;
        case 108:
          offset += grpprl[offset];
          break;
        case 109:
          offset += 2;
          break;
        case 110:
          offset += 2;
          break;
        case 117:
          offset++;
          break;
        case 118:
          offset++;
          break;

      }
    }
    return deleted;
  }
}

Tech Fingerprint

Alerts (49)

'import' Maintainability Info: Wildcard imports (e.g., `import java.util.*;`) can obscure the origin of classes and lead to namespace collisions. Prefer importing specific classes explicitly.
18 21 23
'throws Exception' Declaring 'throws Exception' is too broad. Declare specific checked exceptions that the method might throw, allowing callers to handle them appropriately.
47
'List' Raw collection type used. Specify generic type arguments (e.g., List<String>, Map<Integer, Client>) for type safety and clarity. Avoid raw types unless interacting with legacy code.
58
'switch (' Ensure switch statements on enums or non-trivial types cover all cases or include a 'default:' label to handle unexpected values.
94
'case' Maintainability Info: Avoid using unnamed 'magic' numbers directly in comparisons or assignments. Use named constants (static final variables) instead to improve readability and maintainability.
96 99 102 105 108 111 114 117 120 123 126 129 132 135 138 140 143 146 149 152 155 158 161 164 167 170 173 176 179 182 185 188 191 194 197 200 203 206 209 212 215 218 221