PageRenderTime 4225ms CodeModel.GetById 50ms RepoModel.GetById 4ms app.codeStats 0ms

/nutchindexing/nutch-1.2/src/plugin/parse-msword/src/java/org/apache/nutch/parse/msword/chp/Word6CHPBinTable.java

https://bitbucket.org/AlexeyD/hibench
Java | 77 lines | 36 code | 12 blank | 29 comment | 2 complexity | 335f3012a4cdb326e87a971e5f16db15 MD5 | raw file
Possible License(s): Apache-2.0, BSD-3-Clause
  1. /* Copyright 2004 Ryan Ackley
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. package org.apache.nutch.parse.msword.chp;
  16. import java.util.List;
  17. import java.util.ArrayList;
  18. import java.io.OutputStream;
  19. import java.io.IOException;
  20. import org.apache.poi.poifs.common.POIFSConstants;
  21. import org.apache.poi.util.LittleEndian;
  22. import org.apache.poi.hwpf.model.io.*;
  23. import org.apache.poi.hwpf.model.*;
  24. /**
  25. * This class holds all of the character formatting properties from a Word
  26. * 6.0/95 document.
  27. *
  28. * @author Ryan Ackley
  29. */
  30. public class Word6CHPBinTable
  31. {
  32. /** List of character properties.*/
  33. ArrayList _textRuns = new ArrayList();
  34. /**
  35. * Constructor used to read a binTable in from a Word document.
  36. *
  37. * @param documentStream The POIFS "WordDocument" stream from a Word document
  38. * @param offset The offset of the Chp bin table in the main stream.
  39. * @param size The size of the Chp bin table in the main stream.
  40. * @param fcMin The start of text in the main stream.
  41. */
  42. public Word6CHPBinTable(byte[] documentStream, int offset,
  43. int size, int fcMin, TextPieceTable tpt)
  44. {
  45. PlexOfCps binTable = new PlexOfCps(documentStream, offset, size, 2);
  46. int length = binTable.length();
  47. for (int x = 0; x < length; x++)
  48. {
  49. GenericPropertyNode node = binTable.getProperty(x);
  50. int pageNum = LittleEndian.getShort((byte[])node.getBytes());
  51. int pageOffset = POIFSConstants.BIG_BLOCK_SIZE * pageNum;
  52. CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(documentStream,
  53. pageOffset, fcMin, tpt);
  54. int fkpSize = cfkp.size();
  55. for (int y = 0; y < fkpSize; y++)
  56. {
  57. _textRuns.add(cfkp.getCHPX(y));
  58. }
  59. }
  60. }
  61. public List getTextRuns()
  62. {
  63. return _textRuns;
  64. }
  65. }