PageRenderTime 7248ms CodeModel.GetById 8ms RepoModel.GetById 1ms app.codeStats 0ms

/src/org/apache/poi/hwpf/HWPFDocument.java

https://github.com/minstrelsy/SimpleAndroidDocView
Java | 1042 lines | 553 code | 126 blank | 363 comment | 64 complexity | 677c332698819342ff2ece2baf93a4ed MD5 | raw file
Possible License(s): Apache-2.0
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf;
  16. import java.io.ByteArrayInputStream;
  17. import java.io.FileNotFoundException;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.io.OutputStream;
  21. import java.util.Iterator;
  22. import org.apache.poi.hpsf.DocumentSummaryInformation;
  23. import org.apache.poi.hpsf.SummaryInformation;
  24. import org.apache.poi.hwpf.model.BookmarksTables;
  25. import org.apache.poi.hwpf.model.CHPBinTable;
  26. import org.apache.poi.hwpf.model.ComplexFileTable;
  27. import org.apache.poi.hwpf.model.DocumentProperties;
  28. import org.apache.poi.hwpf.model.EscherRecordHolder;
  29. import org.apache.poi.hwpf.model.FSPADocumentPart;
  30. import org.apache.poi.hwpf.model.FSPATable;
  31. import org.apache.poi.hwpf.model.FieldsTables;
  32. import org.apache.poi.hwpf.model.FontTable;
  33. import org.apache.poi.hwpf.model.ListTables;
  34. import org.apache.poi.hwpf.model.NoteType;
  35. import org.apache.poi.hwpf.model.NotesTables;
  36. import org.apache.poi.hwpf.model.PAPBinTable;
  37. import org.apache.poi.hwpf.model.PicturesTable;
  38. import org.apache.poi.hwpf.model.RevisionMarkAuthorTable;
  39. import org.apache.poi.hwpf.model.SavedByTable;
  40. import org.apache.poi.hwpf.model.SectionTable;
  41. import org.apache.poi.hwpf.model.ShapesTable;
  42. import org.apache.poi.hwpf.model.SinglentonTextPiece;
  43. import org.apache.poi.hwpf.model.StyleSheet;
  44. import org.apache.poi.hwpf.model.SubdocumentType;
  45. import org.apache.poi.hwpf.model.TextPiece;
  46. import org.apache.poi.hwpf.model.TextPieceTable;
  47. import org.apache.poi.hwpf.model.io.HWPFFileSystem;
  48. import org.apache.poi.hwpf.model.io.HWPFOutputStream;
  49. import org.apache.poi.hwpf.usermodel.Bookmarks;
  50. import org.apache.poi.hwpf.usermodel.BookmarksImpl;
  51. import org.apache.poi.hwpf.usermodel.Field;
  52. import org.apache.poi.hwpf.usermodel.Fields;
  53. import org.apache.poi.hwpf.usermodel.FieldsImpl;
  54. import org.apache.poi.hwpf.usermodel.HWPFList;
  55. import org.apache.poi.hwpf.usermodel.Notes;
  56. import org.apache.poi.hwpf.usermodel.NotesImpl;
  57. import org.apache.poi.hwpf.usermodel.OfficeDrawings;
  58. import org.apache.poi.hwpf.usermodel.OfficeDrawingsImpl;
  59. import org.apache.poi.hwpf.usermodel.Range;
  60. import org.apache.poi.poifs.common.POIFSConstants;
  61. import org.apache.poi.poifs.filesystem.DirectoryNode;
  62. import org.apache.poi.poifs.filesystem.DocumentEntry;
  63. import org.apache.poi.poifs.filesystem.Entry;
  64. import org.apache.poi.poifs.filesystem.EntryUtils;
  65. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  66. import org.apache.poi.util.Internal;
  67. /**
  68. *
  69. * This class acts as the bucket that we throw all of the Word data structures
  70. * into.
  71. *
  72. * @author Ryan Ackley
  73. */
  74. @SuppressWarnings("deprecation")
  75. public final class HWPFDocument extends HWPFDocumentCore
  76. {
  77. static final String PROPERTY_PRESERVE_BIN_TABLES = "org.apache.poi.hwpf.preserveBinTables";
  78. private static final String PROPERTY_PRESERVE_TEXT_TABLE = "org.apache.poi.hwpf.preserveTextTable";
  79. private static final String STREAM_DATA = "Data";
  80. private static final String STREAM_TABLE_0 = "0Table";
  81. private static final String STREAM_TABLE_1 = "1Table";
  82. /** table stream buffer*/
  83. protected byte[] _tableStream;
  84. /** data stream buffer*/
  85. protected byte[] _dataStream;
  86. /** Document wide Properties*/
  87. protected DocumentProperties _dop;
  88. /** Contains text of the document wrapped in a obfuscated Word data
  89. * structure*/
  90. protected ComplexFileTable _cft;
  91. /** Contains text buffer linked directly to single-piece document text piece */
  92. protected StringBuilder _text;
  93. /** Holds the save history for this document. */
  94. protected SavedByTable _sbt;
  95. /** Holds the revision mark authors for this document. */
  96. protected RevisionMarkAuthorTable _rmat;
  97. /** Holds FSBA (shape) information */
  98. private FSPATable _fspaHeaders;
  99. /** Holds FSBA (shape) information */
  100. private FSPATable _fspaMain;
  101. /** Escher Drawing Group information */
  102. protected EscherRecordHolder _escherRecordHolder;
  103. /** Holds pictures table */
  104. protected PicturesTable _pictures;
  105. /** Holds Office Art objects */
  106. @Deprecated
  107. protected ShapesTable _officeArts;
  108. /** Holds Office Art objects */
  109. protected OfficeDrawingsImpl _officeDrawingsHeaders;
  110. /** Holds Office Art objects */
  111. protected OfficeDrawingsImpl _officeDrawingsMain;
  112. /** Holds the bookmarks tables */
  113. protected BookmarksTables _bookmarksTables;
  114. /** Holds the bookmarks */
  115. protected Bookmarks _bookmarks;
  116. /** Holds the ending notes tables */
  117. protected NotesTables _endnotesTables = new NotesTables( NoteType.ENDNOTE );
  118. /** Holds the footnotes */
  119. protected Notes _endnotes = new NotesImpl( _endnotesTables );
  120. /** Holds the footnotes tables */
  121. protected NotesTables _footnotesTables = new NotesTables( NoteType.FOOTNOTE );
  122. /** Holds the footnotes */
  123. protected Notes _footnotes = new NotesImpl( _footnotesTables );
  124. /** Holds the fields PLCFs */
  125. protected FieldsTables _fieldsTables;
  126. /** Holds the fields */
  127. protected Fields _fields;
  128. protected HWPFDocument()
  129. {
  130. super();
  131. this._text = new StringBuilder("\r");
  132. }
  133. /**
  134. * This constructor loads a Word document from an InputStream.
  135. *
  136. * @param istream The InputStream that contains the Word document.
  137. * @throws IOException If there is an unexpected IOException from the passed
  138. * in InputStream.
  139. */
  140. public HWPFDocument(InputStream istream) throws IOException
  141. {
  142. //do Ole stuff
  143. this( verifyAndBuildPOIFS(istream) );
  144. }
  145. /**
  146. * This constructor loads a Word document from a POIFSFileSystem
  147. *
  148. * @param pfilesystem The POIFSFileSystem that contains the Word document.
  149. * @throws IOException If there is an unexpected IOException from the passed
  150. * in POIFSFileSystem.
  151. */
  152. public HWPFDocument(POIFSFileSystem pfilesystem) throws IOException
  153. {
  154. this(pfilesystem.getRoot());
  155. }
  156. /**
  157. * This constructor loads a Word document from a specific point
  158. * in a POIFSFileSystem, probably not the default.
  159. * Used typically to open embedded documents.
  160. *
  161. * @param pfilesystem The POIFSFileSystem that contains the Word document.
  162. * @throws IOException If there is an unexpected IOException from the passed
  163. * in POIFSFileSystem.
  164. * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
  165. */
  166. @Deprecated
  167. public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
  168. {
  169. this(directory);
  170. }
  171. /**
  172. * This constructor loads a Word document from a specific point
  173. * in a POIFSFileSystem, probably not the default.
  174. * Used typically to open embeded documents.
  175. *
  176. * @param directory The DirectoryNode that contains the Word document.
  177. * @throws IOException If there is an unexpected IOException from the passed
  178. * in POIFSFileSystem.
  179. */
  180. public HWPFDocument(DirectoryNode directory) throws IOException
  181. {
  182. // Load the main stream and FIB
  183. // Also handles HPSF bits
  184. super(directory);
  185. // Is this document too old for us?
  186. if(_fib.getFibBase().getNFib() < 106) {
  187. throw new OldWordFileFormatException("The document is too old - Word 95 or older. Try HWPFOldDocument instead?");
  188. }
  189. // use the fib to determine the name of the table stream.
  190. String name = STREAM_TABLE_0;
  191. if (_fib.getFibBase().isFWhichTblStm())
  192. {
  193. name = STREAM_TABLE_1;
  194. }
  195. // Grab the table stream.
  196. DocumentEntry tableProps;
  197. try {
  198. tableProps =
  199. (DocumentEntry)directory.getEntry(name);
  200. } catch(FileNotFoundException fnfe) {
  201. throw new IllegalStateException("Table Stream '" + name + "' wasn't found - Either the document is corrupt, or is Word95 (or earlier)");
  202. }
  203. // read in the table stream.
  204. _tableStream = new byte[tableProps.getSize()];
  205. directory.createDocumentInputStream(name).read(_tableStream);
  206. _fib.fillVariableFields(_mainStream, _tableStream);
  207. // read in the data stream.
  208. try
  209. {
  210. DocumentEntry dataProps =
  211. (DocumentEntry)directory.getEntry(STREAM_DATA);
  212. _dataStream = new byte[dataProps.getSize()];
  213. directory.createDocumentInputStream(STREAM_DATA).read(_dataStream);
  214. }
  215. catch(java.io.FileNotFoundException e)
  216. {
  217. _dataStream = new byte[0];
  218. }
  219. // Get the cp of the start of text in the main stream
  220. // The latest spec doc says this is always zero!
  221. int fcMin = 0;
  222. //fcMin = _fib.getFcMin()
  223. // Start to load up our standard structures.
  224. _dop = new DocumentProperties(_tableStream, _fib.getFcDop(), _fib.getLcbDop() );
  225. _cft = new ComplexFileTable(_mainStream, _tableStream, _fib.getFcClx(), fcMin);
  226. TextPieceTable _tpt = _cft.getTextPieceTable();
  227. // Now load the rest of the properties, which need to be adjusted
  228. // for where text really begin
  229. _cbt = new CHPBinTable(_mainStream, _tableStream, _fib.getFcPlcfbteChpx(), _fib.getLcbPlcfbteChpx(), _tpt);
  230. _pbt = new PAPBinTable(_mainStream, _tableStream, _dataStream, _fib.getFcPlcfbtePapx(), _fib.getLcbPlcfbtePapx(), _tpt);
  231. _text = _tpt.getText();
  232. /*
  233. * in this mode we preserving PAPX/CHPX structure from file, so text may
  234. * miss from output, and text order may be corrupted
  235. */
  236. boolean preserveBinTables = false;
  237. try
  238. {
  239. preserveBinTables = Boolean.parseBoolean( System
  240. .getProperty( PROPERTY_PRESERVE_BIN_TABLES ) );
  241. }
  242. catch ( Exception exc )
  243. {
  244. // ignore;
  245. }
  246. if ( !preserveBinTables )
  247. {
  248. _cbt.rebuild( _cft );
  249. _pbt.rebuild( _text, _cft );
  250. }
  251. /*
  252. * Property to disable text rebuilding. In this mode changing the text
  253. * will lead to unpredictable behavior
  254. */
  255. boolean preserveTextTable = false;
  256. try
  257. {
  258. preserveTextTable = Boolean.parseBoolean( System
  259. .getProperty( PROPERTY_PRESERVE_TEXT_TABLE ) );
  260. }
  261. catch ( Exception exc )
  262. {
  263. // ignore;
  264. }
  265. if ( !preserveTextTable )
  266. {
  267. _cft = new ComplexFileTable();
  268. _tpt = _cft.getTextPieceTable();
  269. final TextPiece textPiece = new SinglentonTextPiece( _text );
  270. _tpt.add( textPiece );
  271. _text = textPiece.getStringBuilder();
  272. }
  273. // Read FSPA and Escher information
  274. // _fspa = new FSPATable(_tableStream, _fib.getFcPlcspaMom(),
  275. // _fib.getLcbPlcspaMom(), getTextTable().getTextPieces());
  276. _fspaHeaders = new FSPATable( _tableStream, _fib,
  277. FSPADocumentPart.HEADER );
  278. _fspaMain = new FSPATable( _tableStream, _fib, FSPADocumentPart.MAIN );
  279. if (_fib.getFcDggInfo() != 0)
  280. {
  281. _escherRecordHolder = new EscherRecordHolder(_tableStream, _fib.getFcDggInfo(), _fib.getLcbDggInfo());
  282. } else
  283. {
  284. _escherRecordHolder = new EscherRecordHolder();
  285. }
  286. // read in the pictures stream
  287. _pictures = new PicturesTable(this, _dataStream, _mainStream, _fspaMain, _escherRecordHolder);
  288. // And the art shapes stream
  289. _officeArts = new ShapesTable(_tableStream, _fib);
  290. // And escher pictures
  291. _officeDrawingsHeaders = new OfficeDrawingsImpl( _fspaHeaders, _escherRecordHolder, _mainStream );
  292. _officeDrawingsMain = new OfficeDrawingsImpl( _fspaMain , _escherRecordHolder, _mainStream);
  293. _st = new SectionTable(_mainStream, _tableStream, _fib.getFcPlcfsed(), _fib.getLcbPlcfsed(), fcMin, _tpt, _fib.getSubdocumentTextStreamLength( SubdocumentType.MAIN));
  294. _ss = new StyleSheet(_tableStream, _fib.getFcStshf());
  295. _ft = new FontTable(_tableStream, _fib.getFcSttbfffn(), _fib.getLcbSttbfffn());
  296. int listOffset = _fib.getFcPlfLst();
  297. int lfoOffset = _fib.getFcPlfLfo();
  298. if ( listOffset != 0 && _fib.getLcbPlfLst() != 0 )
  299. {
  300. _lt = new ListTables( _tableStream, listOffset, _fib.getFcPlfLfo(),
  301. _fib.getLcbPlfLfo() );
  302. }
  303. int sbtOffset = _fib.getFcSttbSavedBy();
  304. int sbtLength = _fib.getLcbSttbSavedBy();
  305. if (sbtOffset != 0 && sbtLength != 0)
  306. {
  307. _sbt = new SavedByTable(_tableStream, sbtOffset, sbtLength);
  308. }
  309. int rmarkOffset = _fib.getFcSttbfRMark();
  310. int rmarkLength = _fib.getLcbSttbfRMark();
  311. if (rmarkOffset != 0 && rmarkLength != 0)
  312. {
  313. _rmat = new RevisionMarkAuthorTable(_tableStream, rmarkOffset, rmarkLength);
  314. }
  315. _bookmarksTables = new BookmarksTables( _tableStream, _fib );
  316. _bookmarks = new BookmarksImpl( _bookmarksTables );
  317. _endnotesTables = new NotesTables( NoteType.ENDNOTE, _tableStream, _fib );
  318. _endnotes = new NotesImpl( _endnotesTables );
  319. _footnotesTables = new NotesTables( NoteType.FOOTNOTE, _tableStream, _fib );
  320. _footnotes = new NotesImpl( _footnotesTables );
  321. _fieldsTables = new FieldsTables(_tableStream, _fib);
  322. _fields = new FieldsImpl(_fieldsTables);
  323. }
  324. @Internal
  325. public TextPieceTable getTextTable()
  326. {
  327. return _cft.getTextPieceTable();
  328. }
  329. @Internal
  330. @Override
  331. public StringBuilder getText()
  332. {
  333. return _text;
  334. }
  335. public DocumentProperties getDocProperties()
  336. {
  337. return _dop;
  338. }
  339. public Range getOverallRange() {
  340. return new Range(0, _text.length(), this);
  341. }
  342. /**
  343. * Returns the range which covers the whole of the document, but excludes
  344. * any headers and footers.
  345. */
  346. public Range getRange()
  347. {
  348. // // First up, trigger a full-recalculate
  349. // // Needed in case of deletes etc
  350. // getOverallRange();
  351. //
  352. // if ( getFileInformationBlock().isFComplex() )
  353. // {
  354. // /*
  355. // * Page 31:
  356. // *
  357. // * main document must be found by examining the piece table entries
  358. // * from the 0th piece table entry from the piece table entry that
  359. // * describes cp=fib.ccpText.
  360. // */
  361. // // TODO: review
  362. // return new Range( _cpSplit.getMainDocumentStart(),
  363. // _cpSplit.getMainDocumentEnd(), this );
  364. // }
  365. //
  366. // /*
  367. // * Page 31:
  368. // *
  369. // * "In a non-complex file, this means text of the: main document
  370. // begins
  371. // * at fib.fcMin in the file and continues through
  372. // * fib.fcMin+fib.ccpText."
  373. // */
  374. // int bytesStart = getFileInformationBlock().getFcMin();
  375. //
  376. // int charsStart = getTextTable().getCharIndex( bytesStart );
  377. // int charsEnd = charsStart
  378. // + getFileInformationBlock().getSubdocumentTextStreamLength(
  379. // SubdocumentType.MAIN );
  380. // it seems much simpler -- sergey
  381. return getRange(SubdocumentType.MAIN);
  382. }
  383. private Range getRange( SubdocumentType subdocument )
  384. {
  385. int startCp = 0;
  386. for ( SubdocumentType previos : SubdocumentType.ORDERED )
  387. {
  388. int length = getFileInformationBlock()
  389. .getSubdocumentTextStreamLength( previos );
  390. if ( subdocument == previos )
  391. return new Range( startCp, startCp + length, this );
  392. startCp += length;
  393. }
  394. throw new UnsupportedOperationException(
  395. "Subdocument type not supported: " + subdocument );
  396. }
  397. /**
  398. * Returns the {@link Range} which covers all the Footnotes.
  399. *
  400. * @return the {@link Range} which covers all the Footnotes.
  401. */
  402. public Range getFootnoteRange()
  403. {
  404. return getRange( SubdocumentType.FOOTNOTE );
  405. }
  406. /**
  407. * Returns the {@link Range} which covers all endnotes.
  408. *
  409. * @return the {@link Range} which covers all endnotes.
  410. */
  411. public Range getEndnoteRange()
  412. {
  413. return getRange( SubdocumentType.ENDNOTE );
  414. }
  415. /**
  416. * Returns the {@link Range} which covers all annotations.
  417. *
  418. * @return the {@link Range} which covers all annotations.
  419. */
  420. public Range getCommentsRange()
  421. {
  422. return getRange( SubdocumentType.ANNOTATION );
  423. }
  424. /**
  425. * Returns the {@link Range} which covers all textboxes.
  426. *
  427. * @return the {@link Range} which covers all textboxes.
  428. */
  429. public Range getMainTextboxRange()
  430. {
  431. return getRange( SubdocumentType.TEXTBOX );
  432. }
  433. /**
  434. * Returns the range which covers all "Header Stories".
  435. * A header story contains a header, footer, end note
  436. * separators and footnote separators.
  437. */
  438. public Range getHeaderStoryRange() {
  439. return getRange( SubdocumentType.HEADER );
  440. }
  441. /**
  442. * Returns the character length of a document.
  443. * @return the character length of a document
  444. */
  445. public int characterLength()
  446. {
  447. return _text.length();
  448. }
  449. /**
  450. * Gets a reference to the saved -by table, which holds the save history for the document.
  451. *
  452. * @return the saved-by table.
  453. */
  454. @Internal
  455. public SavedByTable getSavedByTable()
  456. {
  457. return _sbt;
  458. }
  459. /**
  460. * Gets a reference to the revision mark author table, which holds the revision mark authors for the document.
  461. *
  462. * @return the saved-by table.
  463. */
  464. @Internal
  465. public RevisionMarkAuthorTable getRevisionMarkAuthorTable()
  466. {
  467. return _rmat;
  468. }
  469. /**
  470. * @return PicturesTable object, that is able to extract images from this document
  471. */
  472. public PicturesTable getPicturesTable() {
  473. return _pictures;
  474. }
  475. @Internal
  476. public EscherRecordHolder getEscherRecordHolder() {
  477. return _escherRecordHolder;
  478. }
  479. /**
  480. * @return ShapesTable object, that is able to extract office are shapes
  481. * from this document
  482. * @deprecated use {@link #getOfficeDrawingsMain()} instead
  483. */
  484. @Deprecated
  485. @Internal
  486. public ShapesTable getShapesTable()
  487. {
  488. return _officeArts;
  489. }
  490. public OfficeDrawings getOfficeDrawingsHeaders()
  491. {
  492. return _officeDrawingsHeaders;
  493. }
  494. public OfficeDrawings getOfficeDrawingsMain()
  495. {
  496. return _officeDrawingsMain;
  497. }
  498. /**
  499. * @return user-friendly interface to access document bookmarks
  500. */
  501. public Bookmarks getBookmarks()
  502. {
  503. return _bookmarks;
  504. }
  505. /**
  506. * @return user-friendly interface to access document endnotes
  507. */
  508. public Notes getEndnotes()
  509. {
  510. return _endnotes;
  511. }
  512. /**
  513. * @return user-friendly interface to access document footnotes
  514. */
  515. public Notes getFootnotes()
  516. {
  517. return _footnotes;
  518. }
  519. /**
  520. * @return FieldsTables object, that is able to extract fields descriptors from this document
  521. * @deprecated
  522. */
  523. @Deprecated
  524. @Internal
  525. public FieldsTables getFieldsTables() {
  526. return _fieldsTables;
  527. }
  528. /**
  529. * Returns user-friendly interface to access document {@link Field}s
  530. *
  531. * @return user-friendly interface to access document {@link Field}s
  532. */
  533. public Fields getFields()
  534. {
  535. return _fields;
  536. }
  537. /**
  538. * Writes out the word file that is represented by an instance of this class.
  539. *
  540. * @param out The OutputStream to write to.
  541. * @throws IOException If there is an unexpected IOException from the passed
  542. * in OutputStream.
  543. */
  544. public void write(OutputStream out)
  545. throws IOException
  546. {
  547. // initialize our streams for writing.
  548. HWPFFileSystem docSys = new HWPFFileSystem();
  549. HWPFOutputStream wordDocumentStream = docSys.getStream(STREAM_WORD_DOCUMENT);
  550. HWPFOutputStream tableStream = docSys.getStream(STREAM_TABLE_1);
  551. //HWPFOutputStream dataStream = docSys.getStream("Data");
  552. int tableOffset = 0;
  553. // FileInformationBlock fib = (FileInformationBlock)_fib.clone();
  554. // clear the offsets and sizes in our FileInformationBlock.
  555. _fib.clearOffsetsSizes();
  556. // determine the FileInformationBLock size
  557. int fibSize = _fib.getSize();
  558. fibSize += POIFSConstants.SMALLER_BIG_BLOCK_SIZE -
  559. (fibSize % POIFSConstants.SMALLER_BIG_BLOCK_SIZE);
  560. // preserve space for the FileInformationBlock because we will be writing
  561. // it after we write everything else.
  562. byte[] placeHolder = new byte[fibSize];
  563. wordDocumentStream.write(placeHolder);
  564. int mainOffset = wordDocumentStream.getOffset();
  565. // write out the StyleSheet.
  566. _fib.setFcStshf(tableOffset);
  567. _ss.writeTo(tableStream);
  568. _fib.setLcbStshf(tableStream.getOffset() - tableOffset);
  569. tableOffset = tableStream.getOffset();
  570. // get fcMin and fcMac because we will be writing the actual text with the
  571. // complex table.
  572. int fcMin = mainOffset;
  573. /*
  574. * clx (encoding of the sprm lists for a complex file and piece table
  575. * for a any file) Written immediately after the end of the previously
  576. * recorded structure. This is recorded in all Word documents
  577. *
  578. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  579. * Specification; Page 23 of 210
  580. */
  581. // write out the Complex table, includes text.
  582. _fib.setFcClx(tableOffset);
  583. _cft.writeTo(wordDocumentStream, tableStream);
  584. _fib.setLcbClx(tableStream.getOffset() - tableOffset);
  585. tableOffset = tableStream.getOffset();
  586. int fcMac = wordDocumentStream.getOffset();
  587. /*
  588. * dop (document properties record) Written immediately after the end of
  589. * the previously recorded structure. This is recorded in all Word
  590. * documents
  591. *
  592. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  593. * Specification; Page 23 of 210
  594. */
  595. // write out the DocumentProperties.
  596. _fib.setFcDop(tableOffset);
  597. _dop.writeTo(tableStream);
  598. _fib.setLcbDop(tableStream.getOffset() - tableOffset);
  599. tableOffset = tableStream.getOffset();
  600. /*
  601. * plcfBkmkf (table recording beginning CPs of bookmarks) Written
  602. * immediately after the sttbfBkmk, if the document contains bookmarks.
  603. *
  604. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  605. * Specification; Page 24 of 210
  606. */
  607. if ( _bookmarksTables != null )
  608. {
  609. _bookmarksTables.writePlcfBkmkf( _fib, tableStream );
  610. tableOffset = tableStream.getOffset();
  611. }
  612. /*
  613. * plcfBkmkl (table recording limit CPs of bookmarks) Written
  614. * immediately after the plcfBkmkf, if the document contains bookmarks.
  615. *
  616. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  617. * Specification; Page 24 of 210
  618. */
  619. if ( _bookmarksTables != null )
  620. {
  621. _bookmarksTables.writePlcfBkmkl( _fib, tableStream );
  622. tableOffset = tableStream.getOffset();
  623. }
  624. /*
  625. * plcfbteChpx (bin table for CHP FKPs) Written immediately after the
  626. * previously recorded table. This is recorded in all Word documents.
  627. *
  628. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  629. * Specification; Page 24 of 210
  630. */
  631. // write out the CHPBinTable.
  632. _fib.setFcPlcfbteChpx(tableOffset);
  633. _cbt.writeTo(wordDocumentStream, tableStream, fcMin, _cft.getTextPieceTable());
  634. _fib.setLcbPlcfbteChpx(tableStream.getOffset() - tableOffset);
  635. tableOffset = tableStream.getOffset();
  636. /*
  637. * plcfbtePapx (bin table for PAP FKPs) Written immediately after the
  638. * plcfbteChpx. This is recorded in all Word documents.
  639. *
  640. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  641. * Specification; Page 24 of 210
  642. */
  643. // write out the PAPBinTable.
  644. _fib.setFcPlcfbtePapx(tableOffset);
  645. _pbt.writeTo(wordDocumentStream, tableStream, _cft.getTextPieceTable());
  646. _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
  647. tableOffset = tableStream.getOffset();
  648. /*
  649. * plcfendRef (endnote reference position table) Written immediately
  650. * after the previously recorded table if the document contains endnotes
  651. *
  652. * plcfendTxt (endnote text position table) Written immediately after
  653. * the plcfendRef if the document contains endnotes
  654. *
  655. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  656. * Specification; Page 24 of 210
  657. */
  658. _endnotesTables.writeRef( _fib, tableStream );
  659. _endnotesTables.writeTxt( _fib, tableStream );
  660. tableOffset = tableStream.getOffset();
  661. /*
  662. * plcffld*** (table of field positions and statuses for annotation
  663. * subdocument) Written immediately after the previously recorded table,
  664. * if the ******* subdocument contains fields.
  665. *
  666. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  667. * Specification; Page 24 of 210
  668. */
  669. if ( _fieldsTables != null )
  670. {
  671. _fieldsTables.write( _fib, tableStream );
  672. tableOffset = tableStream.getOffset();
  673. }
  674. /*
  675. * plcffndRef (footnote reference position table) Written immediately
  676. * after the stsh if the document contains footnotes
  677. *
  678. * plcffndTxt (footnote text position table) Written immediately after
  679. * the plcffndRef if the document contains footnotes
  680. *
  681. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  682. * Specification; Page 24 of 210
  683. */
  684. _footnotesTables.writeRef( _fib, tableStream );
  685. _footnotesTables.writeTxt( _fib, tableStream );
  686. tableOffset = tableStream.getOffset();
  687. /*
  688. * plcfsed (section table) Written immediately after the previously
  689. * recorded table. Recorded in all Word documents
  690. *
  691. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  692. * Specification; Page 25 of 210
  693. */
  694. // write out the SectionTable.
  695. _fib.setFcPlcfsed(tableOffset);
  696. _st.writeTo(wordDocumentStream, tableStream);
  697. _fib.setLcbPlcfsed(tableStream.getOffset() - tableOffset);
  698. tableOffset = tableStream.getOffset();
  699. // write out the list tables
  700. if ( _lt != null )
  701. {
  702. /*
  703. * plcflst (list formats) Written immediately after the end of the
  704. * previously recorded, if there are any lists defined in the
  705. * document. This begins with a short count of LSTF structures
  706. * followed by those LSTF structures. This is immediately followed
  707. * by the allocated data hanging off the LSTFs. This data consists
  708. * of the array of LVLs for each LSTF. (Each LVL consists of an LVLF
  709. * followed by two grpprls and an XST.)
  710. *
  711. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  712. * Specification; Page 25 of 210
  713. */
  714. _lt.writeListDataTo( _fib, tableStream );
  715. tableOffset = tableStream.getOffset();
  716. /*
  717. * plflfo (more list formats) Written immediately after the end of
  718. * the plcflst and its accompanying data, if there are any lists
  719. * defined in the document. This consists first of a PL of LFO
  720. * records, followed by the allocated data (if any) hanging off the
  721. * LFOs. The allocated data consists of the array of LFOLVLFs for
  722. * each LFO (and each LFOLVLF is immediately followed by some LVLs).
  723. *
  724. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  725. * Specification; Page 26 of 210
  726. */
  727. _lt.writeListOverridesTo( _fib, tableStream );
  728. tableOffset = tableStream.getOffset();
  729. }
  730. /*
  731. * sttbfBkmk (table of bookmark name strings) Written immediately after
  732. * the previously recorded table, if the document contains bookmarks.
  733. *
  734. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  735. * Specification; Page 27 of 210
  736. */
  737. if ( _bookmarksTables != null )
  738. {
  739. _bookmarksTables.writeSttbfBkmk( _fib, tableStream );
  740. tableOffset = tableStream.getOffset();
  741. }
  742. /*
  743. * sttbSavedBy (last saved by string table) Written immediately after
  744. * the previously recorded table.
  745. *
  746. * Microsoft Office Word 97-2007 Binary File Format (.doc)
  747. * Specification; Page 27 of 210
  748. */
  749. // write out the saved-by table.
  750. if (_sbt != null)
  751. {
  752. _fib.setFcSttbSavedBy(tableOffset);
  753. _sbt.writeTo(tableStream);
  754. _fib.setLcbSttbSavedBy(tableStream.getOffset() - tableOffset);
  755. tableOffset = tableStream.getOffset();
  756. }
  757. // write out the revision mark authors table.
  758. if (_rmat != null)
  759. {
  760. _fib.setFcSttbfRMark(tableOffset);
  761. _rmat.writeTo(tableStream);
  762. _fib.setLcbSttbfRMark(tableStream.getOffset() - tableOffset);
  763. tableOffset = tableStream.getOffset();
  764. }
  765. // write out the FontTable.
  766. _fib.setFcSttbfffn(tableOffset);
  767. _ft.writeTo(tableStream);
  768. _fib.setLcbSttbfffn(tableStream.getOffset() - tableOffset);
  769. tableOffset = tableStream.getOffset();
  770. // set some variables in the FileInformationBlock.
  771. _fib.getFibBase().setFcMin(fcMin);
  772. _fib.getFibBase().setFcMac(fcMac);
  773. _fib.setCbMac(wordDocumentStream.getOffset());
  774. // make sure that the table, doc and data streams use big blocks.
  775. byte[] mainBuf = wordDocumentStream.toByteArray();
  776. if (mainBuf.length < 4096)
  777. {
  778. byte[] tempBuf = new byte[4096];
  779. System.arraycopy(mainBuf, 0, tempBuf, 0, mainBuf.length);
  780. mainBuf = tempBuf;
  781. }
  782. // Table1 stream will be used
  783. _fib.getFibBase().setFWhichTblStm( true );
  784. // write out the FileInformationBlock.
  785. //_fib.serialize(mainBuf, 0);
  786. _fib.writeTo(mainBuf, tableStream);
  787. byte[] tableBuf = tableStream.toByteArray();
  788. if (tableBuf.length < 4096)
  789. {
  790. byte[] tempBuf = new byte[4096];
  791. System.arraycopy(tableBuf, 0, tempBuf, 0, tableBuf.length);
  792. tableBuf = tempBuf;
  793. }
  794. byte[] dataBuf = _dataStream;
  795. if (dataBuf == null)
  796. {
  797. dataBuf = new byte[4096];
  798. }
  799. if (dataBuf.length < 4096)
  800. {
  801. byte[] tempBuf = new byte[4096];
  802. System.arraycopy(dataBuf, 0, tempBuf, 0, dataBuf.length);
  803. dataBuf = tempBuf;
  804. }
  805. // create new document preserving order of entries
  806. POIFSFileSystem pfs = new POIFSFileSystem();
  807. boolean docWritten = false;
  808. boolean dataWritten = false;
  809. boolean objectPoolWritten = false;
  810. boolean tableWritten = false;
  811. boolean propertiesWritten = false;
  812. for ( Iterator<Entry> iter = directory.getEntries(); iter.hasNext(); )
  813. {
  814. Entry entry = iter.next();
  815. if ( entry.getName().equals( STREAM_WORD_DOCUMENT ) )
  816. {
  817. if ( !docWritten )
  818. {
  819. pfs.createDocument( new ByteArrayInputStream( mainBuf ),
  820. STREAM_WORD_DOCUMENT );
  821. docWritten = true;
  822. }
  823. }
  824. else if ( entry.getName().equals( STREAM_OBJECT_POOL ) )
  825. {
  826. if ( !objectPoolWritten )
  827. {
  828. _objectPool.writeTo( pfs.getRoot() );
  829. objectPoolWritten = true;
  830. }
  831. }
  832. else if ( entry.getName().equals( STREAM_TABLE_0 )
  833. || entry.getName().equals( STREAM_TABLE_1 ) )
  834. {
  835. if ( !tableWritten )
  836. {
  837. pfs.createDocument( new ByteArrayInputStream( tableBuf ),
  838. STREAM_TABLE_1 );
  839. tableWritten = true;
  840. }
  841. }
  842. else if ( entry.getName().equals(
  843. SummaryInformation.DEFAULT_STREAM_NAME )
  844. || entry.getName().equals(
  845. DocumentSummaryInformation.DEFAULT_STREAM_NAME ) )
  846. {
  847. if ( !propertiesWritten )
  848. {
  849. writeProperties( pfs );
  850. propertiesWritten = true;
  851. }
  852. }
  853. else if ( entry.getName().equals( STREAM_DATA ) )
  854. {
  855. if ( !dataWritten )
  856. {
  857. pfs.createDocument( new ByteArrayInputStream( dataBuf ),
  858. STREAM_DATA );
  859. dataWritten = true;
  860. }
  861. }
  862. else
  863. {
  864. EntryUtils.copyNodeRecursively( entry, pfs.getRoot() );
  865. }
  866. }
  867. if ( !docWritten )
  868. pfs.createDocument( new ByteArrayInputStream( mainBuf ),
  869. STREAM_WORD_DOCUMENT );
  870. if ( !tableWritten )
  871. pfs.createDocument( new ByteArrayInputStream( tableBuf ),
  872. STREAM_TABLE_1 );
  873. if ( !propertiesWritten )
  874. writeProperties( pfs );
  875. if ( !dataWritten )
  876. pfs.createDocument( new ByteArrayInputStream( dataBuf ),
  877. STREAM_DATA );
  878. if ( !objectPoolWritten )
  879. _objectPool.writeTo( pfs.getRoot() );
  880. pfs.writeFilesystem( out );
  881. this.directory = pfs.getRoot();
  882. /*
  883. * since we updated all references in FIB and etc, using new arrays to
  884. * access data
  885. */
  886. this.directory = pfs.getRoot();
  887. this._tableStream = tableStream.toByteArray();
  888. this._dataStream = dataBuf;
  889. }
  890. @Internal
  891. public byte[] getDataStream()
  892. {
  893. return _dataStream;
  894. }
  895. @Internal
  896. public byte[] getTableStream()
  897. {
  898. return _tableStream;
  899. }
  900. public int registerList( HWPFList list )
  901. {
  902. if ( _lt == null )
  903. {
  904. _lt = new ListTables();
  905. }
  906. return _lt.addList( list.getListData(), list.getLFO(),
  907. list.getLFOData() );
  908. }
  909. public void delete(int start, int length)
  910. {
  911. Range r = new Range(start, start + length, this);
  912. r.delete();
  913. }
  914. }