/src/org/apache/poi/hwpf/converter/AbstractWordConverter.java
Java | 1218 lines | 1012 code | 158 blank | 48 comment | 192 complexity | a222c0be0731737a174a7721c959c7e2 MD5 | raw file
Possible License(s): Apache-2.0
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.hwpf.converter;
- import java.util.ArrayList;
- import java.util.Collections;
- import java.util.Iterator;
- import java.util.LinkedHashSet;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.Map;
- import java.util.Set;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.poi.hpsf.SummaryInformation;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.hwpf.HWPFDocumentCore;
- import org.apache.poi.hwpf.converter.AbstractWordUtils.NumberingState;
- import org.apache.poi.hwpf.converter.FontReplacer.Triplet;
- import org.apache.poi.hwpf.model.FieldsDocumentPart;
- import org.apache.poi.hwpf.usermodel.Bookmark;
- import org.apache.poi.hwpf.usermodel.CharacterRun;
- import org.apache.poi.hwpf.usermodel.Field;
- import org.apache.poi.hwpf.usermodel.HWPFList;
- import org.apache.poi.hwpf.usermodel.Notes;
- import org.apache.poi.hwpf.usermodel.OfficeDrawing;
- import org.apache.poi.hwpf.usermodel.Paragraph;
- import org.apache.poi.hwpf.usermodel.Picture;
- import org.apache.poi.hwpf.usermodel.PictureType;
- import org.apache.poi.hwpf.usermodel.Range;
- import org.apache.poi.hwpf.usermodel.Section;
- import org.apache.poi.hwpf.usermodel.Table;
- import org.apache.poi.hwpf.usermodel.TableCell;
- import org.apache.poi.hwpf.usermodel.TableRow;
- import org.apache.poi.poifs.filesystem.Entry;
- import org.apache.poi.util.Beta;
- import org.apache.poi.util.Internal;
- import org.apache.poi.util.POILogFactory;
- import org.apache.poi.util.POILogger;
- import org.w3c.dom.Document;
- import org.w3c.dom.Element;
- @Beta
- public abstract class AbstractWordConverter
- {
- private static class DeadFieldBoundaries
- {
- final int beginMark;
- final int endMark;
- final int separatorMark;
- public DeadFieldBoundaries( int beginMark, int separatorMark,
- int endMark )
- {
- this.beginMark = beginMark;
- this.separatorMark = separatorMark;
- this.endMark = endMark;
- }
- }
- private static final class Structure implements Comparable<Structure>
- {
- final int end;
- final int start;
- final Object structure;
- Structure( Bookmark bookmark )
- {
- this.start = bookmark.getStart();
- this.end = bookmark.getEnd();
- this.structure = bookmark;
- }
- Structure( DeadFieldBoundaries deadFieldBoundaries, int start, int end )
- {
- this.start = start;
- this.end = end;
- this.structure = deadFieldBoundaries;
- }
- Structure( Field field )
- {
- this.start = field.getFieldStartOffset();
- this.end = field.getFieldEndOffset();
- this.structure = field;
- }
- public int compareTo( Structure o )
- {
- return start < o.start ? -1 : start == o.start ? 0 : 1;
- }
- @Override
- public String toString()
- {
- return "Structure [" + start + "; " + end + "): "
- + structure.toString();
- }
- }
- private static final byte BEL_MARK = 7;
- private static final byte FIELD_BEGIN_MARK = 19;
- private static final byte FIELD_END_MARK = 21;
- private static final byte FIELD_SEPARATOR_MARK = 20;
- private static final POILogger logger = POILogFactory
- .getLogger( AbstractWordConverter.class );
- private static final Pattern PATTERN_HYPERLINK_EXTERNAL = Pattern
- .compile( "^[ \\t\\r\\n]*HYPERLINK \"(.*)\".*$" );
- private static final Pattern PATTERN_HYPERLINK_LOCAL = Pattern
- .compile( "^[ \\t\\r\\n]*HYPERLINK \\\\l \"(.*)\"[ ](.*)$" );
- private static final Pattern PATTERN_PAGEREF = Pattern
- .compile( "^[ \\t\\r\\n]*PAGEREF ([^ ]*)[ \\t\\r\\n]*\\\\h.*$" );
- private static final byte SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE = 2;
- private static final byte SPECCHAR_DRAWN_OBJECT = 8;
- protected static final char UNICODECHAR_NO_BREAK_SPACE = '\u00a0';
- protected static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
- protected static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
- private static void addToStructures( List<Structure> structures,
- Structure structure )
- {
- for ( Iterator<Structure> iterator = structures.iterator(); iterator
- .hasNext(); )
- {
- Structure another = iterator.next();
- if ( another.start <= structure.start
- && another.end >= structure.start )
- {
- return;
- }
- if ( ( structure.start < another.start && another.start < structure.end )
- || ( structure.start < another.start && another.end <= structure.end )
- || ( structure.start <= another.start && another.end < structure.end ) )
- {
- iterator.remove();
- continue;
- }
- }
- structures.add( structure );
- }
- private final Set<Bookmark> bookmarkStack = new LinkedHashSet<Bookmark>();
- private FontReplacer fontReplacer = new DefaultFontReplacer();
- private POILogger log = POILogFactory.getLogger( getClass() );
- private NumberingState numberingState = new NumberingState();
- private PicturesManager picturesManager;
- /**
- * Special actions that need to be called after processing complete, like
- * updating stylesheets or building document notes list. Usually they are
- * called once, but it's okay to call them several times.
- */
- protected void afterProcess()
- {
- // by default no such actions needed
- }
- protected Triplet getCharacterRunTriplet( CharacterRun characterRun )
- {
- Triplet original = new Triplet();
- original.bold = characterRun.isBold();
- original.italic = characterRun.isItalic();
- original.fontName = characterRun.getFontName();
- Triplet updated = getFontReplacer().update( original );
- return updated;
- }
- public abstract Document getDocument();
- public FontReplacer getFontReplacer()
- {
- return fontReplacer;
- }
- protected int getNumberColumnsSpanned( int[] tableCellEdges,
- int currentEdgeIndex, TableCell tableCell )
- {
- int nextEdgeIndex = currentEdgeIndex;
- int colSpan = 0;
- int cellRightEdge = tableCell.getLeftEdge() + tableCell.getWidth();
- while ( tableCellEdges[nextEdgeIndex] < cellRightEdge )
- {
- colSpan++;
- nextEdgeIndex++;
- }
- return colSpan;
- }
- protected int getNumberRowsSpanned( Table table,
- final int[] tableCellEdges, int currentRowIndex,
- int currentColumnIndex, TableCell tableCell )
- {
- if ( !tableCell.isFirstVerticallyMerged() )
- return 1;
- final int numRows = table.numRows();
- int count = 1;
- for ( int r1 = currentRowIndex + 1; r1 < numRows; r1++ )
- {
- TableRow nextRow = table.getRow( r1 );
- if ( currentColumnIndex >= nextRow.numCells() )
- break;
- // we need to skip row if he don't have cells at all
- boolean hasCells = false;
- int currentEdgeIndex = 0;
- for ( int c = 0; c < nextRow.numCells(); c++ )
- {
- TableCell nextTableCell = nextRow.getCell( c );
- if ( !nextTableCell.isVerticallyMerged()
- || nextTableCell.isFirstVerticallyMerged() )
- {
- int colSpan = getNumberColumnsSpanned( tableCellEdges,
- currentEdgeIndex, nextTableCell );
- currentEdgeIndex += colSpan;
- if ( colSpan != 0 )
- {
- hasCells = true;
- break;
- }
- }
- else
- {
- currentEdgeIndex += getNumberColumnsSpanned(
- tableCellEdges, currentEdgeIndex, nextTableCell );
- }
- }
- if ( !hasCells )
- continue;
- TableCell nextCell = nextRow.getCell( currentColumnIndex );
- if ( !nextCell.isVerticallyMerged()
- || nextCell.isFirstVerticallyMerged() )
- break;
- count++;
- }
- return count;
- }
- public PicturesManager getPicturesManager()
- {
- return picturesManager;
- }
- protected abstract void outputCharacters( Element block,
- CharacterRun characterRun, String text );
- /**
- * Wrap range into bookmark(s) and process it. All bookmarks have starts
- * equal to range start and ends equal to range end. Usually it's only one
- * bookmark.
- */
- protected abstract void processBookmarks( HWPFDocumentCore wordDocument,
- Element currentBlock, Range range, int currentTableLevel,
- List<Bookmark> rangeBookmarks );
- protected boolean processCharacters( final HWPFDocumentCore wordDocument,
- final int currentTableLevel, final Range range, final Element block )
- {
- if ( range == null )
- return false;
- boolean haveAnyText = false;
- /*
- * In text there can be fields, bookmarks, may be other structures (code
- * below allows extension). Those structures can overlaps, so either we
- * should process char-by-char (slow) or find a correct way to
- * reconstruct the structure of range -- sergey
- */
- List<Structure> structures = new LinkedList<Structure>();
- if ( wordDocument instanceof HWPFDocument )
- {
- final HWPFDocument doc = (HWPFDocument) wordDocument;
- Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks()
- .getBookmarksStartedBetween( range.getStartOffset(),
- range.getEndOffset() );
- if ( rangeBookmarks != null )
- {
- for ( List<Bookmark> lists : rangeBookmarks.values() )
- {
- for ( Bookmark bookmark : lists )
- {
- if ( !bookmarkStack.contains( bookmark ) )
- addToStructures( structures, new Structure(
- bookmark ) );
- }
- }
- }
- // TODO: dead fields?
- int skipUntil = -1;
- for ( int c = 0; c < range.numCharacterRuns(); c++ )
- {
- CharacterRun characterRun = range.getCharacterRun( c );
- if ( characterRun == null )
- throw new AssertionError();
- if ( characterRun.getStartOffset() < skipUntil )
- continue;
- String text = characterRun.text();
- if ( text == null || text.length() == 0
- || text.charAt( 0 ) != FIELD_BEGIN_MARK )
- continue;
- Field aliveField = ( (HWPFDocument) wordDocument ).getFields()
- .getFieldByStartOffset( FieldsDocumentPart.MAIN,
- characterRun.getStartOffset() );
- if ( aliveField != null )
- {
- addToStructures( structures, new Structure( aliveField ) );
- }
- else
- {
- int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(
- wordDocument, range, c );
- if ( separatorEnd != null )
- {
- addToStructures(
- structures,
- new Structure( new DeadFieldBoundaries( c,
- separatorEnd[0], separatorEnd[1] ),
- characterRun.getStartOffset(), range
- .getCharacterRun(
- separatorEnd[1] )
- .getEndOffset() ) );
- c = separatorEnd[1];
- }
- }
- }
- }
- structures = new ArrayList<Structure>( structures );
- Collections.sort( structures );
- int previous = range.getStartOffset();
- for ( Structure structure : structures )
- {
- if ( structure.start != previous )
- {
- Range subrange = new Range( previous, structure.start, range )
- {
- @Override
- public String toString()
- {
- return "BetweenStructuresSubrange " + super.toString();
- }
- };
- processCharacters( wordDocument, currentTableLevel, subrange,
- block );
- }
- if ( structure.structure instanceof Bookmark )
- {
- // other bookmarks with same boundaries
- List<Bookmark> bookmarks = new LinkedList<Bookmark>();
- for ( Bookmark bookmark : ( (HWPFDocument) wordDocument )
- .getBookmarks()
- .getBookmarksStartedBetween( structure.start,
- structure.start + 1 ).values().iterator()
- .next() )
- {
- if ( bookmark.getStart() == structure.start
- && bookmark.getEnd() == structure.end )
- {
- bookmarks.add( bookmark );
- }
- }
- bookmarkStack.addAll( bookmarks );
- try
- {
- int end = Math.min( range.getEndOffset(), structure.end );
- Range subrange = new Range( structure.start, end, range )
- {
- @Override
- public String toString()
- {
- return "BookmarksSubrange " + super.toString();
- }
- };
- processBookmarks( wordDocument, block, subrange,
- currentTableLevel, bookmarks );
- }
- finally
- {
- bookmarkStack.removeAll( bookmarks );
- }
- }
- else if ( structure.structure instanceof Field )
- {
- Field field = (Field) structure.structure;
- processField( (HWPFDocument) wordDocument, range,
- currentTableLevel, field, block );
- }
- else if ( structure.structure instanceof DeadFieldBoundaries )
- {
- DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
- processDeadField( wordDocument, block, range,
- currentTableLevel, boundaries.beginMark,
- boundaries.separatorMark, boundaries.endMark );
- }
- else
- {
- throw new UnsupportedOperationException( "NYI: "
- + structure.structure.getClass() );
- }
- previous = Math.min( range.getEndOffset(), structure.end );
- }
- if ( previous != range.getStartOffset() )
- {
- if ( previous > range.getEndOffset() )
- {
- logger.log( POILogger.WARN, "Latest structure in ", range,
- " ended at #" + previous, " after range boundaries [",
- range.getStartOffset() + "; " + range.getEndOffset(),
- ")" );
- return true;
- }
- if ( previous < range.getEndOffset() )
- {
- Range subrange = new Range( previous, range.getEndOffset(),
- range )
- {
- @Override
- public String toString()
- {
- return "AfterStructureSubrange " + super.toString();
- }
- };
- processCharacters( wordDocument, currentTableLevel, subrange,
- block );
- }
- return true;
- }
- for ( int c = 0; c < range.numCharacterRuns(); c++ )
- {
- CharacterRun characterRun = range.getCharacterRun( c );
- if ( characterRun == null )
- throw new AssertionError();
- if ( wordDocument instanceof HWPFDocument
- && ( (HWPFDocument) wordDocument ).getPicturesTable()
- .hasPicture( characterRun ) )
- {
- HWPFDocument newFormat = (HWPFDocument) wordDocument;
- Picture picture = newFormat.getPicturesTable().extractPicture(
- characterRun, true );
- processImage( block, characterRun.text().charAt( 0 ) == 0x01,
- picture );
- continue;
- }
- String text = characterRun.text();
- if ( text.getBytes().length == 0 )
- continue;
- if ( characterRun.isSpecialCharacter() )
- {
- if ( text.charAt( 0 ) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE
- && ( wordDocument instanceof HWPFDocument ) )
- {
- HWPFDocument doc = (HWPFDocument) wordDocument;
- processNoteAnchor( doc, characterRun, block );
- continue;
- }
- if ( text.charAt( 0 ) == SPECCHAR_DRAWN_OBJECT
- && ( wordDocument instanceof HWPFDocument ) )
- {
- HWPFDocument doc = (HWPFDocument) wordDocument;
- processDrawnObject( doc, characterRun, block );
- continue;
- }
- if ( characterRun.isOle2()
- && ( wordDocument instanceof HWPFDocument ) )
- {
- HWPFDocument doc = (HWPFDocument) wordDocument;
- processOle2( doc, characterRun, block );
- continue;
- }
- if ( characterRun.isSymbol()
- && ( wordDocument instanceof HWPFDocument ) )
- {
- HWPFDocument doc = (HWPFDocument) wordDocument;
- processSymbol( doc, characterRun, block );
- continue;
- }
- }
- if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
- {
- if ( wordDocument instanceof HWPFDocument )
- {
- Field aliveField = ( (HWPFDocument) wordDocument )
- .getFields().getFieldByStartOffset(
- FieldsDocumentPart.MAIN,
- characterRun.getStartOffset() );
- if ( aliveField != null )
- {
- processField( ( (HWPFDocument) wordDocument ), range,
- currentTableLevel, aliveField, block );
- int continueAfter = aliveField.getFieldEndOffset();
- while ( c < range.numCharacterRuns()
- && range.getCharacterRun( c ).getEndOffset() <= continueAfter )
- c++;
- if ( c < range.numCharacterRuns() )
- c--;
- continue;
- }
- }
- int skipTo = tryDeadField( wordDocument, range,
- currentTableLevel, c, block );
- if ( skipTo != c )
- {
- c = skipTo;
- continue;
- }
- continue;
- }
- if ( text.getBytes()[0] == FIELD_SEPARATOR_MARK )
- {
- // shall not appear without FIELD_BEGIN_MARK
- continue;
- }
- if ( text.getBytes()[0] == FIELD_END_MARK )
- {
- // shall not appear without FIELD_BEGIN_MARK
- continue;
- }
- if ( characterRun.isSpecialCharacter() || characterRun.isObj()
- || characterRun.isOle2() )
- {
- continue;
- }
- if ( text.endsWith( "\r" )
- || ( text.charAt( text.length() - 1 ) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE ) )
- text = text.substring( 0, text.length() - 1 );
- {
- // line breaks
- StringBuilder stringBuilder = new StringBuilder();
- for ( char charChar : text.toCharArray() )
- {
- if ( charChar == 11 )
- {
- if ( stringBuilder.length() > 0 )
- {
- outputCharacters( block, characterRun,
- stringBuilder.toString() );
- stringBuilder.setLength( 0 );
- }
- processLineBreak( block, characterRun );
- }
- else if ( charChar == 30 )
- {
- // Non-breaking hyphens are stored as ASCII 30
- stringBuilder.append( UNICODECHAR_NONBREAKING_HYPHEN );
- }
- else if ( charChar == 31 )
- {
- // Non-required hyphens to zero-width space
- stringBuilder.append( UNICODECHAR_ZERO_WIDTH_SPACE );
- }
- else if ( charChar >= 0x20 || charChar == 0x09
- || charChar == 0x0A || charChar == 0x0D )
- {
- stringBuilder.append( charChar );
- }
- }
- if ( stringBuilder.length() > 0 )
- {
- outputCharacters( block, characterRun,
- stringBuilder.toString() );
- stringBuilder.setLength( 0 );
- }
- }
- haveAnyText |= text.trim().length() != 0;
- }
- return haveAnyText;
- }
- protected void processDeadField( HWPFDocumentCore wordDocument,
- Element currentBlock, Range range, int currentTableLevel,
- int beginMark, int separatorMark, int endMark )
- {
- if ( beginMark + 1 < separatorMark && separatorMark + 1 < endMark )
- {
- Range formulaRange = new Range( range.getCharacterRun(
- beginMark + 1 ).getStartOffset(), range.getCharacterRun(
- separatorMark - 1 ).getEndOffset(), range )
- {
- @Override
- public String toString()
- {
- return "Dead field formula subrange: " + super.toString();
- }
- };
- Range valueRange = new Range( range.getCharacterRun(
- separatorMark + 1 ).getStartOffset(), range
- .getCharacterRun( endMark - 1 ).getEndOffset(), range )
- {
- @Override
- public String toString()
- {
- return "Dead field value subrange: " + super.toString();
- }
- };
- String formula = formulaRange.text();
- final Matcher matcher = PATTERN_HYPERLINK_LOCAL.matcher( formula );
- if ( matcher.matches() )
- {
- String localref = matcher.group( 1 );
- processPageref( wordDocument, currentBlock, valueRange,
- currentTableLevel, localref );
- return;
- }
- }
- StringBuilder debug = new StringBuilder( "Unsupported field type: \n" );
- for ( int i = beginMark; i <= endMark; i++ )
- {
- debug.append( "\t" );
- debug.append( range.getCharacterRun( i ) );
- debug.append( "\n" );
- }
- logger.log( POILogger.WARN, debug );
- Range deadFieldValueSubrage = new Range( range.getCharacterRun(
- separatorMark ).getStartOffset() + 1, range.getCharacterRun(
- endMark ).getStartOffset(), range )
- {
- @Override
- public String toString()
- {
- return "DeadFieldValueSubrange (" + super.toString() + ")";
- }
- };
- // just output field value
- if ( separatorMark + 1 < endMark )
- processCharacters( wordDocument, currentTableLevel,
- deadFieldValueSubrage, currentBlock );
- return;
- }
- public void processDocument( HWPFDocumentCore wordDocument )
- {
- try
- {
- final SummaryInformation summaryInformation = wordDocument
- .getSummaryInformation();
- if ( summaryInformation != null )
- {
- processDocumentInformation( summaryInformation );
- }
- }
- catch ( Exception exc )
- {
- logger.log( POILogger.WARN,
- "Unable to process document summary information: ", exc,
- exc );
- }
- final Range docRange = wordDocument.getRange();
- if ( docRange.numSections() == 1 )
- {
- processSingleSection( wordDocument, docRange.getSection( 0 ) );
- afterProcess();
- return;
- }
- processDocumentPart( wordDocument, docRange );
- afterProcess();
- }
- protected abstract void processDocumentInformation(
- SummaryInformation summaryInformation );
- protected void processDocumentPart( HWPFDocumentCore wordDocument,
- final Range range )
- {
- for ( int s = 0; s < range.numSections(); s++ )
- {
- processSection( wordDocument, range.getSection( s ), s );
- }
- }
- protected void processDrawnObject( HWPFDocument doc,
- CharacterRun characterRun, Element block )
- {
- if ( getPicturesManager() == null )
- return;
- // TODO: support headers
- OfficeDrawing officeDrawing = doc.getOfficeDrawingsMain()
- .getOfficeDrawingAt( characterRun.getStartOffset() );
- if ( officeDrawing == null )
- {
- logger.log( POILogger.WARN, "Characters #" + characterRun
- + " references missing drawn object" );
- return;
- }
- byte[] pictureData = officeDrawing.getPictureData();
- if ( pictureData == null )
- // usual shape?
- return;
- float width = ( officeDrawing.getRectangleRight() - officeDrawing
- .getRectangleLeft() ) / AbstractWordUtils.TWIPS_PER_INCH;
- float height = ( officeDrawing.getRectangleBottom() - officeDrawing
- .getRectangleTop() ) / AbstractWordUtils.TWIPS_PER_INCH;
- final PictureType type = PictureType.findMatchingType( pictureData );
- String path = getPicturesManager()
- .savePicture( pictureData, type,
- "s" + characterRun.getStartOffset() + "." + type,
- width, height );
- processDrawnObject( doc, characterRun, officeDrawing, path, block );
- }
- protected abstract void processDrawnObject( HWPFDocument doc,
- CharacterRun characterRun, OfficeDrawing officeDrawing,
- String path, Element block );
- protected void processDropDownList( Element block,
- CharacterRun characterRun, String[] values, int defaultIndex )
- {
- outputCharacters( block, characterRun, values[defaultIndex] );
- }
- protected abstract void processEndnoteAutonumbered(
- HWPFDocument wordDocument, int noteIndex, Element block,
- Range endnoteTextRange );
- protected void processField( HWPFDocument wordDocument, Range parentRange,
- int currentTableLevel, Field field, Element currentBlock )
- {
- switch ( field.getType() )
- {
- case 37: // page reference
- {
- final Range firstSubrange = field.firstSubrange( parentRange );
- if ( firstSubrange != null )
- {
- String formula = firstSubrange.text();
- Matcher matcher = PATTERN_PAGEREF.matcher( formula );
- if ( matcher.find() )
- {
- String pageref = matcher.group( 1 );
- processPageref( wordDocument, currentBlock,
- field.secondSubrange( parentRange ),
- currentTableLevel, pageref );
- return;
- }
- }
- break;
- }
- case 58: // Embedded Object
- {
- if ( !field.hasSeparator() )
- {
- logger.log( POILogger.WARN, parentRange + " contains " + field
- + " with 'Embedded Object' but without separator mark" );
- return;
- }
- CharacterRun separator = field
- .getMarkSeparatorCharacterRun( parentRange );
- if ( separator.isOle2() )
- {
- // the only supported so far
- boolean processed = processOle2( wordDocument, separator,
- currentBlock );
- // if we didn't output OLE - output field value
- if ( !processed )
- {
- processCharacters( wordDocument, currentTableLevel,
- field.secondSubrange( parentRange ), currentBlock );
- }
- return;
- }
- break;
- }
- case 83: // drop down
- {
- Range fieldContent = field.firstSubrange( parentRange );
- CharacterRun cr = fieldContent.getCharacterRun( fieldContent
- .numCharacterRuns() - 1 );
- String[] values = cr.getDropDownListValues();
- Integer defIndex = cr.getDropDownListDefaultItemIndex();
- if ( values != null )
- {
- processDropDownList( currentBlock, cr, values,
- defIndex == null ? -1 : defIndex.intValue() );
- return;
- }
- break;
- }
- case 88: // hyperlink
- {
- final Range firstSubrange = field.firstSubrange( parentRange );
- if ( firstSubrange != null )
- {
- String formula = firstSubrange.text();
- Matcher matcher = PATTERN_HYPERLINK_EXTERNAL.matcher( formula );
- if ( matcher.matches() )
- {
- String hyperlink = matcher.group( 1 );
- processHyperlink( wordDocument, currentBlock,
- field.secondSubrange( parentRange ),
- currentTableLevel, hyperlink );
- return;
- }
- matcher.usePattern( PATTERN_HYPERLINK_LOCAL );
- if ( matcher.matches() )
- {
- String hyperlink = matcher.group( 1 );
- Range textRange = null;
- String text = matcher.group( 2 );
- if ( AbstractWordUtils.isNotEmpty( text ) )
- {
- textRange = new Range( firstSubrange.getStartOffset()
- + matcher.start( 2 ),
- firstSubrange.getStartOffset()
- + matcher.end( 2 ), firstSubrange )
- {
- @Override
- public String toString()
- {
- return "Local hyperlink text";
- }
- };
- }
- processPageref( wordDocument, currentBlock, textRange,
- currentTableLevel, hyperlink );
- return;
- }
- }
- break;
- }
- }
- logger.log( POILogger.WARN, parentRange + " contains " + field
- + " with unsupported type or format" );
- processCharacters( wordDocument, currentTableLevel,
- field.secondSubrange( parentRange ), currentBlock );
- }
- protected abstract void processFootnoteAutonumbered(
- HWPFDocument wordDocument, int noteIndex, Element block,
- Range footnoteTextRange );
- protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
- Element currentBlock, Range textRange, int currentTableLevel,
- String hyperlink );
- protected void processImage( Element currentBlock, boolean inlined,
- Picture picture )
- {
- PicturesManager fileManager = getPicturesManager();
- if ( fileManager != null )
- {
- final int aspectRatioX = picture.getHorizontalScalingFactor();
- final int aspectRatioY = picture.getVerticalScalingFactor();
- final float imageWidth = aspectRatioX > 0 ? picture.getDxaGoal()
- * aspectRatioX / 1000 / AbstractWordUtils.TWIPS_PER_INCH
- : picture.getDxaGoal() / AbstractWordUtils.TWIPS_PER_INCH;
- final float imageHeight = aspectRatioY > 0 ? picture.getDyaGoal()
- * aspectRatioY / 1000 / AbstractWordUtils.TWIPS_PER_INCH
- : picture.getDyaGoal() / AbstractWordUtils.TWIPS_PER_INCH;
- String url = fileManager.savePicture( picture.getContent(),
- picture.suggestPictureType(),
- picture.suggestFullFileName(), imageWidth, imageHeight );
- if ( WordToFoUtils.isNotEmpty( url ) )
- {
- processImage( currentBlock, inlined, picture, url );
- return;
- }
- }
- processImageWithoutPicturesManager( currentBlock, inlined, picture );
- }
- protected abstract void processImage( Element currentBlock,
- boolean inlined, Picture picture, String url );
- @Internal
- protected abstract void processImageWithoutPicturesManager(
- Element currentBlock, boolean inlined, Picture picture );
- protected abstract void processLineBreak( Element block,
- CharacterRun characterRun );
- protected void processNoteAnchor( HWPFDocument doc,
- CharacterRun characterRun, final Element block )
- {
- {
- Notes footnotes = doc.getFootnotes();
- int noteIndex = footnotes
- .getNoteIndexByAnchorPosition( characterRun
- .getStartOffset() );
- if ( noteIndex != -1 )
- {
- Range footnoteRange = doc.getFootnoteRange();
- int rangeStartOffset = footnoteRange.getStartOffset();
- int noteTextStartOffset = footnotes
- .getNoteTextStartOffset( noteIndex );
- int noteTextEndOffset = footnotes
- .getNoteTextEndOffset( noteIndex );
- Range noteTextRange = new Range( rangeStartOffset
- + noteTextStartOffset, rangeStartOffset
- + noteTextEndOffset, doc );
- processFootnoteAutonumbered( doc, noteIndex, block,
- noteTextRange );
- return;
- }
- }
- {
- Notes endnotes = doc.getEndnotes();
- int noteIndex = endnotes.getNoteIndexByAnchorPosition( characterRun
- .getStartOffset() );
- if ( noteIndex != -1 )
- {
- Range endnoteRange = doc.getEndnoteRange();
- int rangeStartOffset = endnoteRange.getStartOffset();
- int noteTextStartOffset = endnotes
- .getNoteTextStartOffset( noteIndex );
- int noteTextEndOffset = endnotes
- .getNoteTextEndOffset( noteIndex );
- Range noteTextRange = new Range( rangeStartOffset
- + noteTextStartOffset, rangeStartOffset
- + noteTextEndOffset, doc );
- processEndnoteAutonumbered( doc, noteIndex, block,
- noteTextRange );
- return;
- }
- }
- }
- private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
- Element block )
- {
- Entry entry = doc.getObjectsPool().getObjectById(
- "_" + characterRun.getPicOffset() );
- if ( entry == null )
- {
- logger.log( POILogger.WARN, "Referenced OLE2 object '",
- Integer.valueOf( characterRun.getPicOffset() ),
- "' not found in ObjectPool" );
- return false;
- }
- try
- {
- return processOle2( doc, block, entry );
- }
- catch ( Exception exc )
- {
- logger.log( POILogger.WARN,
- "Unable to convert internal OLE2 object '",
- Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
- exc );
- return false;
- }
- }
- @SuppressWarnings( "unused" )
- protected boolean processOle2( HWPFDocument wordDocument, Element block,
- Entry entry ) throws Exception
- {
- return false;
- }
- protected abstract void processPageBreak( HWPFDocumentCore wordDocument,
- Element flow );
- protected abstract void processPageref( HWPFDocumentCore wordDocument,
- Element currentBlock, Range textRange, int currentTableLevel,
- String pageref );
- protected abstract void processParagraph( HWPFDocumentCore wordDocument,
- Element parentElement, int currentTableLevel, Paragraph paragraph,
- String bulletText );
- protected void processParagraphes( HWPFDocumentCore wordDocument,
- Element flow, Range range, int currentTableLevel )
- {
- final int paragraphs = range.numParagraphs();
- for ( int p = 0; p < paragraphs; p++ )
- {
- Paragraph paragraph = range.getParagraph( p );
- if ( paragraph.isInTable()
- && paragraph.getTableLevel() != currentTableLevel )
- {
- if ( paragraph.getTableLevel() < currentTableLevel )
- throw new IllegalStateException(
- "Trying to process table cell with higher level ("
- + paragraph.getTableLevel()
- + ") than current table level ("
- + currentTableLevel
- + ") as inner table part" );
- Table table = range.getTable( paragraph );
- processTable( wordDocument, flow, table );
- p += table.numParagraphs();
- p--;
- continue;
- }
- if ( paragraph.text().equals( "\u000c" ) )
- {
- processPageBreak( wordDocument, flow );
- }
- boolean processed = false;
- if ( paragraph.isInList() )
- {
- try
- {
- HWPFList hwpfList = paragraph.getList();
- String label = AbstractWordUtils.getBulletText(
- numberingState, hwpfList,
- (char) paragraph.getIlvl() );
- processParagraph( wordDocument, flow, currentTableLevel,
- paragraph, label );
- processed = true;
- }
- catch ( Exception exc )
- {
- log.log(
- POILogger.WARN,
- "Can't process paragraph as list entry, will be processed without list information",
- exc );
- }
- }
- if ( processed == false )
- {
- processParagraph( wordDocument, flow, currentTableLevel,
- paragraph, AbstractWordUtils.EMPTY );
- }
- }
- }
- protected abstract void processSection( HWPFDocumentCore wordDocument,
- Section section, int s );
- protected void processSingleSection( HWPFDocumentCore wordDocument,
- Section section )
- {
- processSection( wordDocument, section, 0 );
- }
- protected void processSymbol( HWPFDocument doc, CharacterRun characterRun,
- Element block )
- {
- }
- protected abstract void processTable( HWPFDocumentCore wordDocument,
- Element flow, Table table );
- public void setFontReplacer( FontReplacer fontReplacer )
- {
- this.fontReplacer = fontReplacer;
- }
- public void setPicturesManager( PicturesManager fileManager )
- {
- this.picturesManager = fileManager;
- }
- protected int tryDeadField( HWPFDocumentCore wordDocument, Range range,
- int currentTableLevel, int beginMark, Element currentBlock )
- {
- int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(
- wordDocument, range, beginMark );
- if ( separatorEnd == null )
- return beginMark;
- processDeadField( wordDocument, currentBlock, range, currentTableLevel,
- beginMark, separatorEnd[0], separatorEnd[1] );
- return separatorEnd[1];
- }
- private int[] tryDeadField_lookupFieldSeparatorEnd(
- HWPFDocumentCore wordDocument, Range range, int beginMark )
- {
- int separatorMark = -1;
- int endMark = -1;
- for ( int c = beginMark + 1; c < range.numCharacterRuns(); c++ )
- {
- CharacterRun characterRun = range.getCharacterRun( c );
- String text = characterRun.text();
- if ( text.getBytes().length == 0 )
- continue;
- final byte firstByte = text.getBytes()[0];
- if ( firstByte == FIELD_BEGIN_MARK )
- {
- int[] nested = tryDeadField_lookupFieldSeparatorEnd(
- wordDocument, range, c );
- if ( nested != null )
- {
- c = nested[1];
- }
- continue;
- }
- if ( firstByte == FIELD_SEPARATOR_MARK )
- {
- if ( separatorMark != -1 )
- {
- // double; incorrect format
- return null;
- }
- separatorMark = c;
- continue;
- }
- if ( text.getBytes()[0] == FIELD_END_MARK )
- {
- if ( endMark != -1 )
- {
- // double;
- return null;
- }
- endMark = c;
- break;
- }
- }
- if ( separatorMark == -1 || endMark == -1 )
- return null;
- return new int[] { separatorMark, endMark };
- }
- }