WordXMLReader.java - Copyright (C) 2010 Novabit Information…

/nuclos-server/src/main/java/org/nuclos/server/common/ooxml/WordXMLReader.java

https://bitbucket.org/nuclos/nuclos · Java · 321 lines · 208 code · 35 blank · 78 comment · 58 complexity · b0149ee3f1d64724e7fbee233b264acb MD5 · raw file

//Copyright (C) 2010  Novabit Informationssysteme GmbH
//
//This file is part of Nuclos.
//
//Nuclos is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//
//Nuclos is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//GNU Affero General Public License for more details.
//
//You should have received a copy of the GNU Affero General Public License
//along with Nuclos.  If not, see <http://www.gnu.org/licenses/>.

package org.nuclos.server.common.ooxml;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;


/**
 * Helper class providing simple read-only access for examing OOXML wordprocessing documents.
 *
 * <p>Implementation Note: This API is based on Apache POI 3.6 which only covers a very small subset
 * for "common use cases". Especially,structured document tags are not supported by POI 3.6 directly.
 * However, it is possible to access the underlying XML structure directly using precompiled XMLBeans
 * (packages starting with org.openxmlformats.schemas contains the XML Schema Definition (XSD) compiled
 * as XMLBeans). For details about the XML structure, see the ECMA-376 specification (in particular
 * [ECMA-376,2nd], part 1, 17.5.2).
 */
public class WordXMLReader {

	private static final Logger LOG = LoggerFactory.getLogger(WordXMLReader.class);

	/** WordprocessingML namespace URI. */
	private static final String WORDPROCESSINGML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";

	/** Namespace declaration for XMLBeans path selection. */
	private static final String DECLARE_NS_PREFIX = "declare namespace w='" + WORDPROCESSINGML_NS + "' ";

	private final XWPFDocument document;

	private List<StructuredDocumentTag> structuredDocumentTags;

	public WordXMLReader(InputStream is) throws IOException {
		this(new XWPFDocument(is));
	}

	public WordXMLReader(XWPFDocument document) {
		this.document = document;
	}

	public String getText() {
		// Note: in POI 3.6, text extraction does not always work correctly
		return new XWPFWordExtractor(document).getText();
	}

	/**
	 * Returns a map with the text content of the structured document tags
	 * contained in this document (cf. [ECMA-376,2nd], 17.5.2) with their
	 * tag name (17.5.2.42) as key.  Supported tags are comboBox, date,
	 * dropDownList, richText and text.
	 * <p>
	 * If the structured document tag is not filled, i.e. marked as
	 * placeholder (17.5.2.25), the text content is {@code null}.
	 * If the structured document tag does not provide a tag name, the
	 * alias (aka friendly name, 17.5.2.1). If both are omitted, the
	 * structured document tag is skipped.
	 */
	public Map<String, String> getStructuredDocumentTagTexts() {
		Map<String, String> tags = new HashMap<String, String>();
		for (StructuredDocumentTag sdt : structuredDocumentTags()) {
			String name = (sdt.tagName != null) ? sdt.tagName : sdt.alias;
			if (name != null && !tags.containsKey(name))
				tags.put(name, sdt.text);
		}
		return tags;
	}

	/**
	 * Similar to {@link #getStructuredDocumentTagTexts()}, but returns
	 * prepared values.
	 * <p>
	 * For combobox (17.5.2.5) and drop-down (17.5.2.15) elements,
	 * the text content (=display text) is resolved against the given
	 * list items (17.5.2.21/22) and replaced with its associated value.
	 * For date elements (17.5.2.7), a {@link java.util.Date} object based
	 * the cached full-date is returned ({@code toString} returns the
	 * original string). If no full-date is stored, the original string
	 * object is returned.
	 */
	public Map<String, Object> getStructuredDocumentTagValues() {
		Map<String, Object> tags = new HashMap<String, Object>();
		for (StructuredDocumentTag sdt : structuredDocumentTags()) {
			String name = (sdt.tagName != null) ? sdt.tagName : sdt.alias;
			if (name != null && !tags.containsKey(name))
				tags.put(name, sdt.value);
		}
		return tags;
	}

	private List<StructuredDocumentTag> structuredDocumentTags() {
		if (structuredDocumentTags == null) {
			structuredDocumentTags = new ArrayList<StructuredDocumentTag>();
			// TODO: tables, am besten getBodyElements(), siehe JavaDoc...
			for (XWPFParagraph p : document.getParagraphs()) {
				extractStructuredDocumentTags(p.getCTP().getSdtArray());
			}

			Iterator<XWPFTable> tableIter = document.getTablesIterator();
			while(tableIter.hasNext()) {
				extractStructuredDocumentTags(tableIter.next());
			}
		}
		return structuredDocumentTags;
	}

	private void extractStructuredDocumentTags(XWPFTable t) {
		CTTbl table = t.getCTTbl();
		for (CTRow row : table.getTrArray()) {
			for (CTTc cell : row.getTcArray()) {
				extractStructuredDocumentTags(cell.getSdtArray());
				for (CTP ctp : cell.getPArray()) {
					extractStructuredDocumentTags(ctp.getSdtArray());
				}
			}
		}
	}

	private void extractStructuredDocumentTags(CTSdtRun[] sdtRuns) {
		for (CTSdtRun sdtRun : sdtRuns) {
			CTSdtPr sdtPr = sdtRun.getSdtPr();
			CTSdtContentRun sdtContent = sdtRun.getSdtContent();
			createStructuredDocumentTag(sdtPr, getRText(sdtContent.getRArray()));
		}
	}

	private void extractStructuredDocumentTags(CTSdtBlock[] sdtBlocks) {
		for (CTSdtBlock sdtRun : sdtBlocks) {
			CTSdtPr sdtPr = sdtRun.getSdtPr();
			CTSdtContentBlock sdtContent = sdtRun.getSdtContent();
			createStructuredDocumentTag(sdtPr, getPText(sdtContent.getPArray()));
		}
	}


	/**
	 * A <w:sdt> element contains 2 child elements: <w:sdtPr> for the properties,
	 * and <w:sdtContent> for content (here text).
	 */
	private void createStructuredDocumentTag(CTSdtPr sdtPr, String text) {
		// The properties contain (among others) aliases (<w:alias>), tag names (w:tag)
		// and a flag (<w:showingPlcHdr>) whether the content is placeholder or real content.
		String alias = getCTStringVal(getFirst(sdtPr.getAliasArray()));
		String tagName = getCTStringVal(getFirst(sdtPr.getTagArray()));
		boolean isPlaceholder = sdtPr.getShowingPlcHdrArray().length > 0;

		Object value = null;
		// If placeholder is set, the element is not filled by the user
		if (!isPlaceholder) {
			value = text;

			// The following child element can occur and determine the type of the structured
			// document tag: equation, comboBox (*), date (*), docPartObj, docPartList,
			// dropDownList (*), picture, richText (*), text (*), citation, group, bibliography.

			// Note that we can't use the typed method (e.g sdtPr.getComboBoxArray()) here
			// because in the small (poi-)ooxml-schemas.jar bundled with POI, the specialized
			// classes (e.g. CTStdComboBox) are missing. Trying to use these methods will fail
			// with a NoClassDefFoundError exception (cf. POI FAQ).
			// But we can work with the plain XmlObjects or DOM nodes, if we extract them by
			// a generic path expression.
			Element sdtType;
			if ((sdtType = getFirstAsDomElement(sdtPr, "w:text")) != null
				|| (sdtType = getFirstAsDomElement(sdtPr, "w:richText")) != null) {
				// Value is the text (in the case of richText without formatting)
			} else if ((sdtType = getFirstAsDomElement(sdtPr, "w:date")) != null) {
				// 17.5.2.7: fullDate contains the "full date and time last entered"
				// in XML Schema DateTime syntax
				String fullDate = sdtType.getAttributeNS(WORDPROCESSINGML_NS, "fullDate");
				if (fullDate != null) {
					XMLGregorianCalendar calendar;
					try {
						calendar = DatatypeFactory.newInstance().newXMLGregorianCalendar(fullDate);
						long timeMillis = calendar.toGregorianCalendar(null, null, null).getTimeInMillis();
						final String dateText = text;
						value = new Date(timeMillis) {
							@Override 
							public String toString() { 
								return dateText; 
							};
						};
					} catch(DatatypeConfigurationException e) {
						LOG.warn("createStructuredDocumentTag failed: ", e);
					}
				}
			} else if ((sdtType = getFirstAsDomElement(sdtPr, "w:comboBox")) != null
				|| (sdtType = getFirstAsDomElement(sdtPr, "w:dropDownList")) != null) {
				// 17.5.2.5 (comboBox), 17.5.2.15 (dropDownList)
				// Try to find the associated value with the extract text (if possible)
				NodeList listItems = sdtType.getElementsByTagNameNS(WORDPROCESSINGML_NS, "listItem");
				for (int i = 0, n = listItems.getLength(); i < n; i++) {
					Element listItem = (Element) listItems.item(i);
					String displayText = listItem.getAttributeNS(WORDPROCESSINGML_NS, "displayText");
					if (text.equals(displayText)) {
						value = listItem.getAttributeNS(WORDPROCESSINGML_NS, "value");
						break;
					}
				}
			} else if ((getFirstAsDomElement(sdtPr, "equation") != null)
					|| (getFirstAsDomElement(sdtPr, "docPartObj") != null)
					|| (getFirstAsDomElement(sdtPr, "docPartList") != null)
					|| (getFirstAsDomElement(sdtPr, "picture") != null)
					|| (getFirstAsDomElement(sdtPr, "citation") != null)
					|| (getFirstAsDomElement(sdtPr, "group") != null)
					|| (getFirstAsDomElement(sdtPr, "bibliography") != null)) {
				// ignore (unsupported type)
				return;
			} else {
				// type is unspecified, treat as text
			}
		}

		StructuredDocumentTag sdt = new StructuredDocumentTag(alias, tagName, value, text);
		structuredDocumentTags.add(sdt);
	}

	private static Element getFirstAsDomElement(XmlObject xmlObject, String path) {
		XmlObject[] children = xmlObject.selectPath(DECLARE_NS_PREFIX + path);
		if (children.length >= 1)
			return (Element) children[0].getDomNode();
		return null;
	}

	private static String getPText(CTP...ps) {
		StringBuilder sb = new StringBuilder();
		for (CTP p : ps) {
			sb.append(getRText(p.getRArray()));
		}
		return sb.toString();
	}

	private static String getRText(CTR[] rs) {
		// This method is inspired by the text extraction algorithm in the XWPFParagraph constructor
		StringBuilder sb = new StringBuilder();
		for (CTR r : rs) {
			XmlCursor c = r.newCursor();
			c.selectPath("./*");
			while (c.toNextSelection()) {
				XmlObject o = c.getObject();
				if (o instanceof CTText) {
					sb.append(((CTText) o).getStringValue());
				}
				if (o instanceof CTPTab) {
					sb.append("\t");
				}
			}
		}
		return sb.toString();
	}

	private static String getCTStringVal(CTString cts) {
		return (cts != null) ? cts.getVal() : null;
	}

	private static <T> T getFirst(T[] array) {
		return array.length >= 1 ? array[0] : null;
	}

	private static class StructuredDocumentTag {

		final String alias;
		final String tagName;
		final Object value;
		final String text;

		StructuredDocumentTag(String alias, String tagName, Object value, String text) {
			this.alias = alias;
			this.tagName = tagName;
			this.value = value;
			this.text = text;
		}
	}
}
Tech Fingerprint

Alerts (8)

'java.util.Date' Maintainability Info: Prefer using the modern Java Time API (java.time.* classes like LocalDate, ZonedDateTime, Instant) introduced in Java 8 over the legacy java.util.Date and Calendar classes for better API design and thread safety.
23
Complexity hotspot; lines 247 to 253 (total complexity: 15)
247 248 249 250 251 252 253