PageRenderTime 3285ms CodeModel.GetById 43ms RepoModel.GetById 0ms app.codeStats 0ms

/nuclos-server/src/main/java/org/nuclos/server/common/ooxml/WordXMLReader.java

https://bitbucket.org/nuclos/nuclos
Java | 321 lines | 208 code | 35 blank | 78 comment | 58 complexity | b0149ee3f1d64724e7fbee233b264acb MD5 | raw file
Possible License(s): Apache-2.0
  1. //Copyright (C) 2010 Novabit Informationssysteme GmbH
  2. //
  3. //This file is part of Nuclos.
  4. //
  5. //Nuclos is free software: you can redistribute it and/or modify
  6. //it under the terms of the GNU Affero General Public License as published by
  7. //the Free Software Foundation, either version 3 of the License, or
  8. //(at your option) any later version.
  9. //
  10. //Nuclos is distributed in the hope that it will be useful,
  11. //but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. //GNU Affero General Public License for more details.
  14. //
  15. //You should have received a copy of the GNU Affero General Public License
  16. //along with Nuclos. If not, see <http://www.gnu.org/licenses/>.
  17. package org.nuclos.server.common.ooxml;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.util.ArrayList;
  21. import java.util.Date;
  22. import java.util.HashMap;
  23. import java.util.Iterator;
  24. import java.util.List;
  25. import java.util.Map;
  26. import javax.xml.datatype.DatatypeConfigurationException;
  27. import javax.xml.datatype.DatatypeFactory;
  28. import javax.xml.datatype.XMLGregorianCalendar;
  29. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  30. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  31. import org.apache.poi.xwpf.usermodel.XWPFParagraph;
  32. import org.apache.poi.xwpf.usermodel.XWPFTable;
  33. import org.apache.xmlbeans.XmlCursor;
  34. import org.apache.xmlbeans.XmlObject;
  35. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
  36. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPTab;
  37. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
  38. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
  39. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock;
  40. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentBlock;
  41. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtContentRun;
  42. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtPr;
  43. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtRun;
  44. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTString;
  45. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
  46. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
  47. import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
  48. import org.slf4j.Logger;
  49. import org.slf4j.LoggerFactory;
  50. import org.w3c.dom.Element;
  51. import org.w3c.dom.NodeList;
  52. /**
  53. * Helper class providing simple read-only access for examing OOXML wordprocessing documents.
  54. *
  55. * <p>Implementation Note: This API is based on Apache POI 3.6 which only covers a very small subset
  56. * for "common use cases". Especially,structured document tags are not supported by POI 3.6 directly.
  57. * However, it is possible to access the underlying XML structure directly using precompiled XMLBeans
  58. * (packages starting with org.openxmlformats.schemas contains the XML Schema Definition (XSD) compiled
  59. * as XMLBeans). For details about the XML structure, see the ECMA-376 specification (in particular
  60. * [ECMA-376,2nd], part 1, 17.5.2).
  61. */
  62. public class WordXMLReader {
  63. private static final Logger LOG = LoggerFactory.getLogger(WordXMLReader.class);
  64. /** WordprocessingML namespace URI. */
  65. private static final String WORDPROCESSINGML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
  66. /** Namespace declaration for XMLBeans path selection. */
  67. private static final String DECLARE_NS_PREFIX = "declare namespace w='" + WORDPROCESSINGML_NS + "' ";
  68. private final XWPFDocument document;
  69. private List<StructuredDocumentTag> structuredDocumentTags;
  70. public WordXMLReader(InputStream is) throws IOException {
  71. this(new XWPFDocument(is));
  72. }
  73. public WordXMLReader(XWPFDocument document) {
  74. this.document = document;
  75. }
  76. public String getText() {
  77. // Note: in POI 3.6, text extraction does not always work correctly
  78. return new XWPFWordExtractor(document).getText();
  79. }
  80. /**
  81. * Returns a map with the text content of the structured document tags
  82. * contained in this document (cf. [ECMA-376,2nd], 17.5.2) with their
  83. * tag name (17.5.2.42) as key. Supported tags are comboBox, date,
  84. * dropDownList, richText and text.
  85. * <p>
  86. * If the structured document tag is not filled, i.e. marked as
  87. * placeholder (17.5.2.25), the text content is {@code null}.
  88. * If the structured document tag does not provide a tag name, the
  89. * alias (aka friendly name, 17.5.2.1). If both are omitted, the
  90. * structured document tag is skipped.
  91. */
  92. public Map<String, String> getStructuredDocumentTagTexts() {
  93. Map<String, String> tags = new HashMap<String, String>();
  94. for (StructuredDocumentTag sdt : structuredDocumentTags()) {
  95. String name = (sdt.tagName != null) ? sdt.tagName : sdt.alias;
  96. if (name != null && !tags.containsKey(name))
  97. tags.put(name, sdt.text);
  98. }
  99. return tags;
  100. }
  101. /**
  102. * Similar to {@link #getStructuredDocumentTagTexts()}, but returns
  103. * prepared values.
  104. * <p>
  105. * For combobox (17.5.2.5) and drop-down (17.5.2.15) elements,
  106. * the text content (=display text) is resolved against the given
  107. * list items (17.5.2.21/22) and replaced with its associated value.
  108. * For date elements (17.5.2.7), a {@link java.util.Date} object based
  109. * the cached full-date is returned ({@code toString} returns the
  110. * original string). If no full-date is stored, the original string
  111. * object is returned.
  112. */
  113. public Map<String, Object> getStructuredDocumentTagValues() {
  114. Map<String, Object> tags = new HashMap<String, Object>();
  115. for (StructuredDocumentTag sdt : structuredDocumentTags()) {
  116. String name = (sdt.tagName != null) ? sdt.tagName : sdt.alias;
  117. if (name != null && !tags.containsKey(name))
  118. tags.put(name, sdt.value);
  119. }
  120. return tags;
  121. }
  122. private List<StructuredDocumentTag> structuredDocumentTags() {
  123. if (structuredDocumentTags == null) {
  124. structuredDocumentTags = new ArrayList<StructuredDocumentTag>();
  125. // TODO: tables, am besten getBodyElements(), siehe JavaDoc...
  126. for (XWPFParagraph p : document.getParagraphs()) {
  127. extractStructuredDocumentTags(p.getCTP().getSdtArray());
  128. }
  129. Iterator<XWPFTable> tableIter = document.getTablesIterator();
  130. while(tableIter.hasNext()) {
  131. extractStructuredDocumentTags(tableIter.next());
  132. }
  133. }
  134. return structuredDocumentTags;
  135. }
  136. private void extractStructuredDocumentTags(XWPFTable t) {
  137. CTTbl table = t.getCTTbl();
  138. for (CTRow row : table.getTrArray()) {
  139. for (CTTc cell : row.getTcArray()) {
  140. extractStructuredDocumentTags(cell.getSdtArray());
  141. for (CTP ctp : cell.getPArray()) {
  142. extractStructuredDocumentTags(ctp.getSdtArray());
  143. }
  144. }
  145. }
  146. }
  147. private void extractStructuredDocumentTags(CTSdtRun[] sdtRuns) {
  148. for (CTSdtRun sdtRun : sdtRuns) {
  149. CTSdtPr sdtPr = sdtRun.getSdtPr();
  150. CTSdtContentRun sdtContent = sdtRun.getSdtContent();
  151. createStructuredDocumentTag(sdtPr, getRText(sdtContent.getRArray()));
  152. }
  153. }
  154. private void extractStructuredDocumentTags(CTSdtBlock[] sdtBlocks) {
  155. for (CTSdtBlock sdtRun : sdtBlocks) {
  156. CTSdtPr sdtPr = sdtRun.getSdtPr();
  157. CTSdtContentBlock sdtContent = sdtRun.getSdtContent();
  158. createStructuredDocumentTag(sdtPr, getPText(sdtContent.getPArray()));
  159. }
  160. }
  161. /**
  162. * A <w:sdt> element contains 2 child elements: <w:sdtPr> for the properties,
  163. * and <w:sdtContent> for content (here text).
  164. */
  165. private void createStructuredDocumentTag(CTSdtPr sdtPr, String text) {
  166. // The properties contain (among others) aliases (<w:alias>), tag names (w:tag)
  167. // and a flag (<w:showingPlcHdr>) whether the content is placeholder or real content.
  168. String alias = getCTStringVal(getFirst(sdtPr.getAliasArray()));
  169. String tagName = getCTStringVal(getFirst(sdtPr.getTagArray()));
  170. boolean isPlaceholder = sdtPr.getShowingPlcHdrArray().length > 0;
  171. Object value = null;
  172. // If placeholder is set, the element is not filled by the user
  173. if (!isPlaceholder) {
  174. value = text;
  175. // The following child element can occur and determine the type of the structured
  176. // document tag: equation, comboBox (*), date (*), docPartObj, docPartList,
  177. // dropDownList (*), picture, richText (*), text (*), citation, group, bibliography.
  178. // Note that we can't use the typed method (e.g sdtPr.getComboBoxArray()) here
  179. // because in the small (poi-)ooxml-schemas.jar bundled with POI, the specialized
  180. // classes (e.g. CTStdComboBox) are missing. Trying to use these methods will fail
  181. // with a NoClassDefFoundError exception (cf. POI FAQ).
  182. // But we can work with the plain XmlObjects or DOM nodes, if we extract them by
  183. // a generic path expression.
  184. Element sdtType;
  185. if ((sdtType = getFirstAsDomElement(sdtPr, "w:text")) != null
  186. || (sdtType = getFirstAsDomElement(sdtPr, "w:richText")) != null) {
  187. // Value is the text (in the case of richText without formatting)
  188. } else if ((sdtType = getFirstAsDomElement(sdtPr, "w:date")) != null) {
  189. // 17.5.2.7: fullDate contains the "full date and time last entered"
  190. // in XML Schema DateTime syntax
  191. String fullDate = sdtType.getAttributeNS(WORDPROCESSINGML_NS, "fullDate");
  192. if (fullDate != null) {
  193. XMLGregorianCalendar calendar;
  194. try {
  195. calendar = DatatypeFactory.newInstance().newXMLGregorianCalendar(fullDate);
  196. long timeMillis = calendar.toGregorianCalendar(null, null, null).getTimeInMillis();
  197. final String dateText = text;
  198. value = new Date(timeMillis) {
  199. @Override
  200. public String toString() {
  201. return dateText;
  202. };
  203. };
  204. } catch(DatatypeConfigurationException e) {
  205. LOG.warn("createStructuredDocumentTag failed: ", e);
  206. }
  207. }
  208. } else if ((sdtType = getFirstAsDomElement(sdtPr, "w:comboBox")) != null
  209. || (sdtType = getFirstAsDomElement(sdtPr, "w:dropDownList")) != null) {
  210. // 17.5.2.5 (comboBox), 17.5.2.15 (dropDownList)
  211. // Try to find the associated value with the extract text (if possible)
  212. NodeList listItems = sdtType.getElementsByTagNameNS(WORDPROCESSINGML_NS, "listItem");
  213. for (int i = 0, n = listItems.getLength(); i < n; i++) {
  214. Element listItem = (Element) listItems.item(i);
  215. String displayText = listItem.getAttributeNS(WORDPROCESSINGML_NS, "displayText");
  216. if (text.equals(displayText)) {
  217. value = listItem.getAttributeNS(WORDPROCESSINGML_NS, "value");
  218. break;
  219. }
  220. }
  221. } else if ((getFirstAsDomElement(sdtPr, "equation") != null)
  222. || (getFirstAsDomElement(sdtPr, "docPartObj") != null)
  223. || (getFirstAsDomElement(sdtPr, "docPartList") != null)
  224. || (getFirstAsDomElement(sdtPr, "picture") != null)
  225. || (getFirstAsDomElement(sdtPr, "citation") != null)
  226. || (getFirstAsDomElement(sdtPr, "group") != null)
  227. || (getFirstAsDomElement(sdtPr, "bibliography") != null)) {
  228. // ignore (unsupported type)
  229. return;
  230. } else {
  231. // type is unspecified, treat as text
  232. }
  233. }
  234. StructuredDocumentTag sdt = new StructuredDocumentTag(alias, tagName, value, text);
  235. structuredDocumentTags.add(sdt);
  236. }
  237. private static Element getFirstAsDomElement(XmlObject xmlObject, String path) {
  238. XmlObject[] children = xmlObject.selectPath(DECLARE_NS_PREFIX + path);
  239. if (children.length >= 1)
  240. return (Element) children[0].getDomNode();
  241. return null;
  242. }
  243. private static String getPText(CTP...ps) {
  244. StringBuilder sb = new StringBuilder();
  245. for (CTP p : ps) {
  246. sb.append(getRText(p.getRArray()));
  247. }
  248. return sb.toString();
  249. }
  250. private static String getRText(CTR[] rs) {
  251. // This method is inspired by the text extraction algorithm in the XWPFParagraph constructor
  252. StringBuilder sb = new StringBuilder();
  253. for (CTR r : rs) {
  254. XmlCursor c = r.newCursor();
  255. c.selectPath("./*");
  256. while (c.toNextSelection()) {
  257. XmlObject o = c.getObject();
  258. if (o instanceof CTText) {
  259. sb.append(((CTText) o).getStringValue());
  260. }
  261. if (o instanceof CTPTab) {
  262. sb.append("\t");
  263. }
  264. }
  265. }
  266. return sb.toString();
  267. }
  268. private static String getCTStringVal(CTString cts) {
  269. return (cts != null) ? cts.getVal() : null;
  270. }
  271. private static <T> T getFirst(T[] array) {
  272. return array.length >= 1 ? array[0] : null;
  273. }
  274. private static class StructuredDocumentTag {
  275. final String alias;
  276. final String tagName;
  277. final Object value;
  278. final String text;
  279. StructuredDocumentTag(String alias, String tagName, Object value, String text) {
  280. this.alias = alias;
  281. this.tagName = tagName;
  282. this.value = value;
  283. this.text = text;
  284. }
  285. }
  286. }