PageRenderTime 7539ms CodeModel.GetById 144ms RepoModel.GetById 7ms app.codeStats 0ms

/src/main/java/org/docx4j/convert/in/Doc.java

http://github.com/plutext/docx4j
Java | 350 lines | 173 code | 93 blank | 84 comment | 32 complexity | 7e4ab85cefb0bb7ea0f70affee45a7e6 MD5 | raw file
Possible License(s): Apache-2.0
  1. /*
  2. * Copyright 2007-2008, Plutext Pty Ltd.
  3. *
  4. * This file is part of docx4j.
  5. docx4j is licensed under the Apache License, Version 2.0 (the "License");
  6. you may not use this file except in compliance with the License.
  7. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. */
  15. package org.docx4j.convert.in;
  16. import java.io.FileInputStream;
  17. import org.apache.log4j.Logger;
  18. import org.apache.poi.hwpf.HWPFDocument;
  19. import org.apache.poi.hwpf.usermodel.CharacterRun;
  20. import org.apache.poi.hwpf.usermodel.Paragraph;
  21. import org.apache.poi.hwpf.usermodel.Range;
  22. import org.apache.poi.hwpf.usermodel.Section;
  23. import org.apache.poi.hwpf.usermodel.Table;
  24. import org.apache.poi.hwpf.usermodel.TableCell;
  25. import org.apache.poi.hwpf.usermodel.TableRow;
  26. import org.docx4j.openpackaging.io.SaveToZipFile;
  27. import org.docx4j.openpackaging.packages.WordprocessingMLPackage;
  28. import org.docx4j.openpackaging.parts.WordprocessingML.MainDocumentPart;
  29. /**
  30. * @author jason
  31. *
  32. */
  33. public class Doc {
  34. private static Logger log = Logger.getLogger(Doc.class);
  35. /**
  36. * @param in doc file
  37. * @return new WordprocessingMLPackage containing the results of
  38. * the conversion
  39. * @throws Exception
  40. */
  41. public static WordprocessingMLPackage convert(FileInputStream in)
  42. throws Exception {
  43. HWPFDocument doc = new HWPFDocument(in);
  44. WordprocessingMLPackage out = WordprocessingMLPackage.createPackage();
  45. convert(doc, out);
  46. return out;
  47. }
  48. // public static boolean convert(FileInputStream in, WordprocessingMLPackage out) throws Exception {
  49. // HWPFDocument doc = new HWPFDocument(in);
  50. // return convert(doc, out);
  51. // }
  52. // public static boolean convert(org.apache.commons.vfs.FileObject in,
  53. // WordprocessingMLPackage out) throws Exception {
  54. /**
  55. * This method is private, since the fact that conversion is (currently)
  56. * performed using POI's HWPF should be encapsulated.
  57. *
  58. * @param doc
  59. * @param wordMLPackage
  60. * @return success or failure
  61. */
  62. private static void convert(HWPFDocument doc,
  63. WordprocessingMLPackage wordMLPackage) throws Exception {
  64. // Convert styles
  65. org.apache.poi.hwpf.model.StyleSheet stylesheet = doc.getStyleSheet();
  66. // TODO - higher priority
  67. // At present, a default set of styles are defined in the output
  68. // document.
  69. // Convert lists
  70. org.apache.poi.hwpf.model.ListTables listTables = doc.getListTables();
  71. // TODO
  72. // Convert document properties
  73. org.apache.poi.hwpf.model.DocumentProperties docProps = doc.getDocProperties();
  74. // TODO
  75. // Convert main document part
  76. MainDocumentPart documentPart = wordMLPackage.getMainDocumentPart();
  77. org.docx4j.wml.ObjectFactory factory = new org.docx4j.wml.ObjectFactory();
  78. Range r = doc.getRange();
  79. for (int x = 0; x < r.numSections(); x++) {
  80. Section s = r.getSection(x);
  81. // TODO - convert section
  82. for (int y = 0; y < s.numParagraphs(); y++) {
  83. Paragraph p = s.getParagraph(y);
  84. if (p.isInTable()) {
  85. Table t = s.getTable(p);
  86. int cl = numCol(t);
  87. log.info("Found " + t.numRows() + "x" + cl
  88. + " table - TODO - convert");
  89. handleTable(t, stylesheet, documentPart, factory);
  90. // addTODO(factory, wmlP, "[TABLE " + + t.numRows() + "x" + cl
  91. // + " - can't convert tables yet]");
  92. y += t.numParagraphs() - 1;
  93. } else {
  94. org.docx4j.wml.P paraToAdd = handleP(p, stylesheet,
  95. documentPart, factory);
  96. documentPart.addObject(paraToAdd);
  97. }
  98. }
  99. }
  100. }
  101. private static org.docx4j.wml.P handleP(Paragraph p,
  102. org.apache.poi.hwpf.model.StyleSheet stylesheet,
  103. MainDocumentPart documentPart,
  104. org.docx4j.wml.ObjectFactory factory) {
  105. org.docx4j.wml.P wmlP = null;
  106. if (p.getStyleIndex() > 0) {
  107. log.debug("Styled paragraph, with index: " + p.getStyleIndex());
  108. String styleName = stylesheet
  109. .getStyleDescription(p.getStyleIndex()).getName();
  110. log.debug(styleName);
  111. wmlP = documentPart.createStyledParagraphOfText( stripSpace(styleName), null);
  112. } else {
  113. wmlP = documentPart.createParagraphOfText(null);
  114. }
  115. for (int z = 0; z < p.numCharacterRuns(); z++) {
  116. // character run
  117. CharacterRun run = p.getCharacterRun(z);
  118. // No character styles defined in there??
  119. org.docx4j.wml.RPr rPr = null;
  120. if (run.isBold()) {
  121. // TODO - HIGH PRIORITY- handle other run properties
  122. // esp underline, font size
  123. if (rPr == null) {
  124. rPr = factory.createRPr();
  125. }
  126. org.docx4j.wml.BooleanDefaultTrue boldOn = factory.createBooleanDefaultTrue();
  127. boldOn.setVal( Boolean.TRUE);
  128. rPr.setB(boldOn);
  129. }
  130. // character run text
  131. String text = run.text();
  132. // show us the text
  133. log.debug("Processing: " + text);
  134. String cleansed = stripNonValidXMLCharacters(text);
  135. // Necessary to avoid org.xml.sax.SAXParseException: An invalid XML character
  136. // (Unicode: 0xb) was found in the element content of the document.
  137. // when trying to open the resulting docx.
  138. // ie JAXB happily writes (marshals) it, but doesn't want to
  139. // unmarshall.
  140. if (!text.equals(cleansed)) {
  141. log.warn("Cleansed..");
  142. }
  143. org.docx4j.wml.Text t = factory.createText();
  144. t.setValue(cleansed);
  145. org.docx4j.wml.R wmlRun = factory.createR();
  146. if (rPr!=null) {
  147. wmlRun.setRPr(rPr);
  148. }
  149. wmlRun.getRunContent().add(t);
  150. wmlP.getParagraphContent().add(wmlRun);
  151. }
  152. return wmlP;
  153. }
  154. private static String stripSpace(String in) {
  155. StringBuffer sb = new StringBuffer();
  156. for (int i = 0; i < in.length(); i++) {
  157. if (in.charAt(i) != ' ') {
  158. sb.append(in.charAt(i));
  159. }
  160. }
  161. return sb.toString();
  162. }
  163. private static void addTODO(org.docx4j.wml.ObjectFactory factory,
  164. org.docx4j.wml.P wmlP, String message) {
  165. org.docx4j.wml.Text t = factory.createText();
  166. t.setValue(message);
  167. org.docx4j.wml.R wmlRun = factory.createR();
  168. wmlRun.getRunContent().add(t);
  169. wmlP.getParagraphContent().add(wmlRun);
  170. }
  171. /**
  172. * This method ensures that the output String has only
  173. * valid XML unicode characters as specified by the
  174. * XML 1.0 standard. For reference, please see
  175. * <a href="http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char">the
  176. * standard</a>. This method will return an empty
  177. * String if the input is null or empty.
  178. *
  179. * See http://cse-mjmcl.cse.bris.ac.uk/blog/2007/02/14/1171465494443.html
  180. *
  181. * @param in The String whose non-valid characters we want to remove.
  182. * @return The in String, stripped of non-valid characters.
  183. */
  184. public static String stripNonValidXMLCharacters(String in) {
  185. StringBuffer out = new StringBuffer(); // Used to hold the output.
  186. char current; // Used to reference the current character.
  187. if (in == null || ("".equals(in))) return ""; // vacancy test.
  188. for (int i = 0; i < in.length(); i++) {
  189. current = in.charAt(i); // NOTE: No IndexOutOfBoundsException caught here; it should not happen.
  190. if ((current == 0x9) ||
  191. (current == 0xA) ||
  192. (current == 0xD) ||
  193. ((current >= 0x20) && (current <= 0xD7FF)) ||
  194. ((current >= 0xE000) && (current <= 0xFFFD)) ||
  195. ((current >= 0x10000) && (current <= 0x10FFFF))) {
  196. out.append(current);
  197. } else {
  198. out.append("[#?]");
  199. }
  200. }
  201. return out.toString();
  202. }
  203. private static int numCol(Table t) {
  204. int col = 0;
  205. for (int i = 0; i < t.numRows(); i++) {
  206. if (t.getRow(i).numCells() > col)
  207. col = t.getRow(i).numCells();
  208. }
  209. return col;
  210. }
  211. private static void handleTable(Table t,
  212. org.apache.poi.hwpf.model.StyleSheet stylesheet,
  213. MainDocumentPart documentPart,
  214. org.docx4j.wml.ObjectFactory factory) {
  215. org.docx4j.wml.Tbl tbl = factory.createTbl();
  216. documentPart.addObject(tbl);
  217. org.docx4j.wml.TblPr tblPr = factory.createTblPr();
  218. tbl.setTblPr(tblPr);
  219. // TODO - set tblPr values
  220. org.docx4j.wml.TblGrid tblGrid = factory.createTblGrid();
  221. tbl.setTblGrid(tblGrid);
  222. // TODO - set tblGrid values
  223. for (int i = 0; i < t.numRows(); i++) {
  224. TableRow tr = t.getRow(i);
  225. org.docx4j.wml.Tr trOut = factory.createTr();
  226. tbl.getEGContentRowContent().add(trOut);
  227. for (int j = 0; j < tr.numCells(); j++) {
  228. TableCell tc = tr.getCell(j);
  229. org.docx4j.wml.Tc tcOut = factory.createTc();
  230. trOut.getEGContentCellContent().add(tcOut);
  231. //System.out.println("CELL[" + i + "][" + j + "]=" + tc.text());
  232. for (int y = 0; y < tc.numParagraphs(); y++) {
  233. Paragraph p = tc.getParagraph(y);
  234. // Nested tables?
  235. // if (p.isInTable()) ???
  236. org.docx4j.wml.P paraToAdd = handleP(p, stylesheet,
  237. documentPart, factory);
  238. tcOut.getEGBlockLevelElts().add(paraToAdd);
  239. log.debug("Added p to tc");
  240. }
  241. }
  242. }
  243. }
  244. public static void main(String[] args) throws Exception {
  245. String localPath = "/home/dev/TargetFeatureSet.doc";
  246. WordprocessingMLPackage out = convert(new FileInputStream(localPath));
  247. String outputfilepath = "/home/dev/tmp/test-out.docx";
  248. SaveToZipFile saver = new SaveToZipFile(out);
  249. saver.save(outputfilepath);
  250. log.info("Done - saved docx as " + outputfilepath);
  251. }
  252. }