PageRenderTime 71ms CodeModel.GetById 33ms RepoModel.GetById 0ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java

https://github.com/solsson/tika
Java | 207 lines | 146 code | 29 blank | 32 comment | 25 complexity | cbe6aea6e2c67bdf72e6eb288fadfc2d MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft.ooxml;
  18. import java.io.IOException;
  19. import java.util.ArrayList;
  20. import java.util.Iterator;
  21. import java.util.List;
  22. import java.util.Locale;
  23. import org.apache.poi.hssf.extractor.ExcelExtractor;
  24. import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
  25. import org.apache.poi.openxml4j.opc.PackagePart;
  26. import org.apache.poi.openxml4j.opc.PackagePartName;
  27. import org.apache.poi.openxml4j.opc.PackageRelationship;
  28. import org.apache.poi.openxml4j.opc.PackagingURIHelper;
  29. import org.apache.poi.openxml4j.opc.TargetMode;
  30. import org.apache.poi.ss.usermodel.Cell;
  31. import org.apache.poi.ss.usermodel.CellStyle;
  32. import org.apache.poi.ss.usermodel.Comment;
  33. import org.apache.poi.ss.usermodel.DataFormatter;
  34. import org.apache.poi.ss.usermodel.HeaderFooter;
  35. import org.apache.poi.ss.usermodel.Row;
  36. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  37. import org.apache.poi.xssf.usermodel.XSSFCell;
  38. import org.apache.poi.xssf.usermodel.XSSFRelation;
  39. import org.apache.poi.xssf.usermodel.XSSFSheet;
  40. import org.apache.poi.xssf.usermodel.XSSFWorkbook;
  41. import org.apache.tika.sax.XHTMLContentHandler;
  42. import org.apache.tika.metadata.Metadata;
  43. import org.apache.tika.metadata.TikaMetadataKeys;
  44. import org.apache.tika.exception.TikaException;
  45. import org.apache.xmlbeans.XmlException;
  46. import org.xml.sax.SAXException;
  47. public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
  48. /**
  49. * Internal <code>DataFormatter</code> for formatting Numbers.
  50. */
  51. private final DataFormatter formatter;
  52. private final XSSFExcelExtractor extractor;
  53. private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
  54. public XSSFExcelExtractorDecorator(
  55. XSSFExcelExtractor extractor, Locale locale) {
  56. super(extractor, TYPE);
  57. this.extractor = extractor;
  58. formatter = new DataFormatter(locale);
  59. }
  60. /**
  61. * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
  62. */
  63. @Override
  64. protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
  65. XmlException, IOException {
  66. XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
  67. for (int i = 0; i < document.getNumberOfSheets(); i++) {
  68. xhtml.startElement("div");
  69. XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
  70. xhtml.element("h1", document.getSheetName(i));
  71. // Header(s), if present
  72. extractHeaderFooter(sheet.getFirstHeader(), xhtml);
  73. extractHeaderFooter(sheet.getOddHeader(), xhtml);
  74. extractHeaderFooter(sheet.getEvenHeader(), xhtml);
  75. xhtml.startElement("table");
  76. xhtml.startElement("tbody");
  77. // Rows and cells
  78. for (Object rawR : sheet) {
  79. xhtml.startElement("tr");
  80. Row row = (Row) rawR;
  81. for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
  82. xhtml.startElement("td");
  83. Cell cell = ri.next();
  84. int type = cell.getCellType();
  85. if (type == Cell.CELL_TYPE_FORMULA) {
  86. type = cell.getCachedFormulaResultType();
  87. }
  88. if (type == Cell.CELL_TYPE_STRING) {
  89. xhtml.characters(cell.getRichStringCellValue()
  90. .getString());
  91. } else if (type == Cell.CELL_TYPE_NUMERIC) {
  92. CellStyle style = cell.getCellStyle();
  93. xhtml.characters(
  94. formatter.formatRawCellContents(cell.getNumericCellValue(),
  95. style.getDataFormat(),
  96. style.getDataFormatString()));
  97. } else {
  98. XSSFCell xc = (XSSFCell) cell;
  99. String rawValue = xc.getRawValue();
  100. if (rawValue != null) {
  101. xhtml.characters(rawValue);
  102. }
  103. }
  104. // Output the comment in the same cell as the content
  105. Comment comment = cell.getCellComment();
  106. if (comment != null) {
  107. xhtml.characters(comment.getString().getString());
  108. }
  109. xhtml.endElement("td");
  110. }
  111. xhtml.endElement("tr");
  112. }
  113. xhtml.endElement("tbody");
  114. xhtml.endElement("table");
  115. // Finally footer(s), if present
  116. extractHeaderFooter(sheet.getFirstFooter(), xhtml);
  117. extractHeaderFooter(sheet.getOddFooter(), xhtml);
  118. extractHeaderFooter(sheet.getEvenFooter(), xhtml);
  119. xhtml.endElement("div");
  120. }
  121. }
  122. private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler xhtml)
  123. throws SAXException {
  124. String content = ExcelExtractor._extractHeaderFooter(hf);
  125. if (content.length() > 0) {
  126. xhtml.element("p", content);
  127. }
  128. }
  129. /**
  130. * In Excel files, sheets have things embedded in them,
  131. * and sheet drawings which have the images
  132. */
  133. @Override
  134. protected List<PackagePart> getMainDocumentParts() throws TikaException {
  135. List<PackagePart> parts = new ArrayList<PackagePart>();
  136. XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
  137. for(XSSFSheet sheet : document) {
  138. PackagePart part = sheet.getPackagePart();
  139. // Add the sheet
  140. parts.add(part);
  141. // If it has drawings, return those too
  142. try {
  143. for(PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
  144. if(rel.getTargetMode() == TargetMode.INTERNAL) {
  145. PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
  146. parts.add( rel.getPackage().getPart(relName) );
  147. }
  148. }
  149. for(PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
  150. if(rel.getTargetMode() == TargetMode.INTERNAL) {
  151. PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
  152. parts.add( rel.getPackage().getPart(relName) );
  153. }
  154. }
  155. } catch(InvalidFormatException e) {
  156. throw new TikaException("Broken OOXML file", e);
  157. }
  158. }
  159. return parts;
  160. }
  161. @Override
  162. public MetadataExtractor getMetadataExtractor() {
  163. return new MetadataExtractor(extractor, TYPE) {
  164. @Override
  165. public void extract(Metadata metadata) throws TikaException {
  166. super.extract(metadata);
  167. metadata.set(TikaMetadataKeys.PROTECTED, "false");
  168. XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
  169. for (int i = 0; i < document.getNumberOfSheets(); i++) {
  170. XSSFSheet sheet = document.getSheetAt(i);
  171. if (sheet.getProtect()) {
  172. metadata.set(TikaMetadataKeys.PROTECTED, "true");
  173. }
  174. }
  175. }
  176. };
  177. }
  178. }