PageRenderTime 6188ms CodeModel.GetById 39ms RepoModel.GetById 7ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java

https://github.com/solsson/tika
Java | 594 lines | 351 code | 65 blank | 178 comment | 56 complexity | 6b55090236799f49ee0f5bd7d54bbbac MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft;
  18. import java.awt.Point;
  19. import java.io.IOException;
  20. import java.text.NumberFormat;
  21. import java.util.ArrayList;
  22. import java.util.Comparator;
  23. import java.util.List;
  24. import java.util.Locale;
  25. import java.util.Map;
  26. import java.util.SortedMap;
  27. import java.util.TreeMap;
  28. import org.apache.poi.ddf.EscherBSERecord;
  29. import org.apache.poi.ddf.EscherBitmapBlip;
  30. import org.apache.poi.ddf.EscherBlipRecord;
  31. import org.apache.poi.ddf.EscherMetafileBlip;
  32. import org.apache.poi.ddf.EscherRecord;
  33. import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
  34. import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
  35. import org.apache.poi.hssf.eventusermodel.HSSFListener;
  36. import org.apache.poi.hssf.eventusermodel.HSSFRequest;
  37. import org.apache.poi.hssf.record.AbstractEscherHolderRecord;
  38. import org.apache.poi.hssf.record.BOFRecord;
  39. import org.apache.poi.hssf.record.BoundSheetRecord;
  40. import org.apache.poi.hssf.record.CellValueRecordInterface;
  41. import org.apache.poi.hssf.record.CountryRecord;
  42. import org.apache.poi.hssf.record.DateWindow1904Record;
  43. import org.apache.poi.hssf.record.DrawingGroupRecord;
  44. import org.apache.poi.hssf.record.EOFRecord;
  45. import org.apache.poi.hssf.record.ExtendedFormatRecord;
  46. import org.apache.poi.hssf.record.FormatRecord;
  47. import org.apache.poi.hssf.record.FormulaRecord;
  48. import org.apache.poi.hssf.record.HyperlinkRecord;
  49. import org.apache.poi.hssf.record.LabelRecord;
  50. import org.apache.poi.hssf.record.LabelSSTRecord;
  51. import org.apache.poi.hssf.record.NumberRecord;
  52. import org.apache.poi.hssf.record.RKRecord;
  53. import org.apache.poi.hssf.record.Record;
  54. import org.apache.poi.hssf.record.SSTRecord;
  55. import org.apache.poi.hssf.record.TextObjectRecord;
  56. import org.apache.poi.hssf.record.chart.SeriesTextRecord;
  57. import org.apache.poi.hssf.record.common.UnicodeString;
  58. import org.apache.poi.hssf.usermodel.HSSFPictureData;
  59. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  60. import org.apache.poi.poifs.filesystem.DocumentInputStream;
  61. import org.apache.poi.poifs.filesystem.Entry;
  62. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  63. import org.apache.tika.exception.TikaException;
  64. import org.apache.tika.io.TikaInputStream;
  65. import org.apache.tika.parser.ParseContext;
  66. import org.apache.tika.sax.XHTMLContentHandler;
  67. import org.xml.sax.SAXException;
  68. /**
  69. * Excel parser implementation which uses POI's Event API
  70. * to handle the contents of a Workbook.
  71. * <p>
  72. * The Event API uses a much smaller memory footprint than
  73. * <code>HSSFWorkbook</code> when processing excel files
  74. * but at the cost of more complexity.
  75. * <p>
  76. * With the Event API a <i>listener</i> is registered for
  77. * specific record types and those records are created,
  78. * fired off to the listener and then discarded as the stream
  79. * is being processed.
  80. *
  81. * @see org.apache.poi.hssf.eventusermodel.HSSFListener
  82. * @see <a href="http://poi.apache.org/hssf/how-to.html#event_api">
  83. * POI Event API How To</a>
  84. */
  85. public class ExcelExtractor extends AbstractPOIFSExtractor {
  86. /**
  87. * <code>true</code> if the HSSFListener should be registered
  88. * to listen for all records or <code>false</code> (the default)
  89. * if the listener should be configured to only receive specified
  90. * records.
  91. */
  92. private boolean listenForAllRecords = false;
  93. public ExcelExtractor(ParseContext context) {
  94. super(context);
  95. }
  96. /**
  97. * Returns <code>true</code> if this parser is configured to listen
  98. * for all records instead of just the specified few.
  99. */
  100. public boolean isListenForAllRecords() {
  101. return listenForAllRecords;
  102. }
  103. /**
  104. * Specifies whether this parser should to listen for all
  105. * records or just for the specified few.
  106. * <p>
  107. * <strong>Note:</strong> Under normal operation this setting should
  108. * be <code>false</code> (the default), but you can experiment with
  109. * this setting for testing and debugging purposes.
  110. *
  111. * @param listenForAllRecords <code>true</code> if the HSSFListener
  112. * should be registered to listen for all records or <code>false</code>
  113. * if the listener should be configured to only receive specified records.
  114. */
  115. public void setListenForAllRecords(boolean listenForAllRecords) {
  116. this.listenForAllRecords = listenForAllRecords;
  117. }
  118. /**
  119. * Extracts text from an Excel Workbook writing the extracted content
  120. * to the specified {@link Appendable}.
  121. *
  122. * @param filesystem POI file system
  123. * @throws IOException if an error occurs processing the workbook
  124. * or writing the extracted content
  125. */
  126. protected void parse(
  127. POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
  128. Locale locale) throws IOException, SAXException, TikaException {
  129. TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
  130. listener.processFile(filesystem, isListenForAllRecords());
  131. listener.throwStoredException();
  132. for (Entry entry : filesystem.getRoot()) {
  133. if (entry.getName().startsWith("MBD")
  134. && entry instanceof DirectoryEntry) {
  135. try {
  136. handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
  137. } catch (TikaException e) {
  138. // ignore parse errors from embedded documents
  139. }
  140. }
  141. }
  142. }
  143. // ======================================================================
  144. /**
  145. * HSSF Listener implementation which processes the HSSF records.
  146. */
  147. private static class TikaHSSFListener implements HSSFListener {
  148. /**
  149. * XHTML content handler to which the document content is rendered.
  150. */
  151. private final XHTMLContentHandler handler;
  152. /**
  153. * The POIFS Extractor, used for embeded resources.
  154. */
  155. private final AbstractPOIFSExtractor extractor;
  156. /**
  157. * Potential exception thrown by the content handler. When set to
  158. * non-<code>null</code>, causes all subsequent HSSF records to be
  159. * ignored and the stored exception to be thrown when
  160. * {@link #throwStoredException()} is invoked.
  161. */
  162. private Exception exception = null;
  163. private SSTRecord sstRecord;
  164. private short previousSid;
  165. /**
  166. * Internal <code>FormatTrackingHSSFListener</code> to handle cell
  167. * formatting within the extraction.
  168. */
  169. private FormatTrackingHSSFListener formatListener;
  170. /**
  171. * List of worksheet names.
  172. */
  173. private List<String> sheetNames = new ArrayList<String>();
  174. /**
  175. * Index of the current worksheet within the workbook.
  176. * Used to find the worksheet name in the {@link #sheetNames} list.
  177. */
  178. private short currentSheetIndex;
  179. /**
  180. * Content of the current worksheet, or <code>null</code> if no
  181. * worksheet is currently active.
  182. */
  183. private SortedMap<Point, Cell> currentSheet = null;
  184. /**
  185. * Extra text or cells that crops up, typically as part of a
  186. * worksheet but not always.
  187. */
  188. private List<Cell> extraTextCells = new ArrayList<Cell>();
  189. /**
  190. * Format for rendering numbers in the worksheet. Currently we just
  191. * use the platform default formatting.
  192. *
  193. * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
  194. */
  195. private final NumberFormat format;
  196. /**
  197. * These aren't complete when we first see them, as the
  198. * depend on continue records that aren't always
  199. * contiguous. Collect them for later processing.
  200. */
  201. private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>();
  202. /**
  203. * Construct a new listener instance outputting parsed data to
  204. * the specified XHTML content handler.
  205. *
  206. * @param handler Destination to write the parsed output to
  207. */
  208. private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
  209. this.handler = handler;
  210. this.extractor = extractor;
  211. this.format = NumberFormat.getInstance(locale);
  212. this.formatListener = new FormatTrackingHSSFListener(this, locale);
  213. }
  214. /**
  215. * Entry point to listener to start the processing of a file.
  216. *
  217. * @param filesystem POI file system.
  218. * @param listenForAllRecords sets whether the listener is configured to listen
  219. * for all records types or not.
  220. * @throws IOException on any IO errors.
  221. * @throws SAXException on any SAX parsing errors.
  222. */
  223. public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
  224. throws IOException, SAXException, TikaException {
  225. // Set up listener and register the records we want to process
  226. HSSFRequest hssfRequest = new HSSFRequest();
  227. if (listenForAllRecords) {
  228. hssfRequest.addListenerForAllRecords(formatListener);
  229. } else {
  230. hssfRequest.addListener(formatListener, BOFRecord.sid);
  231. hssfRequest.addListener(formatListener, EOFRecord.sid);
  232. hssfRequest.addListener(formatListener, DateWindow1904Record.sid);
  233. hssfRequest.addListener(formatListener, CountryRecord.sid);
  234. hssfRequest.addListener(formatListener, BoundSheetRecord.sid);
  235. hssfRequest.addListener(formatListener, SSTRecord.sid);
  236. hssfRequest.addListener(formatListener, FormulaRecord.sid);
  237. hssfRequest.addListener(formatListener, LabelRecord.sid);
  238. hssfRequest.addListener(formatListener, LabelSSTRecord.sid);
  239. hssfRequest.addListener(formatListener, NumberRecord.sid);
  240. hssfRequest.addListener(formatListener, RKRecord.sid);
  241. hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
  242. hssfRequest.addListener(formatListener, TextObjectRecord.sid);
  243. hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
  244. hssfRequest.addListener(formatListener, FormatRecord.sid);
  245. hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
  246. hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
  247. }
  248. // Create event factory and process Workbook (fire events)
  249. DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
  250. HSSFEventFactory eventFactory = new HSSFEventFactory();
  251. eventFactory.processEvents(hssfRequest, documentInputStream);
  252. // Output any extra text that came after all the sheets
  253. processExtraText();
  254. // Look for embeded images, now that the drawing records
  255. // have been fully matched with their continue data
  256. for(DrawingGroupRecord dgr : drawingGroups) {
  257. dgr.decode();
  258. findPictures(dgr.getEscherRecords());
  259. }
  260. }
  261. /**
  262. * Process a HSSF record.
  263. *
  264. * @param record HSSF Record
  265. */
  266. public void processRecord(Record record) {
  267. if (exception == null) {
  268. try {
  269. internalProcessRecord(record);
  270. } catch (TikaException te) {
  271. exception = te;
  272. } catch (IOException ie) {
  273. exception = ie;
  274. } catch (SAXException se) {
  275. exception = se;
  276. }
  277. }
  278. }
  279. public void throwStoredException() throws TikaException, SAXException, IOException {
  280. if (exception != null) {
  281. if(exception instanceof IOException)
  282. throw (IOException)exception;
  283. if(exception instanceof SAXException)
  284. throw (SAXException)exception;
  285. if(exception instanceof TikaException)
  286. throw (TikaException)exception;
  287. throw new TikaException(exception.getMessage());
  288. }
  289. }
  290. private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException {
  291. switch (record.getSid()) {
  292. case BOFRecord.sid: // start of workbook, worksheet etc. records
  293. BOFRecord bof = (BOFRecord) record;
  294. if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
  295. currentSheetIndex = -1;
  296. } else if (bof.getType() == BOFRecord.TYPE_CHART) {
  297. if(previousSid == EOFRecord.sid) {
  298. // This is a sheet which contains only a chart
  299. newSheet();
  300. } else {
  301. // This is a chart within a normal sheet
  302. // Handling of this is a bit hacky...
  303. if (currentSheet != null) {
  304. processSheet();
  305. currentSheetIndex--;
  306. newSheet();
  307. }
  308. }
  309. } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
  310. newSheet();
  311. }
  312. break;
  313. case EOFRecord.sid: // end of workbook, worksheet etc. records
  314. if (currentSheet != null) {
  315. processSheet();
  316. }
  317. currentSheet = null;
  318. break;
  319. case BoundSheetRecord.sid: // Worksheet index record
  320. BoundSheetRecord boundSheetRecord = (BoundSheetRecord) record;
  321. sheetNames.add(boundSheetRecord.getSheetname());
  322. break;
  323. case SSTRecord.sid: // holds all the strings for LabelSSTRecords
  324. sstRecord = (SSTRecord) record;
  325. break;
  326. case FormulaRecord.sid: // Cell value from a formula
  327. FormulaRecord formula = (FormulaRecord) record;
  328. addCell(record, new NumberCell(formula.getValue(), format));
  329. break;
  330. case LabelRecord.sid: // strings stored directly in the cell
  331. LabelRecord label = (LabelRecord) record;
  332. addTextCell(record, label.getValue());
  333. break;
  334. case LabelSSTRecord.sid: // Ref. a string in the shared string table
  335. LabelSSTRecord sst = (LabelSSTRecord) record;
  336. UnicodeString unicode = sstRecord.getString(sst.getSSTIndex());
  337. addTextCell(record, unicode.getString());
  338. break;
  339. case NumberRecord.sid: // Contains a numeric cell value
  340. NumberRecord number = (NumberRecord) record;
  341. addTextCell(record, formatListener.formatNumberDateCell(number));
  342. break;
  343. case RKRecord.sid: // Excel internal number record
  344. RKRecord rk = (RKRecord) record;
  345. addCell(record, new NumberCell(rk.getRKNumber(), format));
  346. break;
  347. case HyperlinkRecord.sid: // holds a URL associated with a cell
  348. if (currentSheet != null) {
  349. HyperlinkRecord link = (HyperlinkRecord) record;
  350. Point point =
  351. new Point(link.getFirstColumn(), link.getFirstRow());
  352. Cell cell = currentSheet.get(point);
  353. if (cell != null) {
  354. addCell(record, new LinkedCell(cell, link.getAddress()));
  355. }
  356. }
  357. break;
  358. case TextObjectRecord.sid:
  359. TextObjectRecord tor = (TextObjectRecord) record;
  360. addTextCell(record, tor.getStr().getString());
  361. break;
  362. case SeriesTextRecord.sid: // Chart label or title
  363. SeriesTextRecord str = (SeriesTextRecord) record;
  364. addTextCell(record, str.getText());
  365. break;
  366. case DrawingGroupRecord.sid:
  367. // Collect this now, we'll process later when all
  368. // the continue records are in
  369. drawingGroups.add( (DrawingGroupRecord)record );
  370. break;
  371. }
  372. previousSid = record.getSid();
  373. }
  374. private void processExtraText() throws SAXException {
  375. if(extraTextCells.size() > 0) {
  376. for(Cell cell : extraTextCells) {
  377. handler.startElement("div", "class", "outside");
  378. cell.render(handler);
  379. handler.endElement("div");
  380. }
  381. // Reset
  382. extraTextCells.clear();
  383. }
  384. }
  385. /**
  386. * Adds the given cell (unless <code>null</code>) to the current
  387. * worksheet (if any) at the position (if any) of the given record.
  388. *
  389. * @param record record that holds the cell value
  390. * @param cell cell value (or <code>null</code>)
  391. */
  392. private void addCell(Record record, Cell cell) throws SAXException {
  393. if (cell == null) {
  394. // Ignore empty cells
  395. } else if (currentSheet != null
  396. && record instanceof CellValueRecordInterface) {
  397. // Normal cell inside a worksheet
  398. CellValueRecordInterface value =
  399. (CellValueRecordInterface) record;
  400. Point point = new Point(value.getColumn(), value.getRow());
  401. currentSheet.put(point, cell);
  402. } else {
  403. // Cell outside the worksheets
  404. extraTextCells.add(cell);
  405. }
  406. }
  407. /**
  408. * Adds a text cell with the given text comment. The given text
  409. * is trimmed, and ignored if <code>null</code> or empty.
  410. *
  411. * @param record record that holds the text value
  412. * @param text text content, may be <code>null</code>
  413. * @throws SAXException
  414. */
  415. private void addTextCell(Record record, String text) throws SAXException {
  416. if (text != null) {
  417. text = text.trim();
  418. if (text.length() > 0) {
  419. addCell(record, new TextCell(text));
  420. }
  421. }
  422. }
  423. private void newSheet() {
  424. currentSheetIndex++;
  425. currentSheet = new TreeMap<Point, Cell>(new PointComparator());
  426. }
  427. /**
  428. * Process an excel sheet.
  429. *
  430. * @throws SAXException if an error occurs
  431. */
  432. private void processSheet() throws SAXException {
  433. // Sheet Start
  434. handler.startElement("div", "class", "page");
  435. if (currentSheetIndex < sheetNames.size()) {
  436. handler.element("h1", sheetNames.get(currentSheetIndex));
  437. }
  438. handler.startElement("table");
  439. handler.startElement("tbody");
  440. // Process Rows
  441. int currentRow = 0;
  442. int currentColumn = 0;
  443. handler.startElement("tr");
  444. handler.startElement("td");
  445. for (Map.Entry<Point, Cell> entry : currentSheet.entrySet()) {
  446. while (currentRow < entry.getKey().y) {
  447. handler.endElement("td");
  448. handler.endElement("tr");
  449. handler.startElement("tr");
  450. handler.startElement("td");
  451. currentRow++;
  452. currentColumn = 0;
  453. }
  454. while (currentColumn < entry.getKey().x) {
  455. handler.endElement("td");
  456. handler.startElement("td");
  457. currentColumn++;
  458. }
  459. entry.getValue().render(handler);
  460. }
  461. handler.endElement("td");
  462. handler.endElement("tr");
  463. // Sheet End
  464. handler.endElement("tbody");
  465. handler.endElement("table");
  466. // Finish up
  467. processExtraText();
  468. handler.endElement("div");
  469. }
  470. private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException {
  471. for(EscherRecord escherRecord : records) {
  472. if (escherRecord instanceof EscherBSERecord) {
  473. EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
  474. if (blip != null) {
  475. // TODO When we have upgraded POI, we can use this code instead
  476. //HSSFPictureData picture = new HSSFPictureData(blip);
  477. //String mimeType = picture.getMimeType();
  478. //TikaInputStream stream = TikaInputStream.get(picture.getData());
  479. // This code is cut'n'paste from a newer version of POI
  480. String mimeType = "";
  481. switch (blip.getRecordId()) {
  482. case EscherMetafileBlip.RECORD_ID_WMF:
  483. mimeType = "image/x-wmf";
  484. break;
  485. case EscherMetafileBlip.RECORD_ID_EMF:
  486. mimeType = "image/x-emf";
  487. break;
  488. case EscherMetafileBlip.RECORD_ID_PICT:
  489. mimeType = "image/x-pict";
  490. break;
  491. case EscherBitmapBlip.RECORD_ID_PNG:
  492. mimeType = "image/png";
  493. break;
  494. case EscherBitmapBlip.RECORD_ID_JPEG:
  495. mimeType = "image/jpeg";
  496. break;
  497. case EscherBitmapBlip.RECORD_ID_DIB:
  498. mimeType = "image/bmp";
  499. break;
  500. default:
  501. mimeType = "image/unknown";
  502. break;
  503. }
  504. TikaInputStream stream = TikaInputStream.get(blip.getPicturedata());
  505. // Handle the embeded resource
  506. extractor.handleEmbeddedResource(
  507. stream, null, mimeType,
  508. handler
  509. );
  510. }
  511. }
  512. // Recursive call.
  513. findPictures(escherRecord.getChildRecords());
  514. }
  515. }
  516. }
  517. /**
  518. * Utility comparator for points.
  519. */
  520. private static class PointComparator implements Comparator<Point> {
  521. public int compare(Point a, Point b) {
  522. int diff = a.y - b.y;
  523. if (diff == 0) {
  524. diff = a.x - b.x;
  525. }
  526. return diff;
  527. }
  528. }
  529. }