PageRenderTime 5280ms CodeModel.GetById 2ms RepoModel.GetById 0ms app.codeStats 0ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

https://github.com/solsson/tika
Java | 138 lines | 92 code | 17 blank | 29 comment | 20 complexity | dff73f83047fcc998beb71b2402c9412 MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft;
  18. import java.io.FileNotFoundException;
  19. import java.io.IOException;
  20. import java.util.List;
  21. import org.apache.poi.hwpf.HWPFDocument;
  22. import org.apache.poi.hwpf.model.PicturesTable;
  23. import org.apache.poi.hwpf.usermodel.Picture;
  24. import org.apache.poi.poifs.filesystem.DirectoryEntry;
  25. import org.apache.poi.poifs.filesystem.Entry;
  26. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  27. import org.apache.tika.exception.TikaException;
  28. import org.apache.tika.io.TikaInputStream;
  29. import org.apache.tika.parser.ParseContext;
  30. import org.apache.tika.sax.XHTMLContentHandler;
  31. import org.xml.sax.SAXException;
  32. public class WordExtractor extends AbstractPOIFSExtractor {
  33. public WordExtractor(ParseContext context) {
  34. super(context);
  35. }
  36. protected void parse(
  37. POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
  38. throws IOException, SAXException, TikaException {
  39. HWPFDocument document = new HWPFDocument(filesystem);
  40. org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
  41. new org.apache.poi.hwpf.extractor.WordExtractor(document);
  42. addTextIfAny(xhtml, "header", wordExtractor.getHeaderText());
  43. for (String paragraph : wordExtractor.getParagraphText()) {
  44. xhtml.element("p", paragraph);
  45. }
  46. for (String paragraph : wordExtractor.getFootnoteText()) {
  47. xhtml.element("p", paragraph);
  48. }
  49. for (String paragraph : wordExtractor.getCommentsText()) {
  50. xhtml.element("p", paragraph);
  51. }
  52. for (String paragraph : wordExtractor.getEndnoteText()) {
  53. xhtml.element("p", paragraph);
  54. }
  55. addTextIfAny(xhtml, "footer", wordExtractor.getFooterText());
  56. // Handle any embeded images
  57. PicturesTable pictureTable = document.getPicturesTable();
  58. if(pictureTable != null) {
  59. List<Picture> pictures = (List<Picture>)pictureTable.getAllPictures(); // TODO Generics fixed in newer version
  60. for(Picture picture : pictures) {
  61. // TODO When we have upgraded POI, we can use this code instead
  62. //String mimeType = picture.getMimeType();
  63. // This code is cut'n'paste from a newer version of POI
  64. String mimeType = "image/unknown";
  65. String extension = picture.suggestFileExtension();
  66. if("jpg".equals(extension)) {
  67. mimeType = "image/jpeg";
  68. }
  69. if("png".equals(extension)) {
  70. mimeType = "image/png";
  71. }
  72. if("gif".equals(extension)) {
  73. mimeType = "image/gif";
  74. }
  75. if("bmp".equals(extension)) {
  76. mimeType = "image/bmp";
  77. }
  78. if("tiff".equals(extension)) {
  79. mimeType = "image/tiff";
  80. }
  81. if("wmf".equals(extension)) {
  82. mimeType = "image/x-wmf";
  83. }
  84. if("emf".equals(extension)) {
  85. mimeType = "image/x-emf";
  86. }
  87. TikaInputStream stream = TikaInputStream.get(picture.getContent());
  88. handleEmbeddedResource(stream, null, mimeType, xhtml);
  89. }
  90. }
  91. // Handle any embeded office documents
  92. try {
  93. DirectoryEntry op =
  94. (DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");
  95. for (Entry entry : op) {
  96. if (entry.getName().startsWith("_")
  97. && entry instanceof DirectoryEntry) {
  98. handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
  99. }
  100. }
  101. } catch(FileNotFoundException e) {
  102. }
  103. }
  104. /**
  105. * Outputs a section of text if the given text is non-empty.
  106. *
  107. * @param xhtml XHTML content handler
  108. * @param section the class of the &lt;div/&gt; section emitted
  109. * @param text text to be emitted, if any
  110. * @throws SAXException if an error occurs
  111. */
  112. private void addTextIfAny(
  113. XHTMLContentHandler xhtml, String section, String text)
  114. throws SAXException {
  115. if (text != null && text.length() > 0) {
  116. xhtml.startElement("div", "class", section);
  117. xhtml.element("p", text);
  118. xhtml.endElement("div");
  119. }
  120. }
  121. }