PageRenderTime 67ms CodeModel.GetById 29ms RepoModel.GetById 0ms app.codeStats 1ms

/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java

https://github.com/solsson/tika
Java | 105 lines | 73 code | 12 blank | 20 comment | 11 complexity | fec54de4521538d1d148242a35245b0e MD5 | raw file
Possible License(s): BSD-3-Clause, MPL-2.0, Apache-2.0
  1. /*
  2. * Licensed to the Apache Software Foundation (ASF) under one or more
  3. * contributor license agreements. See the NOTICE file distributed with
  4. * this work for additional information regarding copyright ownership.
  5. * The ASF licenses this file to You under the Apache License, Version 2.0
  6. * (the "License"); you may not use this file except in compliance with
  7. * the License. You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. package org.apache.tika.parser.microsoft.ooxml;
  18. import java.io.IOException;
  19. import java.util.ArrayList;
  20. import java.util.List;
  21. import org.apache.poi.openxml4j.opc.PackagePart;
  22. import org.apache.poi.xslf.XSLFSlideShow;
  23. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  24. import org.apache.poi.xslf.usermodel.XMLSlideShow;
  25. import org.apache.poi.xslf.usermodel.XSLFSlide;
  26. import org.apache.tika.sax.XHTMLContentHandler;
  27. import org.apache.xmlbeans.XmlException;
  28. import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
  29. import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
  30. import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
  31. import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
  32. import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
  33. import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
  34. import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
  35. import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
  36. import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
  37. import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
  38. import org.xml.sax.SAXException;
  39. public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
  40. public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor) {
  41. super(extractor, "application/vnd.openxmlformats-officedocument.presentationml.presentation");
  42. }
  43. /**
  44. * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
  45. */
  46. @Override
  47. protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
  48. XmlException, IOException {
  49. XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
  50. XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
  51. XSLFSlide[] slides = xmlSlideShow.getSlides();
  52. for (XSLFSlide slide : slides) {
  53. CTSlide rawSlide = slide._getCTSlide();
  54. CTSlideIdListEntry slideId = slide._getCTSlideId();
  55. CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
  56. slideId);
  57. CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
  58. .getSlideComments(slideId);
  59. xhtml.startElement("div");
  60. extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
  61. if (comments != null) {
  62. for (CTComment comment : comments.getCmArray()) {
  63. xhtml.element("p", comment.getText());
  64. }
  65. }
  66. if (notes != null) {
  67. extractShapeContent(notes.getCSld().getSpTree(), xhtml);
  68. }
  69. xhtml.endElement("div");
  70. }
  71. }
  72. private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler xhtml)
  73. throws SAXException {
  74. CTShape[] shapes = gs.getSpArray();
  75. for (CTShape shape : shapes) {
  76. CTTextBody textBody = shape.getTxBody();
  77. if (textBody != null) {
  78. CTTextParagraph[] paras = textBody.getPArray();
  79. for (CTTextParagraph textParagraph : paras) {
  80. CTRegularTextRun[] textRuns = textParagraph.getRArray();
  81. for (CTRegularTextRun textRun : textRuns) {
  82. xhtml.element("p", textRun.getT());
  83. }
  84. }
  85. }
  86. }
  87. }
  88. @Override
  89. protected List<PackagePart> getMainDocumentParts() {
  90. // TODO
  91. return new ArrayList<PackagePart>();
  92. }
  93. }