PageRenderTime 6217ms CodeModel.GetById 27ms RepoModel.GetById 2ms app.codeStats 0ms

/src/main/java/org/olat/search/service/document/file/WordDocument.java

https://bitbucket.org/mg/olat
Java | 117 lines | 78 code | 13 blank | 26 comment | 14 complexity | a0741c6028ba5ef174d2c349231de80a MD5 | raw file
Possible License(s): LGPL-2.1, GPL-3.0, 0BSD, MPL-2.0-no-copyleft-exception, AGPL-3.0, Apache-2.0
  1. /**
  2. * OLAT - Online Learning and Training<br>
  3. * http://www.olat.org
  4. * <p>
  5. * Licensed under the Apache License, Version 2.0 (the "License"); <br>
  6. * you may not use this file except in compliance with the License.<br>
  7. * You may obtain a copy of the License at
  8. * <p>
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. * <p>
  11. * Unless required by applicable law or agreed to in writing,<br>
  12. * software distributed under the License is distributed on an "AS IS" BASIS, <br>
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
  14. * See the License for the specific language governing permissions and <br>
  15. * limitations under the License.
  16. * <p>
  17. * Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br>
  18. * University of Zurich, Switzerland.
  19. * <p>
  20. */
  21. package org.olat.search.service.document.file;
  22. import java.io.BufferedInputStream;
  23. import java.io.IOException;
  24. import java.util.Iterator;
  25. import org.apache.lucene.document.Document;
  26. import org.apache.poi.hwpf.extractor.WordExtractor;
  27. import org.apache.poi.poifs.filesystem.DocumentEntry;
  28. import org.apache.poi.poifs.filesystem.Entry;
  29. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  30. import org.olat.core.logging.OLog;
  31. import org.olat.core.logging.Tracing;
  32. import org.olat.core.util.vfs.VFSLeaf;
  33. import org.olat.search.service.SearchResourceContext;
  34. /**
  35. * Lucene document mapper.
  36. *
  37. * @author Christian Guretzki
  38. */
  39. public class WordDocument extends FileDocument {
  40. private static final OLog log = Tracing.createLoggerFor(WordOOXMLDocument.class);
  41. public final static String FILE_TYPE = "type.file.word";
  42. public WordDocument() {
  43. super();
  44. }
  45. public static Document createDocument(final SearchResourceContext leafResourceContext, final VFSLeaf leaf) throws IOException, DocumentException,
  46. DocumentAccessException {
  47. final WordDocument wordDocument = new WordDocument();
  48. wordDocument.init(leafResourceContext, leaf);
  49. wordDocument.setFileType(FILE_TYPE);
  50. wordDocument.setCssIcon("b_filetype_doc");
  51. if (log.isDebug()) {
  52. log.debug(wordDocument.toString());
  53. }
  54. return wordDocument.getLuceneDocument();
  55. }
  56. @Override
  57. protected String readContent(final VFSLeaf leaf) throws IOException, DocumentException {
  58. BufferedInputStream bis = null;
  59. final StringBuilder sb = new StringBuilder();
  60. try {
  61. bis = new BufferedInputStream(leaf.getInputStream());
  62. final POIFSFileSystem filesystem = new POIFSFileSystem(bis);
  63. final Iterator<?> entries = filesystem.getRoot().getEntries();
  64. while (entries.hasNext()) {
  65. final Entry entry = (Entry) entries.next();
  66. final String name = entry.getName();
  67. if (!(entry instanceof DocumentEntry)) {
  68. // Skip directory entries
  69. } else if ("WordDocument".equals(name)) {
  70. collectWordDocument(filesystem, sb);
  71. }
  72. }
  73. return sb.toString();
  74. } catch (final Exception e) {
  75. throw new DocumentException(e.getMessage());
  76. } finally {
  77. if (bis != null) {
  78. bis.close();
  79. }
  80. }
  81. }
  82. private void collectWordDocument(final POIFSFileSystem filesystem, final StringBuilder sb) throws IOException {
  83. final WordExtractor extractor = new WordExtractor(filesystem);
  84. addTextIfAny(sb, extractor.getHeaderText());
  85. for (final String paragraph : extractor.getParagraphText()) {
  86. sb.append(paragraph).append(' ');
  87. }
  88. for (final String paragraph : extractor.getFootnoteText()) {
  89. sb.append(paragraph).append(' ');
  90. }
  91. for (final String paragraph : extractor.getCommentsText()) {
  92. sb.append(paragraph).append(' ');
  93. }
  94. for (final String paragraph : extractor.getEndnoteText()) {
  95. sb.append(paragraph).append(' ');
  96. }
  97. addTextIfAny(sb, extractor.getFooterText());
  98. }
  99. private void addTextIfAny(final StringBuilder sb, final String text) {
  100. if (text != null && text.length() > 0) {
  101. sb.append(text).append(' ');
  102. }
  103. }
  104. }