PageRenderTime 51ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/src/com/jgaap/generics/DocumentHelper.java

http://github.com/evllabs/JGAAP
Java | 203 lines | 117 code | 20 blank | 66 comment | 32 complexity | 8bd2f482161cbb20b2fe2b3f678da332 MD5 | raw file
  1. /*
  2. * JGAAP -- a graphical program for stylometric authorship attribution
  3. * Copyright (C) 2009,2011 by Patrick Juola
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License as
  7. * published by the Free Software Foundation, either version 3 of the
  8. * License, or (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Affero General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Affero General Public License
  16. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. */
  18. package com.jgaap.generics;
  19. import java.io.*;
  20. import java.net.URL;
  21. import javax.swing.text.BadLocationException;
  22. import javax.swing.text.EditorKit;
  23. import javax.swing.text.html.HTMLDocument;
  24. import javax.swing.text.html.HTMLEditorKit;
  25. import org.apache.log4j.Logger;
  26. import org.apache.pdfbox.pdmodel.PDDocument;
  27. import org.apache.pdfbox.util.PDFTextStripper;
  28. import org.apache.poi.hwpf.HWPFDocument;
  29. import org.apache.poi.hwpf.extractor.WordExtractor;
  30. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  31. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  32. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  33. /**
  34. *
  35. * A helper class for Document that handles all the different ways documents can be loaded
  36. *
  37. * @author Michael Ryan
  38. * @since 5.0.0
  39. */
  40. class DocumentHelper {
  41. static Logger logger = Logger.getLogger(com.jgaap.generics.DocumentHelper.class);
  42. static char[] loadDocument(String filepath, String charset) throws IOException, BadLocationException {
  43. InputStream is;
  44. int fileSize = -1;
  45. if (filepath.startsWith("http://") || filepath.startsWith("https://")) {
  46. URL url = new URL(filepath);
  47. is = url.openStream();
  48. } else if (filepath.startsWith("/com/jgaap/resources")){
  49. is = com.jgaap.JGAAP.class.getResourceAsStream(filepath);
  50. } else {
  51. fileSize = (int) new File(filepath).length();
  52. is = new FileInputStream(filepath);
  53. }
  54. if (filepath.endsWith(".pdf")) {
  55. return loadPDF(is);
  56. } else if (filepath.endsWith(".doc")) {
  57. return loadMSWord(is);
  58. } else if (filepath.endsWith(".docx")){
  59. return loadMSWordDocx(is);
  60. } else if (filepath.endsWith(".htm") || filepath.endsWith(".html")) {
  61. return loadHTML(is);
  62. } else {
  63. if(fileSize==-1)
  64. return readText(is, charset);
  65. else
  66. return readText(is, charset, fileSize);
  67. }
  68. }
  69. static DocType getDocType(String filepath) {
  70. if (filepath.endsWith(".pdf")) {
  71. return DocType.PDF;
  72. } else if (filepath.endsWith(".doc")||filepath.endsWith(".docx")) {
  73. return DocType.DOC;
  74. } else if (filepath.endsWith(".htm") || filepath.endsWith(".html")) {
  75. return DocType.HTML;
  76. } else {
  77. return DocType.GENERIC;
  78. }
  79. }
  80. /**
  81. * Extracts text from a PDF and stores it in the document. Takes an input
  82. * stream rather than a file name.
  83. *
  84. * @param filesInputStream
  85. * An input stream pointing to a PDF file.
  86. * @throws IOException
  87. */
  88. static private char[] loadPDF(InputStream filesInputStream)
  89. throws IOException {
  90. PDDocument doc;
  91. doc = PDDocument.load(filesInputStream);
  92. PDFTextStripper pdfStripper = new PDFTextStripper();
  93. pdfStripper.setSortByPosition(false);
  94. char[] origText = pdfStripper.getText(doc).toCharArray();
  95. doc.close();
  96. return origText;
  97. }
  98. /**
  99. * Extracts text from an HTML document and stores it in the document.
  100. *
  101. * @param filesInputStream
  102. * An input stream pointing to the HTML document to be read.
  103. * @throws BadLocationException
  104. * @throws IOException
  105. */
  106. static private char[] loadHTML(InputStream filesInputStream)
  107. throws IOException, BadLocationException {
  108. EditorKit kit = new HTMLEditorKit();
  109. HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
  110. doc.putProperty("IgnoreCharsetDirective", true);
  111. kit.read(filesInputStream, doc, 0);
  112. char[] origText = doc.getText(0, doc.getLength()).toCharArray();
  113. return origText;
  114. }
  115. /**
  116. * Extracts text from a Word document and stores it in the document.
  117. *
  118. * @param inputStream
  119. * An input stream pointing to the Word document to be read.
  120. * @throws IOException
  121. */
  122. static private char[] loadMSWord(InputStream inputStream)
  123. throws IOException {
  124. POIFSFileSystem fs = new POIFSFileSystem(inputStream);
  125. HWPFDocument doc = new HWPFDocument(fs);
  126. WordExtractor we = new WordExtractor(doc);
  127. char[] origText = we.getText().toCharArray();
  128. return origText;
  129. }
  130. /**
  131. * Extracts text from a Word document and stores it in the document.
  132. *
  133. * @param inputStream
  134. * An input stream pointing to the Word document to be read.
  135. * @throws IOException
  136. */
  137. static private char[] loadMSWordDocx(InputStream inputStream) throws IOException{
  138. XWPFDocument docx = new XWPFDocument(inputStream);
  139. XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
  140. return extractor.getText().toCharArray();
  141. }
  142. /**
  143. * Reads text from a local file. The raw text of
  144. * the file is stored for quick access in an array.
  145. *
  146. * @throws IOException
  147. **/
  148. static private char[] readText(InputStream is, String charset, int length) throws IOException{
  149. Reader reader;
  150. if(charset==null || charset.isEmpty()){
  151. reader = new InputStreamReader(is);
  152. } else {
  153. reader = new InputStreamReader(is, charset);
  154. }
  155. char[] text = new char[length];
  156. int status = reader.read(text);
  157. if(status < length || reader.read() != -1)
  158. logger.warn("Possibility Document too large to stor into memory or using utf-8 with utf-16 chars in it.");
  159. reader.close();
  160. return text;
  161. }
  162. /**
  163. * Reads text from a local file. The raw text of
  164. * the file is stored for quick access in an array.
  165. *
  166. * @throws IOException
  167. **/
  168. static private char[] readText(InputStream is, String charset) throws IOException {
  169. int c;
  170. StringBuilder stringBuilder = new StringBuilder();
  171. BufferedReader reader;
  172. if (charset==null||charset.isEmpty()) {
  173. reader = new BufferedReader(new InputStreamReader(is));
  174. } else {
  175. reader = new BufferedReader(new InputStreamReader(is,charset));
  176. }
  177. while ((c = reader.read()) != -1) {
  178. stringBuilder.append((char)c);
  179. }
  180. reader.close();
  181. return stringBuilder.toString().toCharArray();
  182. }
  183. }