DocumentHelper.java | searchcode

/src/com/jgaap/generics/DocumentHelper.java

http://github.com/evllabs/JGAAP
Java | 203 lines | 117 code | 20 blank | 66 comment | 32 complexity | 8bd2f482161cbb20b2fe2b3f678da332 MD5 | raw file

/*
 * JGAAP -- a graphical program for stylometric authorship attribution
 * Copyright (C) 2009,2011 by Patrick Juola
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
package com.jgaap.generics;

import java.io.*;
import java.net.URL;

import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;


/**
 * 
 * A helper class for Document that handles all the different ways documents can be loaded
 * 
 * @author Michael Ryan
 * @since 5.0.0
 */

class DocumentHelper {
	
	static Logger logger = Logger.getLogger(com.jgaap.generics.DocumentHelper.class);

	static char[] loadDocument(String filepath, String charset) throws IOException, BadLocationException {
		InputStream is;
		int fileSize = -1;
		if (filepath.startsWith("http://") || filepath.startsWith("https://")) {
			URL url = new URL(filepath);
			is = url.openStream();
		} else if (filepath.startsWith("/com/jgaap/resources")){
			is = com.jgaap.JGAAP.class.getResourceAsStream(filepath);
		} else {
			fileSize = (int) new File(filepath).length();
			is = new FileInputStream(filepath);
		} 
		if (filepath.endsWith(".pdf")) {
			return loadPDF(is);
		} else if (filepath.endsWith(".doc")) {
			return loadMSWord(is);
		} else if (filepath.endsWith(".docx")){
			return loadMSWordDocx(is);
		} else if (filepath.endsWith(".htm") || filepath.endsWith(".html")) {
			return loadHTML(is);
		} else {
			if(fileSize==-1)
				return readText(is, charset);
			else
				return readText(is, charset, fileSize);
		}
	}

	static DocType getDocType(String filepath) {
		if (filepath.endsWith(".pdf")) {
			return DocType.PDF;
		} else if (filepath.endsWith(".doc")||filepath.endsWith(".docx")) {
			return DocType.DOC;
		} else if (filepath.endsWith(".htm") || filepath.endsWith(".html")) {
			return DocType.HTML;
		} else {
			return DocType.GENERIC;
		}
	}

	/**
	 * Extracts text from a PDF and stores it in the document. Takes an input
	 * stream rather than a file name.
	 * 
	 * @param filesInputStream
	 *            An input stream pointing to a PDF file.
	 * @throws IOException
	 */
	static private char[] loadPDF(InputStream filesInputStream)
			throws IOException {
		PDDocument doc;
		doc = PDDocument.load(filesInputStream);
		PDFTextStripper pdfStripper = new PDFTextStripper();
		pdfStripper.setSortByPosition(false);
		char[] origText = pdfStripper.getText(doc).toCharArray();
		doc.close();

		return origText;
	}

	/**
	 * Extracts text from an HTML document and stores it in the document.
	 * 
	 * @param filesInputStream
	 *            An input stream pointing to the HTML document to be read.
	 * @throws BadLocationException
	 * @throws IOException
	 */
	static private char[] loadHTML(InputStream filesInputStream)
			throws IOException, BadLocationException {
		EditorKit kit = new HTMLEditorKit();
		HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
		doc.putProperty("IgnoreCharsetDirective", true);
		kit.read(filesInputStream, doc, 0);
		char[] origText = doc.getText(0, doc.getLength()).toCharArray();

		return origText;
	}

	/**
	 * Extracts text from a Word document and stores it in the document.
	 * 
	 * @param inputStream
	 *            An input stream pointing to the Word document to be read.
	 * @throws IOException
	 */
	static private char[] loadMSWord(InputStream inputStream)
			throws IOException {
		POIFSFileSystem fs = new POIFSFileSystem(inputStream);
		HWPFDocument doc = new HWPFDocument(fs);
		WordExtractor we = new WordExtractor(doc);
		char[] origText = we.getText().toCharArray();

		return origText;
	}
	
	/**
	 * Extracts text from a Word document and stores it in the document.
	 * 
	 * @param inputStream
	 *            An input stream pointing to the Word document to be read.
	 * @throws IOException
	 */
	static private char[] loadMSWordDocx(InputStream inputStream) throws IOException{
		XWPFDocument docx = new XWPFDocument(inputStream);
		XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
		return extractor.getText().toCharArray();
	}
	
	/**
	 * Reads text from a local file. The raw text of
	 * the file is stored for quick access in an array.
	 * 
	 * @throws IOException
	 **/
	
	static private char[] readText(InputStream is, String charset, int length) throws IOException{
		Reader reader;
		if(charset==null || charset.isEmpty()){
			reader = new InputStreamReader(is);
		} else {
			reader = new InputStreamReader(is, charset);
		}
		char[] text = new char[length];
		int status = reader.read(text);
		if(status < length || reader.read() != -1)
			logger.warn("Possibility Document too large to stor into memory or using utf-8 with utf-16 chars in it."); 
		reader.close();
		return text;
	}
	
	/**
	 * Reads text from a local file. The raw text of
	 * the file is stored for quick access in an array.
	 * 
	 * @throws IOException
	 **/
	static private char[] readText(InputStream is, String charset) throws IOException {
		int c;
		StringBuilder stringBuilder = new StringBuilder();
		BufferedReader reader;
		if (charset==null||charset.isEmpty()) {
			reader = new BufferedReader(new InputStreamReader(is));
		} else {
			reader = new BufferedReader(new InputStreamReader(is,charset));
		}
		while ((c = reader.read()) != -1) {
			stringBuilder.append((char)c);
		}
		reader.close();
		return stringBuilder.toString().toCharArray();
	}

}