/src/java/axiom/util/TextExtractor.java
Java | 93 lines | 60 code | 11 blank | 22 comment | 7 complexity | f15b52799630b1d8e39906fafdf7b9a8 MD5 | raw file
- /*
- * Axiom Stack Web Application Framework
- * Copyright (C) 2008 Axiom Software Inc.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as
- * published by the Free Software Foundation, either version 3 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- * Axiom Software Inc., 11480 Commerce Park Drive, Third Floor, Reston, VA 20191 USA
- * email: info@axiomsoftwareinc.com
- */
- package axiom.util;
-
- import java.io.InputStream;
- import java.util.Iterator;
-
- import org.apache.poi.hslf.extractor.QuickButCruddyTextExtractor;
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.pdfbox.pdmodel.PDDocument;
- import org.pdfbox.util.PDFTextStripper;
-
- public class TextExtractor {
-
- public static String msExcelExtractor(InputStream is) throws Exception {
- POIFSFileSystem fs = new POIFSFileSystem(is);
- HSSFWorkbook wb = new HSSFWorkbook(fs);
- StringBuffer sb = new StringBuffer();
-
- final int numSheets = wb.getNumberOfSheets();
- for (int k = 0; k < numSheets; k++) {
- HSSFSheet sheet = wb.getSheetAt(k);
- Iterator rIt = sheet.rowIterator();
- while (rIt.hasNext()) {
- HSSFRow row = (HSSFRow) rIt.next();
- Iterator cIt = row.cellIterator();
- while (cIt.hasNext()) {
- HSSFCell cell = (HSSFCell) cIt.next();
- sb.append(cell.toString()).append(" ");
- }
- }
- }
-
- return sb.toString();
- }
-
- public static String msWordExtractor(InputStream is) throws Exception {
- WordExtractor we = new WordExtractor(is);
- return we.getText();
- }
-
- public static String msPowerPointExtractor(InputStream is) throws Exception{
- QuickButCruddyTextExtractor qbcte = new QuickButCruddyTextExtractor(is);
- return qbcte.getTextAsString();
- }
-
- public static String adobePDFExtractor(InputStream is) throws Exception {
- PDDocument doc = null;
- String pdfStr = null;
- try {
- doc = PDDocument.load(is);
- if (doc.isEncrypted()) {
- //not sure how to handle encrypted
- //unaccessable pdf document
- pdfStr = null;
- } else {
- PDFTextStripper stripper = new PDFTextStripper();
- pdfStr = stripper.getText(doc);
- }
- } finally {
- if (doc != null) {
- doc.close();
- doc = null;
- }
- }
-
- return pdfStr;
- }
-
- }