PageRenderTime 88ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/src/java/axiom/util/TextExtractor.java

https://github.com/SorinS/axiom-stack
Java | 93 lines | 60 code | 11 blank | 22 comment | 7 complexity | f15b52799630b1d8e39906fafdf7b9a8 MD5 | raw file
  1. /*
  2. * Axiom Stack Web Application Framework
  3. * Copyright (C) 2008 Axiom Software Inc.
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License as
  7. * published by the Free Software Foundation, either version 3 of the
  8. * License, or (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Affero General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Affero General Public License
  16. * along with this program. If not, see <http://www.gnu.org/licenses/>.
  17. *
  18. * Axiom Software Inc., 11480 Commerce Park Drive, Third Floor, Reston, VA 20191 USA
  19. * email: info@axiomsoftwareinc.com
  20. */
  21. package axiom.util;
  22. import java.io.InputStream;
  23. import java.util.Iterator;
  24. import org.apache.poi.hslf.extractor.QuickButCruddyTextExtractor;
  25. import org.apache.poi.hssf.usermodel.HSSFCell;
  26. import org.apache.poi.hssf.usermodel.HSSFRow;
  27. import org.apache.poi.hssf.usermodel.HSSFSheet;
  28. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  29. import org.apache.poi.hwpf.extractor.WordExtractor;
  30. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  31. import org.pdfbox.pdmodel.PDDocument;
  32. import org.pdfbox.util.PDFTextStripper;
  33. public class TextExtractor {
  34. public static String msExcelExtractor(InputStream is) throws Exception {
  35. POIFSFileSystem fs = new POIFSFileSystem(is);
  36. HSSFWorkbook wb = new HSSFWorkbook(fs);
  37. StringBuffer sb = new StringBuffer();
  38. final int numSheets = wb.getNumberOfSheets();
  39. for (int k = 0; k < numSheets; k++) {
  40. HSSFSheet sheet = wb.getSheetAt(k);
  41. Iterator rIt = sheet.rowIterator();
  42. while (rIt.hasNext()) {
  43. HSSFRow row = (HSSFRow) rIt.next();
  44. Iterator cIt = row.cellIterator();
  45. while (cIt.hasNext()) {
  46. HSSFCell cell = (HSSFCell) cIt.next();
  47. sb.append(cell.toString()).append(" ");
  48. }
  49. }
  50. }
  51. return sb.toString();
  52. }
  53. public static String msWordExtractor(InputStream is) throws Exception {
  54. WordExtractor we = new WordExtractor(is);
  55. return we.getText();
  56. }
  57. public static String msPowerPointExtractor(InputStream is) throws Exception{
  58. QuickButCruddyTextExtractor qbcte = new QuickButCruddyTextExtractor(is);
  59. return qbcte.getTextAsString();
  60. }
  61. public static String adobePDFExtractor(InputStream is) throws Exception {
  62. PDDocument doc = null;
  63. String pdfStr = null;
  64. try {
  65. doc = PDDocument.load(is);
  66. if (doc.isEncrypted()) {
  67. //not sure how to handle encrypted
  68. //unaccessable pdf document
  69. pdfStr = null;
  70. } else {
  71. PDFTextStripper stripper = new PDFTextStripper();
  72. pdfStr = stripper.getText(doc);
  73. }
  74. } finally {
  75. if (doc != null) {
  76. doc.close();
  77. doc = null;
  78. }
  79. }
  80. return pdfStr;
  81. }
  82. }