PageRenderTime 41ms CodeModel.GetById 13ms RepoModel.GetById 0ms app.codeStats 0ms

/DocumentClustering/src/indexerHouse/DocumentFileParsing.java

https://github.com/manoja328/DocumentClustering
Java | 51 lines | 34 code | 9 blank | 8 comment | 3 complexity | c9a5e1543a427274e0badeba4550d47f MD5 | raw file
  1. /*
  2. * To change this template, choose Tools | Templates
  3. * and open the template in the editor.
  4. */
  5. package indexerHouse;
  6. import org.apache.poi.poifs.filesystem.*;
  7. import org.apache.poi.hwpf.*;
  8. import org.apache.poi.hwpf.extractor.*;
  9. import java.io.*;
  10. import org.apache.poi.hslf.extractor.PowerPointExtractor;
  11. import org.apache.poi.hssf.extractor.ExcelExtractor;
  12. /**
  13. *
  14. * @author paradise lost
  15. */
  16. public class DocumentFileParsing {
  17. public String DocParse(String fileName) throws IOException
  18. {
  19. POIFSFileSystem fs = null;
  20. try{
  21. fs = new POIFSFileSystem(new FileInputStream(fileName));
  22. HWPFDocument doc = new HWPFDocument(fs);
  23. WordExtractor we = new WordExtractor(doc);
  24. if(fileName.endsWith(".xls"))
  25. {
  26. ExcelExtractor ex = new ExcelExtractor(fs);
  27. return ex.getText();
  28. }
  29. else if(fileName.endsWith(".ppt"))
  30. {
  31. PowerPointExtractor extractor = new PowerPointExtractor(fs);
  32. return extractor.getText();
  33. }
  34. return we.getText();
  35. }
  36. catch(Exception e)
  37. {
  38. System.out.println("document file cant be indexed");
  39. }
  40. return "done";
  41. }
  42. }