PageRenderTime 6810ms CodeModel.GetById 28ms RepoModel.GetById 0ms app.codeStats 1ms

/reader-server/src/com/topway/reader/server/lucene/IndexFiles.java

https://github.com/cheft/fast-reader
Java | 287 lines | 263 code | 24 blank | 0 comment | 26 complexity | 8e716c0d4bbe4c605731725e169386b5 MD5 | raw file
  1. package com.topway.reader.server.lucene;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.FileReader;
  6. import java.io.IOException;
  7. import java.util.Date;
  8. import net.paoding.analysis.analyzer.PaodingAnalyzer;
  9. import org.apache.lucene.analysis.Analyzer;
  10. import org.apache.lucene.document.Document;
  11. import org.apache.lucene.document.Field;
  12. import org.apache.lucene.index.IndexWriter;
  13. import org.apache.pdfbox.pdfparser.PDFParser;
  14. import org.apache.pdfbox.pdmodel.PDDocument;
  15. import org.apache.pdfbox.util.PDFTextStripper;
  16. import org.apache.poi.POIXMLDocument;
  17. import org.apache.poi.hslf.HSLFSlideShow;
  18. import org.apache.poi.hslf.model.Slide;
  19. import org.apache.poi.hslf.model.TextRun;
  20. import org.apache.poi.hslf.usermodel.SlideShow;
  21. import org.apache.poi.hssf.extractor.ExcelExtractor;
  22. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  23. import org.apache.poi.hwpf.extractor.WordExtractor;
  24. import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
  25. import org.apache.poi.openxml4j.opc.OPCPackage;
  26. import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
  27. import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
  28. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  29. import org.apache.xmlbeans.XmlException;
  30. import org.nutz.ioc.impl.PropertiesProxy;
  31. import org.nutz.ioc.loader.annotation.Inject;
  32. import org.nutz.ioc.loader.annotation.IocBean;
  33. @IocBean
  34. public class IndexFiles {
  35. @Inject
  36. private PropertiesProxy config;
  37. public void buildIndex(String docsPath) {
  38. String indexPath = config.get("serverDir") + "/index";
  39. File docDir = new File(docsPath);
  40. boolean isCreate = true;
  41. File genFile = new File(indexPath + "/segments.gen");
  42. if (!docDir.exists() || !docDir.canRead()) {
  43. System.out.println("Document directory '"
  44. + docDir.getAbsolutePath()
  45. + "' does not exist or is not readable, please check the path");
  46. System.exit(1);
  47. }else if(genFile.exists()){
  48. isCreate = false;
  49. }
  50. System.out.println(isCreate);
  51. try {
  52. Date start = new Date();
  53. System.out.println("Indexing to directory '" + indexPath + "'...");
  54. Analyzer analyzer = new PaodingAnalyzer();
  55. IndexWriter writer = new IndexWriter(indexPath, analyzer, isCreate, IndexWriter.MaxFieldLength.UNLIMITED);
  56. indexDocs(writer, docDir);
  57. writer.optimize();
  58. writer.close();
  59. Date end = new Date();
  60. System.out.println(end.getTime() - start.getTime()
  61. + " total milliseconds");
  62. } catch (IOException e) {
  63. System.out.println(" caught a " + e.getClass()
  64. + "\n with message: " + e.getMessage());
  65. }catch (Exception e) {
  66. e.printStackTrace();
  67. }
  68. }
  69. public void indexDocs(IndexWriter writer, File file) throws Exception {
  70. if (file.canRead()) {
  71. if (file.isDirectory()) {
  72. String[] files = file.list();
  73. if (files != null) {
  74. for (int i = 0; i < files.length; i++) {
  75. indexDocs(writer, new File(file, files[i]));
  76. }
  77. }
  78. } else {
  79. String text = null;
  80. String path = file.getPath();
  81. if (path.contains(".")) {
  82. int index = path.lastIndexOf(".");
  83. String suffix = path.substring(index + 1);
  84. if("txt".equals(suffix)) {
  85. text = readTxt(path);
  86. }else if("xls".equals(suffix)) {
  87. text = readXls(file);
  88. }else if("xlsx".equals(suffix)) {
  89. text = readXlsx(path);
  90. }else if("doc".equals(suffix)) {
  91. text = readDoc(file);
  92. }else if("docx".equals(suffix)) {
  93. text = readDocx(path);
  94. }else if("ppt".equals(suffix)) {
  95. text = readPpt(file);
  96. }else if("pptx".equals(suffix)) {
  97. text = readPptx(path);
  98. }else if("pdf".equals(suffix)) {
  99. text = readPdf(file);
  100. }else {
  101. text = readTxt(path);
  102. }
  103. } else {
  104. text = readTxt(path);
  105. }
  106. Document doc = new Document();
  107. doc.add(new Field("filename", file.getName(),
  108. Field.Store.YES, Field.Index.ANALYZED,
  109. Field.TermVector.WITH_POSITIONS_OFFSETS));
  110. doc.add(new Field("contents", text,
  111. Field.Store.YES, Field.Index.ANALYZED,
  112. Field.TermVector.WITH_POSITIONS_OFFSETS));
  113. doc.add(new Field("path", file.getAbsolutePath(),
  114. Field.Store.YES, Field.Index.ANALYZED));
  115. doc.add(new Field("modified", Long.toString(file.lastModified()),
  116. Field.Store.NO, Field.Index.ANALYZED));
  117. System.out.println("adding " + file);
  118. writer.addDocument(doc);
  119. }
  120. }
  121. }
  122. public String readTxt(String path) {
  123. String text = new String();
  124. BufferedReader br = null;
  125. try {
  126. FileReader read = new FileReader(path);
  127. br = new BufferedReader(read);
  128. String row;
  129. while(( row = br.readLine()) != null){
  130. text += (row + "\n");
  131. }
  132. } catch (IOException e) {
  133. e.printStackTrace();
  134. } finally {
  135. try {
  136. br.close();
  137. } catch (IOException e) {
  138. e.printStackTrace();
  139. }
  140. }
  141. return text;
  142. }
  143. public String readXls(File xls) {
  144. FileInputStream fis = null;
  145. ExcelExtractor extractor = null;
  146. try {
  147. fis = new FileInputStream(xls);
  148. HSSFWorkbook workbook = new HSSFWorkbook(fis);
  149. extractor = new ExcelExtractor(workbook);
  150. extractor.setFormulasNotResults(true);
  151. extractor.setIncludeSheetNames(false);
  152. } catch (IOException ioe) {
  153. ioe.printStackTrace();
  154. }
  155. return extractor.getText();
  156. }
  157. public String readXlsx(String path) {
  158. XSSFExcelExtractor ee = null;
  159. try {
  160. OPCPackage opcPackage = POIXMLDocument.openPackage(path);
  161. ee = new XSSFExcelExtractor(opcPackage);
  162. ee.setFormulasNotResults(true);
  163. ee.setIncludeSheetNames(false);
  164. } catch (IOException e) {
  165. e.printStackTrace();
  166. } catch (XmlException e) {
  167. e.printStackTrace();
  168. } catch (OpenXML4JException e) {
  169. e.printStackTrace();
  170. }
  171. return ee.getText();
  172. }
  173. public String readDoc(File doc) {
  174. FileInputStream fis = null;
  175. WordExtractor extractor = null;
  176. try {
  177. fis = new FileInputStream(doc);
  178. extractor = new WordExtractor(fis);
  179. } catch (IOException ioe) {
  180. ioe.printStackTrace();
  181. }
  182. return extractor.getText();
  183. }
  184. public String readDocx(String path) {
  185. XWPFWordExtractor we = null;
  186. try {
  187. OPCPackage opcPackage = POIXMLDocument.openPackage(path);
  188. we = new XWPFWordExtractor(opcPackage);
  189. } catch (IOException e) {
  190. e.printStackTrace();
  191. } catch (XmlException e) {
  192. e.printStackTrace();
  193. } catch (OpenXML4JException e) {
  194. e.printStackTrace();
  195. }
  196. return we.getText();
  197. }
  198. public String readPpt(File ppt) throws Exception {
  199. FileInputStream fis = null;
  200. String text = new String();
  201. try {
  202. fis = new FileInputStream(ppt);
  203. SlideShow ss = new SlideShow(new HSLFSlideShow(fis));
  204. Slide[] slides = ss.getSlides();
  205. for (int i = 0; i < slides.length; i++) {
  206. TextRun[] t = slides[i].getTextRuns();
  207. for (int j = 0; j < t.length; j++) {
  208. text += t[j].getText();
  209. }
  210. }
  211. } catch (IOException ioe) {
  212. ioe.printStackTrace();
  213. }
  214. return text;
  215. }
  216. public String readPptx(String path) {
  217. XSLFPowerPointExtractor ppe = null;
  218. try {
  219. OPCPackage opcPackage = POIXMLDocument.openPackage(path);
  220. ppe = new XSLFPowerPointExtractor(opcPackage);
  221. } catch (IOException e) {
  222. e.printStackTrace();
  223. } catch (XmlException e) {
  224. e.printStackTrace();
  225. } catch (OpenXML4JException e) {
  226. e.printStackTrace();
  227. }
  228. return ppe.getText();
  229. }
  230. public String readPdf(File pdf) {
  231. String text = new String();
  232. FileInputStream is = null;
  233. PDDocument document = null;
  234. try {
  235. is = new FileInputStream(pdf);
  236. PDFParser parser = new PDFParser(is);
  237. parser.parse();
  238. document = parser.getPDDocument();
  239. PDFTextStripper stripper = new PDFTextStripper();
  240. text = stripper.getText(document);
  241. } catch (Exception e) {
  242. e.printStackTrace();
  243. } finally {
  244. if (is != null) {
  245. try {
  246. is.close();
  247. } catch (Exception e) {
  248. e.printStackTrace();
  249. }
  250. is = null;
  251. }
  252. if (document != null) {
  253. try {
  254. document.close();
  255. } catch (Exception e) {
  256. e.printStackTrace();
  257. }
  258. document = null;
  259. }
  260. }
  261. return text;
  262. }
  263. }