IndexFiles.java | searchcode

/reader-server/src/com/topway/reader/server/lucene/IndexFiles.java

https://github.com/cheft/fast-reader
Java | 287 lines | 263 code | 24 blank | 0 comment | 26 complexity | 8e716c0d4bbe4c605731725e169386b5 MD5 | raw file

package com.topway.reader.server.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import org.nutz.ioc.impl.PropertiesProxy;
import org.nutz.ioc.loader.annotation.Inject;
import org.nutz.ioc.loader.annotation.IocBean;

@IocBean
public class IndexFiles {
	
	@Inject
	private PropertiesProxy config;
	
	public void buildIndex(String docsPath) {
		String indexPath = config.get("serverDir") + "/index";
		File docDir = new File(docsPath);
		boolean isCreate = true;
		File genFile = new File(indexPath + "/segments.gen");
		if (!docDir.exists() || !docDir.canRead()) {
			System.out.println("Document directory '"
							+ docDir.getAbsolutePath()
							+ "' does not exist or is not readable, please check the path");
			System.exit(1);
		}else if(genFile.exists()){
			isCreate = false;
		}
		System.out.println(isCreate);
		try {
			Date start = new Date();
			System.out.println("Indexing to directory '" + indexPath + "'...");
            Analyzer analyzer = new PaodingAnalyzer();
            IndexWriter writer = new IndexWriter(indexPath, analyzer, isCreate, IndexWriter.MaxFieldLength.UNLIMITED); 
			indexDocs(writer, docDir);
			writer.optimize();   
	        writer.close();   

			Date end = new Date();
			System.out.println(end.getTime() - start.getTime()
					+ " total milliseconds");
		} catch (IOException e) {
			System.out.println(" caught a " + e.getClass()
					+ "\n with message: " + e.getMessage());
		}catch (Exception e) {
			e.printStackTrace();
		}
	}

	public void indexDocs(IndexWriter writer, File file) throws Exception {
		if (file.canRead()) {
			if (file.isDirectory()) {
				String[] files = file.list();
				if (files != null) {
					for (int i = 0; i < files.length; i++) {
						indexDocs(writer, new File(file, files[i]));
					}
				}
			} else {
				String text = null;
				String path = file.getPath();
				if (path.contains(".")) {
					int index = path.lastIndexOf(".");
					String suffix = path.substring(index + 1);
					if("txt".equals(suffix)) {
						text = readTxt(path);
					}else if("xls".equals(suffix)) {
						text = readXls(file);
					}else if("xlsx".equals(suffix)) {
						text = readXlsx(path);
					}else if("doc".equals(suffix)) {
						text = readDoc(file);
					}else if("docx".equals(suffix)) {
						text = readDocx(path);
					}else if("ppt".equals(suffix)) {
						text = readPpt(file);
					}else if("pptx".equals(suffix)) {
						text = readPptx(path);
					}else if("pdf".equals(suffix)) {
						text = readPdf(file);
					}else {
						text = readTxt(path);
					}
				} else {
					text = readTxt(path);
				}
				Document doc = new Document();

				doc.add(new Field("filename", file.getName(),   
                        Field.Store.YES, Field.Index.ANALYZED,   
                        Field.TermVector.WITH_POSITIONS_OFFSETS));
				
				doc.add(new Field("contents", text,   
                        Field.Store.YES, Field.Index.ANALYZED,   
                        Field.TermVector.WITH_POSITIONS_OFFSETS));
				
				doc.add(new Field("path", file.getAbsolutePath(), 
						Field.Store.YES, Field.Index.ANALYZED));
				
				doc.add(new Field("modified", Long.toString(file.lastModified()), 
						Field.Store.NO, Field.Index.ANALYZED));
				
				System.out.println("adding " + file);
				writer.addDocument(doc);
			}
		}
	}

	public String readTxt(String path) {
		String text = new String();
		BufferedReader br = null;
		try {
			FileReader read = new FileReader(path);
			br = new BufferedReader(read);
			String row;
			while(( row = br.readLine()) != null){
			    text += (row + "\n");
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				br.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return text;
	}

	public String readXls(File xls) {
		FileInputStream fis = null;
		ExcelExtractor extractor = null;
		try {
			fis = new FileInputStream(xls);
			HSSFWorkbook workbook = new HSSFWorkbook(fis);
			extractor = new ExcelExtractor(workbook);

			extractor.setFormulasNotResults(true);
			extractor.setIncludeSheetNames(false);
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
		return extractor.getText();
	}
	
	public String readXlsx(String path) {
		XSSFExcelExtractor ee = null;
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(path);
			ee = new XSSFExcelExtractor(opcPackage);
			ee.setFormulasNotResults(true);
			ee.setIncludeSheetNames(false);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		} 
		return ee.getText();
	}

	public String readDoc(File doc) {
		FileInputStream fis = null;
		WordExtractor extractor = null;
		try {
			fis = new FileInputStream(doc);
			extractor = new WordExtractor(fis);
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
		return extractor.getText();
	}
	
	public String readDocx(String path) {
		XWPFWordExtractor we = null;
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(path);
			we = new XWPFWordExtractor(opcPackage);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		} 
		return we.getText();
	}

	public String readPpt(File ppt) throws Exception {
		FileInputStream fis = null;
		String text = new String();
		try {
			fis = new FileInputStream(ppt);
			SlideShow ss = new SlideShow(new HSLFSlideShow(fis));
			Slide[] slides = ss.getSlides();

			for (int i = 0; i < slides.length; i++) {
				TextRun[] t = slides[i].getTextRuns();
				for (int j = 0; j < t.length; j++) {
					text += t[j].getText();
				}
			}
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
		return text;
	}
	
	public String readPptx(String path) {
		XSLFPowerPointExtractor ppe = null;
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(path);
			ppe = new XSLFPowerPointExtractor(opcPackage);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		}
		return ppe.getText();
	}
	

	public String readPdf(File pdf) {
		String text = new String();
		FileInputStream is = null;
		PDDocument document = null;
		try {
			is = new FileInputStream(pdf);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			document = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			text = stripper.getText(document);
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (is != null) {
				try {
					is.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
				is = null;
			}
			if (document != null) {
				try {
					document.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
				document = null;
			}
		}
		return text;
	}
}