IndexFiles.java | searchcode

/reader-server/src/com/topway/reader/server/lucene/IndexFiles.java

https://github.com/cheft/fast-reader · Java · 287 lines · 263 code · 24 blank · 0 comment · 26 complexity · 8e716c0d4bbe4c605731725e169386b5 MD5 · raw file


package com.topway.reader.server.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.Date;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import org.nutz.ioc.impl.PropertiesProxy;
import org.nutz.ioc.loader.annotation.Inject;
import org.nutz.ioc.loader.annotation.IocBean;

@IocBean
public class IndexFiles {
	
	@Inject
	private PropertiesProxy config;
	
	public void buildIndex(String docsPath) {
		String indexPath = config.get("serverDir") + "/index";
		File docDir = new File(docsPath);
		boolean isCreate = true;
		File genFile = new File(indexPath + "/segments.gen");
		if (!docDir.exists() || !docDir.canRead()) {
			System.out.println("Document directory '"
							+ docDir.getAbsolutePath()
							+ "' does not exist or is not readable, please check the path");
			System.exit(1);
		}else if(genFile.exists()){
			isCreate = false;
		}
		System.out.println(isCreate);
		try {
			Date start = new Date();
			System.out.println("Indexing to directory '" + indexPath + "'...");
            Analyzer analyzer = new PaodingAnalyzer();
            IndexWriter writer = new IndexWriter(indexPath, analyzer, isCreate, IndexWriter.MaxFieldLength.UNLIMITED); 
			indexDocs(writer, docDir);
			writer.optimize();   
	        writer.close();   

			Date end = new Date();
			System.out.println(end.getTime() - start.getTime()
					+ " total milliseconds");
		} catch (IOException e) {
			System.out.println(" caught a " + e.getClass()
					+ "\n with message: " + e.getMessage());
		}catch (Exception e) {
			e.printStackTrace();
		}
	}

	public void indexDocs(IndexWriter writer, File file) throws Exception {
		if (file.canRead()) {
			if (file.isDirectory()) {
				String[] files = file.list();
				if (files != null) {
					for (int i = 0; i < files.length; i++) {
						indexDocs(writer, new File(file, files[i]));
					}
				}
			} else {
				String text = null;
				String path = file.getPath();
				if (path.contains(".")) {
					int index = path.lastIndexOf(".");
					String suffix = path.substring(index + 1);
					if("txt".equals(suffix)) {
						text = readTxt(path);
					}else if("xls".equals(suffix)) {
						text = readXls(file);
					}else if("xlsx".equals(suffix)) {
						text = readXlsx(path);
					}else if("doc".equals(suffix)) {
						text = readDoc(file);
					}else if("docx".equals(suffix)) {
						text = readDocx(path);
					}else if("ppt".equals(suffix)) {
						text = readPpt(file);
					}else if("pptx".equals(suffix)) {
						text = readPptx(path);
					}else if("pdf".equals(suffix)) {
						text = readPdf(file);
					}else {
						text = readTxt(path);
					}
				} else {
					text = readTxt(path);
				}
				Document doc = new Document();

				doc.add(new Field("filename", file.getName(),   
                        Field.Store.YES, Field.Index.ANALYZED,   
                        Field.TermVector.WITH_POSITIONS_OFFSETS));
				
				doc.add(new Field("contents", text,   
                        Field.Store.YES, Field.Index.ANALYZED,   
                        Field.TermVector.WITH_POSITIONS_OFFSETS));
				
				doc.add(new Field("path", file.getAbsolutePath(), 
						Field.Store.YES, Field.Index.ANALYZED));
				
				doc.add(new Field("modified", Long.toString(file.lastModified()), 
						Field.Store.NO, Field.Index.ANALYZED));
				
				System.out.println("adding " + file);
				writer.addDocument(doc);
			}
		}
	}

	public String readTxt(String path) {
		String text = new String();
		BufferedReader br = null;
		try {
			FileReader read = new FileReader(path);
			br = new BufferedReader(read);
			String row;
			while(( row = br.readLine()) != null){
			    text += (row + "\n");
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				br.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		return text;
	}

	public String readXls(File xls) {
		FileInputStream fis = null;
		ExcelExtractor extractor = null;
		try {
			fis = new FileInputStream(xls);
			HSSFWorkbook workbook = new HSSFWorkbook(fis);
			extractor = new ExcelExtractor(workbook);

			extractor.setFormulasNotResults(true);
			extractor.setIncludeSheetNames(false);
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
		return extractor.getText();
	}
	
	public String readXlsx(String path) {
		XSSFExcelExtractor ee = null;
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(path);
			ee = new XSSFExcelExtractor(opcPackage);
			ee.setFormulasNotResults(true);
			ee.setIncludeSheetNames(false);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		} 
		return ee.getText();
	}

	public String readDoc(File doc) {
		FileInputStream fis = null;
		WordExtractor extractor = null;
		try {
			fis = new FileInputStream(doc);
			extractor = new WordExtractor(fis);
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
		return extractor.getText();
	}
	
	public String readDocx(String path) {
		XWPFWordExtractor we = null;
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(path);
			we = new XWPFWordExtractor(opcPackage);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		} 
		return we.getText();
	}

	public String readPpt(File ppt) throws Exception {
		FileInputStream fis = null;
		String text = new String();
		try {
			fis = new FileInputStream(ppt);
			SlideShow ss = new SlideShow(new HSLFSlideShow(fis));
			Slide[] slides = ss.getSlides();

			for (int i = 0; i < slides.length; i++) {
				TextRun[] t = slides[i].getTextRuns();
				for (int j = 0; j < t.length; j++) {
					text += t[j].getText();
				}
			}
		} catch (IOException ioe) {
			ioe.printStackTrace();
		}
		return text;
	}
	
	public String readPptx(String path) {
		XSLFPowerPointExtractor ppe = null;
		try {
			OPCPackage opcPackage = POIXMLDocument.openPackage(path);
			ppe = new XSLFPowerPointExtractor(opcPackage);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (XmlException e) {
			e.printStackTrace();
		} catch (OpenXML4JException e) {
			e.printStackTrace();
		}
		return ppe.getText();
	}
	

	public String readPdf(File pdf) {
		String text = new String();
		FileInputStream is = null;
		PDDocument document = null;
		try {
			is = new FileInputStream(pdf);
			PDFParser parser = new PDFParser(is);
			parser.parse();
			document = parser.getPDDocument();
			PDFTextStripper stripper = new PDFTextStripper();
			text = stripper.getText(document);
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (is != null) {
				try {
					is.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
				is = null;
			}
			if (document != null) {
				try {
					document.close();
				} catch (Exception e) {
					e.printStackTrace();
				}
				document = null;
			}
		}
		return text;
	}
}

Tech Fingerprint

Alerts (45)

'java.util.Date' Maintainability Info: Prefer using the modern Java Time API (java.time.* classes like LocalDate, ZonedDateTime, Instant) introduced in Java 8 over the legacy java.util.Date and Calendar classes for better API design and thread safety.
8
'System.out.println(' Use a logging framework (e.g., SLF4J, Log4j) for better control and configurability
49 56 59 67 70 130
'.close()' Manual .close() call detected. Prefer using try-with-resources (Java 7+) for automatic and safer resource management, especially handling exceptions during close.
64 150 270 278
'+' Performance Info: Using string concatenation ('+' or '+=') inside loops can be inefficient due to repeated String object creation. Use StringBuilder (or StringBuffer for thread-safety) instead.
70 71 91
'catch (Exception' Catching generic 'Exception' can hide specific runtime issues. Catch more specific exception types whenever possible. Ensure caught exceptions are logged or handled appropriately, not just swallowed.
72 265 271 279
'.printStackTrace()' Avoid printing stack traces directly to std err/out. Use a proper logging framework to handle exceptions consistently and direct output appropriately.
73 147 152 169 182 184 186 198 209 211 213 233 244 246 248 266 272 280
'throws Exception' Declaring 'throws Exception' is too broad. Declare specific checked exceptions that the method might throw, allowing callers to handle them appropriately.
77 218
Complexity hotspot; lines 81 to 82 (total complexity: 3)
81 82
'new FileReader(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
140
'new FileInputStream(' Resource creation detected. Ensure resources (streams, connections, etc.) are properly closed using try-with-resources (Java 7+) to prevent leaks.
162 195 222 259