/encuestame-business/src/main/java/org/encuestame/business/search/SearchUtils.java
Java | 284 lines | 178 code | 19 blank | 87 comment | 13 complexity | 6a0b4f4e7d61f770522584e775f6cb8b MD5 | raw file
Possible License(s): BSD-3-Clause
- /*
- ************************************************************************************
- * Copyright (C) 2001-2011 encuestame: system online surveys Copyright (C) 2009
- * encuestame Development Team.
- * Licensed under the Apache Software License version 2.0
- * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software distributed
- * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
- * CONDITIONS OF ANY KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations under the License.
- ************************************************************************************
- */
- package org.encuestame.business.search;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.FileReader;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.Iterator;
- import org.apache.commons.lang.StringUtils;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.index.CorruptIndexException;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.store.LockObtainFailedException;
- import org.apache.lucene.util.Version;
- import org.apache.pdfbox.cos.COSDocument;
- import org.apache.pdfbox.pdfparser.PDFParser;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.util.PDFTextStripper;
- import org.apache.poi.POIXMLException;
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFRichTextString;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.ss.usermodel.Cell;
- import org.apache.poi.ss.usermodel.Row;
- import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
- import org.apache.poi.xwpf.usermodel.XWPFDocument;
- import org.encuestame.core.search.DirectoryIndexStore;
- import org.springframework.util.Assert;
- /**
- * Search Utils.
- * @author Morales, Diana Paola paolaATencuestame.org
- * @since Mar 23, 2011
- */
- public class SearchUtils {
- /****/
- protected static final String CONTENT = "content";
- /****/
- protected static final String FULLPATH = "fullpath";
- /****/
- protected static final String FILENAME = "filename";
- /** Lucene Version. **/
- public static final Version LUCENE_VERSION = Version.LUCENE_30;
- /**
- * Log
- */
- private static final Log log = LogFactory.getLog(SearchUtils.class);
- /**
- * Get Filename extension.
- * @param path fullname file
- * @return
- */
- public static String getExtension(final String path) {
- final String ext = path.substring(path.lastIndexOf('.') + 1);
- log.debug("Path file " + path);
- log.debug("Ext file " + ext);
- return ext;
- }
- /**
- * PDF Document content parser.
- * @param is Document content
- * @return
- * @throws IOException
- */
- public static COSDocument parseDocument(final InputStream is) throws IOException {
- PDFParser parser = null;
- parser = new PDFParser(is);
- parser.parse();
- return parser.getDocument();
- }
- /**
- * Add Lucene Document fields.
- * @param file
- * @param docText
- * @return
- * @throws IOException
- */
- public static Document addFields(final File file, final String docText) throws IOException{
- final String fullpath = file.getCanonicalPath();
- final String filename = file.getName();
- final Document doc = new Document();
- if (StringUtils.isNotEmpty(docText)) {
- doc.add(new Field(CONTENT, docText, Field.Store.NO,
- Field.Index.ANALYZED));
- doc.add(new Field(FULLPATH, fullpath,
- Field.Store.YES, Field.Index.NOT_ANALYZED));
- doc.add(new Field(FILENAME, filename, Field.Store.YES,
- Field.Index.NOT_ANALYZED));
- }
- return doc;
- }
- /**
- * Create PDF Document.
- * @param file {@link File}
- * @param Long attachmentId.
- * @return {@link Document}
- * @throws Exception
- */
- public static Document createPdfDocument(final File file) throws Exception {
- InputStream is = new FileInputStream(file);
- COSDocument cosDoc = null;
- String docText = "";
- PDDocument pdDoc = null;
- try {
- cosDoc = parseDocument(is);
- pdDoc = new PDDocument(cosDoc);
- PDFTextStripper stripper = new PDFTextStripper();
- docText = stripper.getText(pdDoc);
- log.debug("PDF Doc Text "+docText.length());
- }
- finally {
- if( pdDoc == null ) {
- log.error("PdDocument is null");
- } else {
- pdDoc.close();
- }
- }
- final Document doc = SearchUtils.addFields(file, docText);
- return doc;
- }
- /**
- * Create Document Word.
- * @param file {@link File}
- * @param Long attachmentId.
- * @return {@link Document}
- * @throws POIXMLException
- * @throws Exception
- */
- public static Document createWordDocument(final File file) throws POIXMLException,
- Exception {
- InputStream is = new FileInputStream(file);
- String bodyText = null;
- try {
- XWPFDocument wd = new XWPFDocument(is);
- XWPFWordExtractor wde = new XWPFWordExtractor(wd);
- bodyText = wde.getText();
- } catch (Exception e) {
- log.debug(e);
- }
- Document doc = SearchUtils.addFields(file, bodyText);
- return doc;
- }
- /**
- * Create Spreadsheets Document.
- * @param file Spreadsheet {@link File}.
- * @param Long attachmentId.
- * @return {@link Document}
- * @throws FileNotFoundException
- */
- public static Document createSpreadsheetsDocument(final File file) throws Exception {
- InputStream is = new FileInputStream(file);
- StringBuilder contents = new StringBuilder();
- POIFSFileSystem fileSystem = new POIFSFileSystem(is);
- HSSFWorkbook workBook = new HSSFWorkbook(fileSystem);
- for (int i = 0; i < workBook.getNumberOfSheets(); i++) {
- HSSFSheet sheet = workBook.getSheetAt(i);
- Iterator<Row> rows = sheet.rowIterator();
- while (rows.hasNext()) {
- HSSFRow row = (HSSFRow) rows.next();
- // Display the row number
- log.debug(row.getRowNum());
- Iterator<Cell> cells = row.cellIterator();
- while (cells.hasNext()) {
- HSSFCell cell = (HSSFCell) cells.next();
- // Display the cell number of the current Row
- switch (cell.getCellType()) {
- case HSSFCell.CELL_TYPE_NUMERIC: {
- log.debug(String.valueOf(cell
- .getNumericCellValue()));
- contents.append(
- String.valueOf(cell.getNumericCellValue()))
- .append(" ");
- break;
- }
- case HSSFCell.CELL_TYPE_STRING: {
- HSSFRichTextString richTextString = cell
- .getRichStringCellValue();
- log.debug(richTextString.toString());
- contents.append(richTextString.toString()).append(" ");
- break;
- }
- case HSSFCell.CELL_TYPE_BOOLEAN: {
- contents.append(
- String.valueOf(cell.getBooleanCellValue()))
- .append(" ");
- break;
- }
- }
- }
- }
- }
- Document doc = SearchUtils.addFields(file, contents.toString());
- return doc;
- }
- /**
- * Create Text Document.
- * @param file Text File.
- * @param Long attachmentId.
- * @return {@link Document}
- * @throws Exception
- */
- public static Document createTextDocument(final File file) throws Exception {
- final String docText = new FileReader(file).toString();
- final Document doc = SearchUtils.addFields(file, docText);
- return doc;
- }
- /**
- * Open Index Writer
- * @param directoryStore
- * @param indexWriter
- * @throws CorruptIndexException
- * @throws LockObtainFailedException
- * @throws IOException
- */
- public static IndexWriter openIndexWriter(
- final DirectoryIndexStore directoryStore, IndexWriter indexWriter)
- throws CorruptIndexException, LockObtainFailedException,
- IOException {
- final Directory directory = directoryStore.getDirectory();
- log.debug("Get Directory ----------" + directory.toString());
- if(indexWriter != null){
- indexWriter.close();
- }
- log.debug("Index Directory is locked? ----------> " + indexWriter.isLocked(directory));
- indexWriter = new IndexWriter(directory, new StandardAnalyzer(
- SearchUtils.LUCENE_VERSION), true,
- IndexWriter.MaxFieldLength.UNLIMITED);
- Assert.notNull(indexWriter);
- return indexWriter;
- }
- /**
- * Close Index writer.
- * @param indexWriter
- * @throws CorruptIndexException
- * @throws IOException
- */
- public static void closeIndexWriter(final IndexWriter indexWriter) throws CorruptIndexException, IOException{
- Assert.notNull(indexWriter);
- if (indexWriter == null){
- log.error("Index writer is null");
- } else {
- indexWriter.close();
- log.debug("Index writer was closed");
- }
- }
- }