PageRenderTime 41ms CodeModel.GetById 11ms RepoModel.GetById 0ms app.codeStats 0ms

/encuestame-business/src/main/java/org/encuestame/business/search/SearchUtils.java

http://github.com/encuestame/encuestame
Java | 284 lines | 178 code | 19 blank | 87 comment | 13 complexity | 6a0b4f4e7d61f770522584e775f6cb8b MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /*
  2. ************************************************************************************
  3. * Copyright (C) 2001-2011 encuestame: system online surveys Copyright (C) 2009
  4. * encuestame Development Team.
  5. * Licensed under the Apache Software License version 2.0
  6. * You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
  7. * Unless required by applicable law or agreed to in writing, software distributed
  8. * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  9. * CONDITIONS OF ANY KIND, either express or implied. See the License for the
  10. * specific language governing permissions and limitations under the License.
  11. ************************************************************************************
  12. */
  13. package org.encuestame.business.search;
  14. import java.io.File;
  15. import java.io.FileInputStream;
  16. import java.io.FileNotFoundException;
  17. import java.io.FileReader;
  18. import java.io.IOException;
  19. import java.io.InputStream;
  20. import java.util.Iterator;
  21. import org.apache.commons.lang.StringUtils;
  22. import org.apache.commons.logging.Log;
  23. import org.apache.commons.logging.LogFactory;
  24. import org.apache.lucene.analysis.standard.StandardAnalyzer;
  25. import org.apache.lucene.document.Document;
  26. import org.apache.lucene.document.Field;
  27. import org.apache.lucene.index.CorruptIndexException;
  28. import org.apache.lucene.index.IndexWriter;
  29. import org.apache.lucene.store.Directory;
  30. import org.apache.lucene.store.LockObtainFailedException;
  31. import org.apache.lucene.util.Version;
  32. import org.apache.pdfbox.cos.COSDocument;
  33. import org.apache.pdfbox.pdfparser.PDFParser;
  34. import org.apache.pdfbox.pdmodel.PDDocument;
  35. import org.apache.pdfbox.util.PDFTextStripper;
  36. import org.apache.poi.POIXMLException;
  37. import org.apache.poi.hssf.usermodel.HSSFCell;
  38. import org.apache.poi.hssf.usermodel.HSSFRichTextString;
  39. import org.apache.poi.hssf.usermodel.HSSFRow;
  40. import org.apache.poi.hssf.usermodel.HSSFSheet;
  41. import org.apache.poi.hssf.usermodel.HSSFWorkbook;
  42. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  43. import org.apache.poi.ss.usermodel.Cell;
  44. import org.apache.poi.ss.usermodel.Row;
  45. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  46. import org.apache.poi.xwpf.usermodel.XWPFDocument;
  47. import org.encuestame.core.search.DirectoryIndexStore;
  48. import org.springframework.util.Assert;
  49. /**
  50. * Search Utils.
  51. * @author Morales, Diana Paola paolaATencuestame.org
  52. * @since Mar 23, 2011
  53. */
  54. public class SearchUtils {
  55. /****/
  56. protected static final String CONTENT = "content";
  57. /****/
  58. protected static final String FULLPATH = "fullpath";
  59. /****/
  60. protected static final String FILENAME = "filename";
  61. /** Lucene Version. **/
  62. public static final Version LUCENE_VERSION = Version.LUCENE_30;
  63. /**
  64. * Log
  65. */
  66. private static final Log log = LogFactory.getLog(SearchUtils.class);
  67. /**
  68. * Get Filename extension.
  69. * @param path fullname file
  70. * @return
  71. */
  72. public static String getExtension(final String path) {
  73. final String ext = path.substring(path.lastIndexOf('.') + 1);
  74. log.debug("Path file " + path);
  75. log.debug("Ext file " + ext);
  76. return ext;
  77. }
  78. /**
  79. * PDF Document content parser.
  80. * @param is Document content
  81. * @return
  82. * @throws IOException
  83. */
  84. public static COSDocument parseDocument(final InputStream is) throws IOException {
  85. PDFParser parser = null;
  86. parser = new PDFParser(is);
  87. parser.parse();
  88. return parser.getDocument();
  89. }
  90. /**
  91. * Add Lucene Document fields.
  92. * @param file
  93. * @param docText
  94. * @return
  95. * @throws IOException
  96. */
  97. public static Document addFields(final File file, final String docText) throws IOException{
  98. final String fullpath = file.getCanonicalPath();
  99. final String filename = file.getName();
  100. final Document doc = new Document();
  101. if (StringUtils.isNotEmpty(docText)) {
  102. doc.add(new Field(CONTENT, docText, Field.Store.NO,
  103. Field.Index.ANALYZED));
  104. doc.add(new Field(FULLPATH, fullpath,
  105. Field.Store.YES, Field.Index.NOT_ANALYZED));
  106. doc.add(new Field(FILENAME, filename, Field.Store.YES,
  107. Field.Index.NOT_ANALYZED));
  108. }
  109. return doc;
  110. }
  111. /**
  112. * Create PDF Document.
  113. * @param file {@link File}
  114. * @param Long attachmentId.
  115. * @return {@link Document}
  116. * @throws Exception
  117. */
  118. public static Document createPdfDocument(final File file) throws Exception {
  119. InputStream is = new FileInputStream(file);
  120. COSDocument cosDoc = null;
  121. String docText = "";
  122. PDDocument pdDoc = null;
  123. try {
  124. cosDoc = parseDocument(is);
  125. pdDoc = new PDDocument(cosDoc);
  126. PDFTextStripper stripper = new PDFTextStripper();
  127. docText = stripper.getText(pdDoc);
  128. log.debug("PDF Doc Text "+docText.length());
  129. }
  130. finally {
  131. if( pdDoc == null ) {
  132. log.error("PdDocument is null");
  133. } else {
  134. pdDoc.close();
  135. }
  136. }
  137. final Document doc = SearchUtils.addFields(file, docText);
  138. return doc;
  139. }
  140. /**
  141. * Create Document Word.
  142. * @param file {@link File}
  143. * @param Long attachmentId.
  144. * @return {@link Document}
  145. * @throws POIXMLException
  146. * @throws Exception
  147. */
  148. public static Document createWordDocument(final File file) throws POIXMLException,
  149. Exception {
  150. InputStream is = new FileInputStream(file);
  151. String bodyText = null;
  152. try {
  153. XWPFDocument wd = new XWPFDocument(is);
  154. XWPFWordExtractor wde = new XWPFWordExtractor(wd);
  155. bodyText = wde.getText();
  156. } catch (Exception e) {
  157. log.debug(e);
  158. }
  159. Document doc = SearchUtils.addFields(file, bodyText);
  160. return doc;
  161. }
  162. /**
  163. * Create Spreadsheets Document.
  164. * @param file Spreadsheet {@link File}.
  165. * @param Long attachmentId.
  166. * @return {@link Document}
  167. * @throws FileNotFoundException
  168. */
  169. public static Document createSpreadsheetsDocument(final File file) throws Exception {
  170. InputStream is = new FileInputStream(file);
  171. StringBuilder contents = new StringBuilder();
  172. POIFSFileSystem fileSystem = new POIFSFileSystem(is);
  173. HSSFWorkbook workBook = new HSSFWorkbook(fileSystem);
  174. for (int i = 0; i < workBook.getNumberOfSheets(); i++) {
  175. HSSFSheet sheet = workBook.getSheetAt(i);
  176. Iterator<Row> rows = sheet.rowIterator();
  177. while (rows.hasNext()) {
  178. HSSFRow row = (HSSFRow) rows.next();
  179. // Display the row number
  180. log.debug(row.getRowNum());
  181. Iterator<Cell> cells = row.cellIterator();
  182. while (cells.hasNext()) {
  183. HSSFCell cell = (HSSFCell) cells.next();
  184. // Display the cell number of the current Row
  185. switch (cell.getCellType()) {
  186. case HSSFCell.CELL_TYPE_NUMERIC: {
  187. log.debug(String.valueOf(cell
  188. .getNumericCellValue()));
  189. contents.append(
  190. String.valueOf(cell.getNumericCellValue()))
  191. .append(" ");
  192. break;
  193. }
  194. case HSSFCell.CELL_TYPE_STRING: {
  195. HSSFRichTextString richTextString = cell
  196. .getRichStringCellValue();
  197. log.debug(richTextString.toString());
  198. contents.append(richTextString.toString()).append(" ");
  199. break;
  200. }
  201. case HSSFCell.CELL_TYPE_BOOLEAN: {
  202. contents.append(
  203. String.valueOf(cell.getBooleanCellValue()))
  204. .append(" ");
  205. break;
  206. }
  207. }
  208. }
  209. }
  210. }
  211. Document doc = SearchUtils.addFields(file, contents.toString());
  212. return doc;
  213. }
  214. /**
  215. * Create Text Document.
  216. * @param file Text File.
  217. * @param Long attachmentId.
  218. * @return {@link Document}
  219. * @throws Exception
  220. */
  221. public static Document createTextDocument(final File file) throws Exception {
  222. final String docText = new FileReader(file).toString();
  223. final Document doc = SearchUtils.addFields(file, docText);
  224. return doc;
  225. }
  226. /**
  227. * Open Index Writer
  228. * @param directoryStore
  229. * @param indexWriter
  230. * @throws CorruptIndexException
  231. * @throws LockObtainFailedException
  232. * @throws IOException
  233. */
  234. public static IndexWriter openIndexWriter(
  235. final DirectoryIndexStore directoryStore, IndexWriter indexWriter)
  236. throws CorruptIndexException, LockObtainFailedException,
  237. IOException {
  238. final Directory directory = directoryStore.getDirectory();
  239. log.debug("Get Directory ----------" + directory.toString());
  240. if(indexWriter != null){
  241. indexWriter.close();
  242. }
  243. log.debug("Index Directory is locked? ----------> " + indexWriter.isLocked(directory));
  244. indexWriter = new IndexWriter(directory, new StandardAnalyzer(
  245. SearchUtils.LUCENE_VERSION), true,
  246. IndexWriter.MaxFieldLength.UNLIMITED);
  247. Assert.notNull(indexWriter);
  248. return indexWriter;
  249. }
  250. /**
  251. * Close Index writer.
  252. * @param indexWriter
  253. * @throws CorruptIndexException
  254. * @throws IOException
  255. */
  256. public static void closeIndexWriter(final IndexWriter indexWriter) throws CorruptIndexException, IOException{
  257. Assert.notNull(indexWriter);
  258. if (indexWriter == null){
  259. log.error("Index writer is null");
  260. } else {
  261. indexWriter.close();
  262. log.debug("Index writer was closed");
  263. }
  264. }
  265. }