PageRenderTime 3258ms CodeModel.GetById 12ms RepoModel.GetById 1ms app.codeStats 0ms

/src/main/java/org/blackbeanbag/recipe/scanners/WordScanner.java

https://github.com/pperalta/recipe-index
Java | 74 lines | 45 code | 11 blank | 18 comment | 4 complexity | 38fa4d7e6bcf2ed73fc8de18900aa50f MD5 | raw file
  1. package org.blackbeanbag.recipe.scanners;
  2. import org.apache.log4j.Logger;
  3. import org.apache.lucene.document.Document;
  4. import org.apache.lucene.document.Field;
  5. import org.apache.lucene.document.TextField;
  6. import org.apache.poi.hwpf.extractor.WordExtractor;
  7. import org.apache.poi.poifs.filesystem.POIFSFileSystem;
  8. import java.io.FileInputStream;
  9. import java.util.*;
  10. /**
  11. * Implementation of {@link Scanner} that supports Microsoft
  12. * word documents (Word 95 through Word 2003).
  13. */
  14. public class WordScanner implements Scanner {
  15. private static final Logger LOG = Logger.getLogger(WordScanner.class);
  16. /**
  17. * {@inheritDoc}
  18. */
  19. @Override
  20. public boolean supportsFile(String file) {
  21. return file.toLowerCase().endsWith("doc");
  22. }
  23. /**
  24. * {@inheritDoc}
  25. */
  26. @Override
  27. public Document scan(String file) {
  28. try {
  29. POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(file));
  30. WordExtractor extractor = new WordExtractor(fs);
  31. String[] paragraphs = extractor.getParagraphText();
  32. // assuming the first line is the recipe title
  33. String title = paragraphs[0].trim();
  34. Document doc = new Document();
  35. doc.add(new Field("file", file, TextField.TYPE_STORED));
  36. doc.add(new Field("title", title, TextField.TYPE_STORED));
  37. for (String paragraph : paragraphs) {
  38. // Someday this tokenizer will be smarter and distinguish
  39. // between ingredients and amounts. The index (or the search)
  40. // should be able to perform quantity conversions and recognize
  41. // common quantity abbreviations. This may be done with a custom
  42. // Lucene tokenizer.
  43. //
  44. // For now we'll naively index each string that we come across
  45. StringTokenizer t = new StringTokenizer(paragraph);
  46. while (t.hasMoreTokens()) {
  47. doc.add(new Field("ingredient", t.nextToken().trim(), TextField.TYPE_STORED));
  48. }
  49. }
  50. if (LOG.isDebugEnabled()) {
  51. LOG.debug("Scanned file " + file);
  52. }
  53. if (LOG.isTraceEnabled()) {
  54. LOG.trace("Created document " + doc);
  55. }
  56. return doc;
  57. }
  58. catch (Exception e) {
  59. LOG.error("Error parsing file " + file, e);
  60. return null;
  61. }
  62. }
  63. }