/extras/obcene/J2Reader.java

https://github.com/hausdorf/vn · Java · 170 lines · 121 code · 28 blank · 21 comment · 19 complexity · 7d28b850cdb624843b12447807cb073a MD5 · raw file

  1. import java.lang.Integer;
  2. import java.lang.String;
  3. import java.lang.StringBuffer;
  4. import java.util.Iterator;
  5. import java.util.List;
  6. import java.util.HashMap;
  7. import java.util.Set;
  8. import java.io.IOException;
  9. import org.apache.lucene.index.IndexReader;
  10. import org.apache.lucene.document.Document;
  11. import org.apache.lucene.document.Field;
  12. import org.apache.lucene.index.CorruptIndexException;
  13. import com.google.gson.Gson;
  14. class J2Reader {
  15. private IndexReader _reader = null;
  16. public static int NO_DOCS = -1;
  17. public int numDocs = NO_DOCS;
  18. /* initIndex: Initializes a J2Reader instance with an index. */
  19. public void initIndex(String filename)
  20. throws CorruptIndexException, IOException
  21. {
  22. this._reader = IndexReader.open(filename);
  23. this.numDocs = this._reader.numDocs();
  24. }
  25. /* closeIndex: Resets reader */
  26. public void closeIndex() throws CorruptIndexException, IOException {
  27. this._reader.close();
  28. this.numDocs = J2Reader.NO_DOCS;
  29. }
  30. /* docAt: shorthand */
  31. public Document docAt(int index) throws IOException {
  32. return this._reader.document(index);
  33. }
  34. /* countFields: */
  35. public HashMap countFields(Document doc, HashMap fieldMap) {
  36. List fields = doc.getFields();
  37. int numFields = fields.size();
  38. if(fieldMap == null)
  39. fieldMap = new HashMap();
  40. for(int i=0; i < numFields; i++) {
  41. Field f = (Field) fields.get(i);
  42. String mapKey = f.name();
  43. // Attempts to hide the quoted-header field arrangement
  44. if(mapKey.startsWith("quoted-header")) {
  45. if(mapKey.equals("quoted-header-name")) {
  46. mapKey = f.stringValue();
  47. if(mapKey.equals("date")) // date shows up as quoted and regular
  48. continue;
  49. }
  50. else {
  51. continue;
  52. }
  53. }
  54. if(!fieldMap.containsKey(mapKey)) {
  55. fieldMap.put(mapKey, 1);
  56. }
  57. else {
  58. int existingCount = (Integer) fieldMap.get(mapKey);
  59. fieldMap.put(mapKey, existingCount + 1);
  60. }
  61. }
  62. return fieldMap;
  63. }
  64. /* printAsJson: Builds a HashMap to match roughly the structure MongoDB
  65. * uses and prints it as json.
  66. */
  67. public void printDocAsJson(Document doc) {
  68. List fields = doc.getFields();
  69. int numFields = fields.size();
  70. HashMap fieldMap = new HashMap();
  71. for(int i=0; i < numFields; i++) {
  72. Field f = (Field) fields.get(i);
  73. String mapKey = f.name();
  74. String mapVal = f.stringValue();
  75. // Append any existing data
  76. StringBuffer mapValBuffer = new StringBuffer();
  77. /* Catch quoted-header fields. It appears at the beginning of
  78. * the field list (from what I can tell) and doesn't always have
  79. * a confidence entry. */
  80. if(mapKey.startsWith("quoted-header")) {
  81. if(mapKey.equals("quoted-header-name")) {
  82. mapKey = f.stringValue();
  83. if(mapKey.equals("date")) // date shows up as quoted and regular
  84. continue;
  85. Field f_value = (Field) fields.get(++i);
  86. mapVal = f_value.stringValue();
  87. }
  88. else {
  89. continue;
  90. }
  91. }
  92. /* Now that we've identified our map key and buffer, we can check
  93. * our fieldMap for it and either store or append the value */
  94. if(fieldMap.containsKey(mapKey)) {
  95. String existingVal = (String) fieldMap.get(mapKey);
  96. mapValBuffer.append(existingVal + ", ");
  97. }
  98. mapValBuffer.append(mapVal);
  99. fieldMap.put(mapKey, mapValBuffer.toString());
  100. }
  101. Gson g = new Gson();
  102. System.out.println(g.toJson(fieldMap));
  103. }
  104. /* printAsJson: Renders documents between `start` and `finish` as json */
  105. public void printAsJson(int start, int finish) throws IOException {
  106. for(int i = start; i < finish; i++) {
  107. Document doc = this.docAt(i);
  108. this.printDocAsJson(doc);
  109. }
  110. }
  111. public void printAggFields(int start, int finish) throws IOException {
  112. HashMap fieldMap = new HashMap();
  113. // Aggregate window of docs
  114. for(int i = start; i < finish; i++) {
  115. Document doc = this.docAt(i);
  116. this.countFields(doc, fieldMap);
  117. }
  118. // Print out totals
  119. Set keys = fieldMap.keySet();
  120. Iterator it = keys.iterator();
  121. while(it.hasNext()) {
  122. String key = (String) it.next();
  123. int count = (Integer) fieldMap.get(key);
  124. System.out.println("Key => " + key + " || Val => " + count);
  125. }
  126. }
  127. /* main: allows the class to be run like a command line tool. Expects
  128. * a path to a lucene index directory as the first argument.
  129. */
  130. public static void main (String[] args) throws Exception {
  131. if(args.length != 1) {
  132. System.out.println("J2Reader <dirname>");
  133. System.exit(1);
  134. }
  135. String indexName = args[0];
  136. J2Reader reader = new J2Reader();
  137. reader.initIndex(indexName);
  138. reader.printAsJson(0, reader.numDocs);
  139. //reader.printAggFields(0, reader.numDocs);
  140. reader.closeIndex();
  141. }
  142. }