PageRenderTime 60ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/lucene-3.6.0/contrib/pruning/src/test/org/apache/lucene/index/TestPruningReader.java

#
Java | 359 lines | 297 code | 24 blank | 38 comment | 6 complexity | c45309e2e55da347643c44e41e97cb85 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, MIT, BSD-3-Clause, CPL-1.0, Apache-2.0, AGPL-1.0, GPL-2.0
  1. package org.apache.lucene.index;
  2. /*
  3. * Licensed to the Apache Software Foundation (ASF) under one or more
  4. * contributor license agreements. See the NOTICE file distributed with
  5. * this work for additional information regarding copyright ownership.
  6. * The ASF licenses this file to You under the Apache License, Version 2.0
  7. * (the "License"); you may not use this file except in compliance with
  8. * the License. You may obtain a copy of the License at
  9. *
  10. * http://www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an "AS IS" BASIS,
  14. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. import java.io.IOException;
  19. import java.util.HashMap;
  20. import java.util.Map;
  21. import org.apache.lucene.analysis.WhitespaceAnalyzer;
  22. import org.apache.lucene.document.Document;
  23. import org.apache.lucene.document.Field;
  24. import org.apache.lucene.index.PruningReader;
  25. import org.apache.lucene.index.pruning.CarmelTopKTermPruningPolicy;
  26. import org.apache.lucene.index.pruning.PruningPolicy;
  27. import org.apache.lucene.index.pruning.RIDFTermPruningPolicy;
  28. import org.apache.lucene.index.pruning.StorePruningPolicy;
  29. import org.apache.lucene.index.pruning.TFTermPruningPolicy;
  30. import org.apache.lucene.search.IndexSearcher;
  31. import org.apache.lucene.search.ScoreDoc;
  32. import org.apache.lucene.search.TermQuery;
  33. import org.apache.lucene.store.RAMDirectory;
  34. import org.apache.lucene.util.LuceneTestCase;
  35. public class TestPruningReader extends LuceneTestCase {
  36. // parameters for the Carmel-TopK-Pruning
  37. private static final int R = 1; //number of terms in the query
  38. private static final int K = 2; // top K results
  39. private static final float EPSILON = .001f; // error in score
  40. RAMDirectory sourceDir = new RAMDirectory();
  41. /** once computed base on how index is created, these are the full scores, i.e. before pruning */
  42. private static Map<Term,ScoreDoc[]> fullScores = initFullScores();
  43. private static Map<Term,ScoreDoc[]> prunedScores = initPrunedScores();
  44. private void assertTD(IndexReader ir, Term t, int[] ids) throws Exception {
  45. TermPositions td = ir.termPositions(t);
  46. assertNotNull(td);
  47. try {
  48. int i = 0;
  49. while(td.next()) {
  50. int doc = td.doc();
  51. assertEquals(t + ", i=" + i, ids[i], doc);
  52. i++;
  53. }
  54. assertEquals(ids.length, i);
  55. } finally {
  56. td.close();
  57. }
  58. }
  59. /**
  60. * Scores of the full, unpruned index.
  61. */
  62. private static Map<Term, ScoreDoc[]> initFullScores() {
  63. HashMap<Term, ScoreDoc[]> res = new HashMap<Term, ScoreDoc[]>();
  64. Term t;
  65. ScoreDoc sd[];
  66. t = new Term("body","one");
  67. sd = new ScoreDoc[] {
  68. new ScoreDoc(4, 0.74011815f),
  69. new ScoreDoc(2, 0.54939526f),
  70. new ScoreDoc(3, 0.54939526f),
  71. new ScoreDoc(1, 0.44857934f),
  72. new ScoreDoc(0, 0.42292467f)
  73. };
  74. res.put(t,sd);
  75. t = new Term("body","two");
  76. sd = new ScoreDoc[] {
  77. new ScoreDoc(2, 0.7679404f),
  78. new ScoreDoc(1, 0.62702066f),
  79. new ScoreDoc(0, 0.5911608f),
  80. new ScoreDoc(4, 0.5172657f)
  81. };
  82. res.put(t,sd);
  83. t = new Term("body","three");
  84. sd = new ScoreDoc[] {
  85. new ScoreDoc(3, 0.7679404f),
  86. new ScoreDoc(1, 0.62702066f),
  87. new ScoreDoc(0, 0.5911608f)
  88. };
  89. res.put(t,sd);
  90. t = new Term("test","one");
  91. sd = new ScoreDoc[] {
  92. new ScoreDoc(4, 2.9678855f)
  93. };
  94. res.put(t,sd);
  95. t = new Term("allthesame","allthesame");
  96. sd = new ScoreDoc[] {
  97. new ScoreDoc(0, 0.84584934f),
  98. new ScoreDoc(1, 0.84584934f),
  99. new ScoreDoc(2, 0.84584934f),
  100. new ScoreDoc(3, 0.84584934f),
  101. new ScoreDoc(4, 0.84584934f)
  102. };
  103. res.put(t,sd);
  104. return res;
  105. }
  106. /**
  107. * Expected scores of the pruned index - with EPSILON=0.001, K=2, R=1
  108. */
  109. private static Map<Term, ScoreDoc[]> initPrunedScores() {
  110. HashMap<Term, ScoreDoc[]> res = new HashMap<Term, ScoreDoc[]>();
  111. Term t;
  112. ScoreDoc sd[];
  113. t = new Term("body","one");
  114. sd = new ScoreDoc[] {
  115. new ScoreDoc(4, 0.74011815f),
  116. new ScoreDoc(2, 0.54939526f),
  117. new ScoreDoc(3, 0.54939526f),
  118. };
  119. res.put(t,sd);
  120. t = new Term("body","two");
  121. sd = new ScoreDoc[] {
  122. new ScoreDoc(2, 0.7679404f),
  123. new ScoreDoc(1, 0.62702066f),
  124. };
  125. res.put(t,sd);
  126. t = new Term("body","three");
  127. sd = new ScoreDoc[] {
  128. new ScoreDoc(3, 0.7679404f),
  129. new ScoreDoc(1, 0.62702066f),
  130. };
  131. res.put(t,sd);
  132. t = new Term("test","one");
  133. sd = new ScoreDoc[] {
  134. new ScoreDoc(4, 2.9678855f)
  135. };
  136. res.put(t,sd);
  137. t = new Term("allthesame","allthesame"); // must keep all because all are the same!
  138. sd = new ScoreDoc[] {
  139. new ScoreDoc(0, 0.84584934f),
  140. new ScoreDoc(1, 0.84584934f),
  141. new ScoreDoc(2, 0.84584934f),
  142. new ScoreDoc(3, 0.84584934f),
  143. new ScoreDoc(4, 0.84584934f)
  144. };
  145. res.put(t,sd);
  146. return res;
  147. }
  148. private void assertTDCount(IndexReader ir, Term t, int count) throws Exception {
  149. TermPositions td = ir.termPositions(t);
  150. assertNotNull(td);
  151. try {
  152. int i = 0;
  153. while (td.next()) i++;
  154. assertEquals(t.toString(), count, i);
  155. } finally {
  156. td.close();
  157. }
  158. }
  159. public void setUp() throws Exception {
  160. super.setUp();
  161. WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
  162. IndexWriter iw = new IndexWriter(sourceDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
  163. Document doc = new Document();
  164. doc.add(new Field("body", "one two three four", Field.Store.YES, Field.Index.ANALYZED));
  165. doc.add(new Field("id", "0", Field.Store.YES, Field.Index.NO));
  166. doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
  167. iw.addDocument(doc);
  168. doc = new Document();
  169. doc.add(new Field("body", "one two three one two three", Field.Store.YES, Field.Index.ANALYZED));
  170. doc.add(new Field("id", "1", Field.Store.YES, Field.Index.NO));
  171. doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
  172. iw.addDocument(doc);
  173. doc = new Document();
  174. doc.add(new Field("body", "one two one two one two", Field.Store.YES, Field.Index.ANALYZED));
  175. doc.add(new Field("id", "2", Field.Store.YES, Field.Index.NO));
  176. doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
  177. iw.addDocument(doc);
  178. doc = new Document();
  179. doc.add(new Field("body", "one three one three one three", Field.Store.YES, Field.Index.ANALYZED));
  180. doc.add(new Field("id", "3", Field.Store.YES, Field.Index.NO));
  181. doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
  182. iw.addDocument(doc);
  183. doc = new Document();
  184. doc.add(new Field("body", "one one one one two", Field.Store.YES, Field.Index.ANALYZED));
  185. doc.add(new Field("test", "one two one two three three three four", Field.Store.YES, Field.Index.ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS));
  186. doc.add(new Field("id", "4", Field.Store.YES, Field.Index.NO));
  187. doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
  188. iw.addDocument(doc);
  189. // to be deleted
  190. doc = new Document();
  191. doc.add(new Field("body", "one three one three one three five five five", Field.Store.YES, Field.Index.ANALYZED));
  192. doc.add(new Field("id", "5", Field.Store.YES, Field.Index.NO));
  193. doc.add(new Field("allthesame", "allthesame", Field.Store.YES, Field.Index.ANALYZED));
  194. iw.addDocument(doc);
  195. iw.close();
  196. IndexReader ir = IndexReader.open(sourceDir, false);
  197. ir.deleteDocument(5);
  198. ir.close();
  199. }
  200. public void testRIDFPruning() throws Exception {
  201. RAMDirectory targetDir = new RAMDirectory();
  202. IndexReader in = IndexReader.open(sourceDir, true);
  203. // remove only very popular terms
  204. RIDFTermPruningPolicy ridf = new RIDFTermPruningPolicy(in, null, null, -0.12);
  205. PruningReader tfr = new PruningReader(in, null, ridf);
  206. assertTDCount(tfr, new Term("body", "one"), 0);
  207. assertTD(tfr, new Term("body", "two"), new int[]{0, 1, 2, 4});
  208. assertTD(tfr, new Term("body", "three"), new int[]{0, 1, 3});
  209. assertTD(tfr, new Term("test", "one"), new int[]{4});
  210. assertTD(tfr, new Term("body", "four"), new int[]{0});
  211. assertTD(tfr, new Term("test", "four"), new int[]{4});
  212. }
  213. public void testTfPruning() throws Exception {
  214. RAMDirectory targetDir = new RAMDirectory();
  215. IndexReader in = IndexReader.open(sourceDir, true);
  216. TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, null, 2);
  217. PruningReader tfr = new PruningReader(in, null, tfp);
  218. // verify
  219. assertTD(tfr, new Term("body", "one"), new int[]{1, 2, 3, 4});
  220. assertTD(tfr, new Term("body", "two"), new int[]{1, 2});
  221. assertTD(tfr, new Term("body", "three"), new int[]{1, 3});
  222. assertTD(tfr, new Term("test", "one"), new int[]{4});
  223. assertTDCount(tfr, new Term("body", "four"), 0);
  224. assertTDCount(tfr, new Term("test", "four"), 0);
  225. // verify new reader
  226. WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
  227. IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
  228. iw.addIndexes(new IndexReader[]{tfr});
  229. iw.close();
  230. IndexReader ir = IndexReader.open(targetDir, true);
  231. assertTD(ir, new Term("body", "one"), new int[]{1, 2, 3, 4});
  232. assertTD(ir, new Term("body", "two"), new int[]{1, 2});
  233. assertTD(ir, new Term("body", "three"), new int[]{1, 3});
  234. assertTD(ir, new Term("test", "one"), new int[]{4});
  235. tfr.close();
  236. ir.close();
  237. }
  238. public void testCarmelTopKPruning() throws Exception {
  239. IndexReader in = IndexReader.open(sourceDir, true);
  240. // validate full scores - without pruning, just to make sure we test the right thing
  241. validateDocScores(fullScores, in, false, false); // validate both docs and scores
  242. // prune reader
  243. CarmelTopKTermPruningPolicy tfp = new CarmelTopKTermPruningPolicy(in, null, K, EPSILON, R, null);
  244. PruningReader tfr = new PruningReader(in, null, tfp);
  245. // create the pruned index
  246. RAMDirectory targetDir = new RAMDirectory();
  247. WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
  248. IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
  249. iw.addIndexes(new IndexReader[]{tfr});
  250. iw.close();
  251. in.close();
  252. // validate scores of pruned index
  253. IndexReader ir = IndexReader.open(targetDir, true);
  254. validateDocScores(prunedScores, ir, false, true); // validated only docs (scores have changed after pruning)
  255. ir.close();
  256. }
  257. private void validateDocScores(Map<Term,ScoreDoc[]> baseScores, IndexReader in, boolean print, boolean onlyDocs) throws IOException {
  258. validateDocScores(baseScores, in, new Term("body", "one"), print, onlyDocs);
  259. validateDocScores(baseScores, in, new Term("body", "two"), print, onlyDocs);
  260. validateDocScores(baseScores, in, new Term("body", "three"), print, onlyDocs);
  261. validateDocScores(baseScores, in, new Term("test", "one"), print, onlyDocs);
  262. validateDocScores(baseScores, in, new Term("allthesame", "allthesame"), print, onlyDocs);
  263. }
  264. /** validate the doc-scores, optionally also print them */
  265. private void validateDocScores(Map<Term,ScoreDoc[]> baseScores, IndexReader in, Term term, boolean print, boolean onlyDocs) throws IOException {
  266. if (print) {
  267. printDocScores(baseScores, in, term);
  268. }
  269. float delta = .0001f;
  270. IndexSearcher is = new IndexSearcher(in);
  271. TermQuery q = new TermQuery(term);
  272. ScoreDoc[] sd = is.search(q, 100).scoreDocs;
  273. assertNotNull("unknown result for term: "+term, baseScores.get(term));
  274. assertEquals("wrong number of results!", baseScores.get(term).length, sd.length);
  275. for (int i = 0; i < sd.length; i++) {
  276. assertEquals("wrong doc!", baseScores.get(term)[i].doc, sd[i].doc);
  277. if (!onlyDocs) {
  278. assertEquals("wrong score!", baseScores.get(term)[i].score, sd[i].score, delta);
  279. }
  280. }
  281. }
  282. /** Print the doc scores (in a code format */
  283. private void printDocScores(Map<Term,ScoreDoc[]> baseScores, IndexReader in, Term term) throws IOException {
  284. IndexSearcher is = new IndexSearcher(in);
  285. TermQuery q = new TermQuery(term);
  286. ScoreDoc[] scoreDocs = is.search(q, 100).scoreDocs;
  287. System.out.println("t = new Term(\""+term.field+"\",\""+term.text+"\");");
  288. System.out.println("sd = new ScoreDoc[] {");
  289. for (ScoreDoc sd : scoreDocs) {
  290. System.out.println(" new ScoreDoc("+sd.doc+", "+sd.score+"f),");
  291. }
  292. System.out.println("res.put(t,sd);");
  293. }
  294. public void testThresholds() throws Exception {
  295. Map<String, Integer> thresholds = new HashMap<String, Integer>();
  296. thresholds.put("test", 3);
  297. IndexReader in = IndexReader.open(sourceDir, true);
  298. TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, null, thresholds, 2);
  299. PruningReader tfr = new PruningReader(in, null, tfp);
  300. assertTDCount(tfr, new Term("test", "one"), 0);
  301. assertTDCount(tfr, new Term("test", "two"), 0);
  302. assertTD(tfr, new Term("test", "three"), new int[]{4});
  303. assertTDCount(tfr, new Term("test", "four"), 0);
  304. }
  305. public void testRemoveFields() throws Exception {
  306. RAMDirectory targetDir = new RAMDirectory();
  307. Map<String, Integer> removeFields = new HashMap<String, Integer>();
  308. removeFields.put("test", PruningPolicy.DEL_POSTINGS | PruningPolicy.DEL_STORED);
  309. IndexReader in = IndexReader.open(sourceDir, true);
  310. TFTermPruningPolicy tfp = new TFTermPruningPolicy(in, removeFields, null, 2);
  311. StorePruningPolicy stp = new StorePruningPolicy(in, removeFields);
  312. PruningReader tfr = new PruningReader(in, stp, tfp);
  313. Document doc = tfr.document(4);
  314. // removed stored values?
  315. assertNull(doc.get("test"));
  316. // removed postings ?
  317. TermEnum te = tfr.terms();
  318. while (te.next()) {
  319. assertFalse("test".equals(te.term().field()));
  320. }
  321. // but vectors should be present !
  322. TermFreqVector tv = tfr.getTermFreqVector(4, "test");
  323. assertNotNull(tv);
  324. assertEquals(4, tv.getTerms().length); // term "four" not deleted yet from TermEnum
  325. // verify new reader
  326. WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
  327. IndexWriter iw = new IndexWriter(targetDir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
  328. iw.addIndexes(new IndexReader[]{tfr});
  329. iw.close();
  330. IndexReader ir = IndexReader.open(targetDir, true);
  331. tv = ir.getTermFreqVector(4, "test");
  332. assertNotNull(tv);
  333. assertEquals(3, tv.getTerms().length); // term "four" was deleted from TermEnum
  334. }
  335. }