/starter/src/cs276/pe1/spell/KGramSpellingCorrector.java
Java | 101 lines | 69 code | 22 blank | 10 comment | 15 complexity | 6867259a27c72d95f8a52a2ac332424f MD5 | raw file
1package cs276.pe1.spell;
2
3import java.io.File;
4import java.util.List;
5
6import cs276.util.IOUtils;
7import cs276.util.StringUtils;
8import java.util.*;
9
10import cs276.util.Counter;
11
12public class KGramSpellingCorrector implements SpellingCorrector {
13 /** Initializes spelling corrector by indexing kgrams in words from a file */
14
15 private HashMap<String, Counter<String>> index;
16
17 public KGramSpellingCorrector() {
18
19 // instantiate new index
20 index = new HashMap<String, Counter<String>>();
21
22 File path = new File("/afs/ir/class/cs276/pe1-2011/big.txt.gz");
23// File path = new File("/afs/ir/class/cs276/pe1-2011/imdb-plots-20081003.list.gz");
24 for (String line : IOUtils.readLines(IOUtils.openFile(path))) {
25 for (String word : StringUtils.tokenize(line)) {
26
27 //ArrayList<String> bigrams = getBigrams(word);
28 ArrayList<String> bigrams = getKgrams(word, 2);
29 for (String bigram : bigrams) {
30 if (index.containsKey(bigram)) {
31 index.get(bigram).incrementCount(word);
32 } else {
33 Counter<String> words = new Counter<String>();
34 words.incrementCount(word);
35 index.put(bigram, words);
36 }
37 }
38
39 }
40 }
41 }
42
43 public List<String> corrections(String word) {
44 Set<String> wordBigrams = getKgramsSet(word, 2);
45
46 Counter<String> possibleCorrections = new Counter<String>();
47
48 for (String wordBigram : wordBigrams) {
49 Set<String> postings = null;
50 if (index.containsKey(wordBigram)) postings = index.get(wordBigram).keySet();
51 if (postings != null) {
52 for (String posting : postings) {
53 if (!possibleCorrections.containsKey(posting)) {
54 Set<String> intersect = getKgramsSet(posting, 2);
55 intersect.retainAll(wordBigrams);
56
57 Set<String> union = getKgramsSet(posting, 2);
58 union.addAll(wordBigrams);
59
60 possibleCorrections.setCount(posting, ((double) intersect.size())/union.size());
61 }
62 }
63 }
64 }
65
66// System.out.println("Word: " + word);
67// System.out.println("KGram corrections----------------------");
68// for (String entry : possibleCorrections.topK(20)) {
69// double occurrences = index.get(getKgrams(entry,2).get(0)).getCount(entry);
70// System.out.println(entry + " " + possibleCorrections.getCount(entry) + ": " + occurrences);
71// }
72
73 return possibleCorrections.topK(5);
74 }
75
76
77 private Set<String> getKgramsSet(String word, int k) {
78 Set<String> kgrams = new HashSet<String>();
79 for (int i = 0; i < k-1; i++) word = "$" + word + "$";
80 for (int i = 0; i < word.length()-(k-1); i++) {
81 String kgram = word.substring(i, i+k);
82 kgrams.add(kgram);
83 }
84 return kgrams;
85 }
86
87 private ArrayList<String> getKgrams(String word, int k) {
88 ArrayList<String> kgrams = new ArrayList<String>();
89 for (int i = 0; i < k-1; i++) word = "$" + word + "$";
90 for (int i = 0; i < word.length()-(k-1); i++) {
91 String kgram = word.substring(i, i+k);
92 kgrams.add(kgram);
93 }
94 return kgrams;
95 }
96
97 public double getOccurrences(String word) {
98 return(index.get(getKgrams(word,2).get(0)).getCount(word));
99 }
100
101}