PageRenderTime 70ms CodeModel.GetById 39ms app.highlight 20ms RepoModel.GetById 1ms app.codeStats 1ms

/starter/src/cs276/pe1/spell/KGramSpellingCorrector.java

https://github.com/rioleo/huangenius
Java | 101 lines | 69 code | 22 blank | 10 comment | 15 complexity | 6867259a27c72d95f8a52a2ac332424f MD5 | raw file
  1package cs276.pe1.spell;
  2
  3import java.io.File;
  4import java.util.List;
  5
  6import cs276.util.IOUtils;
  7import cs276.util.StringUtils;
  8import java.util.*;
  9
 10import cs276.util.Counter;
 11
 12public class KGramSpellingCorrector implements SpellingCorrector {
 13	/** Initializes spelling corrector by indexing kgrams in words from a file */
 14	
 15	private HashMap<String, Counter<String>> index;
 16	
 17	public KGramSpellingCorrector() {
 18	    
 19	    // instantiate new index
 20	    index = new HashMap<String, Counter<String>>();
 21	    
 22        File path = new File("/afs/ir/class/cs276/pe1-2011/big.txt.gz");
 23//		File path = new File("/afs/ir/class/cs276/pe1-2011/imdb-plots-20081003.list.gz");
 24        for (String line : IOUtils.readLines(IOUtils.openFile(path))) {        
 25            for (String word : StringUtils.tokenize(line)) {
 26            
 27                //ArrayList<String> bigrams = getBigrams(word);           
 28				ArrayList<String> bigrams = getKgrams(word, 2); 
 29                for (String bigram : bigrams) {
 30                    if (index.containsKey(bigram)) {
 31                        index.get(bigram).incrementCount(word);
 32                    } else {
 33                        Counter<String> words = new Counter<String>();
 34                        words.incrementCount(word);
 35                        index.put(bigram, words);
 36                    }
 37                }
 38                
 39            }
 40        }
 41	}
 42
 43	public List<String> corrections(String word) {
 44		Set<String> wordBigrams = getKgramsSet(word, 2);
 45		
 46		Counter<String> possibleCorrections = new Counter<String>();
 47		
 48		for (String wordBigram : wordBigrams) {
 49		    Set<String> postings = null;
 50		    if (index.containsKey(wordBigram)) postings = index.get(wordBigram).keySet();
 51		    if (postings != null) {
 52		        for (String posting : postings) {
 53		            if (!possibleCorrections.containsKey(posting)) {
 54	                    Set<String> intersect = getKgramsSet(posting, 2);
 55                        intersect.retainAll(wordBigrams);
 56	                    
 57	                    Set<String> union = getKgramsSet(posting, 2);
 58	                    union.addAll(wordBigrams);
 59	                    
 60	                    possibleCorrections.setCount(posting, ((double) intersect.size())/union.size());
 61	                }
 62		        }
 63		    }
 64		}
 65		
 66//	    System.out.println("Word: " + word);
 67//	    System.out.println("KGram corrections----------------------");
 68//	    for (String entry : possibleCorrections.topK(20)) {
 69//	        double occurrences = index.get(getKgrams(entry,2).get(0)).getCount(entry);
 70//	        System.out.println(entry + " " + possibleCorrections.getCount(entry) + ": " + occurrences);
 71//	    }
 72		
 73		return possibleCorrections.topK(5);
 74	}
 75	
 76	
 77	private Set<String> getKgramsSet(String word, int k) {
 78	    Set<String> kgrams = new HashSet<String>();
 79	    for (int i = 0; i < k-1; i++) word = "$" + word + "$";
 80        for (int i = 0; i < word.length()-(k-1); i++) {
 81            String kgram = word.substring(i, i+k);
 82            kgrams.add(kgram);
 83        }
 84        return kgrams;
 85	}
 86	
 87	private ArrayList<String> getKgrams(String word, int k) {
 88	    ArrayList<String> kgrams = new ArrayList<String>();
 89	    for (int i = 0; i < k-1; i++) word = "$" + word + "$";
 90        for (int i = 0; i < word.length()-(k-1); i++) {
 91            String kgram = word.substring(i, i+k);
 92            kgrams.add(kgram);
 93        }
 94        return kgrams;
 95	}
 96	
 97	public double getOccurrences(String word) {
 98        return(index.get(getKgrams(word,2).get(0)).getCount(word));
 99	}
100
101}