KGramSpellingCorrector.java

/starter/src/cs276/pe1/spell/KGramSpellingCorrector.java

https://github.com/rioleo/huangenius
Java | 101 lines | 69 code | 22 blank | 10 comment | 15 complexity | 6867259a27c72d95f8a52a2ac332424f MD5 | raw file

package cs276.pe1.spell;

import java.io.File;
import java.util.List;

import cs276.util.IOUtils;
import cs276.util.StringUtils;
import java.util.*;

import cs276.util.Counter;

public class KGramSpellingCorrector implements SpellingCorrector {
	/** Initializes spelling corrector by indexing kgrams in words from a file */
	
	private HashMap<String, Counter<String>> index;
	
	public KGramSpellingCorrector() {
	    
	    // instantiate new index
	    index = new HashMap<String, Counter<String>>();
	    
        File path = new File("/afs/ir/class/cs276/pe1-2011/big.txt.gz");
//		File path = new File("/afs/ir/class/cs276/pe1-2011/imdb-plots-20081003.list.gz");
        for (String line : IOUtils.readLines(IOUtils.openFile(path))) {        
            for (String word : StringUtils.tokenize(line)) {
            
                //ArrayList<String> bigrams = getBigrams(word);           
				ArrayList<String> bigrams = getKgrams(word, 2); 
                for (String bigram : bigrams) {
                    if (index.containsKey(bigram)) {
                        index.get(bigram).incrementCount(word);
                    } else {
                        Counter<String> words = new Counter<String>();
                        words.incrementCount(word);
                        index.put(bigram, words);
                    }
                }
                
            }
        }
	}

	public List<String> corrections(String word) {
		Set<String> wordBigrams = getKgramsSet(word, 2);
		
		Counter<String> possibleCorrections = new Counter<String>();
		
		for (String wordBigram : wordBigrams) {
		    Set<String> postings = null;
		    if (index.containsKey(wordBigram)) postings = index.get(wordBigram).keySet();
		    if (postings != null) {
		        for (String posting : postings) {
		            if (!possibleCorrections.containsKey(posting)) {
	                    Set<String> intersect = getKgramsSet(posting, 2);
                        intersect.retainAll(wordBigrams);
	                    
	                    Set<String> union = getKgramsSet(posting, 2);
	                    union.addAll(wordBigrams);
	                    
	                    possibleCorrections.setCount(posting, ((double) intersect.size())/union.size());
	                }
		        }
		    }
		}
		
//	    System.out.println("Word: " + word);
//	    System.out.println("KGram corrections----------------------");
//	    for (String entry : possibleCorrections.topK(20)) {
//	        double occurrences = index.get(getKgrams(entry,2).get(0)).getCount(entry);
//	        System.out.println(entry + " " + possibleCorrections.getCount(entry) + ": " + occurrences);
//	    }
		
		return possibleCorrections.topK(5);
	}
	
	
	private Set<String> getKgramsSet(String word, int k) {
	    Set<String> kgrams = new HashSet<String>();
	    for (int i = 0; i < k-1; i++) word = "$" + word + "$";
        for (int i = 0; i < word.length()-(k-1); i++) {
            String kgram = word.substring(i, i+k);
            kgrams.add(kgram);
        }
        return kgrams;
	}
	
	private ArrayList<String> getKgrams(String word, int k) {
	    ArrayList<String> kgrams = new ArrayList<String>();
	    for (int i = 0; i < k-1; i++) word = "$" + word + "$";
        for (int i = 0; i < word.length()-(k-1); i++) {
            String kgram = word.substring(i, i+k);
            kgrams.add(kgram);
        }
        return kgrams;
	}
	
	public double getOccurrences(String word) {
        return(index.get(getKgrams(word,2).get(0)).getCount(word));
	}

}