PageRenderTime 43ms CodeModel.GetById 17ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/java/src/main/java/SpellingCorrector.java

https://github.com/kbaribeau/Spelling-Corrector
Java | 174 lines | 143 code | 27 blank | 4 comment | 21 complexity | 5596e42d34acfd5f05c0c1a72bcb98e8 MD5 | raw file
  1import java.io.BufferedReader;
  2import java.io.FileReader;
  3import java.io.IOException;
  4import java.util.HashMap;
  5import java.util.HashSet;
  6import java.util.Map;
  7import java.util.Set;
  8import java.util.StringTokenizer;
  9
 10
 11//This started when @damaneice asked for a code review.  Most of this code is still his.
 12//All I've done is a little refactoring.
 13//If I can get my act together I'll try and refactor it until it's as concise as Norvig's
 14//python implementation
 15public class SpellingCorrector {
 16	private static final String alphabet = "abcdefghijklmnopqrstuvwxyz";
 17	private Map<String,Integer> languageModel = new HashMap<String,Integer>();
 18
 19	public SpellingCorrector(){
 20		buildLanguageModel();
 21	}
 22
 23	public String correct(String word){
 24		Set<String> candidatesSet = findLikelyCandidates(word);
 25
 26		if (candidatesSet.size() == 1){
 27			return candidatesSet.iterator().next();
 28		}
 29
 30		return findMostLikelyCandidateInLanguageModel(candidatesSet);
 31
 32	}
 33
 34	private String findMostLikelyCandidateInLanguageModel(Set<String> candidatesSet) {
 35		int highScore = 0;
 36		String selectedWord = null;
 37		for (String wordCandidate : candidatesSet){
 38			int score = languageModel.get(wordCandidate);
 39			if (score > highScore){
 40				highScore = score;
 41				selectedWord = wordCandidate;
 42			}
 43		}
 44		return selectedWord;
 45	}
 46
 47	public Set<String> applySingleCharacterDeletions(String word) {
 48		Set<String> deletes = new HashSet<String>();
 49		for (int i = 0; i < word.length(); i++){
 50			deletes.add(new StringBuffer(word).deleteCharAt(i).toString());
 51		}
 52		return deletes;
 53	}
 54
 55	public Set<String> applyTranspositions(String word){
 56		Set<String> transpositions = new HashSet<String>();
 57		for (int i = 0; i < word.length() - 1; i++){
 58			transpositions.add(transpose(word, i));
 59		}
 60		return transpositions;
 61	}
 62
 63	private String transpose(String word, int index){
 64		return new StringBuilder(word).
 65				deleteCharAt(index).
 66				insert(index+1, word.charAt(index)).
 67				toString();
 68	}
 69
 70	public Set<String> applyOneLetterTypeos(String word){
 71		Set<String> alterations = new HashSet<String>();
 72		for (int i = 0; i < word.length(); i++){
 73			for(int j = 0; j < alphabet.length(); j++){
 74				String replacement = String.valueOf(alphabet.charAt(j));
 75				alterations.add(
 76						new StringBuilder(word).replace(i, i + 1, replacement).toString());
 77			}
 78		}
 79		return alterations;
 80	}
 81
 82	public Set<String> applyInserts(String word){
 83		Set<String> inserts = new HashSet<String>();
 84		for (int i = 0; i <= word.length(); i++){
 85			for(int j = 0; j < alphabet.length(); j++){
 86				inserts.add(new StringBuilder(word).insert(i, alphabet.charAt(j)).toString());
 87			}
 88		}
 89		return inserts;
 90	}
 91
 92	public Set<String> applyEdits(String word){
 93		Set<String> edits = new HashSet<String>();
 94		edits.addAll(applySingleCharacterDeletions(word));
 95		edits.addAll(applyTranspositions(word));
 96		edits.addAll(applyOneLetterTypeos(word));
 97		edits.addAll(applyInserts(word));
 98		return edits;
 99	}
100
101
102	private Set<String> filterOutNonWords(Set<String> words){
103		Set<String> filteredWords = new HashSet<String>();
104		for (String word : words){
105			if (isWordInLanguageModel(word)){
106				filteredWords.add(word);
107			}
108		}
109		return filteredWords;
110	}
111
112	private boolean isWordInLanguageModel(String word){
113		return languageModel.containsKey(word);
114	}
115
116
117	public Set<String> findLikelyCandidates(String word) {
118
119		Set<String> wordSet = new HashSet<String>();
120		wordSet.add(word);
121		if (isWordInLanguageModel(word)) {
122			return wordSet;
123		}
124
125		Set<String> edits = applyEdits(word);
126		Set<String> candidates = filterOutNonWords(edits);
127		if (!candidates.isEmpty()) {
128			return candidates;
129		}
130
131		Set<String> secondOrderEdits = applyEditsToSet(edits);
132		candidates = filterOutNonWords(secondOrderEdits);
133		if (!candidates.isEmpty()) {
134			return candidates;
135		}
136
137		return wordSet;
138	}
139
140	private Set<String> applyEditsToSet(Set<String> words) {
141		Set<String> edits = new HashSet<String>();
142		for (String word : words){
143			edits.addAll(applyEdits(word));
144		}
145		return edits;
146	}
147
148	public void buildLanguageModel() {
149		try {
150			BufferedReader input = new BufferedReader(new FileReader("../big.txt"));
151			try {
152				String line;
153				while ((line = input.readLine()) != null) {
154					StringTokenizer tok = new StringTokenizer(line);
155					while (tok.hasMoreElements()) {
156						String word = (String) tok.nextElement();
157						if (languageModel.get(word) == null) {
158							 languageModel.put(word, 1);
159						}
160						else{
161							languageModel.put(word, languageModel.get(word) + 1);
162						}
163					}
164				}
165			} finally {
166				input.close();
167			}
168		} catch (IOException ex) {
169			throw new RuntimeException(ex);
170		}
171	}
172
173
174}