/java/src/main/java/SpellingCorrector.java
Java | 174 lines | 143 code | 27 blank | 4 comment | 21 complexity | 5596e42d34acfd5f05c0c1a72bcb98e8 MD5 | raw file
1import java.io.BufferedReader;
2import java.io.FileReader;
3import java.io.IOException;
4import java.util.HashMap;
5import java.util.HashSet;
6import java.util.Map;
7import java.util.Set;
8import java.util.StringTokenizer;
9
10
11//This started when @damaneice asked for a code review. Most of this code is still his.
12//All I've done is a little refactoring.
13//If I can get my act together I'll try and refactor it until it's as concise as Norvig's
14//python implementation
15public class SpellingCorrector {
16 private static final String alphabet = "abcdefghijklmnopqrstuvwxyz";
17 private Map<String,Integer> languageModel = new HashMap<String,Integer>();
18
19 public SpellingCorrector(){
20 buildLanguageModel();
21 }
22
23 public String correct(String word){
24 Set<String> candidatesSet = findLikelyCandidates(word);
25
26 if (candidatesSet.size() == 1){
27 return candidatesSet.iterator().next();
28 }
29
30 return findMostLikelyCandidateInLanguageModel(candidatesSet);
31
32 }
33
34 private String findMostLikelyCandidateInLanguageModel(Set<String> candidatesSet) {
35 int highScore = 0;
36 String selectedWord = null;
37 for (String wordCandidate : candidatesSet){
38 int score = languageModel.get(wordCandidate);
39 if (score > highScore){
40 highScore = score;
41 selectedWord = wordCandidate;
42 }
43 }
44 return selectedWord;
45 }
46
47 public Set<String> applySingleCharacterDeletions(String word) {
48 Set<String> deletes = new HashSet<String>();
49 for (int i = 0; i < word.length(); i++){
50 deletes.add(new StringBuffer(word).deleteCharAt(i).toString());
51 }
52 return deletes;
53 }
54
55 public Set<String> applyTranspositions(String word){
56 Set<String> transpositions = new HashSet<String>();
57 for (int i = 0; i < word.length() - 1; i++){
58 transpositions.add(transpose(word, i));
59 }
60 return transpositions;
61 }
62
63 private String transpose(String word, int index){
64 return new StringBuilder(word).
65 deleteCharAt(index).
66 insert(index+1, word.charAt(index)).
67 toString();
68 }
69
70 public Set<String> applyOneLetterTypeos(String word){
71 Set<String> alterations = new HashSet<String>();
72 for (int i = 0; i < word.length(); i++){
73 for(int j = 0; j < alphabet.length(); j++){
74 String replacement = String.valueOf(alphabet.charAt(j));
75 alterations.add(
76 new StringBuilder(word).replace(i, i + 1, replacement).toString());
77 }
78 }
79 return alterations;
80 }
81
82 public Set<String> applyInserts(String word){
83 Set<String> inserts = new HashSet<String>();
84 for (int i = 0; i <= word.length(); i++){
85 for(int j = 0; j < alphabet.length(); j++){
86 inserts.add(new StringBuilder(word).insert(i, alphabet.charAt(j)).toString());
87 }
88 }
89 return inserts;
90 }
91
92 public Set<String> applyEdits(String word){
93 Set<String> edits = new HashSet<String>();
94 edits.addAll(applySingleCharacterDeletions(word));
95 edits.addAll(applyTranspositions(word));
96 edits.addAll(applyOneLetterTypeos(word));
97 edits.addAll(applyInserts(word));
98 return edits;
99 }
100
101
102 private Set<String> filterOutNonWords(Set<String> words){
103 Set<String> filteredWords = new HashSet<String>();
104 for (String word : words){
105 if (isWordInLanguageModel(word)){
106 filteredWords.add(word);
107 }
108 }
109 return filteredWords;
110 }
111
112 private boolean isWordInLanguageModel(String word){
113 return languageModel.containsKey(word);
114 }
115
116
117 public Set<String> findLikelyCandidates(String word) {
118
119 Set<String> wordSet = new HashSet<String>();
120 wordSet.add(word);
121 if (isWordInLanguageModel(word)) {
122 return wordSet;
123 }
124
125 Set<String> edits = applyEdits(word);
126 Set<String> candidates = filterOutNonWords(edits);
127 if (!candidates.isEmpty()) {
128 return candidates;
129 }
130
131 Set<String> secondOrderEdits = applyEditsToSet(edits);
132 candidates = filterOutNonWords(secondOrderEdits);
133 if (!candidates.isEmpty()) {
134 return candidates;
135 }
136
137 return wordSet;
138 }
139
140 private Set<String> applyEditsToSet(Set<String> words) {
141 Set<String> edits = new HashSet<String>();
142 for (String word : words){
143 edits.addAll(applyEdits(word));
144 }
145 return edits;
146 }
147
148 public void buildLanguageModel() {
149 try {
150 BufferedReader input = new BufferedReader(new FileReader("../big.txt"));
151 try {
152 String line;
153 while ((line = input.readLine()) != null) {
154 StringTokenizer tok = new StringTokenizer(line);
155 while (tok.hasMoreElements()) {
156 String word = (String) tok.nextElement();
157 if (languageModel.get(word) == null) {
158 languageModel.put(word, 1);
159 }
160 else{
161 languageModel.put(word, languageModel.get(word) + 1);
162 }
163 }
164 }
165 } finally {
166 input.close();
167 }
168 } catch (IOException ex) {
169 throw new RuntimeException(ex);
170 }
171 }
172
173
174}