PageRenderTime 1117ms CodeModel.GetById 41ms RepoModel.GetById 0ms app.codeStats 0ms

/java/src/main/java/SpellingCorrector.java

https://github.com/kbaribeau/Spelling-Corrector
Java | 174 lines | 143 code | 27 blank | 4 comment | 21 complexity | 5596e42d34acfd5f05c0c1a72bcb98e8 MD5 | raw file
  1. import java.io.BufferedReader;
  2. import java.io.FileReader;
  3. import java.io.IOException;
  4. import java.util.HashMap;
  5. import java.util.HashSet;
  6. import java.util.Map;
  7. import java.util.Set;
  8. import java.util.StringTokenizer;
  9. //This started when @damaneice asked for a code review. Most of this code is still his.
  10. //All I've done is a little refactoring.
  11. //If I can get my act together I'll try and refactor it until it's as concise as Norvig's
  12. //python implementation
  13. public class SpellingCorrector {
  14. private static final String alphabet = "abcdefghijklmnopqrstuvwxyz";
  15. private Map<String,Integer> languageModel = new HashMap<String,Integer>();
  16. public SpellingCorrector(){
  17. buildLanguageModel();
  18. }
  19. public String correct(String word){
  20. Set<String> candidatesSet = findLikelyCandidates(word);
  21. if (candidatesSet.size() == 1){
  22. return candidatesSet.iterator().next();
  23. }
  24. return findMostLikelyCandidateInLanguageModel(candidatesSet);
  25. }
  26. private String findMostLikelyCandidateInLanguageModel(Set<String> candidatesSet) {
  27. int highScore = 0;
  28. String selectedWord = null;
  29. for (String wordCandidate : candidatesSet){
  30. int score = languageModel.get(wordCandidate);
  31. if (score > highScore){
  32. highScore = score;
  33. selectedWord = wordCandidate;
  34. }
  35. }
  36. return selectedWord;
  37. }
  38. public Set<String> applySingleCharacterDeletions(String word) {
  39. Set<String> deletes = new HashSet<String>();
  40. for (int i = 0; i < word.length(); i++){
  41. deletes.add(new StringBuffer(word).deleteCharAt(i).toString());
  42. }
  43. return deletes;
  44. }
  45. public Set<String> applyTranspositions(String word){
  46. Set<String> transpositions = new HashSet<String>();
  47. for (int i = 0; i < word.length() - 1; i++){
  48. transpositions.add(transpose(word, i));
  49. }
  50. return transpositions;
  51. }
  52. private String transpose(String word, int index){
  53. return new StringBuilder(word).
  54. deleteCharAt(index).
  55. insert(index+1, word.charAt(index)).
  56. toString();
  57. }
  58. public Set<String> applyOneLetterTypeos(String word){
  59. Set<String> alterations = new HashSet<String>();
  60. for (int i = 0; i < word.length(); i++){
  61. for(int j = 0; j < alphabet.length(); j++){
  62. String replacement = String.valueOf(alphabet.charAt(j));
  63. alterations.add(
  64. new StringBuilder(word).replace(i, i + 1, replacement).toString());
  65. }
  66. }
  67. return alterations;
  68. }
  69. public Set<String> applyInserts(String word){
  70. Set<String> inserts = new HashSet<String>();
  71. for (int i = 0; i <= word.length(); i++){
  72. for(int j = 0; j < alphabet.length(); j++){
  73. inserts.add(new StringBuilder(word).insert(i, alphabet.charAt(j)).toString());
  74. }
  75. }
  76. return inserts;
  77. }
  78. public Set<String> applyEdits(String word){
  79. Set<String> edits = new HashSet<String>();
  80. edits.addAll(applySingleCharacterDeletions(word));
  81. edits.addAll(applyTranspositions(word));
  82. edits.addAll(applyOneLetterTypeos(word));
  83. edits.addAll(applyInserts(word));
  84. return edits;
  85. }
  86. private Set<String> filterOutNonWords(Set<String> words){
  87. Set<String> filteredWords = new HashSet<String>();
  88. for (String word : words){
  89. if (isWordInLanguageModel(word)){
  90. filteredWords.add(word);
  91. }
  92. }
  93. return filteredWords;
  94. }
  95. private boolean isWordInLanguageModel(String word){
  96. return languageModel.containsKey(word);
  97. }
  98. public Set<String> findLikelyCandidates(String word) {
  99. Set<String> wordSet = new HashSet<String>();
  100. wordSet.add(word);
  101. if (isWordInLanguageModel(word)) {
  102. return wordSet;
  103. }
  104. Set<String> edits = applyEdits(word);
  105. Set<String> candidates = filterOutNonWords(edits);
  106. if (!candidates.isEmpty()) {
  107. return candidates;
  108. }
  109. Set<String> secondOrderEdits = applyEditsToSet(edits);
  110. candidates = filterOutNonWords(secondOrderEdits);
  111. if (!candidates.isEmpty()) {
  112. return candidates;
  113. }
  114. return wordSet;
  115. }
  116. private Set<String> applyEditsToSet(Set<String> words) {
  117. Set<String> edits = new HashSet<String>();
  118. for (String word : words){
  119. edits.addAll(applyEdits(word));
  120. }
  121. return edits;
  122. }
  123. public void buildLanguageModel() {
  124. try {
  125. BufferedReader input = new BufferedReader(new FileReader("../big.txt"));
  126. try {
  127. String line;
  128. while ((line = input.readLine()) != null) {
  129. StringTokenizer tok = new StringTokenizer(line);
  130. while (tok.hasMoreElements()) {
  131. String word = (String) tok.nextElement();
  132. if (languageModel.get(word) == null) {
  133. languageModel.put(word, 1);
  134. }
  135. else{
  136. languageModel.put(word, languageModel.get(word) + 1);
  137. }
  138. }
  139. }
  140. } finally {
  141. input.close();
  142. }
  143. } catch (IOException ex) {
  144. throw new RuntimeException(ex);
  145. }
  146. }
  147. }