PageRenderTime 25ms CodeModel.GetById 0ms RepoModel.GetById 1ms app.codeStats 0ms

/src/main/java/com/searchcode/app/util/SearchcodeSpellingCorrector.java

https://github.com/boyter/searchcode-server
Java | 213 lines | 134 code | 45 blank | 34 comment | 29 complexity | 027de46d0de72b0d65b650fe2ff63986 MD5 | raw file
  1. /*
  2. * Copyright (c) 2016 Boyter Online Services
  3. *
  4. * Use of this software is governed by the Fair Source License included
  5. * in the LICENSE.TXT file, but will be eventually open under GNU General Public License Version 3
  6. * see the README.md for when this clause will take effect
  7. *
  8. * Version 1.3.15
  9. */
  10. package com.searchcode.app.util;
  11. import com.searchcode.app.config.Values;
  12. import com.searchcode.app.service.Singleton;
  13. import java.util.*;
  14. import java.util.stream.Collectors;
  15. /**
  16. * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and
  17. * the like. Attempts to be highly performing by never changing the first character (for the first pass)
  18. * since we can usually assume that the user got that correct.
  19. */
  20. public class SearchcodeSpellingCorrector implements ISpellingCorrector {
  21. // How many terms to keep in the LRUCACHE
  22. private int LRUCOUNT = Integer.parseInt(Values.DEFAULTSPELLINGCORRECTORSIZE);
  23. private int VARIATIONSCOUNT = 200000;
  24. // word to count map - how may times a word is present - or a weight attached to a word
  25. private Map<String, Integer> dictionary = null;
  26. public SearchcodeSpellingCorrector() {
  27. this.LRUCOUNT = Integer.parseInt(Properties.getProperties().getProperty(Values.SPELLINGCORRECTORSIZE, Values.DEFAULTSPELLINGCORRECTORSIZE));
  28. if (this.LRUCOUNT <= 0) {
  29. this.LRUCOUNT = Integer.parseInt(Values.DEFAULTSPELLINGCORRECTORSIZE);
  30. }
  31. this.dictionary = Collections.synchronizedMap(new LruCache<>(this.LRUCOUNT));
  32. }
  33. @Override
  34. public int getWordCount() {
  35. return dictionary.size();
  36. }
  37. @Override
  38. public boolean reset() {
  39. this.dictionary.clear();
  40. return true;
  41. }
  42. @Override
  43. public List<String> getSampleWords(int count) {
  44. List<String> sampleWords = new ArrayList<>();
  45. for (Map.Entry<String, Integer> entry : this.dictionary.entrySet()) {
  46. sampleWords.add(entry.getValue() + " - " + entry.getKey());
  47. }
  48. int end = sampleWords.size() >= 10 ? 10 : sampleWords.size();
  49. return sampleWords.subList(0, end);
  50. }
  51. @Override
  52. public void putWord(String word) {
  53. word = word.toLowerCase();
  54. if (dictionary.containsKey(word)) {
  55. dictionary.put(word, (dictionary.get(word) + 1));
  56. } else {
  57. dictionary.put(word, 1);
  58. }
  59. }
  60. @Override
  61. public String correct(String word) {
  62. if (Singleton.getHelpers().isNullEmptyOrWhitespace(word)) {
  63. return word;
  64. }
  65. word = word.toLowerCase();
  66. // If the word exists in our dictionary then return
  67. if (dictionary.containsKey(word)) {
  68. return word;
  69. }
  70. Map<String, Integer> possibleMatches = new HashMap<>();
  71. List<String> closeEdits = this.wordEdits(word);
  72. for (String closeEdit : closeEdits) {
  73. if (dictionary.containsKey(closeEdit)) {
  74. possibleMatches.put(closeEdit, this.dictionary.get(closeEdit));
  75. }
  76. }
  77. if (closeEdits.size() > VARIATIONSCOUNT) {
  78. closeEdits = closeEdits.subList(0, VARIATIONSCOUNT);
  79. }
  80. if (!possibleMatches.isEmpty()) {
  81. // Sorted least likely first
  82. Object[] matches = Singleton.getHelpers().sortByValue(possibleMatches).keySet().toArray();
  83. // Try to match anything of the same length first
  84. String bestMatch = Values.EMPTYSTRING;
  85. for (Object o : matches) {
  86. if (o.toString().length() == word.length()) {
  87. bestMatch = o.toString();
  88. }
  89. }
  90. if (!Singleton.getHelpers().isNullEmptyOrWhitespace(bestMatch)) {
  91. return bestMatch;
  92. }
  93. // Just return whatever is the best match
  94. return matches[matches.length - 1].toString();
  95. }
  96. // Ok we did't find anything, so lets run the edits function on the previous results and use those
  97. // this gives us results which are 2 characters away from whatever was entered
  98. List<String> furtherEdits = new ArrayList<>();
  99. for (String closeEdit : closeEdits) {
  100. furtherEdits.addAll(this.wordEdits(closeEdit));
  101. if (furtherEdits.size() > this.VARIATIONSCOUNT) {
  102. break;
  103. }
  104. }
  105. for (String furtherEdit : furtherEdits) {
  106. if (dictionary.containsKey(furtherEdit)) {
  107. possibleMatches.put(furtherEdit, this.dictionary.get(furtherEdit));
  108. }
  109. }
  110. if (!possibleMatches.isEmpty()) {
  111. // Sorted least likely first
  112. Object[] matches = Singleton.getHelpers().sortByValue(possibleMatches).keySet().toArray();
  113. // Try to match anything of the same length first
  114. String bestMatch = Values.EMPTYSTRING;
  115. for (Object o : matches) {
  116. if (o.toString().length() == word.length()) {
  117. bestMatch = o.toString();
  118. }
  119. }
  120. if (!Singleton.getHelpers().isNullEmptyOrWhitespace(bestMatch)) {
  121. return bestMatch;
  122. }
  123. // Just return whatever is the best match
  124. return matches[matches.length - 1].toString();
  125. }
  126. // If unable to find something better return the same string
  127. return word;
  128. }
  129. @Override
  130. public boolean containsWord(String word) {
  131. return dictionary.containsKey(word);
  132. }
  133. /**
  134. * Return a list of strings which are words similar to our one which could potentially be misspellings
  135. * Abuse the fact that a char can be used as an integer
  136. * Assume that they got the first letter correct for all edits to cut on CPU burn time
  137. */
  138. private List<String> wordEdits(String word) {
  139. List<String> closeWords = new ArrayList<String>();
  140. for (int i = 1; i < word.length() + 1; i++) {
  141. for (char character = 'a'; character <= 'z'; character++) {
  142. // Maybe they forgot to type a letter? Try adding one
  143. StringBuilder sb = new StringBuilder(word);
  144. sb.insert(i, character);
  145. closeWords.add(sb.toString());
  146. }
  147. if (closeWords.size() > this.VARIATIONSCOUNT) {
  148. return closeWords;
  149. }
  150. }
  151. for (int i = 1; i < word.length(); i++) {
  152. for (char character = 'a'; character <= 'z'; character++) {
  153. // Maybe they mistyped a single letter? Try replacing them all
  154. StringBuilder sb = new StringBuilder(word);
  155. sb.setCharAt(i, character);
  156. closeWords.add(sb.toString());
  157. // Maybe they added an extra letter? Try deleting one
  158. sb = new StringBuilder(word);
  159. sb.deleteCharAt(i);
  160. closeWords.add(sb.toString());
  161. }
  162. if (closeWords.size() > this.VARIATIONSCOUNT) {
  163. return closeWords;
  164. }
  165. }
  166. return closeWords;
  167. }
  168. }