PageRenderTime 2ms CodeModel.GetById 27ms app.highlight 12ms RepoModel.GetById 0ms app.codeStats 0ms

/src/main/java/com/searchcode/app/util/SearchcodeSpellingCorrector.java

https://github.com/boyter/searchcode-server
Java | 213 lines | 134 code | 45 blank | 34 comment | 29 complexity | 027de46d0de72b0d65b650fe2ff63986 MD5 | raw file
  1/*
  2 * Copyright (c) 2016 Boyter Online Services
  3 *
  4 * Use of this software is governed by the Fair Source License included
  5 * in the LICENSE.TXT file, but will be eventually open under GNU General Public License Version 3
  6 * see the README.md for when this clause will take effect
  7 *
  8 * Version 1.3.15
  9 */
 10
 11package com.searchcode.app.util;
 12
 13
 14import com.searchcode.app.config.Values;
 15import com.searchcode.app.service.Singleton;
 16
 17import java.util.*;
 18import java.util.stream.Collectors;
 19
 20/**
 21 * A simple spell checker based on a few implementations such as the infamous Peter Noving spell checker and
 22 * the like. Attempts to be highly performing by never changing the first character (for the first pass)
 23 * since we can usually assume that the user got that correct.
 24 */
 25public class SearchcodeSpellingCorrector implements ISpellingCorrector {
 26
 27    // How many terms to keep in the LRUCACHE
 28    private int LRUCOUNT = Integer.parseInt(Values.DEFAULTSPELLINGCORRECTORSIZE);
 29
 30    private int VARIATIONSCOUNT = 200000;
 31
 32    // word to count map - how may times a word is present - or a weight attached to a word
 33    private Map<String, Integer> dictionary = null;
 34
 35    public SearchcodeSpellingCorrector() {
 36        this.LRUCOUNT = Integer.parseInt(Properties.getProperties().getProperty(Values.SPELLINGCORRECTORSIZE, Values.DEFAULTSPELLINGCORRECTORSIZE));
 37        if (this.LRUCOUNT <= 0) {
 38            this.LRUCOUNT = Integer.parseInt(Values.DEFAULTSPELLINGCORRECTORSIZE);
 39        }
 40
 41        this.dictionary = Collections.synchronizedMap(new LruCache<>(this.LRUCOUNT));
 42    }
 43
 44    @Override
 45    public int getWordCount() {
 46        return dictionary.size();
 47    }
 48
 49    @Override
 50    public boolean reset() {
 51        this.dictionary.clear();
 52        return true;
 53    }
 54
 55    @Override
 56    public List<String> getSampleWords(int count) {
 57        List<String> sampleWords = new ArrayList<>();
 58
 59        for (Map.Entry<String, Integer> entry : this.dictionary.entrySet()) {
 60            sampleWords.add(entry.getValue() + " - " + entry.getKey());
 61        }
 62
 63        int end = sampleWords.size() >= 10 ? 10 : sampleWords.size();
 64
 65        return sampleWords.subList(0, end);
 66    }
 67
 68    @Override
 69    public void putWord(String word) {
 70        word = word.toLowerCase();
 71        if (dictionary.containsKey(word)) {
 72            dictionary.put(word, (dictionary.get(word) + 1));
 73        } else {
 74            dictionary.put(word, 1);
 75        }
 76    }
 77
 78    @Override
 79    public String correct(String word) {
 80        if (Singleton.getHelpers().isNullEmptyOrWhitespace(word)) {
 81            return word;
 82        }
 83
 84        word = word.toLowerCase();
 85
 86        // If the word exists in our dictionary then return
 87        if (dictionary.containsKey(word)) {
 88            return word;
 89        }
 90
 91        Map<String, Integer> possibleMatches = new HashMap<>();
 92
 93        List<String> closeEdits = this.wordEdits(word);
 94        for (String closeEdit : closeEdits) {
 95            if (dictionary.containsKey(closeEdit)) {
 96                possibleMatches.put(closeEdit, this.dictionary.get(closeEdit));
 97            }
 98        }
 99
100        if (closeEdits.size() > VARIATIONSCOUNT) {
101            closeEdits = closeEdits.subList(0, VARIATIONSCOUNT);
102        }
103
104        if (!possibleMatches.isEmpty()) {
105            // Sorted least likely first
106            Object[] matches = Singleton.getHelpers().sortByValue(possibleMatches).keySet().toArray();
107
108            // Try to match anything of the same length first
109            String bestMatch = Values.EMPTYSTRING;
110            for (Object o : matches) {
111                if (o.toString().length() == word.length()) {
112                    bestMatch = o.toString();
113                }
114            }
115
116            if (!Singleton.getHelpers().isNullEmptyOrWhitespace(bestMatch)) {
117                return bestMatch;
118            }
119
120            // Just return whatever is the best match
121            return matches[matches.length - 1].toString();
122        }
123
124        // Ok we did't find anything, so lets run the edits function on the previous results and use those
125        // this gives us results which are 2 characters away from whatever was entered
126        List<String> furtherEdits = new ArrayList<>();
127        for (String closeEdit : closeEdits) {
128            furtherEdits.addAll(this.wordEdits(closeEdit));
129
130            if (furtherEdits.size() > this.VARIATIONSCOUNT) {
131                break;
132            }
133        }
134
135        for (String furtherEdit : furtherEdits) {
136            if (dictionary.containsKey(furtherEdit)) {
137                possibleMatches.put(furtherEdit, this.dictionary.get(furtherEdit));
138            }
139        }
140
141        if (!possibleMatches.isEmpty()) {
142            // Sorted least likely first
143            Object[] matches = Singleton.getHelpers().sortByValue(possibleMatches).keySet().toArray();
144
145            // Try to match anything of the same length first
146            String bestMatch = Values.EMPTYSTRING;
147            for (Object o : matches) {
148                if (o.toString().length() == word.length()) {
149                    bestMatch = o.toString();
150                }
151            }
152
153            if (!Singleton.getHelpers().isNullEmptyOrWhitespace(bestMatch)) {
154                return bestMatch;
155            }
156
157            // Just return whatever is the best match
158            return matches[matches.length - 1].toString();
159        }
160
161
162        // If unable to find something better return the same string
163        return word;
164    }
165
166    @Override
167    public boolean containsWord(String word) {
168        return dictionary.containsKey(word);
169    }
170
171
172    /**
173     * Return a list of strings which are words similar to our one which could potentially be misspellings
174     * Abuse the fact that a char can be used as an integer
175     * Assume that they got the first letter correct for all edits to cut on CPU burn time
176     */
177    private List<String> wordEdits(String word) {
178        List<String> closeWords = new ArrayList<String>();
179
180        for (int i = 1; i < word.length() + 1; i++) {
181            for (char character = 'a'; character <= 'z'; character++) {
182                // Maybe they forgot to type a letter? Try adding one
183                StringBuilder sb = new StringBuilder(word);
184                sb.insert(i, character);
185                closeWords.add(sb.toString());
186            }
187
188            if (closeWords.size() > this.VARIATIONSCOUNT) {
189                return closeWords;
190            }
191        }
192
193        for (int i = 1; i < word.length(); i++) {
194            for (char character = 'a'; character <= 'z'; character++) {
195                // Maybe they mistyped a single letter? Try replacing them all
196                StringBuilder sb = new StringBuilder(word);
197                sb.setCharAt(i, character);
198                closeWords.add(sb.toString());
199
200                // Maybe they added an extra letter? Try deleting one
201                sb = new StringBuilder(word);
202                sb.deleteCharAt(i);
203                closeWords.add(sb.toString());
204            }
205
206            if (closeWords.size() > this.VARIATIONSCOUNT) {
207                return closeWords;
208            }
209        }
210
211        return closeWords;
212    }
213}