/plugins/VoxSpell/tags/release-1.0.2/voxspellcheck/SuggestionTree.java

# · Java · 213 lines · 156 code · 24 blank · 33 comment · 47 complexity · 2cd7fcd01f2b85ff190438ef816d5221 MD5 · raw file

  1. /*
  2. Copyright (C) 2008 Matthew Gilbert
  3. This program is free software; you can redistribute it and/or
  4. modify it under the terms of the GNU General Public License
  5. as published by the Free Software Foundation; either version 2
  6. of the License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. */
  15. package voxspellcheck;
  16. import java.util.Vector;
  17. import java.util.TreeMap;
  18. import java.util.SortedMap;
  19. import java.util.TreeSet;
  20. import java.util.Collections;
  21. import java.util.Comparator;
  22. import java.util.Set;
  23. import java.util.HashSet;
  24. import org.gjt.sp.util.Log;
  25. public class SuggestionTree
  26. {
  27. protected SortedMap<String, TreeSet<String>> map;
  28. protected WordTrie word_trie;
  29. protected OffsetTrie offset_trie;
  30. public SuggestionTree(OffsetTrie offset_trie_)
  31. {
  32. map = new TreeMap<String, TreeSet<String>>();
  33. offset_trie = offset_trie_;
  34. Vector<String> words = offset_trie.getWords();
  35. for (String s : words) {
  36. String meta = new DoubleMetaphone().encode(s);
  37. if (map.get(meta) == null) {
  38. map.put(meta, new TreeSet<String>());
  39. }
  40. map.get(meta).add(s);
  41. }
  42. word_trie = new WordTrie();
  43. }
  44. public void addWord(String word)
  45. {
  46. if (word.length() > 0) {
  47. String meta = new DoubleMetaphone().encode(word);
  48. if (map.get(meta) == null) {
  49. map.put(meta, new TreeSet<String>());
  50. }
  51. TreeSet<String> words = map.get(meta);
  52. words.add(word);
  53. word_trie.addWord(word);
  54. }
  55. }
  56. /* Find valid words that can be formed from a simple character swap. */
  57. public TreeSet<String> getPermutations(String word)
  58. {
  59. TreeSet<String> permutations = new TreeSet<String>();
  60. int length = word.length();
  61. for (int i = 0; i < length - 1; ++i) {
  62. char[] chars = new char[length];
  63. for (int j = 0; j < length; ++j) {
  64. if (j == i)
  65. chars[j] = word.charAt(j + 1);
  66. else if (j == (i + 1))
  67. chars[j] = word.charAt(j - 1);
  68. else
  69. chars[j] = word.charAt(j);
  70. }
  71. String new_word = new String(chars);
  72. if (offset_trie.find(new_word) || word_trie.find(new_word)) {
  73. permutations.add(new_word);
  74. }
  75. }
  76. return permutations;
  77. }
  78. public TreeSet<String> getSoundalike(String word)
  79. {
  80. DoubleMetaphone dm = new DoubleMetaphone();
  81. String meta = dm.encode(word);
  82. TreeSet<String> suggestions = map.get(meta);
  83. if (suggestions == null)
  84. suggestions = new TreeSet<String>();
  85. String higher = meta;
  86. String lower = meta;
  87. int num_tries = 0;
  88. // TODO: What's appropriate here?
  89. //while (suggestions.size() < 1000) {
  90. while (num_tries < 50) {
  91. //higher = (higher != null) ? map.higherKey(higher) : null;
  92. if (higher != null) {
  93. SortedMap<String, TreeSet<String>> tail = map.tailMap(higher);
  94. Object[] keys = tail.keySet().toArray();
  95. if (keys.length > 0) {
  96. // If higher isn't in the tail, then the first
  97. // result is what we want (i.e. meta is not in
  98. // map, so the next higher or equal
  99. // is a new value). Otherwise, take the next
  100. // higher key.
  101. if (!higher.equals((String)keys[0])) {
  102. higher = (String)keys[0];
  103. } else if (keys.length > 1) {
  104. higher = (String)keys[1];
  105. } else {
  106. higher = null;
  107. }
  108. } else {
  109. higher = null;
  110. }
  111. }
  112. if (higher != null) {
  113. TreeSet<String> add = map.get(higher);
  114. if (add != null)
  115. suggestions.addAll(add);
  116. }
  117. //lower = (lower != null) ? map.lowerKey(lower) : null;
  118. if (lower != null) {
  119. // headMap does not include lower, so don't need to do
  120. // the filtering like for tailMap.
  121. SortedMap<String, TreeSet<String>> head = map.headMap(lower);
  122. lower = null;
  123. try {
  124. lower = head.lastKey();
  125. } catch (java.util.NoSuchElementException ex) {
  126. ;
  127. }
  128. }
  129. if (higher != null) {
  130. TreeSet<String> add = map.get(higher);
  131. if (add != null)
  132. suggestions.addAll(add);
  133. }
  134. if ((lower == null) && (higher == null))
  135. break;
  136. ++num_tries;
  137. }
  138. return suggestions;
  139. }
  140. public Vector<String> getStartsWith(String word)
  141. {
  142. Vector<String> vec = new Vector<String>();
  143. vec.addAll(offset_trie.getWords(word));
  144. vec.addAll(word_trie.getWords(word));
  145. return vec;
  146. }
  147. public Vector<String> getSuggestions(String word)
  148. {
  149. if (word == null || word.trim().length() == 0)
  150. return null;
  151. word = word.trim();
  152. TreeSet<String> permutations = getPermutations(word);
  153. TreeSet<String> suggestions = new TreeSet<String>();
  154. permutations.add(word);
  155. for (String s : permutations)
  156. suggestions.addAll(getSoundalike(s));
  157. suggestions.addAll(getStartsWith(word));
  158. final String input = word;
  159. Comparator<String> c = new Comparator<String>() {
  160. public String s;
  161. public int compare(String s1, String s2) {
  162. int d1 = LevenshteinDistance.LD(input, s1);
  163. int d2 = LevenshteinDistance.LD(input, s2);
  164. return d1 - d2;
  165. }
  166. public boolean equals(Object obj) {
  167. return false;
  168. }
  169. };
  170. /* Sort the words according the levenshtein distance using the
  171. above comparator. The suggestion tree can have the same word
  172. appear twice, so make a unique list. */
  173. Vector<String> vec_suggestions = new Vector<String>(suggestions);
  174. Collections.sort(vec_suggestions, c);
  175. HashSet<String> unique_db = new HashSet<String>();
  176. Vector<String> unique = new Vector<String>();
  177. for (String s : vec_suggestions) {
  178. if (unique_db.contains(s))
  179. continue;
  180. unique.add(s);
  181. }
  182. // Cut it down to the first 100
  183. unique = new Vector<String>(unique.subList(0, 100));
  184. return unique;
  185. }
  186. }