/plugins/VoxSpell/tags/release-1.0.1/voxspellcheck/WordTrie.java

# · Java · 454 lines · 361 code · 59 blank · 34 comment · 73 complexity · d6547cd1924b0d1e4e1c3d281fc88986 MD5 · raw file

  1. /*
  2. Copyright (C) 2008 Matthew Gilbert
  3. This program is free software; you can redistribute it and/or
  4. modify it under the terms of the GNU General Public License
  5. as published by the Free Software Foundation; either version 2
  6. of the License, or (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  14. */
  15. package voxspellcheck;
  16. import java.lang.Comparable;
  17. import java.util.Vector;
  18. import java.util.Stack;
  19. import java.io.BufferedReader;
  20. import java.io.DataInputStream;
  21. import java.io.DataOutputStream;
  22. public class WordTrie implements SpellCheck
  23. {
  24. private static class WordChar implements Comparable
  25. {
  26. public Character c;
  27. public Node next;
  28. public WordChar(Character c_, Node n_)
  29. {
  30. c = c_;
  31. next = n_;
  32. }
  33. public WordChar(Character c_)
  34. {
  35. c = c_;
  36. next = new Node();
  37. }
  38. public int compareTo(Object obj)
  39. {
  40. if (obj instanceof WordChar) {
  41. WordChar wc = (WordChar)obj;
  42. return c.compareTo(wc.c);
  43. } else if (obj instanceof Character) {
  44. Character other_c = (Character)obj;
  45. return c.compareTo(other_c);
  46. } else {
  47. throw new java.lang.IllegalArgumentException();
  48. }
  49. }
  50. public boolean equals(Object obj)
  51. {
  52. if (obj instanceof WordChar) {
  53. WordChar wc = (WordChar)obj;
  54. return c.equals(wc.c);
  55. } else if (obj instanceof Character) {
  56. Character other_c = (Character)obj;
  57. return c.equals(other_c);
  58. } else {
  59. throw new java.lang.IllegalArgumentException();
  60. }
  61. }
  62. }
  63. private static class Node
  64. {
  65. public short length;
  66. public WordChar[] chars;
  67. public Node()
  68. {
  69. length = 0;
  70. chars = new WordChar[0];
  71. }
  72. public boolean contains(Character c)
  73. {
  74. for (WordChar wc : chars) {
  75. if (wc.equals(c))
  76. return true;
  77. if (wc.compareTo(c) > 0)
  78. break;
  79. }
  80. return false;
  81. }
  82. public void set(Character[] array)
  83. {
  84. length = (short)array.length;
  85. chars = new WordChar[array.length];
  86. for (int i = 0; i < length; i++) {
  87. if (array[i].equals(Character.MIN_VALUE)) {
  88. chars[i] = new WordChar(array[i], null);
  89. } else {
  90. chars[i] = new WordChar(array[i]);
  91. }
  92. }
  93. }
  94. public void add(WordChar wc)
  95. {
  96. WordChar[] new_chars = new WordChar[chars.length + 1];
  97. int i;
  98. for (i = 0; i < chars.length; i++) {
  99. new_chars[i] = chars[i];
  100. }
  101. new_chars[i] = wc;
  102. java.util.Arrays.sort(new_chars);
  103. chars = new_chars;
  104. length++;
  105. }
  106. public WordChar get(Character c)
  107. {
  108. for (WordChar wc : chars) {
  109. if (wc.equals(c))
  110. return wc;
  111. if (wc.compareTo(c) > 0)
  112. break;
  113. }
  114. return null;
  115. }
  116. public boolean remove(Character c)
  117. {
  118. if (!contains(c))
  119. return false;
  120. --length;
  121. WordChar[] new_chars = new WordChar[length];
  122. int i = 0;
  123. for (WordChar wc : chars) {
  124. if (wc.equals(c))
  125. continue;
  126. new_chars[i++] = wc;
  127. }
  128. chars = new_chars;
  129. return true;
  130. }
  131. public int length()
  132. {
  133. return chars.length;
  134. }
  135. public String toString()
  136. {
  137. StringBuffer buf = new StringBuffer();
  138. buf.append(length);
  139. buf.append(" - ");
  140. for (WordChar wc : chars) {
  141. buf.append(wc.c + " ");
  142. }
  143. return buf.toString();
  144. }
  145. }
  146. public Node root;
  147. public WordTrie()
  148. {
  149. root = new Node();
  150. }
  151. public boolean add(Node node, String word)
  152. {
  153. if (word.length() == 0) {
  154. if (node.contains(Character.MIN_VALUE)) {
  155. return false;
  156. }
  157. node.add(new WordChar(Character.MIN_VALUE, null));
  158. return true;
  159. }
  160. Character cur = word.charAt(0);
  161. if (!node.contains(cur)) {
  162. node.add(new WordChar(cur));
  163. }
  164. return add(node.get(cur).next, word.substring(1));
  165. }
  166. public void addWord(String word)
  167. {
  168. String trimmed = word.trim();
  169. add(root, trimmed);
  170. }
  171. public void addWordList(BufferedReader input)
  172. {
  173. while (true) {
  174. String line;
  175. try {
  176. line = input.readLine();
  177. } catch (java.io.IOException ex) {
  178. break;
  179. }
  180. if (line == null) {
  181. break;
  182. }
  183. String trimmed = line.trim();
  184. if (trimmed.length() == 0)
  185. continue;
  186. add(root, trimmed);
  187. }
  188. }
  189. public boolean find(Node node, String word)
  190. {
  191. if (word.length() == 0) {
  192. if (node.contains(Character.MIN_VALUE))
  193. return true;
  194. return false;
  195. }
  196. Character cur = word.charAt(0);
  197. if (!node.contains(cur)) {
  198. return false;
  199. }
  200. return find(node.get(cur).next, word.substring(1));
  201. }
  202. public boolean find(String word)
  203. {
  204. return find(root, word);
  205. }
  206. // Returns the node.next of the last character of word; or null if any
  207. // character in word isn't found.
  208. protected Node findNode(Node node, String word)
  209. {
  210. if (word.length() == 0)
  211. return node;
  212. Character cur = word.charAt(0);
  213. if (!node.contains(cur)) {
  214. return null;
  215. }
  216. return findNode(node.get(cur).next, word.substring(1));
  217. }
  218. public void write(DataOutputStream writer, Node node) throws java.io.IOException
  219. {
  220. writer.writeShort(node.length);
  221. for (WordChar wc : node.chars)
  222. writer.writeChar(wc.c);
  223. for (WordChar wc : node.chars) {
  224. if (!wc.c.equals(Character.MIN_VALUE))
  225. write(writer, wc.next);
  226. }
  227. }
  228. public void write(DataOutputStream writer) throws java.io.IOException
  229. {
  230. write(writer, root);
  231. }
  232. public void read(DataInputStream reader, Node node) throws java.io.IOException
  233. {
  234. int length;
  235. try {
  236. length = reader.readShort();
  237. } catch (java.io.EOFException ex) {
  238. return;
  239. }
  240. Character[] array = new Character[length];
  241. for (int i = 0; i < length; i++) {
  242. try {
  243. array[i] = reader.readChar();
  244. } catch (java.io.EOFException ex) {
  245. return;
  246. }
  247. }
  248. node.set(array);
  249. for (WordChar wc : node.chars) {
  250. if (wc.c != Character.MIN_VALUE)
  251. read(reader, wc.next);
  252. }
  253. }
  254. public void read(DataInputStream reader) throws java.io.IOException
  255. {
  256. read(reader, root);
  257. }
  258. private int getNodeCount(Node node)
  259. {
  260. int count = 0;
  261. for (WordChar wc : node.chars) {
  262. if (!wc.c.equals(Character.MIN_VALUE)) {
  263. count += getNodeCount(wc.next);
  264. }
  265. }
  266. // Finally, include this node and return.
  267. return count++;
  268. }
  269. public void write_bf(DataOutputStream writer) throws java.io.IOException
  270. {
  271. Vector<Node> cur = null;
  272. Vector<Node> next = new Vector<Node>();
  273. next.add(root);
  274. do {
  275. cur = next;
  276. next = new Vector<Node>();
  277. for (Node node : cur) {
  278. writer.writeShort(node.length);
  279. for (WordChar wc : node.chars) {
  280. writer.writeChar(wc.c);
  281. if (!wc.c.equals(Character.MIN_VALUE))
  282. next.add(wc.next);
  283. }
  284. }
  285. } while (next.size() > 0);
  286. }
  287. public void read_bf(DataInputStream reader) throws java.io.IOException
  288. {
  289. Vector<Node> cur = null;
  290. Vector<Node> next = new Vector<Node>();
  291. next.add(root);
  292. do {
  293. cur = next;
  294. next = new Vector<Node>();
  295. for (Node node : cur) {
  296. int length = reader.readShort();
  297. WordChar[] chars = new WordChar[length];
  298. node.set(new Character[0]);
  299. for (int i = 0; i < length; ++i) {
  300. WordChar wc = new WordChar(reader.readChar());
  301. chars[i] = wc;
  302. node.add(wc);
  303. if (!wc.c.equals(Character.MIN_VALUE))
  304. next.add(wc.next);
  305. }
  306. }
  307. } while (next.size() > 0);
  308. }
  309. protected int bloom(String s)
  310. {
  311. int res = 0;
  312. for (Character c : s.toCharArray()) {
  313. int i = Character.getNumericValue(c);
  314. res |= (1 << (i & 0x1f));
  315. }
  316. return res;
  317. }
  318. protected void getWords(Vector<String> vec,
  319. Stack<Character> stack,
  320. Node node,
  321. int filter)
  322. {
  323. for (WordChar wc : node.chars) {
  324. if (wc.c.equals(Character.MIN_VALUE)) {
  325. char[] chars = new char[stack.size()];
  326. for (int i = 0; i < stack.size(); ++i) {
  327. chars[i] = stack.get(i);
  328. }
  329. String s = new String(chars);
  330. if (((filter ^ bloom(s)) & filter) == 0)
  331. vec.add(s);
  332. } else {
  333. stack.push(wc.c);
  334. getWords(vec, stack, wc.next, filter);
  335. stack.pop();
  336. }
  337. }
  338. }
  339. public Vector<String> getWords()
  340. {
  341. Vector<String> vec = new Vector<String>();
  342. Stack<Character> stack = new Stack<Character>();
  343. getWords(vec, stack, root, 0);
  344. return vec;
  345. }
  346. // FIXME: This is a hacked interface to support the bloom filter.
  347. public Vector<String> getWords(String prefix)
  348. {
  349. Vector<String> vec = new Vector<String>();
  350. if (prefix.length() == 0)
  351. return vec;
  352. Stack<Character> stack = new Stack<Character>();
  353. stack.push(prefix.charAt(0));
  354. //for (Character c : prefix.substring(0, 1).toCharArray())
  355. // stack.push(c);
  356. Node node = findNode(root, prefix.substring(0, 1));
  357. if (node != null) {
  358. getWords(vec, stack, node, bloom(prefix));
  359. }
  360. return vec;
  361. }
  362. public boolean removeWord(String word)
  363. {
  364. if (!find(word))
  365. return false;
  366. // First remove MIN_VALUE WordChar from final node. Then
  367. // loop through removing WordChar's from nodes if the next
  368. // pointer points to a node with no WordChars.
  369. // findNode will return the node with MIN_VALUE in it for word.
  370. Node node = findNode(root, word);
  371. node.remove(Character.MIN_VALUE);
  372. Stack<Node> stack = new Stack<Node>();
  373. // word has to have at least 1 letter, so push root. findNode always
  374. // returns node.next of the last character, so root can never be
  375. // returned.
  376. stack.push(root);
  377. // Unconventional loop: i is the last param to substring, so 1 past the
  378. // char we want. The last iteration of i will return the node of the
  379. // last letter in word, even though substring doesn't contain the last
  380. // letter. FIXME: fix findNode.
  381. for (int i = 1; i < word.length(); ++i)
  382. stack.push(findNode(root, word.substring(0, i)));
  383. int i = word.length() - 1;
  384. while (!stack.empty()) {
  385. node = stack.pop();
  386. WordChar wc = node.get(word.charAt(i--));
  387. if ((wc.next != null) && (wc.next.length() == 0))
  388. node.remove(wc.c);
  389. }
  390. return true;
  391. }
  392. }