/src/org/arabidopsis/ahocorasick/AhoCorasick.java

http://github.com/DRMacIver/aho-corasick · Java · 178 lines · 76 code · 38 blank · 64 comment · 16 complexity · ae9b906451b1caba0e6970d93e7f4aa8 MD5 · raw file

  1. package org.arabidopsis.ahocorasick;
  2. import java.util.Iterator;
  3. /**
  4. <p>An implementation of the Aho-Corasick string searching
  5. automaton. This implementation of the <a
  6. href="http://portal.acm.org/citation.cfm?id=360855&dl=ACM&coll=GUIDE"
  7. target="_blank">Aho-Corasick</a> algorithm is optimized to work
  8. with bytes.</p>
  9. <p>
  10. Example usage:
  11. <code><pre>
  12. AhoCorasick tree = new AhoCorasick();
  13. tree.add("hello".getBytes(), "hello");
  14. tree.add("world".getBytes(), "world");
  15. tree.prepare();
  16. Iterator searcher = tree.search("hello world".getBytes());
  17. while (searcher.hasNext()) {
  18. SearchResult result = searcher.next();
  19. System.out.println(result.getOutputs());
  20. System.out.println("Found at index: " + result.getLastIndex());
  21. }
  22. </pre></code>
  23. </p>
  24. <h2>Recent changes</h2>
  25. <ul>
  26. <li>Per user request from Carsten Kruege, I've
  27. changed the signature of State.getOutputs() and
  28. SearchResults.getOutputs() to Sets rather than Lists.
  29. </li>
  30. </ul>
  31. */
  32. public class AhoCorasick {
  33. private State root;
  34. private boolean prepared;
  35. public AhoCorasick() {
  36. this.root = new State();
  37. this.prepared = false;
  38. }
  39. /**
  40. Adds a new keyword with the given output. During search, if
  41. the keyword is matched, output will be one of the yielded
  42. elements in SearchResults.getOutputs().
  43. */
  44. public void add(byte[] keyword, int output) {
  45. if (this.prepared)
  46. throw new IllegalStateException
  47. ("can't add keywords after prepare() is called");
  48. State lastState = this.root.extendAll(keyword);
  49. lastState.addOutput(output);
  50. }
  51. /**
  52. Prepares the automaton for searching. This must be called
  53. before any searching().
  54. */
  55. public void prepare() {
  56. this.prepareFailTransitions();
  57. this.prepared = true;
  58. }
  59. /**
  60. Starts a new search, and returns an Iterator of SearchResults.
  61. */
  62. public Iterator<SearchResult> search(byte[] bytes) {
  63. return new Searcher(this, this.startSearch(bytes));
  64. }
  65. /** DANGER DANGER: dense algorithm code ahead. Very order
  66. dependent. Initializes the fail transitions of all states
  67. except for the root.
  68. */
  69. private void prepareFailTransitions() {
  70. Queue q = new Queue();
  71. for(int i = 0; i < 256; i++)
  72. if (this.root.get((byte) i) != null) {
  73. this.root.get((byte) i).setFail(this.root);
  74. q.add(this.root.get((byte) i));
  75. }
  76. this.prepareRoot();
  77. while (! q.isEmpty()) {
  78. State state = q.pop();
  79. byte[] keys = state.keys();
  80. for (int i = 0; i < keys.length; i++) {
  81. State r = state;
  82. byte a = keys[i];
  83. State s = r.get(a);
  84. q.add(s);
  85. r = r.getFail();
  86. while (r.get(a) == null)
  87. r = r.getFail();
  88. s.setFail(r.get(a));
  89. s.addAllOutputs(r.get(a));
  90. }
  91. }
  92. }
  93. /** Sets all the out transitions of the root to itself, if no
  94. transition yet exists at this point.
  95. */
  96. private void prepareRoot() {
  97. for(int i = 0; i < 256; i++)
  98. if (this.root.get((byte) i) == null)
  99. this.root.put((byte) i, this.root);
  100. }
  101. /**
  102. Returns the root of the tree. Package protected, since the
  103. user probably shouldn't touch this.
  104. */
  105. State getRoot() {
  106. return this.root;
  107. }
  108. /**
  109. Begins a new search using the raw interface. Package protected.
  110. */
  111. SearchResult startSearch(byte[] bytes) {
  112. if (! this.prepared)
  113. throw new IllegalStateException
  114. ("can't start search until prepare()");
  115. return continueSearch
  116. (new SearchResult(this.root, bytes, 0));
  117. }
  118. /**
  119. Continues the search, given the initial state described by the
  120. lastResult. Package protected.
  121. */
  122. SearchResult continueSearch(SearchResult lastResult) {
  123. byte[] bytes = lastResult.bytes;
  124. State state = lastResult.lastMatchedState;
  125. for (int i = lastResult.lastIndex; i < bytes.length; i++) {
  126. byte b = bytes[i];
  127. while (state.get(b) == null)
  128. state = state.getFail();
  129. state = state.get(b);
  130. if (state.getOutputs().length > 0)
  131. return new SearchResult(state, bytes, i+1);
  132. }
  133. return null;
  134. }
  135. }