/jEdit/branches/plugin_packages/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

# · Java · 316 lines · 147 code · 41 blank · 128 comment · 29 complexity · b73175fafe014829c17881a3165b992f MD5 · raw file

  1. /*
  2. * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3. * Boyer-Moore algorithm
  4. * :tabSize=8:indentSize=8:noTabs=false:
  5. * :folding=explicit:collapseFolds=1:
  6. *
  7. * Copyright (C) 1999, 2000 mike dillon
  8. * Portions copyright (C) 2001 Tom Locke
  9. * Portions copyright (C) 2001, 2002 Slava Pestov
  10. *
  11. * This program is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU General Public License
  13. * as published by the Free Software Foundation; either version 2
  14. * of the License, or any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  24. */
  25. package org.gjt.sp.jedit.search;
  26. /**
  27. * Implements literal search using the Boyer-Moore algorithm.
  28. * @version $Id: BoyerMooreSearchMatcher.java 17598 2010-04-08 14:52:32Z kpouer $
  29. */
  30. public class BoyerMooreSearchMatcher extends SearchMatcher
  31. {
  32. //{{{ BoyerMooreSearchMatcher constructor
  33. /**
  34. * Creates a new string literal matcher.
  35. * @param pattern the search pattern
  36. * @param ignoreCase <code>true</code> if you want to ignore case
  37. */
  38. public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase)
  39. {
  40. this.pattern = pattern.toCharArray();
  41. if(ignoreCase)
  42. {
  43. for(int i = 0; i < this.pattern.length; i++)
  44. {
  45. this.pattern[i] = Character.toUpperCase(
  46. this.pattern[i]);
  47. }
  48. }
  49. this.ignoreCase = ignoreCase;
  50. pattern_end = this.pattern.length - 1;
  51. } //}}}
  52. //{{{ nextMatch() method
  53. /**
  54. * Returns the offset of the first match of the specified text
  55. * within this matcher.
  56. * @param text The text to search in
  57. * @param start True if the start of the segment is the beginning of the
  58. * buffer
  59. * @param end True if the end of the segment is the end of the buffer
  60. * @param firstTime If false and the search string matched at the start
  61. * offset with length zero, automatically find next match
  62. * @param reverse If true, searching will be performed in a backward
  63. * direction.
  64. * @return an array where the first element is the start offset
  65. * of the match, and the second element is the end offset of
  66. * the match
  67. * @since jEdit 4.2pre4
  68. */
  69. @Override
  70. public SearchMatcher.Match nextMatch(CharSequence text,
  71. boolean start, boolean end, boolean firstTime,
  72. boolean reverse)
  73. {
  74. int pos = match(text,reverse);
  75. if (pos == -1)
  76. {
  77. return null;
  78. }
  79. else
  80. {
  81. returnValue.start = pos;
  82. returnValue.end = pos + pattern.length;
  83. return returnValue;
  84. }
  85. } //}}}
  86. //{{{ match() method
  87. /**
  88. * a good introduction to the Boyer-Moore fast string matching
  89. * algorithm may be found on Moore's website at:
  90. *
  91. * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
  92. *
  93. * @since jEdit 4.3pre5
  94. */
  95. public int match(CharSequence text, boolean reverse)
  96. {
  97. //{{{
  98. // lazily create skip and suffix arrays for either the
  99. // search pattern, or the reversed search pattern
  100. int[] skip, suffix;
  101. if(reverse)
  102. {
  103. if(back_skip == null)
  104. {
  105. back_skip = generateSkipArray(true);
  106. back_suffix = generateSuffixArray(true);
  107. }
  108. skip = back_skip;
  109. suffix = back_suffix;
  110. }
  111. else
  112. {
  113. if(fwd_skip == null)
  114. {
  115. fwd_skip = generateSkipArray(false);
  116. fwd_suffix = generateSuffixArray(false);
  117. }
  118. skip = fwd_skip;
  119. suffix = fwd_suffix;
  120. } //}}}
  121. // position variable for pattern test position
  122. int pos;
  123. // position variable for pattern start
  124. int anchor = 0;
  125. // last possible start position of a match with this pattern;
  126. // this is negative if the pattern is longer than the text
  127. // causing the search loop below to immediately fail
  128. //int last_anchor = reverseSearch
  129. // ? offset + pattern.length - 1
  130. // : length - pattern.length;
  131. char ch = 0;
  132. int bad_char;
  133. int good_suffix;
  134. // the search works by starting the anchor (first character
  135. // of the pattern) at the initial offset. as long as the
  136. // anchor is far enough from the enough of the text for the
  137. // pattern to match, and until the pattern matches, we
  138. // compare the pattern to the text from the last character
  139. // to the first character in reverse order. where a character
  140. // in the pattern mismatches, we use the two heuristics
  141. // based on the mismatch character and its position in the
  142. // pattern to determine the furthest we can move the anchor
  143. // without missing any potential pattern matches.
  144. SEARCH:
  145. while (anchor + pattern_end < text.length())
  146. {
  147. for (pos = pattern_end; pos >= 0; --pos)
  148. {
  149. ch = text.charAt(pos + anchor);
  150. if(ignoreCase)
  151. ch = Character.toUpperCase(ch);
  152. // pattern test
  153. if ((reverse ? ch != pattern[pattern_end - pos]
  154. : ch != pattern[pos]))
  155. {
  156. // character mismatch, determine how many characters to skip
  157. // heuristic #1
  158. bad_char = pos - skip[getSkipIndex(ch)];
  159. // heuristic #2
  160. good_suffix = suffix[pos];
  161. // skip the greater of the two distances provided by the
  162. // heuristics
  163. int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
  164. anchor += skip_index;
  165. // go back to the while loop
  166. continue SEARCH;
  167. }
  168. }
  169. // MATCH: return the position of its first character
  170. return anchor;
  171. }
  172. // MISMATCH: return -1 as defined by API
  173. return -1;
  174. } //}}}
  175. //{{{ toString() method
  176. public String toString()
  177. {
  178. return "BoyerMooreSearchMatcher[" + new String(pattern) + ',' + ignoreCase + ']';
  179. } //}}}
  180. //{{{ Private members
  181. private char[] pattern;
  182. private int pattern_end;
  183. private boolean ignoreCase;
  184. // Boyer-Moore member fields
  185. private int[] fwd_skip;
  186. private int[] fwd_suffix;
  187. private int[] back_skip;
  188. private int[] back_suffix;
  189. // Boyer-Moore helper methods
  190. //{{{ generateSkipArray() method
  191. /*
  192. * the 'skip' array is used to determine for each index in the
  193. * hashed alphabet how many characters can be skipped if
  194. * a mismatch occurs on a characater hashing to that index.
  195. */
  196. private int[] generateSkipArray(boolean reverse)
  197. {
  198. // initialize the skip array to all zeros
  199. int[] skip = new int[256];
  200. // leave the table cleanly-initialized for an empty pattern
  201. if (pattern.length == 0)
  202. return skip;
  203. int pos = 0;
  204. do
  205. {
  206. skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
  207. }
  208. while (++pos < pattern.length);
  209. return skip;
  210. } //}}}
  211. //{{{ getSkipIndex() method
  212. /*
  213. * to avoid our skip table having a length of 2 ^ 16, we hash each
  214. * character of the input into a character in the alphabet [\x00-\xFF]
  215. * using the lower 8 bits of the character's value (resulting in
  216. * a more reasonable skip table of length 2 ^ 8).
  217. *
  218. * the result of this is that more than one character can hash to the
  219. * same index, but since the skip table encodes the position of
  220. * occurence of the character furthest into the string with a particular
  221. * index (whether or not it is the only character with that index), an
  222. * index collision only means that that this heuristic will give a
  223. * sub-optimal skip (i.e. a complete skip table could use the differences
  224. * between colliding characters to maximal effect, at the expense of
  225. * building a table that is over 2 orders of magnitude larger and very
  226. * sparse).
  227. */
  228. private static int getSkipIndex(char ch)
  229. {
  230. return ch & 0x000000FF;
  231. } //}}}
  232. //{{{ generateSuffixArray() method
  233. /*
  234. * XXX: hairy code that is basically just a functional(?) port of some
  235. * other code i barely understood
  236. */
  237. private int[] generateSuffixArray(boolean reverse)
  238. {
  239. int m = pattern.length;
  240. int j = m + 1;
  241. int[] suffix = new int[j];
  242. int[] tmp = new int[j];
  243. tmp[m] = j;
  244. for (int i = m; i > 0; --i)
  245. {
  246. while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
  247. != pattern[reverse ? pattern_end - j + 1 : j - 1])
  248. {
  249. if (suffix[j] == 0)
  250. {
  251. suffix[j] = j - i;
  252. }
  253. j = tmp[j];
  254. }
  255. tmp[i - 1] = --j;
  256. }
  257. int k = tmp[0];
  258. for (j = 0; j <= m; j++)
  259. {
  260. // the code above builds a 1-indexed suffix array,
  261. // but we shift it to be 0-indexed, ignoring the
  262. // original 0-th element
  263. if (j > 0)
  264. {
  265. suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
  266. }
  267. if (j == k)
  268. {
  269. k = tmp[k];
  270. }
  271. }
  272. return suffix;
  273. } //}}}
  274. //}}}
  275. }