PageRenderTime 45ms CodeModel.GetById 21ms RepoModel.GetById 0ms app.codeStats 0ms

/jEdit/tags/jedit-4-2-pre14/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

#
Java | 314 lines | 147 code | 41 blank | 126 comment | 29 complexity | a0cf75b9ea3a4ebfd223a1e389e0dfc5 MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3. * Boyer-Moore algorithm
  4. * :tabSize=8:indentSize=8:noTabs=false:
  5. * :folding=explicit:collapseFolds=1:
  6. *
  7. * Copyright (C) 1999, 2000 mike dillon
  8. * Portions copyright (C) 2001 Tom Locke
  9. * Portions copyright (C) 2001, 2002 Slava Pestov
  10. *
  11. * This program is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU General Public License
  13. * as published by the Free Software Foundation; either version 2
  14. * of the License, or any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  24. */
  25. package org.gjt.sp.jedit.search;
  26. //{{{ Imports
  27. import bsh.BshMethod;
  28. import bsh.NameSpace;
  29. import gnu.regexp.CharIndexed;
  30. import org.gjt.sp.jedit.BeanShell;
  31. //}}}
  32. /**
  33. * Implements literal search using the Boyer-Moore algorithm.
  34. */
  35. public class BoyerMooreSearchMatcher extends SearchMatcher
  36. {
  37. //{{{ BoyerMooreSearchMatcher constructor
  38. /**
  39. * Creates a new string literal matcher.
  40. */
  41. public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase)
  42. {
  43. this.pattern = pattern.toCharArray();
  44. if(ignoreCase)
  45. {
  46. for(int i = 0; i < this.pattern.length; i++)
  47. {
  48. this.pattern[i] = Character.toUpperCase(
  49. this.pattern[i]);
  50. }
  51. }
  52. this.ignoreCase = ignoreCase;
  53. pattern_end = this.pattern.length - 1;
  54. } //}}}
  55. //{{{ nextMatch() method
  56. /**
  57. * Returns the offset of the first match of the specified text
  58. * within this matcher.
  59. * @param text The text to search in
  60. * @param start True if the start of the segment is the beginning of the
  61. * buffer
  62. * @param end True if the end of the segment is the end of the buffer
  63. * @param firstTime If false and the search string matched at the start
  64. * offset with length zero, automatically find next match
  65. * @param reverse If true, searching will be performed in a backward
  66. * direction.
  67. * @return an array where the first element is the start offset
  68. * of the match, and the second element is the end offset of
  69. * the match
  70. * @since jEdit 4.2pre4
  71. */
  72. public SearchMatcher.Match nextMatch(CharIndexed text,
  73. boolean start, boolean end, boolean firstTime,
  74. boolean reverse)
  75. {
  76. int pos = match(text,reverse);
  77. if (pos == -1)
  78. {
  79. return null;
  80. }
  81. else
  82. {
  83. returnValue.start = pos;
  84. returnValue.end = pos + pattern.length;
  85. return returnValue;
  86. }
  87. } //}}}
  88. //{{{ match() method
  89. /*
  90. * a good introduction to the Boyer-Moore fast string matching
  91. * algorithm may be found on Moore's website at:
  92. *
  93. * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
  94. *
  95. */
  96. public int match(CharIndexed text, boolean reverse)
  97. {
  98. //{{{
  99. // lazily create skip and suffix arrays for either the
  100. // search pattern, or the reversed search pattern
  101. int[] skip, suffix;
  102. if(reverse)
  103. {
  104. if(back_skip == null)
  105. {
  106. back_skip = generateSkipArray(true);
  107. back_suffix = generateSuffixArray(true);
  108. }
  109. skip = back_skip;
  110. suffix = back_suffix;
  111. }
  112. else
  113. {
  114. if(fwd_skip == null)
  115. {
  116. fwd_skip = generateSkipArray(false);
  117. fwd_suffix = generateSuffixArray(false);
  118. }
  119. skip = fwd_skip;
  120. suffix = fwd_suffix;
  121. } //}}}
  122. // position variable for pattern test position
  123. int pos;
  124. // position variable for pattern start
  125. int anchor = 0;
  126. // last possible start position of a match with this pattern;
  127. // this is negative if the pattern is longer than the text
  128. // causing the search loop below to immediately fail
  129. //int last_anchor = reverseSearch
  130. // ? offset + pattern.length - 1
  131. // : length - pattern.length;
  132. char ch = 0;
  133. int bad_char;
  134. int good_suffix;
  135. // the search works by starting the anchor (first character
  136. // of the pattern) at the initial offset. as long as the
  137. // anchor is far enough from the enough of the text for the
  138. // pattern to match, and until the pattern matches, we
  139. // compare the pattern to the text from the last character
  140. // to the first character in reverse order. where a character
  141. // in the pattern mismatches, we use the two heuristics
  142. // based on the mismatch character and its position in the
  143. // pattern to determine the furthest we can move the anchor
  144. // without missing any potential pattern matches.
  145. SEARCH:
  146. while (text.isValid())
  147. {
  148. for (pos = pattern_end; pos >= 0; --pos)
  149. {
  150. ch = text.charAt(pos);
  151. if(ignoreCase)
  152. ch = Character.toUpperCase(ch);
  153. // pattern test
  154. if ((reverse ? ch != pattern[pattern_end - pos]
  155. : ch != pattern[pos]))
  156. {
  157. // character mismatch, determine how many characters to skip
  158. // heuristic #1
  159. bad_char = pos - skip[getSkipIndex(ch)];
  160. // heuristic #2
  161. good_suffix = suffix[pos];
  162. // skip the greater of the two distances provided by the
  163. // heuristics
  164. int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
  165. anchor += skip_index;
  166. text.move(skip_index);
  167. // go back to the while loop
  168. continue SEARCH;
  169. }
  170. }
  171. // MATCH: return the position of its first character
  172. return anchor;
  173. }
  174. // MISMATCH: return -1 as defined by API
  175. return -1;
  176. } //}}}
  177. //{{{ Private members
  178. private char[] pattern;
  179. private int pattern_end;
  180. private boolean ignoreCase;
  181. // Boyer-Moore member fields
  182. private int[] fwd_skip;
  183. private int[] fwd_suffix;
  184. private int[] back_skip;
  185. private int[] back_suffix;
  186. //}}}
  187. // Boyer-Moore helper methods
  188. //{{{ generateSkipArray() method
  189. /*
  190. * the 'skip' array is used to determine for each index in the
  191. * hashed alphabet how many characters can be skipped if
  192. * a mismatch occurs on a characater hashing to that index.
  193. */
  194. private int[] generateSkipArray(boolean reverse)
  195. {
  196. // initialize the skip array to all zeros
  197. int[] skip = new int[256];
  198. // leave the table cleanly-initialized for an empty pattern
  199. if (pattern.length == 0)
  200. return skip;
  201. int pos = 0;
  202. do
  203. {
  204. skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
  205. }
  206. while (++pos < pattern.length);
  207. return skip;
  208. } //}}}
  209. //{{{ getSkipIndex() method
  210. /*
  211. * to avoid our skip table having a length of 2 ^ 16, we hash each
  212. * character of the input into a character in the alphabet [\x00-\xFF]
  213. * using the lower 8 bits of the character's value (resulting in
  214. * a more reasonable skip table of length 2 ^ 8).
  215. *
  216. * the result of this is that more than one character can hash to the
  217. * same index, but since the skip table encodes the position of
  218. * occurence of the character furthest into the string with a particular
  219. * index (whether or not it is the only character with that index), an
  220. * index collision only means that that this heuristic will give a
  221. * sub-optimal skip (i.e. a complete skip table could use the differences
  222. * between colliding characters to maximal effect, at the expense of
  223. * building a table that is over 2 orders of magnitude larger and very
  224. * sparse).
  225. */
  226. private static final int getSkipIndex(char ch)
  227. {
  228. return ((int) ch) & 0x000000FF;
  229. } //}}}
  230. //{{{ generateSuffixArray() method
  231. /*
  232. * XXX: hairy code that is basically just a functional(?) port of some
  233. * other code i barely understood
  234. */
  235. private int[] generateSuffixArray(boolean reverse)
  236. {
  237. int m = pattern.length;
  238. int j = m + 1;
  239. int[] suffix = new int[j];
  240. int[] tmp = new int[j];
  241. tmp[m] = j;
  242. for (int i = m; i > 0; --i)
  243. {
  244. while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
  245. != pattern[reverse ? pattern_end - j + 1 : j - 1])
  246. {
  247. if (suffix[j] == 0)
  248. {
  249. suffix[j] = j - i;
  250. }
  251. j = tmp[j];
  252. }
  253. tmp[i - 1] = --j;
  254. }
  255. int k = tmp[0];
  256. for (j = 0; j <= m; j++)
  257. {
  258. // the code above builds a 1-indexed suffix array,
  259. // but we shift it to be 0-indexed, ignoring the
  260. // original 0-th element
  261. if (j > 0)
  262. {
  263. suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
  264. }
  265. if (j == k)
  266. {
  267. k = tmp[k];
  268. }
  269. }
  270. return suffix;
  271. } //}}}
  272. //}}}
  273. }