PageRenderTime 45ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/jEdit/tags/jedit-4-0-pre3/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

#
Java | 322 lines | 156 code | 42 blank | 124 comment | 28 complexity | dbc5444ee1aa0ce917144a51ba915e8c MD5 | raw file
Possible License(s): BSD-3-Clause, AGPL-1.0, Apache-2.0, LGPL-2.0, LGPL-3.0, GPL-2.0, CC-BY-SA-3.0, LGPL-2.1, GPL-3.0, MPL-2.0-no-copyleft-exception, IPL-1.0
  1. /*
  2. * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3. * Boyer-Moore algorithm
  4. * :tabSize=8:indentSize=8:noTabs=false:
  5. * :folding=explicit:collapseFolds=1:
  6. *
  7. * Copyright (C) 1999, 2000 mike dillon
  8. * Portions copyright (C) 2001 Tom Locke
  9. * Portions copyright (C) 2001 Slava Pestov
  10. *
  11. * This program is free software; you can redistribute it and/or
  12. * modify it under the terms of the GNU General Public License
  13. * as published by the Free Software Foundation; either version 2
  14. * of the License, or any later version.
  15. *
  16. * This program is distributed in the hope that it will be useful,
  17. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19. * GNU General Public License for more details.
  20. *
  21. * You should have received a copy of the GNU General Public License
  22. * along with this program; if not, write to the Free Software
  23. * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  24. */
  25. package org.gjt.sp.jedit.search;
  26. //{{{ Imports
  27. import bsh.NameSpace;
  28. import gnu.regexp.CharIndexed;
  29. import javax.swing.text.Segment;
  30. import org.gjt.sp.jedit.BeanShell;
  31. import org.gjt.sp.util.Log;
  32. //}}}
  33. public class BoyerMooreSearchMatcher implements SearchMatcher
  34. {
  35. //{{{ BoyerMooreSearchMatcher constructor
  36. /**
  37. * Creates a new string literal matcher.
  38. */
  39. public BoyerMooreSearchMatcher(String pattern, String replace,
  40. boolean ignoreCase, boolean reverseSearch,
  41. boolean beanshell, String replaceMethod)
  42. {
  43. if (ignoreCase)
  44. {
  45. this.pattern = pattern.toUpperCase().toCharArray();
  46. }
  47. else
  48. {
  49. this.pattern = pattern.toCharArray();
  50. }
  51. if (reverseSearch)
  52. {
  53. char[] tmp = new char[this.pattern.length];
  54. for (int i = 0; i < tmp.length; i++)
  55. {
  56. tmp[i] = this.pattern[this.pattern.length - (i + 1)];
  57. }
  58. this.pattern = tmp;
  59. }
  60. this.replace = replace;
  61. this.ignoreCase = ignoreCase;
  62. this.reverseSearch = reverseSearch;
  63. this.beanshell = beanshell;
  64. if(beanshell)
  65. {
  66. this.replaceMethod = replaceMethod;
  67. replaceNS = new NameSpace(BeanShell.getNameSpace(),
  68. "search and replace");
  69. }
  70. generateSkipArray();
  71. generateSuffixArray();
  72. } //}}}
  73. //{{{ nextMatch() method
  74. /**
  75. * Returns the offset of the first match of the specified text
  76. * within this matcher.
  77. * @param text The text to search in
  78. * @param start True if the start of the segment is the beginning of the
  79. * buffer
  80. * @param end True if the end of the segment is the end of the buffer
  81. * @return an array where the first element is the start offset
  82. * of the match, and the second element is the end offset of
  83. * the match
  84. * @since jEdit 4.0pre3
  85. */
  86. public int[] nextMatch(CharIndexed text, boolean start, boolean end)
  87. {
  88. int pos = match(text);
  89. if (pos == -1)
  90. {
  91. return null;
  92. }
  93. else
  94. {
  95. return new int[] { pos, pos + pattern.length };
  96. }
  97. } //}}}
  98. //{{{ substitute() method
  99. /**
  100. * Returns the specified text, with any substitution specified
  101. * within this matcher performed.
  102. * @param text The text
  103. */
  104. public String substitute(String text) throws Exception
  105. {
  106. if(beanshell)
  107. {
  108. replaceNS.setVariable("_0",text);
  109. Object obj = BeanShell.runCachedBlock(replaceMethod,
  110. null,replaceNS);
  111. if(obj == null)
  112. return null;
  113. else
  114. return obj.toString();
  115. }
  116. else
  117. return replace;
  118. } //}}}
  119. //{{{ match() method
  120. /*
  121. * a good introduction to the Boyer-Moore fast string matching
  122. * algorithm may be found on Moore's website at:
  123. *
  124. * http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
  125. *
  126. */
  127. public int match(CharIndexed text)
  128. {
  129. // position variable for pattern test position
  130. int pos;
  131. // position variable for pattern start
  132. int anchor = 0;
  133. // last possible start position of a match with this pattern;
  134. // this is negative if the pattern is longer than the text
  135. // causing the search loop below to immediately fail
  136. //int last_anchor = reverseSearch
  137. // ? offset + pattern.length - 1
  138. // : length - pattern.length;
  139. // each time the pattern is checked, we start this many
  140. // characters ahead of 'anchor'
  141. int pattern_end = pattern.length - 1;
  142. char ch = 0;
  143. int bad_char;
  144. int good_suffix;
  145. // the search works by starting the anchor (first character
  146. // of the pattern) at the initial offset. as long as the
  147. // anchor is far enough from the enough of the text for the
  148. // pattern to match, and until the pattern matches, we
  149. // compare the pattern to the text from the last character
  150. // to the first character in reverse order. where a character
  151. // in the pattern mismatches, we use the two heuristics
  152. // based on the mismatch character and its position in the
  153. // pattern to determine the furthest we can move the anchor
  154. // without missing any potential pattern matches.
  155. SEARCH:
  156. while (text.isValid())
  157. {
  158. for (pos = pattern_end; pos >= 0; --pos)
  159. {
  160. ch = text.charAt(pos);
  161. if(ignoreCase)
  162. ch = Character.toUpperCase(ch);
  163. // pattern test
  164. if (ch != pattern[pos])
  165. {
  166. // character mismatch, determine how many characters to skip
  167. // heuristic #1
  168. bad_char = pos - skip[getSkipIndex(ch)];
  169. // heuristic #2
  170. good_suffix = suffix[pos];
  171. // skip the greater of the two distances provided by the
  172. // heuristics
  173. int skip = (bad_char > good_suffix) ? bad_char : good_suffix;
  174. anchor += skip;
  175. text.move(skip);
  176. // go back to the while loop
  177. continue SEARCH;
  178. }
  179. }
  180. // MATCH: return the position of its first character
  181. return anchor;
  182. }
  183. // MISMATCH: return -1 as defined by API
  184. return -1;
  185. } //}}}
  186. //{{{ Private members
  187. private char[] pattern;
  188. private String replace;
  189. private boolean ignoreCase;
  190. private boolean reverseSearch;
  191. private boolean beanshell;
  192. private String replaceMethod;
  193. private NameSpace replaceNS;
  194. // Boyer-Moore member fields
  195. private int[] skip;
  196. private int[] suffix;
  197. //}}}
  198. // Boyer-Moore helper methods
  199. //{{{ generateSkipArray() method
  200. /*
  201. * the 'skip' array is used to determine for each index in the
  202. * hashed alphabet how many characters can be skipped if
  203. * a mismatch occurs on a characater hashing to that index.
  204. */
  205. private void generateSkipArray()
  206. {
  207. // initialize the skip array to all zeros
  208. skip = new int[256];
  209. // leave the table cleanly-initialized for an empty pattern
  210. if (pattern.length == 0) return;
  211. int pos = 0;
  212. do
  213. {
  214. skip[getSkipIndex(pattern[pos])] = pos;
  215. }
  216. while (++pos < pattern.length);
  217. } //}}}
  218. //{{{ getSkipIndex() method
  219. /*
  220. * to avoid our skip table having a length of 2 ^ 16, we hash each
  221. * character of the input into a character in the alphabet [\x00-\xFF]
  222. * using the lower 8 bits of the character's value (resulting in
  223. * a more reasonable skip table of length 2 ^ 8).
  224. *
  225. * the result of this is that more than one character can hash to the
  226. * same index, but since the skip table encodes the position of
  227. * occurence of the character furthest into the string with a particular
  228. * index (whether or not it is the only character with that index), an
  229. * index collision only means that that this heuristic will give a
  230. * sub-optimal skip (i.e. a complete skip table could use the differences
  231. * between colliding characters to maximal effect, at the expense of
  232. * building a table that is over 2 orders of magnitude larger and very
  233. * sparse).
  234. */
  235. private static final int getSkipIndex(char ch)
  236. {
  237. return ((int) ch) & 0x000000FF;
  238. } //}}}
  239. //{{{ generateSuffixArray() method
  240. /*
  241. * XXX: hairy code that is basically just a functional(?) port of some
  242. * other code i barely understood
  243. */
  244. private void generateSuffixArray()
  245. {
  246. int m = pattern.length;
  247. int j = m + 1;
  248. suffix = new int[j];
  249. int[] tmp = new int[j];
  250. tmp[m] = j;
  251. for (int i = m; i > 0; --i)
  252. {
  253. while (j <= m && pattern[i - 1] != pattern[j - 1])
  254. {
  255. if (suffix[j] == 0)
  256. {
  257. suffix[j] = j - i;
  258. }
  259. j = tmp[j];
  260. }
  261. tmp[i - 1] = --j;
  262. }
  263. int k = tmp[0];
  264. for (j = 0; j <= m; j++)
  265. {
  266. // the code above builds a 1-indexed suffix array,
  267. // but we shift it to be 0-indexed, ignoring the
  268. // original 0-th element
  269. if (j > 0)
  270. {
  271. suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
  272. }
  273. if (j == k)
  274. {
  275. k = tmp[k];
  276. }
  277. }
  278. } //}}}
  279. //}}}
  280. }