PageRenderTime 493ms CodeModel.GetById 483ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 0ms

/jEdit/tags/jedit-4-2-pre4/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

#
Java | 316 lines | 149 code | 41 blank | 126 comment | 29 complexity | 00012e72e34d826054478fe0f8d7a139 MD5 | raw file
  1/*
  2 * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3 *         Boyer-Moore algorithm
  4 * :tabSize=8:indentSize=8:noTabs=false:
  5 * :folding=explicit:collapseFolds=1:
  6 *
  7 * Copyright (C) 1999, 2000 mike dillon
  8 * Portions copyright (C) 2001 Tom Locke
  9 * Portions copyright (C) 2001, 2002 Slava Pestov
 10 *
 11 * This program is free software; you can redistribute it and/or
 12 * modify it under the terms of the GNU General Public License
 13 * as published by the Free Software Foundation; either version 2
 14 * of the License, or any later version.
 15 *
 16 * This program is distributed in the hope that it will be useful,
 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 * GNU General Public License for more details.
 20 *
 21 * You should have received a copy of the GNU General Public License
 22 * along with this program; if not, write to the Free Software
 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 24 */
 25
 26package org.gjt.sp.jedit.search;
 27
 28//{{{ Imports
 29import bsh.BshMethod;
 30import bsh.NameSpace;
 31import gnu.regexp.CharIndexed;
 32import org.gjt.sp.jedit.BeanShell;
 33//}}}
 34
 35/**
 36 * Implements literal search using the Boyer-Moore algorithm.
 37 */
 38public class BoyerMooreSearchMatcher extends SearchMatcher
 39{
 40	//{{{ BoyerMooreSearchMatcher constructor
 41	/**
 42	 * Creates a new string literal matcher.
 43	 */
 44	public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase)
 45	{
 46		this.pattern = pattern.toCharArray();
 47		if(ignoreCase)
 48		{
 49			for(int i = 0; i < this.pattern.length; i++)
 50			{
 51				this.pattern[i] = Character.toUpperCase(
 52					this.pattern[i]);
 53			}
 54		}
 55
 56		this.replace = replace;
 57		this.ignoreCase = ignoreCase;
 58
 59		pattern_end = this.pattern.length - 1;
 60	} //}}}
 61
 62	//{{{ nextMatch() method
 63	/**
 64	 * Returns the offset of the first match of the specified text
 65	 * within this matcher.
 66	 * @param text The text to search in
 67	 * @param start True if the start of the segment is the beginning of the
 68	 * buffer
 69	 * @param end True if the end of the segment is the end of the buffer
 70	 * @param firstTime If false and the search string matched at the start
 71	 * offset with length zero, automatically find next match
 72	 * @param reverse If true, searching will be performed in a backward
 73	 * direction.
 74	 * @return an array where the first element is the start offset
 75	 * of the match, and the second element is the end offset of
 76	 * the match
 77	 * @since jEdit 4.2pre4
 78	 */
 79	public SearchMatcher.Match nextMatch(CharIndexed text,
 80		boolean start, boolean end, boolean firstTime,
 81		boolean reverse)
 82	{
 83		int pos = match(text,reverse);
 84
 85		if (pos == -1)
 86		{
 87			return null;
 88		}
 89		else
 90		{
 91			returnValue.start = pos;
 92			returnValue.end = pos + pattern.length;
 93			return returnValue;
 94		}
 95	} //}}}
 96
 97	//{{{ match() method
 98	/*
 99	 *  a good introduction to the Boyer-Moore fast string matching
100	 *  algorithm may be found on Moore's website at:
101	 *
102	 *   http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
103	 *
104	 */
105	public int match(CharIndexed text, boolean reverse)
106	{
107		//{{{
108		// lazily create skip and suffix arrays for either the
109		// search pattern, or the reversed search pattern
110		int[] skip, suffix;
111		if(reverse)
112		{
113			if(back_skip == null)
114			{
115				back_skip = generateSkipArray(true);
116				back_suffix = generateSuffixArray(true);
117			}
118			skip = back_skip;
119			suffix = back_suffix;
120		}
121		else
122		{
123			if(fwd_skip == null)
124			{
125				fwd_skip = generateSkipArray(false);
126				fwd_suffix = generateSuffixArray(false);
127			}
128			skip = fwd_skip;
129			suffix = fwd_suffix;
130		} //}}}
131
132		// position variable for pattern test position
133		int pos;
134
135		// position variable for pattern start
136		int anchor = 0;
137
138		// last possible start position of a match with this pattern;
139		// this is negative if the pattern is longer than the text
140		// causing the search loop below to immediately fail
141		//int last_anchor = reverseSearch
142		//	? offset + pattern.length - 1
143		//	: length - pattern.length;
144
145		char ch = 0;
146
147		int bad_char;
148		int good_suffix;
149
150		// the search works by starting the anchor (first character
151		// of the pattern) at the initial offset. as long as the
152		// anchor is far enough from the enough of the text for the
153		// pattern to match, and until the pattern matches, we
154		// compare the pattern to the text from the last character
155		// to the first character in reverse order. where a character
156		// in the pattern mismatches, we use the two heuristics
157		// based on the mismatch character and its position in the
158		// pattern to determine the furthest we can move the anchor
159		// without missing any potential pattern matches.
160SEARCH:
161		while (text.isValid())
162		{
163			for (pos = pattern_end; pos >= 0; --pos)
164			{
165				ch = text.charAt(pos);
166				if(ignoreCase)
167					ch = Character.toUpperCase(ch);
168
169				// pattern test
170				if ((reverse ? ch != pattern[pattern_end - pos]
171					: ch != pattern[pos]))
172				{
173					// character mismatch, determine how many characters to skip
174
175					// heuristic #1
176					bad_char = pos - skip[getSkipIndex(ch)];
177
178					// heuristic #2
179					good_suffix = suffix[pos];
180
181					// skip the greater of the two distances provided by the
182					// heuristics
183					int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
184					anchor += skip_index;
185					text.move(skip_index);
186
187					// go back to the while loop
188					continue SEARCH;
189				}
190			}
191
192			// MATCH: return the position of its first character
193			return anchor;
194		}
195
196		// MISMATCH: return -1 as defined by API
197		return -1;
198	} //}}}
199
200	//{{{ Private members
201	private char[] pattern;
202	private int pattern_end;
203	private String replace;
204	private boolean ignoreCase;
205
206	// Boyer-Moore member fields
207	private int[] fwd_skip;
208	private int[] fwd_suffix;
209	private int[] back_skip;
210	private int[] back_suffix;
211	//}}}
212
213	// Boyer-Moore helper methods
214
215	//{{{ generateSkipArray() method
216	/*
217	 *  the 'skip' array is used to determine for each index in the
218	 *  hashed alphabet how many characters can be skipped if
219	 *  a mismatch occurs on a characater hashing to that index.
220	 */
221	private int[] generateSkipArray(boolean reverse)
222	{
223		// initialize the skip array to all zeros
224		int[] skip = new int[256];
225
226		// leave the table cleanly-initialized for an empty pattern
227		if (pattern.length == 0)
228			return skip;
229
230		int pos = 0;
231
232		do
233		{
234			skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
235		}
236		while (++pos < pattern.length);
237
238		return skip;
239	} //}}}
240
241	//{{{ getSkipIndex() method
242	/*
243	 *  to avoid our skip table having a length of 2 ^ 16, we hash each
244	 *  character of the input into a character in the alphabet [\x00-\xFF]
245	 *  using the lower 8 bits of the character's value (resulting in
246	 *  a more reasonable skip table of length 2 ^ 8).
247	 *
248	 *  the result of this is that more than one character can hash to the
249	 *  same index, but since the skip table encodes the position of
250	 *  occurence of the character furthest into the string with a particular
251	 *  index (whether or not it is the only character with that index), an
252	 *  index collision only means that that this heuristic will give a
253	 *  sub-optimal skip (i.e. a complete skip table could use the differences
254	 *  between colliding characters to maximal effect, at the expense of
255	 *  building a table that is over 2 orders of magnitude larger and very
256	 *  sparse).
257	 */
258	private static final int getSkipIndex(char ch)
259	{
260		return ((int) ch) & 0x000000FF;
261	} //}}}
262
263	//{{{ generateSuffixArray() method
264	/*
265	 *  XXX: hairy code that is basically just a functional(?) port of some
266	 *  other code i barely understood
267	 */
268	private int[] generateSuffixArray(boolean reverse)
269	{
270		int m = pattern.length;
271
272		int j = m + 1;
273
274		int[] suffix = new int[j];
275		int[] tmp = new int[j];
276		tmp[m] = j;
277
278		for (int i = m; i > 0; --i)
279		{
280			while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
281				!= pattern[reverse ? pattern_end - j + 1 : j - 1])
282			{
283				if (suffix[j] == 0)
284				{
285					suffix[j] = j - i;
286				}
287
288				j = tmp[j];
289			}
290
291			tmp[i - 1] = --j;
292		}
293
294		int k = tmp[0];
295
296		for (j = 0; j <= m; j++)
297		{
298			// the code above builds a 1-indexed suffix array,
299			// but we shift it to be 0-indexed, ignoring the
300			// original 0-th element
301			if (j > 0)
302			{
303				suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
304			}
305
306			if (j == k)
307			{
308				k = tmp[k];
309			}
310		}
311
312		return suffix;
313	} //}}}
314
315	//}}}
316}