PageRenderTime 59ms CodeModel.GetById 40ms app.highlight 14ms RepoModel.GetById 1ms app.codeStats 0ms

/jEdit/tags/jedit-4-3-pre5/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

#
Java | 307 lines | 142 code | 40 blank | 125 comment | 29 complexity | 09b28db84df4f0899cf63df84d6163a1 MD5 | raw file
  1/*
  2 * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3 *         Boyer-Moore algorithm
  4 * :tabSize=8:indentSize=8:noTabs=false:
  5 * :folding=explicit:collapseFolds=1:
  6 *
  7 * Copyright (C) 1999, 2000 mike dillon
  8 * Portions copyright (C) 2001 Tom Locke
  9 * Portions copyright (C) 2001, 2002 Slava Pestov
 10 *
 11 * This program is free software; you can redistribute it and/or
 12 * modify it under the terms of the GNU General Public License
 13 * as published by the Free Software Foundation; either version 2
 14 * of the License, or any later version.
 15 *
 16 * This program is distributed in the hope that it will be useful,
 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 * GNU General Public License for more details.
 20 *
 21 * You should have received a copy of the GNU General Public License
 22 * along with this program; if not, write to the Free Software
 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 24 */
 25
 26package org.gjt.sp.jedit.search;
 27
 28/**
 29 * Implements literal search using the Boyer-Moore algorithm.
 30 */
 31public class BoyerMooreSearchMatcher extends SearchMatcher
 32{
 33	//{{{ BoyerMooreSearchMatcher constructor
 34	/**
 35	 * Creates a new string literal matcher.
 36	 */
 37	public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase)
 38	{
 39		this.pattern = pattern.toCharArray();
 40		if(ignoreCase)
 41		{
 42			for(int i = 0; i < this.pattern.length; i++)
 43			{
 44				this.pattern[i] = Character.toUpperCase(
 45					this.pattern[i]);
 46			}
 47		}
 48
 49		this.ignoreCase = ignoreCase;
 50
 51		pattern_end = this.pattern.length - 1;
 52	} //}}}
 53
 54	//{{{ nextMatch() method
 55	/**
 56	 * Returns the offset of the first match of the specified text
 57	 * within this matcher.
 58	 * @param text The text to search in
 59	 * @param start True if the start of the segment is the beginning of the
 60	 * buffer
 61	 * @param end True if the end of the segment is the end of the buffer
 62	 * @param firstTime If false and the search string matched at the start
 63	 * offset with length zero, automatically find next match
 64	 * @param reverse If true, searching will be performed in a backward
 65	 * direction.
 66	 * @return an array where the first element is the start offset
 67	 * of the match, and the second element is the end offset of
 68	 * the match
 69	 * @since jEdit 4.2pre4
 70	 */
 71	public SearchMatcher.Match nextMatch(CharSequence text,
 72		boolean start, boolean end, boolean firstTime,
 73		boolean reverse)
 74	{
 75		int pos = match(text,reverse);
 76
 77		if (pos == -1)
 78		{
 79			return null;
 80		}
 81		else
 82		{
 83			returnValue.start = pos;
 84			returnValue.end = pos + pattern.length;
 85			return returnValue;
 86		}
 87	} //}}}
 88
 89	//{{{ match() method
 90	/**
 91	 *  a good introduction to the Boyer-Moore fast string matching
 92	 *  algorithm may be found on Moore's website at:
 93	 *
 94	 *   http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
 95	 *
 96	 * @since jEdit 4.3pre5
 97	 */
 98	public int match(CharSequence text, boolean reverse)
 99	{
100		//{{{
101		// lazily create skip and suffix arrays for either the
102		// search pattern, or the reversed search pattern
103		int[] skip, suffix;
104		if(reverse)
105		{
106			if(back_skip == null)
107			{
108				back_skip = generateSkipArray(true);
109				back_suffix = generateSuffixArray(true);
110			}
111			skip = back_skip;
112			suffix = back_suffix;
113		}
114		else
115		{
116			if(fwd_skip == null)
117			{
118				fwd_skip = generateSkipArray(false);
119				fwd_suffix = generateSuffixArray(false);
120			}
121			skip = fwd_skip;
122			suffix = fwd_suffix;
123		} //}}}
124
125		// position variable for pattern test position
126		int pos;
127
128		// position variable for pattern start
129		int anchor = 0;
130
131		// last possible start position of a match with this pattern;
132		// this is negative if the pattern is longer than the text
133		// causing the search loop below to immediately fail
134		//int last_anchor = reverseSearch
135		//	? offset + pattern.length - 1
136		//	: length - pattern.length;
137
138		char ch = 0;
139
140		int bad_char;
141		int good_suffix;
142
143		// the search works by starting the anchor (first character
144		// of the pattern) at the initial offset. as long as the
145		// anchor is far enough from the enough of the text for the
146		// pattern to match, and until the pattern matches, we
147		// compare the pattern to the text from the last character
148		// to the first character in reverse order. where a character
149		// in the pattern mismatches, we use the two heuristics
150		// based on the mismatch character and its position in the
151		// pattern to determine the furthest we can move the anchor
152		// without missing any potential pattern matches.
153		SEARCH:
154		while (anchor + pattern_end < text.length())
155		{
156			for (pos = pattern_end; pos >= 0; --pos)
157			{
158				ch = text.charAt(pos + anchor);
159				if(ignoreCase)
160					ch = Character.toUpperCase(ch);
161
162				// pattern test
163				if ((reverse ? ch != pattern[pattern_end - pos]
164					: ch != pattern[pos]))
165				{
166					// character mismatch, determine how many characters to skip
167
168					// heuristic #1
169					bad_char = pos - skip[getSkipIndex(ch)];
170
171					// heuristic #2
172					good_suffix = suffix[pos];
173
174					// skip the greater of the two distances provided by the
175					// heuristics
176					int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
177					anchor += skip_index;
178
179					// go back to the while loop
180					continue SEARCH;
181				}
182			}
183
184			// MATCH: return the position of its first character
185			return anchor;
186		}
187
188		// MISMATCH: return -1 as defined by API
189		return -1;
190	} //}}}
191
192	//{{{ Private members
193	private char[] pattern;
194	private int pattern_end;
195	private boolean ignoreCase;
196
197	// Boyer-Moore member fields
198	private int[] fwd_skip;
199	private int[] fwd_suffix;
200	private int[] back_skip;
201	private int[] back_suffix;
202	//}}}
203
204	// Boyer-Moore helper methods
205
206	//{{{ generateSkipArray() method
207	/*
208	 *  the 'skip' array is used to determine for each index in the
209	 *  hashed alphabet how many characters can be skipped if
210	 *  a mismatch occurs on a characater hashing to that index.
211	 */
212	private int[] generateSkipArray(boolean reverse)
213	{
214		// initialize the skip array to all zeros
215		int[] skip = new int[256];
216
217		// leave the table cleanly-initialized for an empty pattern
218		if (pattern.length == 0)
219			return skip;
220
221		int pos = 0;
222
223		do
224		{
225			skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
226		}
227		while (++pos < pattern.length);
228
229		return skip;
230	} //}}}
231
232	//{{{ getSkipIndex() method
233	/*
234	 *  to avoid our skip table having a length of 2 ^ 16, we hash each
235	 *  character of the input into a character in the alphabet [\x00-\xFF]
236	 *  using the lower 8 bits of the character's value (resulting in
237	 *  a more reasonable skip table of length 2 ^ 8).
238	 *
239	 *  the result of this is that more than one character can hash to the
240	 *  same index, but since the skip table encodes the position of
241	 *  occurence of the character furthest into the string with a particular
242	 *  index (whether or not it is the only character with that index), an
243	 *  index collision only means that that this heuristic will give a
244	 *  sub-optimal skip (i.e. a complete skip table could use the differences
245	 *  between colliding characters to maximal effect, at the expense of
246	 *  building a table that is over 2 orders of magnitude larger and very
247	 *  sparse).
248	 */
249	private static final int getSkipIndex(char ch)
250	{
251		return ch & 0x000000FF;
252	} //}}}
253
254	//{{{ generateSuffixArray() method
255	/*
256	 *  XXX: hairy code that is basically just a functional(?) port of some
257	 *  other code i barely understood
258	 */
259	private int[] generateSuffixArray(boolean reverse)
260	{
261		int m = pattern.length;
262
263		int j = m + 1;
264
265		int[] suffix = new int[j];
266		int[] tmp = new int[j];
267		tmp[m] = j;
268
269		for (int i = m; i > 0; --i)
270		{
271			while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
272				!= pattern[reverse ? pattern_end - j + 1 : j - 1])
273			{
274				if (suffix[j] == 0)
275				{
276					suffix[j] = j - i;
277				}
278
279				j = tmp[j];
280			}
281
282			tmp[i - 1] = --j;
283		}
284
285		int k = tmp[0];
286
287		for (j = 0; j <= m; j++)
288		{
289			// the code above builds a 1-indexed suffix array,
290			// but we shift it to be 0-indexed, ignoring the
291			// original 0-th element
292			if (j > 0)
293			{
294				suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
295			}
296
297			if (j == k)
298			{
299				k = tmp[k];
300			}
301		}
302
303		return suffix;
304	} //}}}
305
306	//}}}
307}