PageRenderTime 326ms CodeModel.GetById 307ms app.highlight 15ms RepoModel.GetById 1ms app.codeStats 0ms

/jEdit/tags/jedit-4-2-pre14/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

#
Java | 314 lines | 147 code | 41 blank | 126 comment | 29 complexity | a0cf75b9ea3a4ebfd223a1e389e0dfc5 MD5 | raw file
  1/*
  2 * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3 *         Boyer-Moore algorithm
  4 * :tabSize=8:indentSize=8:noTabs=false:
  5 * :folding=explicit:collapseFolds=1:
  6 *
  7 * Copyright (C) 1999, 2000 mike dillon
  8 * Portions copyright (C) 2001 Tom Locke
  9 * Portions copyright (C) 2001, 2002 Slava Pestov
 10 *
 11 * This program is free software; you can redistribute it and/or
 12 * modify it under the terms of the GNU General Public License
 13 * as published by the Free Software Foundation; either version 2
 14 * of the License, or any later version.
 15 *
 16 * This program is distributed in the hope that it will be useful,
 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 * GNU General Public License for more details.
 20 *
 21 * You should have received a copy of the GNU General Public License
 22 * along with this program; if not, write to the Free Software
 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 24 */
 25
 26package org.gjt.sp.jedit.search;
 27
 28//{{{ Imports
 29import bsh.BshMethod;
 30import bsh.NameSpace;
 31import gnu.regexp.CharIndexed;
 32import org.gjt.sp.jedit.BeanShell;
 33//}}}
 34
 35/**
 36 * Implements literal search using the Boyer-Moore algorithm.
 37 */
 38public class BoyerMooreSearchMatcher extends SearchMatcher
 39{
 40	//{{{ BoyerMooreSearchMatcher constructor
 41	/**
 42	 * Creates a new string literal matcher.
 43	 */
 44	public BoyerMooreSearchMatcher(String pattern, boolean ignoreCase)
 45	{
 46		this.pattern = pattern.toCharArray();
 47		if(ignoreCase)
 48		{
 49			for(int i = 0; i < this.pattern.length; i++)
 50			{
 51				this.pattern[i] = Character.toUpperCase(
 52					this.pattern[i]);
 53			}
 54		}
 55
 56		this.ignoreCase = ignoreCase;
 57
 58		pattern_end = this.pattern.length - 1;
 59	} //}}}
 60
 61	//{{{ nextMatch() method
 62	/**
 63	 * Returns the offset of the first match of the specified text
 64	 * within this matcher.
 65	 * @param text The text to search in
 66	 * @param start True if the start of the segment is the beginning of the
 67	 * buffer
 68	 * @param end True if the end of the segment is the end of the buffer
 69	 * @param firstTime If false and the search string matched at the start
 70	 * offset with length zero, automatically find next match
 71	 * @param reverse If true, searching will be performed in a backward
 72	 * direction.
 73	 * @return an array where the first element is the start offset
 74	 * of the match, and the second element is the end offset of
 75	 * the match
 76	 * @since jEdit 4.2pre4
 77	 */
 78	public SearchMatcher.Match nextMatch(CharIndexed text,
 79		boolean start, boolean end, boolean firstTime,
 80		boolean reverse)
 81	{
 82		int pos = match(text,reverse);
 83
 84		if (pos == -1)
 85		{
 86			return null;
 87		}
 88		else
 89		{
 90			returnValue.start = pos;
 91			returnValue.end = pos + pattern.length;
 92			return returnValue;
 93		}
 94	} //}}}
 95
 96	//{{{ match() method
 97	/*
 98	 *  a good introduction to the Boyer-Moore fast string matching
 99	 *  algorithm may be found on Moore's website at:
100	 *
101	 *   http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
102	 *
103	 */
104	public int match(CharIndexed text, boolean reverse)
105	{
106		//{{{
107		// lazily create skip and suffix arrays for either the
108		// search pattern, or the reversed search pattern
109		int[] skip, suffix;
110		if(reverse)
111		{
112			if(back_skip == null)
113			{
114				back_skip = generateSkipArray(true);
115				back_suffix = generateSuffixArray(true);
116			}
117			skip = back_skip;
118			suffix = back_suffix;
119		}
120		else
121		{
122			if(fwd_skip == null)
123			{
124				fwd_skip = generateSkipArray(false);
125				fwd_suffix = generateSuffixArray(false);
126			}
127			skip = fwd_skip;
128			suffix = fwd_suffix;
129		} //}}}
130
131		// position variable for pattern test position
132		int pos;
133
134		// position variable for pattern start
135		int anchor = 0;
136
137		// last possible start position of a match with this pattern;
138		// this is negative if the pattern is longer than the text
139		// causing the search loop below to immediately fail
140		//int last_anchor = reverseSearch
141		//	? offset + pattern.length - 1
142		//	: length - pattern.length;
143
144		char ch = 0;
145
146		int bad_char;
147		int good_suffix;
148
149		// the search works by starting the anchor (first character
150		// of the pattern) at the initial offset. as long as the
151		// anchor is far enough from the enough of the text for the
152		// pattern to match, and until the pattern matches, we
153		// compare the pattern to the text from the last character
154		// to the first character in reverse order. where a character
155		// in the pattern mismatches, we use the two heuristics
156		// based on the mismatch character and its position in the
157		// pattern to determine the furthest we can move the anchor
158		// without missing any potential pattern matches.
159SEARCH:
160		while (text.isValid())
161		{
162			for (pos = pattern_end; pos >= 0; --pos)
163			{
164				ch = text.charAt(pos);
165				if(ignoreCase)
166					ch = Character.toUpperCase(ch);
167
168				// pattern test
169				if ((reverse ? ch != pattern[pattern_end - pos]
170					: ch != pattern[pos]))
171				{
172					// character mismatch, determine how many characters to skip
173
174					// heuristic #1
175					bad_char = pos - skip[getSkipIndex(ch)];
176
177					// heuristic #2
178					good_suffix = suffix[pos];
179
180					// skip the greater of the two distances provided by the
181					// heuristics
182					int skip_index = (bad_char > good_suffix) ? bad_char : good_suffix;
183					anchor += skip_index;
184					text.move(skip_index);
185
186					// go back to the while loop
187					continue SEARCH;
188				}
189			}
190
191			// MATCH: return the position of its first character
192			return anchor;
193		}
194
195		// MISMATCH: return -1 as defined by API
196		return -1;
197	} //}}}
198
199	//{{{ Private members
200	private char[] pattern;
201	private int pattern_end;
202	private boolean ignoreCase;
203
204	// Boyer-Moore member fields
205	private int[] fwd_skip;
206	private int[] fwd_suffix;
207	private int[] back_skip;
208	private int[] back_suffix;
209	//}}}
210
211	// Boyer-Moore helper methods
212
213	//{{{ generateSkipArray() method
214	/*
215	 *  the 'skip' array is used to determine for each index in the
216	 *  hashed alphabet how many characters can be skipped if
217	 *  a mismatch occurs on a characater hashing to that index.
218	 */
219	private int[] generateSkipArray(boolean reverse)
220	{
221		// initialize the skip array to all zeros
222		int[] skip = new int[256];
223
224		// leave the table cleanly-initialized for an empty pattern
225		if (pattern.length == 0)
226			return skip;
227
228		int pos = 0;
229
230		do
231		{
232			skip[getSkipIndex(pattern[reverse ? pattern_end - pos : pos])] = pos;
233		}
234		while (++pos < pattern.length);
235
236		return skip;
237	} //}}}
238
239	//{{{ getSkipIndex() method
240	/*
241	 *  to avoid our skip table having a length of 2 ^ 16, we hash each
242	 *  character of the input into a character in the alphabet [\x00-\xFF]
243	 *  using the lower 8 bits of the character's value (resulting in
244	 *  a more reasonable skip table of length 2 ^ 8).
245	 *
246	 *  the result of this is that more than one character can hash to the
247	 *  same index, but since the skip table encodes the position of
248	 *  occurence of the character furthest into the string with a particular
249	 *  index (whether or not it is the only character with that index), an
250	 *  index collision only means that that this heuristic will give a
251	 *  sub-optimal skip (i.e. a complete skip table could use the differences
252	 *  between colliding characters to maximal effect, at the expense of
253	 *  building a table that is over 2 orders of magnitude larger and very
254	 *  sparse).
255	 */
256	private static final int getSkipIndex(char ch)
257	{
258		return ((int) ch) & 0x000000FF;
259	} //}}}
260
261	//{{{ generateSuffixArray() method
262	/*
263	 *  XXX: hairy code that is basically just a functional(?) port of some
264	 *  other code i barely understood
265	 */
266	private int[] generateSuffixArray(boolean reverse)
267	{
268		int m = pattern.length;
269
270		int j = m + 1;
271
272		int[] suffix = new int[j];
273		int[] tmp = new int[j];
274		tmp[m] = j;
275
276		for (int i = m; i > 0; --i)
277		{
278			while (j <= m && pattern[reverse ? pattern_end - i + 1 : i - 1]
279				!= pattern[reverse ? pattern_end - j + 1 : j - 1])
280			{
281				if (suffix[j] == 0)
282				{
283					suffix[j] = j - i;
284				}
285
286				j = tmp[j];
287			}
288
289			tmp[i - 1] = --j;
290		}
291
292		int k = tmp[0];
293
294		for (j = 0; j <= m; j++)
295		{
296			// the code above builds a 1-indexed suffix array,
297			// but we shift it to be 0-indexed, ignoring the
298			// original 0-th element
299			if (j > 0)
300			{
301				suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
302			}
303
304			if (j == k)
305			{
306				k = tmp[k];
307			}
308		}
309
310		return suffix;
311	} //}}}
312
313	//}}}
314}