PageRenderTime 20ms CodeModel.GetById 10ms app.highlight 7ms RepoModel.GetById 1ms app.codeStats 0ms

/jEdit/tags/jedit-4-0-pre3/org/gjt/sp/jedit/search/BoyerMooreSearchMatcher.java

#
Java | 322 lines | 156 code | 42 blank | 124 comment | 28 complexity | dbc5444ee1aa0ce917144a51ba915e8c MD5 | raw file
  1/*
  2 * BoyerMooreSearchMatcher.java - Literal pattern String matcher utilizing the
  3 *         Boyer-Moore algorithm
  4 * :tabSize=8:indentSize=8:noTabs=false:
  5 * :folding=explicit:collapseFolds=1:
  6 *
  7 * Copyright (C) 1999, 2000 mike dillon
  8 * Portions copyright (C) 2001 Tom Locke
  9 * Portions copyright (C) 2001 Slava Pestov
 10 *
 11 * This program is free software; you can redistribute it and/or
 12 * modify it under the terms of the GNU General Public License
 13 * as published by the Free Software Foundation; either version 2
 14 * of the License, or any later version.
 15 *
 16 * This program is distributed in the hope that it will be useful,
 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 * GNU General Public License for more details.
 20 *
 21 * You should have received a copy of the GNU General Public License
 22 * along with this program; if not, write to the Free Software
 23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 24 */
 25
 26package org.gjt.sp.jedit.search;
 27
 28//{{{ Imports
 29import bsh.NameSpace;
 30import gnu.regexp.CharIndexed;
 31import javax.swing.text.Segment;
 32import org.gjt.sp.jedit.BeanShell;
 33import org.gjt.sp.util.Log;
 34//}}}
 35
 36public class BoyerMooreSearchMatcher implements SearchMatcher
 37{
 38	//{{{ BoyerMooreSearchMatcher constructor
 39	/**
 40	 * Creates a new string literal matcher.
 41	 */
 42	public BoyerMooreSearchMatcher(String pattern, String replace,
 43		boolean ignoreCase, boolean reverseSearch,
 44		boolean beanshell, String replaceMethod)
 45	{
 46		if (ignoreCase)
 47		{
 48			this.pattern = pattern.toUpperCase().toCharArray();
 49		}
 50		else
 51		{
 52			this.pattern = pattern.toCharArray();
 53		}
 54
 55		if (reverseSearch)
 56		{
 57			char[] tmp = new char[this.pattern.length];
 58			for (int i = 0; i < tmp.length; i++)
 59			{
 60				tmp[i] = this.pattern[this.pattern.length - (i + 1)];
 61			}
 62			this.pattern = tmp;
 63		}
 64
 65		this.replace = replace;
 66		this.ignoreCase = ignoreCase;
 67		this.reverseSearch = reverseSearch;
 68		this.beanshell = beanshell;
 69
 70		if(beanshell)
 71		{
 72			this.replaceMethod = replaceMethod;
 73			replaceNS = new NameSpace(BeanShell.getNameSpace(),
 74				"search and replace");
 75		}
 76
 77		generateSkipArray();
 78		generateSuffixArray();
 79	} //}}}
 80
 81	//{{{ nextMatch() method
 82	/**
 83	 * Returns the offset of the first match of the specified text
 84	 * within this matcher.
 85	 * @param text The text to search in
 86	 * @param start True if the start of the segment is the beginning of the
 87	 * buffer
 88	 * @param end True if the end of the segment is the end of the buffer
 89	 * @return an array where the first element is the start offset
 90	 * of the match, and the second element is the end offset of
 91	 * the match
 92	 * @since jEdit 4.0pre3
 93	 */
 94	public int[] nextMatch(CharIndexed text, boolean start, boolean end)
 95	{
 96		int pos = match(text);
 97
 98		if (pos == -1)
 99		{
100			return null;
101		}
102		else
103		{
104			return new int[] { pos, pos + pattern.length };
105		}
106	} //}}}
107
108	//{{{ substitute() method
109	/**
110	 * Returns the specified text, with any substitution specified
111	 * within this matcher performed.
112	 * @param text The text
113	 */
114	public String substitute(String text) throws Exception
115	{
116		if(beanshell)
117		{
118			replaceNS.setVariable("_0",text);
119			Object obj = BeanShell.runCachedBlock(replaceMethod,
120				null,replaceNS);
121			if(obj == null)
122				return null;
123			else
124				return obj.toString();
125		}
126		else
127			return replace;
128	} //}}}
129
130	//{{{ match() method
131	/*
132	 *  a good introduction to the Boyer-Moore fast string matching
133	 *  algorithm may be found on Moore's website at:
134	 *
135	 *   http://www.cs.utexas.edu/users/moore/best-ideas/string-searching/
136	 *
137	 */
138	public int match(CharIndexed text)
139	{
140		// position variable for pattern test position
141		int pos;
142
143		// position variable for pattern start
144		int anchor = 0;
145
146		// last possible start position of a match with this pattern;
147		// this is negative if the pattern is longer than the text
148		// causing the search loop below to immediately fail
149		//int last_anchor = reverseSearch
150		//	? offset + pattern.length - 1
151		//	: length - pattern.length;
152
153		// each time the pattern is checked, we start this many
154		// characters ahead of 'anchor'
155		int pattern_end = pattern.length - 1;
156
157		char ch = 0;
158
159		int bad_char;
160		int good_suffix;
161
162		// the search works by starting the anchor (first character
163		// of the pattern) at the initial offset. as long as the
164		// anchor is far enough from the enough of the text for the
165		// pattern to match, and until the pattern matches, we
166		// compare the pattern to the text from the last character
167		// to the first character in reverse order. where a character
168		// in the pattern mismatches, we use the two heuristics
169		// based on the mismatch character and its position in the
170		// pattern to determine the furthest we can move the anchor
171		// without missing any potential pattern matches.
172SEARCH:
173		while (text.isValid())
174		{
175			for (pos = pattern_end; pos >= 0; --pos)
176			{
177				ch = text.charAt(pos);
178				if(ignoreCase)
179					ch = Character.toUpperCase(ch);
180
181				// pattern test
182				if (ch != pattern[pos])
183				{
184					// character mismatch, determine how many characters to skip
185
186					// heuristic #1
187					bad_char = pos - skip[getSkipIndex(ch)];
188
189					// heuristic #2
190					good_suffix = suffix[pos];
191
192					// skip the greater of the two distances provided by the
193					// heuristics
194					int skip = (bad_char > good_suffix) ? bad_char : good_suffix;
195					anchor += skip;
196					text.move(skip);
197
198					// go back to the while loop
199					continue SEARCH;
200				}
201			}
202
203			// MATCH: return the position of its first character
204			return anchor;
205		}
206
207		// MISMATCH: return -1 as defined by API
208		return -1;
209	} //}}}
210
211	//{{{ Private members
212	private char[] pattern;
213	private String replace;
214	private boolean ignoreCase;
215	private boolean reverseSearch;
216	private boolean beanshell;
217	private String replaceMethod;
218	private NameSpace replaceNS;
219
220	// Boyer-Moore member fields
221	private int[] skip;
222	private int[] suffix;
223	//}}}
224
225	// Boyer-Moore helper methods
226
227	//{{{ generateSkipArray() method
228	/*
229	 *  the 'skip' array is used to determine for each index in the
230	 *  hashed alphabet how many characters can be skipped if
231	 *  a mismatch occurs on a characater hashing to that index.
232	 */
233	private void generateSkipArray()
234	{
235		// initialize the skip array to all zeros
236		skip = new int[256];
237
238		// leave the table cleanly-initialized for an empty pattern
239		if (pattern.length == 0) return;
240
241		int pos = 0;
242
243		do
244		{
245			skip[getSkipIndex(pattern[pos])] = pos;
246		}
247		while (++pos < pattern.length);
248	} //}}}
249
250	//{{{ getSkipIndex() method
251	/*
252	 *  to avoid our skip table having a length of 2 ^ 16, we hash each
253	 *  character of the input into a character in the alphabet [\x00-\xFF]
254	 *  using the lower 8 bits of the character's value (resulting in
255	 *  a more reasonable skip table of length 2 ^ 8).
256	 *
257	 *  the result of this is that more than one character can hash to the
258	 *  same index, but since the skip table encodes the position of
259	 *  occurence of the character furthest into the string with a particular
260	 *  index (whether or not it is the only character with that index), an
261	 *  index collision only means that that this heuristic will give a
262	 *  sub-optimal skip (i.e. a complete skip table could use the differences
263	 *  between colliding characters to maximal effect, at the expense of
264	 *  building a table that is over 2 orders of magnitude larger and very
265	 *  sparse).
266	 */
267	private static final int getSkipIndex(char ch)
268	{
269		return ((int) ch) & 0x000000FF;
270	} //}}}
271
272	//{{{ generateSuffixArray() method
273	/*
274	 *  XXX: hairy code that is basically just a functional(?) port of some
275	 *  other code i barely understood
276	 */
277	private void generateSuffixArray()
278	{
279		int m = pattern.length;
280
281		int j = m + 1;
282
283		suffix = new int[j];
284		int[] tmp = new int[j];
285		tmp[m] = j;
286
287		for (int i = m; i > 0; --i)
288		{
289			while (j <= m && pattern[i - 1] != pattern[j - 1])
290			{
291				if (suffix[j] == 0)
292				{
293					suffix[j] = j - i;
294				}
295
296				j = tmp[j];
297			}
298
299			tmp[i - 1] = --j;
300		}
301
302		int k = tmp[0];
303
304		for (j = 0; j <= m; j++)
305		{
306			// the code above builds a 1-indexed suffix array,
307			// but we shift it to be 0-indexed, ignoring the
308			// original 0-th element
309			if (j > 0)
310			{
311				suffix[j - 1] = (suffix[j] == 0) ? k : suffix[j];
312			}
313
314			if (j == k)
315			{
316				k = tmp[k];
317			}
318		}
319	} //}}}
320
321	//}}}
322}