PageRenderTime 65ms CodeModel.GetById 9ms app.highlight 47ms RepoModel.GetById 1ms app.codeStats 0ms

/cssed-0.4.0/scintilla/src/RESearch.cxx

#
C++ | 835 lines | 506 code | 67 blank | 262 comment | 151 complexity | 663b69e9b6ecefea4cb372cdeca8c46d MD5 | raw file
  1// Scintilla source code edit control
  2/** @file RESearch.cxx
  3 ** Regular expression search library.
  4 **/
  5
  6/*
  7 * regex - Regular expression pattern matching  and replacement
  8 *
  9 * By:  Ozan S. Yigit (oz)
 10 *      Dept. of Computer Science
 11 *      York University
 12 *
 13 * Original code available from http://www.cs.yorku.ca/~oz/
 14 * Translation to C++ by Neil Hodgson neilh@scintilla.org
 15 * Removed all use of register.
 16 * Converted to modern function prototypes.
 17 * Put all global/static variables into an object so this code can be
 18 * used from multiple threads etc.
 19 *
 20 * These routines are the PUBLIC DOMAIN equivalents of regex
 21 * routines as found in 4.nBSD UN*X, with minor extensions.
 22 *
 23 * These routines are derived from various implementations found
 24 * in software tools books, and Conroy's grep. They are NOT derived
 25 * from licensed/restricted software.
 26 * For more interesting/academic/complicated implementations,
 27 * see Henry Spencer's regexp routines, or GNU Emacs pattern
 28 * matching module.
 29 *
 30 * Modification history removed.
 31 *
 32 * Interfaces:
 33 *      RESearch::Compile:        compile a regular expression into a NFA.
 34 *
 35 *			char *RESearch::Compile(s)
 36 *			char *s;
 37 *
 38 *      RESearch::Execute:        execute the NFA to match a pattern.
 39 *
 40 *			int RESearch::Execute(s)
 41 *			char *s;
 42 *
 43 *	RESearch::ModifyWord		change RESearch::Execute's understanding of what a "word"
 44 *			looks like (for \< and \>) by adding into the
 45 *			hidden word-syntax table.
 46 *
 47 *			void RESearch::ModifyWord(s)
 48 *			char *s;
 49 *
 50 *      RESearch::Substitute:	substitute the matched portions in a new string.
 51 *
 52 *			int RESearch::Substitute(src, dst)
 53 *			char *src;
 54 *			char *dst;
 55 *
 56 *	re_fail:	failure routine for RESearch::Execute.
 57 *
 58 *			void re_fail(msg, op)
 59 *			char *msg;
 60 *			char op;
 61 *
 62 * Regular Expressions:
 63 *
 64 *      [1]     char    matches itself, unless it is a special
 65 *                      character (metachar): . \ [ ] * + ^ $
 66 *
 67 *      [2]     .       matches any character.
 68 *
 69 *      [3]     \       matches the character following it, except
 70 *			when followed by a left or right round bracket,
 71 *			a digit 1 to 9 or a left or right angle bracket.
 72 *			(see [7], [8] and [9])
 73 *			It is used as an escape character for all
 74 *			other meta-characters, and itself. When used
 75 *			in a set ([4]), it is treated as an ordinary
 76 *			character.
 77 *
 78 *      [4]     [set]   matches one of the characters in the set.
 79 *                      If the first character in the set is "^",
 80 *                      it matches a character NOT in the set, i.e.
 81 *			complements the set. A shorthand S-E is
 82 *			used to specify a set of characters S upto
 83 *			E, inclusive. The special characters "]" and
 84 *			"-" have no special meaning if they appear
 85 *			as the first chars in the set.
 86 *                      examples:        match:
 87 *
 88 *                              [a-z]    any lowercase alpha
 89 *
 90 *                              [^]-]    any char except ] and -
 91 *
 92 *                              [^A-Z]   any char except uppercase
 93 *                                       alpha
 94 *
 95 *                              [a-zA-Z] any alpha
 96 *
 97 *      [5]     *       any regular expression form [1] to [4], followed by
 98 *                      closure char (*) matches zero or more matches of
 99 *                      that form.
100 *
101 *      [6]     +       same as [5], except it matches one or more.
102 *
103 *      [7]             a regular expression in the form [1] to [10], enclosed
104 *                      as \(form\) matches what form matches. The enclosure
105 *                      creates a set of tags, used for [8] and for
106 *                      pattern substution. The tagged forms are numbered
107 *			starting from 1.
108 *
109 *      [8]             a \ followed by a digit 1 to 9 matches whatever a
110 *                      previously tagged regular expression ([7]) matched.
111 *
112 *	[9]	\<	a regular expression starting with a \< construct
113 *		\>	and/or ending with a \> construct, restricts the
114 *			pattern matching to the beginning of a word, and/or
115 *			the end of a word. A word is defined to be a character
116 *			string beginning and/or ending with the characters
117 *			A-Z a-z 0-9 and _. It must also be preceded and/or
118 *			followed by any character outside those mentioned.
119 *
120 *      [10]            a composite regular expression xy where x and y
121 *                      are in the form [1] to [10] matches the longest
122 *                      match of x followed by a match for y.
123 *
124 *      [11]	^	a regular expression starting with a ^ character
125 *		$	and/or ending with a $ character, restricts the
126 *                      pattern matching to the beginning of the line,
127 *                      or the end of line. [anchors] Elsewhere in the
128 *			pattern, ^ and $ are treated as ordinary characters.
129 *
130 *
131 * Acknowledgements:
132 *
133 *	HCR's Hugh Redelmeier has been most helpful in various
134 *	stages of development. He convinced me to include BOW
135 *	and EOW constructs, originally invented by Rob Pike at
136 *	the University of Toronto.
137 *
138 * References:
139 *              Software tools			Kernighan & Plauger
140 *              Software tools in Pascal        Kernighan & Plauger
141 *              Grep [rsx-11 C dist]            David Conroy
142 *		ed - text editor		Un*x Programmer's Manual
143 *		Advanced editing on Un*x	B. W. Kernighan
144 *		RegExp routines			Henry Spencer
145 *
146 * Notes:
147 *
148 *	This implementation uses a bit-set representation for character
149 *	classes for speed and compactness. Each character is represented
150 *	by one bit in a 128-bit block. Thus, CCL always takes a
151 *	constant 16 bytes in the internal nfa, and RESearch::Execute does a single
152 *	bit comparison to locate the character in the set.
153 *
154 * Examples:
155 *
156 *	pattern:	foo*.*
157 *	compile:	CHR f CHR o CLO CHR o END CLO ANY END END
158 *	matches:	fo foo fooo foobar fobar foxx ...
159 *
160 *	pattern:	fo[ob]a[rz]
161 *	compile:	CHR f CHR o CCL bitset CHR a CCL bitset END
162 *	matches:	fobar fooar fobaz fooaz
163 *
164 *	pattern:	foo\\+
165 *	compile:	CHR f CHR o CHR o CHR \ CLO CHR \ END END
166 *	matches:	foo\ foo\\ foo\\\  ...
167 *
168 *	pattern:	\(foo\)[1-3]\1	(same as foo[1-3]foo)
169 *	compile:	BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
170 *	matches:	foo1foo foo2foo foo3foo
171 *
172 *	pattern:	\(fo.*\)-\1
173 *	compile:	BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
174 *	matches:	foo-foo fo-fo fob-fob foobar-foobar ...
175 */
176
177#include "RESearch.h"

178
179#define OKP     1

180#define NOP     0

181
182#define CHR     1

183#define ANY     2

184#define CCL     3

185#define BOL     4

186#define EOL     5

187#define BOT     6

188#define EOT     7

189#define BOW	8

190#define EOW	9

191#define REF     10

192#define CLO     11

193
194#define END     0

195
196/*
197 * The following defines are not meant to be changeable.
198 * They are for readability only.
199 */
200#define BLKIND	0370

201#define BITIND	07

202
203#define ASCIIB	0177

204
205const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
206
207#define badpat(x)	(*nfa = END, x)

208
209RESearch::RESearch() {
210	Init();
211}
212
213RESearch::~RESearch() {
214	Clear();
215}
216
217void RESearch::Init() {
218	sta = NOP;               	/* status of lastpat */
219	bol = 0;
220	for (int i=0; i<MAXTAG; i++)
221		pat[i] = 0;
222	for (int j=0; j<BITBLK; j++)
223		bittab[j] = 0;
224}
225
226void RESearch::Clear() {
227	for (int i=0; i<MAXTAG; i++) {
228		delete []pat[i];
229		pat[i] = 0;
230		bopat[i] = NOTFOUND;
231		eopat[i] = NOTFOUND;
232	}
233}
234
235bool RESearch::GrabMatches(CharacterIndexer &ci) {
236	bool success = true;
237	for (unsigned int i=0; i<MAXTAG; i++) {
238		if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
239			unsigned int len = eopat[i] - bopat[i];
240			pat[i] = new char[len + 1];
241			if (pat[i]) {
242				for (unsigned int j=0; j<len; j++)
243					pat[i][j] = ci.CharAt(bopat[i] + j);
244				pat[i][len] = '\0';
245			} else {
246				success = false;
247			}
248		}
249	}
250	return success;
251}
252
253void RESearch::ChSet(char c) {
254	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
255}
256
257void RESearch::ChSetWithCase(char c, bool caseSensitive) {
258	if (caseSensitive) {
259		ChSet(c);
260	} else {
261		if ((c >= 'a') && (c <= 'z')) {
262			ChSet(c);
263			ChSet(static_cast<char>(c - 'a' + 'A'));
264		} else if ((c >= 'A') && (c <= 'Z')) {
265			ChSet(c);
266			ChSet(static_cast<char>(c - 'A' + 'a'));
267		} else {
268			ChSet(c);
269		}
270	}
271}
272
273const char escapeValue(char ch) {
274	switch (ch) {
275	case 'a':	return '\a';
276	case 'b':	return '\b';
277	case 'f':	return '\f';
278	case 'n':	return '\n';
279	case 'r':	return '\r';
280	case 't':	return '\t';
281	case 'v':	return '\v';
282	}
283	return 0;
284}
285
286const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
287	char *mp=nfa;          /* nfa pointer       */
288	char *lp;              /* saved pointer..   */
289	char *sp=nfa;          /* another one..     */
290    char *mpMax = mp + MAXNFA - BITBLK - 10;
291
292	int tagi = 0;          /* tag stack index   */
293	int tagc = 1;          /* actual tag count  */
294
295	int n;
296	char mask;		/* xor mask -CCL/NCL */
297	int c1, c2;
298
299	if (!pat || !length)
300		if (sta)
301			return 0;
302		else
303			return badpat("No previous regular expression");
304	sta = NOP;
305
306	const char *p=pat;               /* pattern pointer   */
307	for (int i=0; i<length; i++, p++) {
308		if (mp > mpMax)
309			return badpat("Pattern too long");
310		lp = mp;
311		switch(*p) {
312
313		case '.':               /* match any char..  */
314			*mp++ = ANY;
315			break;
316
317		case '^':               /* match beginning.. */
318			if (p == pat)
319				*mp++ = BOL;
320			else {
321				*mp++ = CHR;
322				*mp++ = *p;
323			}
324			break;
325
326		case '$':               /* match endofline.. */
327			if (!*(p+1))
328				*mp++ = EOL;
329			else {
330				*mp++ = CHR;
331				*mp++ = *p;
332			}
333			break;
334
335		case '[':               /* match char class..*/
336			*mp++ = CCL;
337
338			i++;
339			if (*++p == '^') {
340				mask = '\377';
341				i++;
342				p++;
343			} else
344				mask = 0;
345
346			if (*p == '-') {		/* real dash */
347				i++;
348				ChSet(*p++);
349			}
350			if (*p == ']') {	/* real brace */
351				i++;
352				ChSet(*p++);
353			}
354			while (*p && *p != ']') {
355				if (*p == '-' && *(p+1) && *(p+1) != ']') {
356					i++;
357					p++;
358					c1 = *(p-2) + 1;
359					i++;
360					c2 = *p++;
361					while (c1 <= c2) {
362						ChSetWithCase(static_cast<char>(c1++), caseSensitive);
363					}
364				} else if (*p == '\\' && *(p+1)) {
365					i++;
366					p++;
367					char escape = escapeValue(*p);
368					if (escape)
369						ChSetWithCase(escape, caseSensitive);
370					else
371						ChSetWithCase(*p, caseSensitive);
372					i++;
373					p++;
374				} else {
375					i++;
376					ChSetWithCase(*p++, caseSensitive);
377				}
378			}
379			if (!*p)
380				return badpat("Missing ]");
381
382			for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
383				*mp++ = static_cast<char>(mask ^ bittab[n]);
384
385			break;
386
387		case '*':               /* match 0 or more.. */
388		case '+':               /* match 1 or more.. */
389			if (p == pat)
390				return badpat("Empty closure");
391			lp = sp;		/* previous opcode */
392			if (*lp == CLO)		/* equivalence..   */
393				break;
394			switch(*lp) {
395
396			case BOL:
397			case BOT:
398			case EOT:
399			case BOW:
400			case EOW:
401			case REF:
402				return badpat("Illegal closure");
403			default:
404				break;
405			}
406
407			if (*p == '+')
408				for (sp = mp; lp < sp; lp++)
409					*mp++ = *lp;
410
411			*mp++ = END;
412			*mp++ = END;
413			sp = mp;
414			while (--mp > lp)
415				*mp = mp[-1];
416			*mp = CLO;
417			mp = sp;
418			break;
419
420		case '\\':              /* tags, backrefs .. */
421			i++;
422			switch(*++p) {
423
424			case '<':
425				*mp++ = BOW;
426				break;
427			case '>':
428				if (*sp == BOW)
429					return badpat("Null pattern inside \\<\\>");
430				*mp++ = EOW;
431				break;
432			case '1':
433			case '2':
434			case '3':
435			case '4':
436			case '5':
437			case '6':
438			case '7':
439			case '8':
440			case '9':
441				n = *p-'0';
442				if (tagi > 0 && tagstk[tagi] == n)
443					return badpat("Cyclical reference");
444				if (tagc > n) {
445					*mp++ = static_cast<char>(REF);
446					*mp++ = static_cast<char>(n);
447				}
448				else
449					return badpat("Undetermined reference");
450				break;
451			case 'a':
452			case 'b':
453			case 'n':
454			case 'f':
455			case 'r':
456			case 't':
457			case 'v':
458				*mp++ = CHR;
459				*mp++ = escapeValue(*p);
460				break;
461			default:
462				if (!posix && *p == '(') {
463					if (tagc < MAXTAG) {
464						tagstk[++tagi] = tagc;
465						*mp++ = BOT;
466						*mp++ = static_cast<char>(tagc++);
467					}
468					else
469						return badpat("Too many \\(\\) pairs");
470				} else if (!posix && *p == ')') {
471					if (*sp == BOT)
472						return badpat("Null pattern inside \\(\\)");
473					if (tagi > 0) {
474						*mp++ = static_cast<char>(EOT);
475						*mp++ = static_cast<char>(tagstk[tagi--]);
476					}
477					else
478						return badpat("Unmatched \\)");
479				} else {
480					*mp++ = CHR;
481					*mp++ = *p;
482				}
483			}
484			break;
485
486		default :               /* an ordinary char  */
487			if (posix && *p == '(') {
488				if (tagc < MAXTAG) {
489					tagstk[++tagi] = tagc;
490					*mp++ = BOT;
491					*mp++ = static_cast<char>(tagc++);
492				}
493				else
494					return badpat("Too many () pairs");
495			} else if (posix && *p == ')') {
496				if (*sp == BOT)
497					return badpat("Null pattern inside ()");
498				if (tagi > 0) {
499					*mp++ = static_cast<char>(EOT);
500					*mp++ = static_cast<char>(tagstk[tagi--]);
501				}
502				else
503					return badpat("Unmatched )");
504			} else if (caseSensitive) {
505				*mp++ = CHR;
506				*mp++ = *p;
507			} else {
508				*mp++ = CCL;
509				mask = 0;
510				ChSetWithCase(*p, false);
511				for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
512					*mp++ = static_cast<char>(mask ^ bittab[n]);
513			}
514			break;
515		}
516		sp = lp;
517	}
518	if (tagi > 0)
519		return badpat((posix ? "Unmatched (" : "Unmatched \\("));
520	*mp = END;
521	sta = OKP;
522	return 0;
523}
524
525/*
526 * RESearch::Execute:
527 * 	execute nfa to find a match.
528 *
529 *	special cases: (nfa[0])
530 *		BOL
531 *			Match only once, starting from the
532 *			beginning.
533 *		CHR
534 *			First locate the character without
535 *			calling PMatch, and if found, call
536 *			PMatch for the remaining string.
537 *		END
538 *			RESearch::Compile failed, poor luser did not
539 *			check for it. Fail fast.
540 *
541 *	If a match is found, bopat[0] and eopat[0] are set
542 *	to the beginning and the end of the matched fragment,
543 *	respectively.
544 *
545 */
546
547int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
548	char c;
549	int ep = NOTFOUND;
550	char *ap = nfa;
551
552	bol = lp;
553	failure = 0;
554
555	Clear();
556
557	switch(*ap) {
558
559	case BOL:			/* anchored: match from BOL only */
560		ep = PMatch(ci, lp, endp, ap);
561		break;
562	case EOL:			/* just searching for end of line normal path doesn't work */
563		if (*(ap+1) == END) {
564			lp = endp;
565			ep = lp;
566			break;
567		} else {
568			return 0;
569		}
570	case CHR:			/* ordinary char: locate it fast */
571		c = *(ap+1);
572		while ((lp < endp) && (ci.CharAt(lp) != c))
573			lp++;
574		if (lp >= endp)		/* if EOS, fail, else fall thru. */
575			return 0;
576	default:			/* regular matching all the way. */
577		while (lp < endp) {
578			ep = PMatch(ci, lp, endp, ap);
579			if (ep != NOTFOUND)
580				break;
581			lp++;
582		}
583		break;
584	case END:			/* munged automaton. fail always */
585		return 0;
586	}
587	if (ep == NOTFOUND)
588		return 0;
589
590	bopat[0] = lp;
591	eopat[0] = ep;
592	return 1;
593}
594
595/*
596 * PMatch: internal routine for the hard part
597 *
598 * 	This code is partly snarfed from an early grep written by
599 *	David Conroy. The backref and tag stuff, and various other
600 *	innovations are by oz.
601 *
602 *	special case optimizations: (nfa[n], nfa[n+1])
603 *		CLO ANY
604 *			We KNOW .* will match everything upto the
605 *			end of line. Thus, directly go to the end of
606 *			line, without recursive PMatch calls. As in
607 *			the other closure cases, the remaining pattern
608 *			must be matched by moving backwards on the
609 *			string recursively, to find a match for xy
610 *			(x is ".*" and y is the remaining pattern)
611 *			where the match satisfies the LONGEST match for
612 *			x followed by a match for y.
613 *		CLO CHR
614 *			We can again scan the string forward for the
615 *			single char and at the point of failure, we
616 *			execute the remaining nfa recursively, same as
617 *			above.
618 *
619 *	At the end of a successful match, bopat[n] and eopat[n]
620 *	are set to the beginning and end of subpatterns matched
621 *	by tagged expressions (n = 1 to 9).
622 *
623 */
624
625extern void re_fail(char *,char);
626
627/*
628 * character classification table for word boundary operators BOW
629 * and EOW. the reason for not using ctype macros is that we can
630 * let the user add into our own table. see RESearch::ModifyWord. This table
631 * is not in the bitset form, since we may wish to extend it in the
632 * future for other character classifications.
633 *
634 *	TRUE for 0-9 A-Z a-z _
635 */
636static char chrtyp[MAXCHR] = {
637	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
638	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
639	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
640	0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
641	0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
642	1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
643	0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
644	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646	1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
647	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
648	1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
649	1, 1, 1, 0, 0, 0, 0, 0
650	};
651
652#define inascii(x)	(0177&(x))

653#define iswordc(x) 	chrtyp[inascii(x)]

654#define isinset(x,y) 	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])

655
656/*
657 * skip values for CLO XXX to skip past the closure
658 */
659
660#define ANYSKIP	2 	/* [CLO] ANY END ...	     */

661#define CHRSKIP	3	/* [CLO] CHR chr END ...     */

662#define CCLSKIP 34	/* [CLO] CCL 32bytes END ... */

663
664int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
665	int op, c, n;
666	int e;		/* extra pointer for CLO */
667	int bp;		/* beginning of subpat.. */
668	int ep;		/* ending of subpat..	 */
669	int are;			/* to save the line ptr. */
670
671	while ((op = *ap++) != END)
672		switch(op) {
673
674		case CHR:
675			if (ci.CharAt(lp++) != *ap++)
676				return NOTFOUND;
677			break;
678		case ANY:
679			if (lp++ >= endp)
680				return NOTFOUND;
681			break;
682		case CCL:
683			c = ci.CharAt(lp++);
684			if (!isinset(ap,c))
685				return NOTFOUND;
686			ap += BITBLK;
687			break;
688		case BOL:
689			if (lp != bol)
690				return NOTFOUND;
691			break;
692		case EOL:
693			if (lp < endp)
694				return NOTFOUND;
695			break;
696		case BOT:
697			bopat[*ap++] = lp;
698			break;
699		case EOT:
700			eopat[*ap++] = lp;
701			break;
702 		case BOW:
703			if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
704				return NOTFOUND;
705			break;
706		case EOW:
707			if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
708				return NOTFOUND;
709			break;
710		case REF:
711			n = *ap++;
712			bp = bopat[n];
713			ep = eopat[n];
714			while (bp < ep)
715				if (ci.CharAt(bp++) != ci.CharAt(lp++))
716					return NOTFOUND;
717			break;
718		case CLO:
719			are = lp;
720			switch(*ap) {
721
722			case ANY:
723				while (lp < endp)
724					lp++;
725				n = ANYSKIP;
726				break;
727			case CHR:
728				c = *(ap+1);
729				while ((lp < endp) && (c == ci.CharAt(lp)))
730					lp++;
731				n = CHRSKIP;
732				break;
733			case CCL:
734				while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
735					lp++;
736				n = CCLSKIP;
737				break;
738			default:
739				failure = true;
740				//re_fail("closure: bad nfa.", *ap);
741				return NOTFOUND;
742			}
743
744			ap += n;
745
746			while (lp >= are) {
747				if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
748					return e;
749				--lp;
750			}
751			return NOTFOUND;
752		default:
753			//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
754			return NOTFOUND;
755		}
756	return lp;
757}
758
759/*
760 * RESearch::ModifyWord:
761 *	add new characters into the word table to change RESearch::Execute's
762 *	understanding of what a word should look like. Note that we
763 *	only accept additions into the word definition.
764 *
765 *	If the string parameter is 0 or null string, the table is
766 *	reset back to the default containing A-Z a-z 0-9 _. [We use
767 *	the compact bitset representation for the default table]
768 */
769
770static char deftab[16] = {
771	0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
772	'\376', '\377', '\377', 007
773};
774
775void RESearch::ModifyWord(char *s) {
776	int i;
777
778	if (!s || !*s) {
779		for (i = 0; i < MAXCHR; i++)
780			if (!isinset(deftab,i))
781				iswordc(i) = 0;
782	}
783	else
784		while(*s)
785			iswordc(*s++) = 1;
786}
787
788/*
789 * RESearch::Substitute:
790 *	substitute the matched portions of the src in dst.
791 *
792 *	&	substitute the entire matched pattern.
793 *
794 *	\digit	substitute a subpattern, with the given	tag number.
795 *		Tags are numbered from 1 to 9. If the particular
796 *		tagged subpattern does not exist, null is substituted.
797 */
798int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
799	char c;
800	int  pin;
801	int bp;
802	int ep;
803
804	if (!*src || !bopat[0])
805		return 0;
806
807	while ((c = *src++) != 0) {
808		switch(c) {
809
810		case '&':
811			pin = 0;
812			break;
813
814		case '\\':
815			c = *src++;
816			if (c >= '0' && c <= '9') {
817				pin = c - '0';
818				break;
819			}
820
821		default:
822			*dst++ = c;
823			continue;
824		}
825
826		if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
827			while (ci.CharAt(bp) && bp < ep)
828				*dst++ = ci.CharAt(bp++);
829			if (bp < ep)
830				return 0;
831		}
832	}
833	*dst = (char) 0;
834	return 1;
835}