PageRenderTime 57ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/cssed-0.4.0/scintilla/src/RESearch.cxx

#
C++ | 835 lines | 506 code | 67 blank | 262 comment | 151 complexity | 663b69e9b6ecefea4cb372cdeca8c46d MD5 | raw file
Possible License(s): GPL-2.0
  1. // Scintilla source code edit control
  2. /** @file RESearch.cxx
  3. ** Regular expression search library.
  4. **/
  5. /*
  6. * regex - Regular expression pattern matching and replacement
  7. *
  8. * By: Ozan S. Yigit (oz)
  9. * Dept. of Computer Science
  10. * York University
  11. *
  12. * Original code available from http://www.cs.yorku.ca/~oz/
  13. * Translation to C++ by Neil Hodgson neilh@scintilla.org
  14. * Removed all use of register.
  15. * Converted to modern function prototypes.
  16. * Put all global/static variables into an object so this code can be
  17. * used from multiple threads etc.
  18. *
  19. * These routines are the PUBLIC DOMAIN equivalents of regex
  20. * routines as found in 4.nBSD UN*X, with minor extensions.
  21. *
  22. * These routines are derived from various implementations found
  23. * in software tools books, and Conroy's grep. They are NOT derived
  24. * from licensed/restricted software.
  25. * For more interesting/academic/complicated implementations,
  26. * see Henry Spencer's regexp routines, or GNU Emacs pattern
  27. * matching module.
  28. *
  29. * Modification history removed.
  30. *
  31. * Interfaces:
  32. * RESearch::Compile: compile a regular expression into a NFA.
  33. *
  34. * char *RESearch::Compile(s)
  35. * char *s;
  36. *
  37. * RESearch::Execute: execute the NFA to match a pattern.
  38. *
  39. * int RESearch::Execute(s)
  40. * char *s;
  41. *
  42. * RESearch::ModifyWord change RESearch::Execute's understanding of what a "word"
  43. * looks like (for \< and \>) by adding into the
  44. * hidden word-syntax table.
  45. *
  46. * void RESearch::ModifyWord(s)
  47. * char *s;
  48. *
  49. * RESearch::Substitute: substitute the matched portions in a new string.
  50. *
  51. * int RESearch::Substitute(src, dst)
  52. * char *src;
  53. * char *dst;
  54. *
  55. * re_fail: failure routine for RESearch::Execute.
  56. *
  57. * void re_fail(msg, op)
  58. * char *msg;
  59. * char op;
  60. *
  61. * Regular Expressions:
  62. *
  63. * [1] char matches itself, unless it is a special
  64. * character (metachar): . \ [ ] * + ^ $
  65. *
  66. * [2] . matches any character.
  67. *
  68. * [3] \ matches the character following it, except
  69. * when followed by a left or right round bracket,
  70. * a digit 1 to 9 or a left or right angle bracket.
  71. * (see [7], [8] and [9])
  72. * It is used as an escape character for all
  73. * other meta-characters, and itself. When used
  74. * in a set ([4]), it is treated as an ordinary
  75. * character.
  76. *
  77. * [4] [set] matches one of the characters in the set.
  78. * If the first character in the set is "^",
  79. * it matches a character NOT in the set, i.e.
  80. * complements the set. A shorthand S-E is
  81. * used to specify a set of characters S upto
  82. * E, inclusive. The special characters "]" and
  83. * "-" have no special meaning if they appear
  84. * as the first chars in the set.
  85. * examples: match:
  86. *
  87. * [a-z] any lowercase alpha
  88. *
  89. * [^]-] any char except ] and -
  90. *
  91. * [^A-Z] any char except uppercase
  92. * alpha
  93. *
  94. * [a-zA-Z] any alpha
  95. *
  96. * [5] * any regular expression form [1] to [4], followed by
  97. * closure char (*) matches zero or more matches of
  98. * that form.
  99. *
  100. * [6] + same as [5], except it matches one or more.
  101. *
  102. * [7] a regular expression in the form [1] to [10], enclosed
  103. * as \(form\) matches what form matches. The enclosure
  104. * creates a set of tags, used for [8] and for
  105. * pattern substution. The tagged forms are numbered
  106. * starting from 1.
  107. *
  108. * [8] a \ followed by a digit 1 to 9 matches whatever a
  109. * previously tagged regular expression ([7]) matched.
  110. *
  111. * [9] \< a regular expression starting with a \< construct
  112. * \> and/or ending with a \> construct, restricts the
  113. * pattern matching to the beginning of a word, and/or
  114. * the end of a word. A word is defined to be a character
  115. * string beginning and/or ending with the characters
  116. * A-Z a-z 0-9 and _. It must also be preceded and/or
  117. * followed by any character outside those mentioned.
  118. *
  119. * [10] a composite regular expression xy where x and y
  120. * are in the form [1] to [10] matches the longest
  121. * match of x followed by a match for y.
  122. *
  123. * [11] ^ a regular expression starting with a ^ character
  124. * $ and/or ending with a $ character, restricts the
  125. * pattern matching to the beginning of the line,
  126. * or the end of line. [anchors] Elsewhere in the
  127. * pattern, ^ and $ are treated as ordinary characters.
  128. *
  129. *
  130. * Acknowledgements:
  131. *
  132. * HCR's Hugh Redelmeier has been most helpful in various
  133. * stages of development. He convinced me to include BOW
  134. * and EOW constructs, originally invented by Rob Pike at
  135. * the University of Toronto.
  136. *
  137. * References:
  138. * Software tools Kernighan & Plauger
  139. * Software tools in Pascal Kernighan & Plauger
  140. * Grep [rsx-11 C dist] David Conroy
  141. * ed - text editor Un*x Programmer's Manual
  142. * Advanced editing on Un*x B. W. Kernighan
  143. * RegExp routines Henry Spencer
  144. *
  145. * Notes:
  146. *
  147. * This implementation uses a bit-set representation for character
  148. * classes for speed and compactness. Each character is represented
  149. * by one bit in a 128-bit block. Thus, CCL always takes a
  150. * constant 16 bytes in the internal nfa, and RESearch::Execute does a single
  151. * bit comparison to locate the character in the set.
  152. *
  153. * Examples:
  154. *
  155. * pattern: foo*.*
  156. * compile: CHR f CHR o CLO CHR o END CLO ANY END END
  157. * matches: fo foo fooo foobar fobar foxx ...
  158. *
  159. * pattern: fo[ob]a[rz]
  160. * compile: CHR f CHR o CCL bitset CHR a CCL bitset END
  161. * matches: fobar fooar fobaz fooaz
  162. *
  163. * pattern: foo\\+
  164. * compile: CHR f CHR o CHR o CHR \ CLO CHR \ END END
  165. * matches: foo\ foo\\ foo\\\ ...
  166. *
  167. * pattern: \(foo\)[1-3]\1 (same as foo[1-3]foo)
  168. * compile: BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
  169. * matches: foo1foo foo2foo foo3foo
  170. *
  171. * pattern: \(fo.*\)-\1
  172. * compile: BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
  173. * matches: foo-foo fo-fo fob-fob foobar-foobar ...
  174. */
  175. #include "RESearch.h"
  176. #define OKP 1
  177. #define NOP 0
  178. #define CHR 1
  179. #define ANY 2
  180. #define CCL 3
  181. #define BOL 4
  182. #define EOL 5
  183. #define BOT 6
  184. #define EOT 7
  185. #define BOW 8
  186. #define EOW 9
  187. #define REF 10
  188. #define CLO 11
  189. #define END 0
  190. /*
  191. * The following defines are not meant to be changeable.
  192. * They are for readability only.
  193. */
  194. #define BLKIND 0370
  195. #define BITIND 07
  196. #define ASCIIB 0177
  197. const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
  198. #define badpat(x) (*nfa = END, x)
  199. RESearch::RESearch() {
  200. Init();
  201. }
  202. RESearch::~RESearch() {
  203. Clear();
  204. }
  205. void RESearch::Init() {
  206. sta = NOP; /* status of lastpat */
  207. bol = 0;
  208. for (int i=0; i<MAXTAG; i++)
  209. pat[i] = 0;
  210. for (int j=0; j<BITBLK; j++)
  211. bittab[j] = 0;
  212. }
  213. void RESearch::Clear() {
  214. for (int i=0; i<MAXTAG; i++) {
  215. delete []pat[i];
  216. pat[i] = 0;
  217. bopat[i] = NOTFOUND;
  218. eopat[i] = NOTFOUND;
  219. }
  220. }
  221. bool RESearch::GrabMatches(CharacterIndexer &ci) {
  222. bool success = true;
  223. for (unsigned int i=0; i<MAXTAG; i++) {
  224. if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
  225. unsigned int len = eopat[i] - bopat[i];
  226. pat[i] = new char[len + 1];
  227. if (pat[i]) {
  228. for (unsigned int j=0; j<len; j++)
  229. pat[i][j] = ci.CharAt(bopat[i] + j);
  230. pat[i][len] = '\0';
  231. } else {
  232. success = false;
  233. }
  234. }
  235. }
  236. return success;
  237. }
  238. void RESearch::ChSet(char c) {
  239. bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
  240. }
  241. void RESearch::ChSetWithCase(char c, bool caseSensitive) {
  242. if (caseSensitive) {
  243. ChSet(c);
  244. } else {
  245. if ((c >= 'a') && (c <= 'z')) {
  246. ChSet(c);
  247. ChSet(static_cast<char>(c - 'a' + 'A'));
  248. } else if ((c >= 'A') && (c <= 'Z')) {
  249. ChSet(c);
  250. ChSet(static_cast<char>(c - 'A' + 'a'));
  251. } else {
  252. ChSet(c);
  253. }
  254. }
  255. }
  256. const char escapeValue(char ch) {
  257. switch (ch) {
  258. case 'a': return '\a';
  259. case 'b': return '\b';
  260. case 'f': return '\f';
  261. case 'n': return '\n';
  262. case 'r': return '\r';
  263. case 't': return '\t';
  264. case 'v': return '\v';
  265. }
  266. return 0;
  267. }
  268. const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
  269. char *mp=nfa; /* nfa pointer */
  270. char *lp; /* saved pointer.. */
  271. char *sp=nfa; /* another one.. */
  272. char *mpMax = mp + MAXNFA - BITBLK - 10;
  273. int tagi = 0; /* tag stack index */
  274. int tagc = 1; /* actual tag count */
  275. int n;
  276. char mask; /* xor mask -CCL/NCL */
  277. int c1, c2;
  278. if (!pat || !length)
  279. if (sta)
  280. return 0;
  281. else
  282. return badpat("No previous regular expression");
  283. sta = NOP;
  284. const char *p=pat; /* pattern pointer */
  285. for (int i=0; i<length; i++, p++) {
  286. if (mp > mpMax)
  287. return badpat("Pattern too long");
  288. lp = mp;
  289. switch(*p) {
  290. case '.': /* match any char.. */
  291. *mp++ = ANY;
  292. break;
  293. case '^': /* match beginning.. */
  294. if (p == pat)
  295. *mp++ = BOL;
  296. else {
  297. *mp++ = CHR;
  298. *mp++ = *p;
  299. }
  300. break;
  301. case '$': /* match endofline.. */
  302. if (!*(p+1))
  303. *mp++ = EOL;
  304. else {
  305. *mp++ = CHR;
  306. *mp++ = *p;
  307. }
  308. break;
  309. case '[': /* match char class..*/
  310. *mp++ = CCL;
  311. i++;
  312. if (*++p == '^') {
  313. mask = '\377';
  314. i++;
  315. p++;
  316. } else
  317. mask = 0;
  318. if (*p == '-') { /* real dash */
  319. i++;
  320. ChSet(*p++);
  321. }
  322. if (*p == ']') { /* real brace */
  323. i++;
  324. ChSet(*p++);
  325. }
  326. while (*p && *p != ']') {
  327. if (*p == '-' && *(p+1) && *(p+1) != ']') {
  328. i++;
  329. p++;
  330. c1 = *(p-2) + 1;
  331. i++;
  332. c2 = *p++;
  333. while (c1 <= c2) {
  334. ChSetWithCase(static_cast<char>(c1++), caseSensitive);
  335. }
  336. } else if (*p == '\\' && *(p+1)) {
  337. i++;
  338. p++;
  339. char escape = escapeValue(*p);
  340. if (escape)
  341. ChSetWithCase(escape, caseSensitive);
  342. else
  343. ChSetWithCase(*p, caseSensitive);
  344. i++;
  345. p++;
  346. } else {
  347. i++;
  348. ChSetWithCase(*p++, caseSensitive);
  349. }
  350. }
  351. if (!*p)
  352. return badpat("Missing ]");
  353. for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
  354. *mp++ = static_cast<char>(mask ^ bittab[n]);
  355. break;
  356. case '*': /* match 0 or more.. */
  357. case '+': /* match 1 or more.. */
  358. if (p == pat)
  359. return badpat("Empty closure");
  360. lp = sp; /* previous opcode */
  361. if (*lp == CLO) /* equivalence.. */
  362. break;
  363. switch(*lp) {
  364. case BOL:
  365. case BOT:
  366. case EOT:
  367. case BOW:
  368. case EOW:
  369. case REF:
  370. return badpat("Illegal closure");
  371. default:
  372. break;
  373. }
  374. if (*p == '+')
  375. for (sp = mp; lp < sp; lp++)
  376. *mp++ = *lp;
  377. *mp++ = END;
  378. *mp++ = END;
  379. sp = mp;
  380. while (--mp > lp)
  381. *mp = mp[-1];
  382. *mp = CLO;
  383. mp = sp;
  384. break;
  385. case '\\': /* tags, backrefs .. */
  386. i++;
  387. switch(*++p) {
  388. case '<':
  389. *mp++ = BOW;
  390. break;
  391. case '>':
  392. if (*sp == BOW)
  393. return badpat("Null pattern inside \\<\\>");
  394. *mp++ = EOW;
  395. break;
  396. case '1':
  397. case '2':
  398. case '3':
  399. case '4':
  400. case '5':
  401. case '6':
  402. case '7':
  403. case '8':
  404. case '9':
  405. n = *p-'0';
  406. if (tagi > 0 && tagstk[tagi] == n)
  407. return badpat("Cyclical reference");
  408. if (tagc > n) {
  409. *mp++ = static_cast<char>(REF);
  410. *mp++ = static_cast<char>(n);
  411. }
  412. else
  413. return badpat("Undetermined reference");
  414. break;
  415. case 'a':
  416. case 'b':
  417. case 'n':
  418. case 'f':
  419. case 'r':
  420. case 't':
  421. case 'v':
  422. *mp++ = CHR;
  423. *mp++ = escapeValue(*p);
  424. break;
  425. default:
  426. if (!posix && *p == '(') {
  427. if (tagc < MAXTAG) {
  428. tagstk[++tagi] = tagc;
  429. *mp++ = BOT;
  430. *mp++ = static_cast<char>(tagc++);
  431. }
  432. else
  433. return badpat("Too many \\(\\) pairs");
  434. } else if (!posix && *p == ')') {
  435. if (*sp == BOT)
  436. return badpat("Null pattern inside \\(\\)");
  437. if (tagi > 0) {
  438. *mp++ = static_cast<char>(EOT);
  439. *mp++ = static_cast<char>(tagstk[tagi--]);
  440. }
  441. else
  442. return badpat("Unmatched \\)");
  443. } else {
  444. *mp++ = CHR;
  445. *mp++ = *p;
  446. }
  447. }
  448. break;
  449. default : /* an ordinary char */
  450. if (posix && *p == '(') {
  451. if (tagc < MAXTAG) {
  452. tagstk[++tagi] = tagc;
  453. *mp++ = BOT;
  454. *mp++ = static_cast<char>(tagc++);
  455. }
  456. else
  457. return badpat("Too many () pairs");
  458. } else if (posix && *p == ')') {
  459. if (*sp == BOT)
  460. return badpat("Null pattern inside ()");
  461. if (tagi > 0) {
  462. *mp++ = static_cast<char>(EOT);
  463. *mp++ = static_cast<char>(tagstk[tagi--]);
  464. }
  465. else
  466. return badpat("Unmatched )");
  467. } else if (caseSensitive) {
  468. *mp++ = CHR;
  469. *mp++ = *p;
  470. } else {
  471. *mp++ = CCL;
  472. mask = 0;
  473. ChSetWithCase(*p, false);
  474. for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
  475. *mp++ = static_cast<char>(mask ^ bittab[n]);
  476. }
  477. break;
  478. }
  479. sp = lp;
  480. }
  481. if (tagi > 0)
  482. return badpat((posix ? "Unmatched (" : "Unmatched \\("));
  483. *mp = END;
  484. sta = OKP;
  485. return 0;
  486. }
  487. /*
  488. * RESearch::Execute:
  489. * execute nfa to find a match.
  490. *
  491. * special cases: (nfa[0])
  492. * BOL
  493. * Match only once, starting from the
  494. * beginning.
  495. * CHR
  496. * First locate the character without
  497. * calling PMatch, and if found, call
  498. * PMatch for the remaining string.
  499. * END
  500. * RESearch::Compile failed, poor luser did not
  501. * check for it. Fail fast.
  502. *
  503. * If a match is found, bopat[0] and eopat[0] are set
  504. * to the beginning and the end of the matched fragment,
  505. * respectively.
  506. *
  507. */
  508. int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
  509. char c;
  510. int ep = NOTFOUND;
  511. char *ap = nfa;
  512. bol = lp;
  513. failure = 0;
  514. Clear();
  515. switch(*ap) {
  516. case BOL: /* anchored: match from BOL only */
  517. ep = PMatch(ci, lp, endp, ap);
  518. break;
  519. case EOL: /* just searching for end of line normal path doesn't work */
  520. if (*(ap+1) == END) {
  521. lp = endp;
  522. ep = lp;
  523. break;
  524. } else {
  525. return 0;
  526. }
  527. case CHR: /* ordinary char: locate it fast */
  528. c = *(ap+1);
  529. while ((lp < endp) && (ci.CharAt(lp) != c))
  530. lp++;
  531. if (lp >= endp) /* if EOS, fail, else fall thru. */
  532. return 0;
  533. default: /* regular matching all the way. */
  534. while (lp < endp) {
  535. ep = PMatch(ci, lp, endp, ap);
  536. if (ep != NOTFOUND)
  537. break;
  538. lp++;
  539. }
  540. break;
  541. case END: /* munged automaton. fail always */
  542. return 0;
  543. }
  544. if (ep == NOTFOUND)
  545. return 0;
  546. bopat[0] = lp;
  547. eopat[0] = ep;
  548. return 1;
  549. }
  550. /*
  551. * PMatch: internal routine for the hard part
  552. *
  553. * This code is partly snarfed from an early grep written by
  554. * David Conroy. The backref and tag stuff, and various other
  555. * innovations are by oz.
  556. *
  557. * special case optimizations: (nfa[n], nfa[n+1])
  558. * CLO ANY
  559. * We KNOW .* will match everything upto the
  560. * end of line. Thus, directly go to the end of
  561. * line, without recursive PMatch calls. As in
  562. * the other closure cases, the remaining pattern
  563. * must be matched by moving backwards on the
  564. * string recursively, to find a match for xy
  565. * (x is ".*" and y is the remaining pattern)
  566. * where the match satisfies the LONGEST match for
  567. * x followed by a match for y.
  568. * CLO CHR
  569. * We can again scan the string forward for the
  570. * single char and at the point of failure, we
  571. * execute the remaining nfa recursively, same as
  572. * above.
  573. *
  574. * At the end of a successful match, bopat[n] and eopat[n]
  575. * are set to the beginning and end of subpatterns matched
  576. * by tagged expressions (n = 1 to 9).
  577. *
  578. */
  579. extern void re_fail(char *,char);
  580. /*
  581. * character classification table for word boundary operators BOW
  582. * and EOW. the reason for not using ctype macros is that we can
  583. * let the user add into our own table. see RESearch::ModifyWord. This table
  584. * is not in the bitset form, since we may wish to extend it in the
  585. * future for other character classifications.
  586. *
  587. * TRUE for 0-9 A-Z a-z _
  588. */
  589. static char chrtyp[MAXCHR] = {
  590. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  591. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  592. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  593. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  594. 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
  595. 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
  596. 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
  597. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  598. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  599. 1, 0, 0, 0, 0, 1, 0, 1, 1, 1,
  600. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  601. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  602. 1, 1, 1, 0, 0, 0, 0, 0
  603. };
  604. #define inascii(x) (0177&(x))
  605. #define iswordc(x) chrtyp[inascii(x)]
  606. #define isinset(x,y) ((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
  607. /*
  608. * skip values for CLO XXX to skip past the closure
  609. */
  610. #define ANYSKIP 2 /* [CLO] ANY END ... */
  611. #define CHRSKIP 3 /* [CLO] CHR chr END ... */
  612. #define CCLSKIP 34 /* [CLO] CCL 32bytes END ... */
  613. int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
  614. int op, c, n;
  615. int e; /* extra pointer for CLO */
  616. int bp; /* beginning of subpat.. */
  617. int ep; /* ending of subpat.. */
  618. int are; /* to save the line ptr. */
  619. while ((op = *ap++) != END)
  620. switch(op) {
  621. case CHR:
  622. if (ci.CharAt(lp++) != *ap++)
  623. return NOTFOUND;
  624. break;
  625. case ANY:
  626. if (lp++ >= endp)
  627. return NOTFOUND;
  628. break;
  629. case CCL:
  630. c = ci.CharAt(lp++);
  631. if (!isinset(ap,c))
  632. return NOTFOUND;
  633. ap += BITBLK;
  634. break;
  635. case BOL:
  636. if (lp != bol)
  637. return NOTFOUND;
  638. break;
  639. case EOL:
  640. if (lp < endp)
  641. return NOTFOUND;
  642. break;
  643. case BOT:
  644. bopat[*ap++] = lp;
  645. break;
  646. case EOT:
  647. eopat[*ap++] = lp;
  648. break;
  649. case BOW:
  650. if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
  651. return NOTFOUND;
  652. break;
  653. case EOW:
  654. if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
  655. return NOTFOUND;
  656. break;
  657. case REF:
  658. n = *ap++;
  659. bp = bopat[n];
  660. ep = eopat[n];
  661. while (bp < ep)
  662. if (ci.CharAt(bp++) != ci.CharAt(lp++))
  663. return NOTFOUND;
  664. break;
  665. case CLO:
  666. are = lp;
  667. switch(*ap) {
  668. case ANY:
  669. while (lp < endp)
  670. lp++;
  671. n = ANYSKIP;
  672. break;
  673. case CHR:
  674. c = *(ap+1);
  675. while ((lp < endp) && (c == ci.CharAt(lp)))
  676. lp++;
  677. n = CHRSKIP;
  678. break;
  679. case CCL:
  680. while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
  681. lp++;
  682. n = CCLSKIP;
  683. break;
  684. default:
  685. failure = true;
  686. //re_fail("closure: bad nfa.", *ap);
  687. return NOTFOUND;
  688. }
  689. ap += n;
  690. while (lp >= are) {
  691. if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
  692. return e;
  693. --lp;
  694. }
  695. return NOTFOUND;
  696. default:
  697. //re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
  698. return NOTFOUND;
  699. }
  700. return lp;
  701. }
  702. /*
  703. * RESearch::ModifyWord:
  704. * add new characters into the word table to change RESearch::Execute's
  705. * understanding of what a word should look like. Note that we
  706. * only accept additions into the word definition.
  707. *
  708. * If the string parameter is 0 or null string, the table is
  709. * reset back to the default containing A-Z a-z 0-9 _. [We use
  710. * the compact bitset representation for the default table]
  711. */
  712. static char deftab[16] = {
  713. 0, 0, 0, 0, 0, 0, '\377', 003, '\376', '\377', '\377', '\207',
  714. '\376', '\377', '\377', 007
  715. };
  716. void RESearch::ModifyWord(char *s) {
  717. int i;
  718. if (!s || !*s) {
  719. for (i = 0; i < MAXCHR; i++)
  720. if (!isinset(deftab,i))
  721. iswordc(i) = 0;
  722. }
  723. else
  724. while(*s)
  725. iswordc(*s++) = 1;
  726. }
  727. /*
  728. * RESearch::Substitute:
  729. * substitute the matched portions of the src in dst.
  730. *
  731. * & substitute the entire matched pattern.
  732. *
  733. * \digit substitute a subpattern, with the given tag number.
  734. * Tags are numbered from 1 to 9. If the particular
  735. * tagged subpattern does not exist, null is substituted.
  736. */
  737. int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
  738. char c;
  739. int pin;
  740. int bp;
  741. int ep;
  742. if (!*src || !bopat[0])
  743. return 0;
  744. while ((c = *src++) != 0) {
  745. switch(c) {
  746. case '&':
  747. pin = 0;
  748. break;
  749. case '\\':
  750. c = *src++;
  751. if (c >= '0' && c <= '9') {
  752. pin = c - '0';
  753. break;
  754. }
  755. default:
  756. *dst++ = c;
  757. continue;
  758. }
  759. if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
  760. while (ci.CharAt(bp) && bp < ep)
  761. *dst++ = ci.CharAt(bp++);
  762. if (bp < ep)
  763. return 0;
  764. }
  765. }
  766. *dst = (char) 0;
  767. return 1;
  768. }