PageRenderTime 35ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 1ms

/src/regex.cc

https://github.com/snmsts/xyzzy
C++ | 2695 lines | 2421 code | 268 blank | 6 comment | 578 complexity | 226b71576c0cab93d6ce542a7404c697 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. #include "stdafx.h"
  2. #include "ed.h"
  3. #include "regex.h"
  4. extern u_char char_no_translate_table[];
  5. extern u_char char_translate_upcase_table[];
  6. enum
  7. {
  8. BEGLINE,
  9. ENDLINE,
  10. BEGBUF,
  11. ENDBUF,
  12. ANYCHAR,
  13. START_SAVE_REGS,
  14. END_SAVE_REGS,
  15. BACKREF,
  16. NORMAL_CHARS,
  17. END_BRANCH,
  18. BRANCH,
  19. BRANCH_BACKTRACK,
  20. CLOSURE,
  21. CLOSURE_BACKTRACK,
  22. CLOSURE_SIMPLE,
  23. SHORTEST_CLOSURE,
  24. SHORTEST_CLOSURE_BACKTRACK,
  25. SHORTEST_CLOSURE_SIMPLE,
  26. CHAR_CLASS,
  27. CHAR_CLASS_NOT,
  28. BEGWORD,
  29. ENDWORD,
  30. WORDBOUND,
  31. NOT_WORDBOUND,
  32. WORDCHAR,
  33. NOT_WORDCHAR,
  34. SYNTAX_SPEC,
  35. NOT_SYNTAX_SPEC,
  36. BEGSYMBOL,
  37. ENDSYMBOL,
  38. SYMBOLBOUND,
  39. NOT_SYMBOLBOUND,
  40. SYMBOLCHAR,
  41. NOT_SYMBOLCHAR
  42. };
  43. #define NBITS (sizeof (Char) * 8)
  44. class charclass
  45. {
  46. Char hi[256 / NBITS];
  47. Char lo[256][256 / NBITS];
  48. static void set (Char *b, int n)
  49. {b[n / NBITS] |= 1 << (n % NBITS);}
  50. static int isset (const Char *b, int n)
  51. {return b[n / NBITS] & (1 << (n % NBITS));}
  52. public:
  53. struct cc
  54. {
  55. int nent;
  56. struct
  57. {
  58. u_char hi;
  59. u_char all;
  60. u_char lower;
  61. u_char upper;
  62. } f[256];
  63. };
  64. charclass () {bzero (hi, sizeof hi);}
  65. void set (Char c)
  66. {
  67. int h = c >> 8;
  68. int l = c & 255;
  69. if (!isset (hi, h))
  70. {
  71. set (hi, h);
  72. bzero (lo[h], sizeof lo[h]);
  73. }
  74. set (lo[h], l);
  75. }
  76. int count_size (cc &) const;
  77. Char *copy (Char *, cc &, int) const;
  78. };
  79. int
  80. charclass::count_size (cc &f) const
  81. {
  82. int size = 1;
  83. f.nent = 0;
  84. for (int h = 0; h < 256; h++)
  85. if (isset (hi, h))
  86. {
  87. int l;
  88. for (l = 0; l < 256 / NBITS && !lo[h][l]; l++)
  89. ;
  90. int u;
  91. for (u = 256 / NBITS - 1; u > l && !lo[h][u]; u--)
  92. ;
  93. u++;
  94. int i;
  95. for (i = l; i < u; i++)
  96. if (lo[h][i] != Char (-1))
  97. break;
  98. size += 2;
  99. if (i < u)
  100. {
  101. size += u - l;
  102. f.f[f.nent].all = 0;
  103. }
  104. else
  105. f.f[f.nent].all = 1;
  106. f.f[f.nent].lower = l;
  107. f.f[f.nent].upper = u;
  108. f.f[f.nent].hi = h;
  109. f.nent++;
  110. }
  111. return size;
  112. }
  113. Char *
  114. charclass::copy (Char *b, cc &f, int size) const
  115. {
  116. *b++ = size;
  117. for (int i = 0; i < f.nent; i++)
  118. {
  119. *b++ = (f.f[i].hi << 8) + f.f[i].all;
  120. *b++ = (f.f[i].upper << 8) + f.f[i].lower;
  121. if (!f.f[i].all)
  122. for (int j = f.f[i].lower; j < f.f[i].upper; j++)
  123. *b++ = lo[f.f[i].hi][j];
  124. }
  125. return b;
  126. }
  127. #define INFINITY (CHAR_LIMIT - 1)
  128. class regexp_compile
  129. {
  130. struct stack
  131. {
  132. int buf;
  133. int remain_branch;
  134. int reg;
  135. int branch_start;
  136. };
  137. enum {MAX_STACK_DEPTH = 10};
  138. stack r_stackb[MAX_STACK_DEPTH];
  139. stack *r_stackp;
  140. int r_allocated;
  141. Char *r_last_start;
  142. Char *r_branch_start;
  143. int r_remain_branch;
  144. Char *r_normal_char;
  145. int r_regnum;
  146. const u_char *r_translate;
  147. const syntax_table *r_syntax_table;
  148. Char *extend_buffer (Char *, int);
  149. Char *branch (Char *);
  150. Char *closure (Char *, int, int, int);
  151. static void error (message_code e) {FEsimple_error (e);}
  152. const Char *char_class (const Char *, const Char *, Char *&);
  153. void char_class_fastmap (const Char *, char *) const;
  154. void char_class_not_fastmap (const Char *, char *) const;
  155. static int check_inner_closure (Char *, Char *);
  156. static int check_postfix (const Char *&, const Char *);
  157. public:
  158. Char *r_buffer;
  159. int r_used;
  160. int r_has_backref;
  161. regexp_compile (const u_char *, const syntax_table *);
  162. regexp_compile (const u_char *, const syntax_table *, int); // XXX
  163. void compile (const Char *, int);
  164. static int match_void_p (const Char *, const Char *);
  165. static int match_bol_p (const Char *, const Char *);
  166. int compile_fastmap (char *, const Char *, const Char *) const;
  167. ~regexp_compile ()
  168. {
  169. if (r_buffer)
  170. xfree (r_buffer);
  171. }
  172. #ifdef DEBUG
  173. void dump () const;
  174. static void dump (const Char *, int);
  175. #endif
  176. };
  177. regexp_compile::regexp_compile (const u_char *translate, const syntax_table *syntax_tab)
  178. : r_translate (translate), r_syntax_table (syntax_tab), r_buffer (0)
  179. {
  180. r_allocated = 128;
  181. r_buffer = (Char *)xmalloc (sizeof (Char) * r_allocated);
  182. r_stackp = r_stackb;
  183. r_last_start = 0;
  184. r_branch_start = r_buffer;
  185. r_remain_branch = 0;
  186. r_normal_char = 0;
  187. r_regnum = 1;
  188. r_has_backref = 0;
  189. }
  190. regexp_compile::regexp_compile (const u_char *translate, const syntax_table *syntax_tab, int)
  191. : r_translate (translate), r_syntax_table (syntax_tab), r_buffer (0)
  192. {
  193. }
  194. Char *
  195. regexp_compile::extend_buffer (Char *b, int req)
  196. {
  197. if (b + req > r_buffer + r_allocated)
  198. {
  199. int size = r_allocated + max (req, 64);
  200. Char *p = (Char *)xrealloc (r_buffer, sizeof (Char) * size);
  201. r_allocated = size;
  202. b = p + (b - r_buffer);
  203. r_branch_start = p + (r_branch_start - r_buffer);
  204. if (r_last_start)
  205. r_last_start = p + (r_last_start - r_buffer);
  206. if (r_normal_char)
  207. r_normal_char = p + (r_normal_char - r_buffer);
  208. r_buffer = p;
  209. }
  210. return b;
  211. }
  212. #ifdef DEBUG
  213. void
  214. regexp_compile::dump (const Char *p0, int size)
  215. {
  216. for (const Char *p = p0, *pe = p + size; p < pe;)
  217. {
  218. printf ("%5d: ", p - p0);
  219. Char c = *p++;
  220. switch (c)
  221. {
  222. case BEGLINE:
  223. printf ("begline");
  224. break;
  225. case ENDLINE:
  226. printf ("endline");
  227. break;
  228. case BEGBUF:
  229. printf ("begbuf");
  230. break;
  231. case ENDBUF:
  232. printf ("endbuf");
  233. break;
  234. case ANYCHAR:
  235. printf ("anychar");
  236. break;
  237. case START_SAVE_REGS:
  238. printf ("start-save-regs: %d", *p++);
  239. break;
  240. case END_SAVE_REGS:
  241. printf ("end-save-regs: %d", *p++);
  242. break;
  243. case BACKREF:
  244. printf ("backref: %d", *p++);
  245. break;
  246. case NORMAL_CHARS:
  247. {
  248. printf ("normal-chars: ");
  249. int i;
  250. for (i = 1; i <= *p; i++)
  251. putchar (p[i]);
  252. p += i;
  253. }
  254. break;
  255. case CHAR_CLASS:
  256. case CHAR_CLASS_NOT:
  257. {
  258. printf (c == CHAR_CLASS ? "char-class: " : "char-class-not: ");
  259. const Char *p2 = p + *p;
  260. for (p++; p < p2;)
  261. {
  262. int u = p[1] >> 8, l = p[1] & 0xff;
  263. printf ("%02x(%d-%d)", *p >> 8, l, u);
  264. if (*p & 0xff)
  265. {
  266. p += 2;
  267. printf ("ALL");
  268. }
  269. else
  270. p += 2 + u - l;
  271. }
  272. }
  273. break;
  274. case END_BRANCH:
  275. printf ("end-branch");
  276. break;
  277. case BRANCH:
  278. printf ("branch: %d", *p++);
  279. break;
  280. case BRANCH_BACKTRACK:
  281. printf ("branch-backtrack: %d", *p++);
  282. break;
  283. case CLOSURE:
  284. printf ("closure: ");
  285. printf ("min: %d ", *p++);
  286. printf ("max: %d ", *p++);
  287. printf ("start: %d ", *p++);
  288. break;
  289. case CLOSURE_BACKTRACK:
  290. printf ("closure-backtrack: ");
  291. printf ("min: %d ", *p++);
  292. printf ("max: %d ", *p++);
  293. printf ("start: %d ", *p++);
  294. break;
  295. case CLOSURE_SIMPLE:
  296. printf ("closure-simple: ");
  297. printf ("min: %d ", *p++);
  298. printf ("max: %d ", *p++);
  299. printf ("start: %d ", *p++);
  300. break;
  301. case SHORTEST_CLOSURE:
  302. printf ("shortest-closure: ");
  303. printf ("min: %d ", *p++);
  304. printf ("max: %d ", *p++);
  305. printf ("start: %d ", *p++);
  306. break;
  307. case SHORTEST_CLOSURE_BACKTRACK:
  308. printf ("shortest-closure-backtrack: ");
  309. printf ("min: %d ", *p++);
  310. printf ("max: %d ", *p++);
  311. printf ("start: %d ", *p++);
  312. break;
  313. case SHORTEST_CLOSURE_SIMPLE:
  314. printf ("shortest-closure-simple: ");
  315. printf ("min: %d ", *p++);
  316. printf ("max: %d ", *p++);
  317. printf ("start: %d ", *p++);
  318. break;
  319. case BEGWORD:
  320. printf ("begword\n");
  321. break;
  322. case ENDWORD:
  323. printf ("endword\n");
  324. break;
  325. case WORDBOUND:
  326. printf ("word-bound\n");
  327. break;
  328. case NOT_WORDBOUND:
  329. printf ("not-word-bound\n");
  330. break;
  331. case WORDCHAR:
  332. printf ("word-char\n");
  333. break;
  334. case NOT_WORDCHAR:
  335. printf ("not-word-char\n");
  336. break;
  337. case SYNTAX_SPEC:
  338. printf ("syntax-spec: %d\n", *p++);
  339. break;
  340. case NOT_SYNTAX_SPEC:
  341. printf ("not-syntax-spec: %d\n", *p++);
  342. break;
  343. case BEGSYMBOL:
  344. printf ("begsymbol\n");
  345. break;
  346. case ENDSYMBOL:
  347. printf ("endsymbol\n");
  348. break;
  349. case SYMBOLBOUND:
  350. printf ("symbol-bound\n");
  351. break;
  352. case NOT_SYMBOLBOUND:
  353. printf ("not-symbol-bound\n");
  354. break;
  355. case SYMBOLCHAR:
  356. printf ("symbol-char\n");
  357. break;
  358. case NOT_SYMBOLCHAR:
  359. printf ("not-symbol-char\n");
  360. break;
  361. default:
  362. printf ("UNKNOWN CODE: %d", c);
  363. break;
  364. }
  365. putchar ('\n');
  366. }
  367. fflush (stdout);
  368. }
  369. void
  370. regexp_compile::dump () const
  371. {
  372. //dump (r_buffer, r_used);
  373. }
  374. #endif /* DEBUG */
  375. int
  376. regexp_compile::check_inner_closure (Char *p, Char *pe)
  377. {
  378. int n = 0;
  379. while (p < pe)
  380. {
  381. switch (*p++)
  382. {
  383. case BEGLINE:
  384. case ENDLINE:
  385. case BEGBUF:
  386. case ENDBUF:
  387. case ANYCHAR:
  388. case BEGWORD:
  389. case ENDWORD:
  390. case WORDBOUND:
  391. case NOT_WORDBOUND:
  392. case WORDCHAR:
  393. case NOT_WORDCHAR:
  394. case END_BRANCH:
  395. case BEGSYMBOL:
  396. case ENDSYMBOL:
  397. case SYMBOLBOUND:
  398. case NOT_SYMBOLBOUND:
  399. case SYMBOLCHAR:
  400. case NOT_SYMBOLCHAR:
  401. break;
  402. case NORMAL_CHARS:
  403. p += *p + 1;
  404. break;
  405. case SYNTAX_SPEC:
  406. case NOT_SYNTAX_SPEC:
  407. case START_SAVE_REGS:
  408. case END_SAVE_REGS:
  409. case BACKREF:
  410. p++;
  411. break;
  412. case CHAR_CLASS:
  413. case CHAR_CLASS_NOT:
  414. p += *p;
  415. break;
  416. case BRANCH:
  417. case BRANCH_BACKTRACK:
  418. do
  419. {
  420. if (check_inner_closure (p + 1, p + *p - 1))
  421. p[-1] = BRANCH_BACKTRACK;
  422. p += *p - 1;
  423. }
  424. while (*p++ != END_BRANCH);
  425. n = 1;
  426. break;
  427. case CLOSURE:
  428. case CLOSURE_BACKTRACK:
  429. case CLOSURE_SIMPLE:
  430. if (check_inner_closure (p + 3, p + p[2] - 1))
  431. p[-1] = CLOSURE_BACKTRACK;
  432. p += p[2] - 1;
  433. n = 1;
  434. break;
  435. case SHORTEST_CLOSURE:
  436. case SHORTEST_CLOSURE_BACKTRACK:
  437. case SHORTEST_CLOSURE_SIMPLE:
  438. if (check_inner_closure (p + 3, p + p[2] - 1))
  439. p[-1] = SHORTEST_CLOSURE_BACKTRACK;
  440. p += p[2] - 1;
  441. n = 1;
  442. break;
  443. }
  444. }
  445. return n;
  446. }
  447. Char *
  448. regexp_compile::branch (Char *b)
  449. {
  450. Char *p = r_branch_start;
  451. memmove (p + 2, p, sizeof (Char) * (b - p));
  452. b += 2;
  453. if (b - p >= CHAR_LIMIT)
  454. error (Eregexp_too_long);
  455. p[0] = BRANCH;
  456. p[1] = b - p;
  457. r_normal_char = 0;
  458. return b;
  459. }
  460. Char *
  461. regexp_compile::closure (Char *b, int min, int max, int shortest)
  462. {
  463. Char *p = r_last_start;
  464. memmove (p + 4, p, sizeof (Char) * (b - p));
  465. b += 4;
  466. Char *endp;
  467. switch (p[4])
  468. {
  469. case NORMAL_CHARS:
  470. endp = p[5] == 1 ? &p[6] + p[5] : 0;
  471. break;
  472. case ANYCHAR:
  473. case WORDCHAR:
  474. case NOT_WORDCHAR:
  475. case SYMBOLCHAR:
  476. case NOT_SYMBOLCHAR:
  477. endp = &p[4] + 1;
  478. break;
  479. case SYNTAX_SPEC:
  480. case NOT_SYNTAX_SPEC:
  481. endp = &p[4] + 2;
  482. break;
  483. case CHAR_CLASS:
  484. case CHAR_CLASS_NOT:
  485. endp = &p[5] + p[5];
  486. break;
  487. default:
  488. endp = 0;
  489. break;
  490. }
  491. if (endp == b)
  492. p[0] = shortest ? SHORTEST_CLOSURE_SIMPLE : CLOSURE_SIMPLE;
  493. else
  494. p[0] = shortest ? SHORTEST_CLOSURE : CLOSURE;
  495. if (b - p >= CHAR_LIMIT)
  496. error (Eregexp_too_long);
  497. p[1] = min;
  498. p[2] = max;
  499. p[3] = b - p;
  500. r_normal_char = 0;
  501. return b;
  502. }
  503. const Char *
  504. regexp_compile::char_class (const Char *p, const Char *pe, Char *&b)
  505. {
  506. r_last_start = b;
  507. if (p == pe)
  508. error (Eunmatched_bracket);
  509. if (*p == '^')
  510. {
  511. *b++ = CHAR_CLASS_NOT;
  512. p++;
  513. }
  514. else
  515. *b++ = CHAR_CLASS;
  516. charclass ccl;
  517. const Char *p0 = p + 1;
  518. while (1)
  519. {
  520. if (p == pe)
  521. error (Eunmatched_bracket);
  522. lChar c = *p++;
  523. if (c == ']' && p != p0)
  524. break;
  525. if (p < pe - 1 && *p == '-' && p[1] != ']')
  526. {
  527. lChar c2 = p[1];
  528. p += 2;
  529. for (; c <= c2; c++)
  530. if (ascii_char_p (c))
  531. ccl.set (Char (r_translate[c]));
  532. else
  533. ccl.set (Char (c));
  534. }
  535. else
  536. if (ascii_char_p (c))
  537. ccl.set (Char (r_translate[c]));
  538. else
  539. ccl.set (Char (c));
  540. }
  541. charclass::cc f;
  542. int size = ccl.count_size (f);
  543. b = extend_buffer (b, size);
  544. Char *b2 = ccl.copy (b, f, size);
  545. assert (b2 == b + size);
  546. b = b2;
  547. return p;
  548. }
  549. inline int
  550. regexp_compile::check_postfix (const Char *&p, const Char *pe)
  551. {
  552. if (p == pe || *p != '?')
  553. return 0;
  554. p++;
  555. return 1;
  556. }
  557. void
  558. regexp_compile::compile (const Char *pattern, int size)
  559. {
  560. const Char *p = pattern, *pe = p + size;
  561. Char *b = r_buffer;
  562. while (p < pe)
  563. {
  564. b = extend_buffer (b, 32);
  565. Char c = *p++;
  566. switch (c)
  567. {
  568. case '^':
  569. if (r_last_start)
  570. goto normal_char;
  571. *b++ = BEGLINE;
  572. break;
  573. case '$':
  574. if (p == pe || (p < pe - 1 && *p == '\\' && (p[1] == '|' || p[1] == ')')))
  575. *b++ = ENDLINE;
  576. else
  577. goto normal_char;
  578. break;
  579. case '.':
  580. r_last_start = b;
  581. *b++ = ANYCHAR;
  582. break;
  583. case '[':
  584. p = char_class (p, pe, b);
  585. break;
  586. case '*':
  587. if (!r_last_start)
  588. goto normal_char;
  589. b = closure (b, 0, INFINITY, check_postfix (p, pe));
  590. break;
  591. case '+':
  592. if (!r_last_start)
  593. goto normal_char;
  594. b = closure (b, 1, INFINITY, check_postfix (p, pe));
  595. break;
  596. case '?':
  597. if (!r_last_start)
  598. goto normal_char;
  599. b = closure (b, 0, 1, check_postfix (p, pe));
  600. break;
  601. case '\\':
  602. if (p == pe)
  603. error (Ere_invalid_pattern);
  604. c = *p++;
  605. switch (c)
  606. {
  607. case '{':
  608. if (!r_last_start)
  609. goto normal_char;
  610. else
  611. {
  612. /*
  613. {M} - x == M
  614. {M,} - x >= M
  615. {,M} - 0 <= x <= M
  616. {M,N} - M <= x <= N */
  617. int minrep, maxrep;
  618. const Char *op = p;
  619. minrep = 0;
  620. while (1)
  621. {
  622. if (p == pe)
  623. error (Ere_unmatched_lbrace);
  624. c = *p++;
  625. if (c < '0' || c > '9')
  626. break;
  627. minrep = minrep * 10 + c - '0';
  628. }
  629. if (c == ',')
  630. {
  631. const Char *op2 = p;
  632. maxrep = 0;
  633. while (1)
  634. {
  635. if (p == pe)
  636. error (Ere_unmatched_lbrace);
  637. c = *p++;
  638. if (c < '0' || c > '9')
  639. break;
  640. maxrep = maxrep * 10 + c - '0';
  641. maxrep = min (maxrep, INFINITY);
  642. }
  643. if (p == op2 + 1)
  644. {
  645. if (p == op + 2)
  646. error (Ere_malformed_repeat_count);
  647. maxrep = INFINITY;
  648. }
  649. }
  650. else
  651. maxrep = minrep;
  652. if (p == op + 1 || c != '\\' || p == pe || *p++ != '}')
  653. error (Ere_malformed_repeat_count);
  654. b = closure (b, minrep, maxrep, check_postfix (p, pe));
  655. }
  656. break;
  657. case '(':
  658. if (r_stackp == r_stackb + MAX_STACK_DEPTH)
  659. error (Ere_nesting_too_deep);
  660. r_stackp->buf = b - r_buffer;
  661. r_stackp->remain_branch = r_remain_branch;
  662. r_stackp->branch_start = r_branch_start - r_buffer;
  663. if (p + 2 <= pe && *p == '?' && p[1] == ':')
  664. {
  665. r_stackp->reg = MAX_REGS;
  666. p += 2;
  667. }
  668. else
  669. {
  670. r_stackp->reg = r_regnum;
  671. if (r_regnum < MAX_REGS)
  672. {
  673. *b++ = START_SAVE_REGS;
  674. *b++ = r_regnum;
  675. }
  676. r_regnum++;
  677. }
  678. r_stackp++;
  679. r_remain_branch = 0;
  680. r_last_start = 0;
  681. r_branch_start = b;
  682. r_normal_char = 0;
  683. break;
  684. case ')':
  685. if (r_stackp == r_stackb)
  686. error (Ere_unmatched_rparen);
  687. if (r_remain_branch)
  688. {
  689. b = branch (b);
  690. *b++ = END_BRANCH;
  691. }
  692. r_stackp--;
  693. r_last_start = r_buffer + r_stackp->buf;
  694. r_remain_branch = r_stackp->remain_branch;
  695. r_branch_start = r_buffer + r_stackp->branch_start;
  696. if (r_stackp->reg < MAX_REGS)
  697. {
  698. *b++ = END_SAVE_REGS;
  699. *b++ = r_stackp->reg;
  700. }
  701. break;
  702. case '|':
  703. *b++ = END_BRANCH;
  704. b = branch (b);
  705. r_branch_start = b;
  706. r_remain_branch = 1;
  707. r_last_start = 0;
  708. break;
  709. case '1':
  710. case '2':
  711. case '3':
  712. case '4':
  713. case '5':
  714. case '6':
  715. case '7':
  716. case '8':
  717. case '9':
  718. {
  719. int n = c - '0';
  720. if (n >= r_regnum)
  721. goto normal_char;
  722. for (stack *p = r_stackb; p < r_stackp; p++)
  723. if (p->reg == n)
  724. goto normal_char;
  725. r_last_start = b;
  726. *b++ = BACKREF;
  727. *b++ = n;
  728. r_has_backref = 1;
  729. }
  730. break;
  731. case '`':
  732. if (r_last_start)
  733. goto normal_char;
  734. *b++ = BEGBUF;
  735. break;
  736. case '\'':
  737. if (p == pe || (p < pe - 1 && *p == '\\' && (p[1] == '|' || p[1] == ')')))
  738. *b++ = ENDBUF;
  739. else
  740. goto normal_char;
  741. break;
  742. case '<':
  743. *b++ = BEGWORD;
  744. break;
  745. case '>':
  746. *b++ = ENDWORD;
  747. break;
  748. case 'b':
  749. *b++ = WORDBOUND;
  750. break;
  751. case 'B':
  752. *b++ = NOT_WORDBOUND;
  753. break;
  754. case 'w':
  755. r_last_start = b;
  756. *b++ = WORDCHAR;
  757. break;
  758. case 'W':
  759. r_last_start = b;
  760. *b++ = NOT_WORDCHAR;
  761. break;
  762. case '_':
  763. if (p == pe)
  764. error (Ere_invalid_pattern);
  765. c = *p++;
  766. switch (c)
  767. {
  768. case '<':
  769. *b++ = BEGSYMBOL;
  770. break;
  771. case '>':
  772. *b++ = ENDSYMBOL;
  773. break;
  774. case 'b':
  775. *b++ = SYMBOLBOUND;
  776. break;
  777. case 'B':
  778. *b++ = NOT_SYMBOLBOUND;
  779. break;
  780. case 's':
  781. r_last_start = b;
  782. *b++ = SYMBOLCHAR;
  783. break;
  784. case 'S':
  785. r_last_start = b;
  786. *b++ = NOT_SYMBOLCHAR;
  787. break;
  788. default:
  789. p--;
  790. c = '_';
  791. goto normal_char;
  792. }
  793. break;
  794. case 's':
  795. if (p == pe)
  796. error (Ere_invalid_pattern);
  797. c = *p++;
  798. if (!ascii_char_p (c) || syntax_spec_table[c] == -1)
  799. error (Einvalid_syntax_spec);
  800. r_last_start = b;
  801. *b++ = SYNTAX_SPEC;
  802. *b++ = syntax_spec_table[c];
  803. break;
  804. case 'S':
  805. if (p == pe)
  806. error (Ere_invalid_pattern);
  807. c = *p++;
  808. if (!ascii_char_p (c) || syntax_spec_table[c] == -1)
  809. error (Einvalid_syntax_spec);
  810. r_last_start = b;
  811. *b++ = NOT_SYNTAX_SPEC;
  812. *b++ = syntax_spec_table[c];
  813. break;
  814. default:
  815. goto normal_char;
  816. }
  817. break;
  818. default:
  819. normal_char:
  820. if (!r_normal_char || r_normal_char + *r_normal_char + 1 != b
  821. || *r_normal_char >= CHAR_LIMIT - 1
  822. || (p < pe && (*p == '*' || *p == '+' || *p == '?'))
  823. || (p < pe - 1 && *p == '\\' && p[1] == '{'))
  824. {
  825. r_last_start = b;
  826. *b++ = NORMAL_CHARS;
  827. r_normal_char = b;
  828. *b++ = 0;
  829. }
  830. *b++ = ascii_char_p (c) ? r_translate[c] : c;
  831. (*r_normal_char)++;
  832. }
  833. }
  834. if (r_stackp != r_stackb)
  835. error (Ere_unmatched_lparen);
  836. if (r_remain_branch)
  837. {
  838. b = branch (b);
  839. *b++ = END_BRANCH;
  840. }
  841. r_used = b - r_buffer;
  842. check_inner_closure (r_buffer, b);
  843. #ifdef DEBUG
  844. dump ();
  845. #endif
  846. }
  847. int
  848. regexp_compile::match_void_p (const Char *p, const Char *pe)
  849. {
  850. while (p < pe)
  851. switch (*p++)
  852. {
  853. case BEGLINE:
  854. case BEGBUF:
  855. break;
  856. case NORMAL_CHARS:
  857. case ANYCHAR:
  858. case CHAR_CLASS:
  859. case CHAR_CLASS_NOT:
  860. case WORDCHAR:
  861. case NOT_WORDCHAR:
  862. case SYNTAX_SPEC:
  863. case NOT_SYNTAX_SPEC:
  864. case SYMBOLCHAR:
  865. case NOT_SYMBOLCHAR:
  866. return 0;
  867. case BEGWORD:
  868. case ENDWORD:
  869. case WORDBOUND:
  870. case NOT_WORDBOUND:
  871. case BEGSYMBOL:
  872. case ENDSYMBOL:
  873. case SYMBOLBOUND:
  874. case NOT_SYMBOLBOUND:
  875. break;
  876. case ENDLINE:
  877. case ENDBUF:
  878. case END_BRANCH:
  879. return 1;
  880. case CLOSURE:
  881. case CLOSURE_BACKTRACK:
  882. case CLOSURE_SIMPLE:
  883. case SHORTEST_CLOSURE:
  884. case SHORTEST_CLOSURE_BACKTRACK:
  885. case SHORTEST_CLOSURE_SIMPLE:
  886. if (*p && !match_void_p (p + 3, p + p[2] - 1))
  887. return 0;
  888. p += p[2] - 1;
  889. break;
  890. case BRANCH:
  891. case BRANCH_BACKTRACK:
  892. {
  893. int f = 0;
  894. do
  895. {
  896. if (match_void_p (p + 1, p + *p - 1))
  897. f = 1;
  898. p += *p - 1;
  899. }
  900. while (*p++ != END_BRANCH);
  901. if (!f)
  902. return 0;
  903. }
  904. break;
  905. case BACKREF:
  906. return 1;
  907. case START_SAVE_REGS:
  908. case END_SAVE_REGS:
  909. p++;
  910. break;
  911. }
  912. return 1;
  913. }
  914. int
  915. regexp_compile::match_bol_p (const Char *p, const Char *pe)
  916. {
  917. while (p < pe)
  918. switch (*p++)
  919. {
  920. default:
  921. return 0;
  922. case BEGLINE:
  923. case BEGBUF:
  924. return 1;
  925. case START_SAVE_REGS:
  926. case END_SAVE_REGS:
  927. p++;
  928. break;
  929. case BRANCH:
  930. case BRANCH_BACKTRACK:
  931. do
  932. {
  933. if (!match_bol_p (p + 1, p + *p - 1))
  934. return 0;
  935. p += *p - 1;
  936. }
  937. while (*p++ != END_BRANCH);
  938. return 1;
  939. case CLOSURE:
  940. case CLOSURE_BACKTRACK:
  941. case CLOSURE_SIMPLE:
  942. case SHORTEST_CLOSURE:
  943. case SHORTEST_CLOSURE_BACKTRACK:
  944. case SHORTEST_CLOSURE_SIMPLE:
  945. return *p && match_bol_p (p + 3, p + p[2] - 1);
  946. }
  947. return 0;
  948. }
  949. void
  950. regexp_compile::char_class_fastmap (const Char *p, char *fastmap) const
  951. {
  952. const Char *pe = p + *p;
  953. for (p++; p < pe;)
  954. {
  955. int h = *p >> 8;
  956. if (*p & 0xff)
  957. {
  958. if (h)
  959. fastmap[h] = p[1] == ((256 / NBITS) << 16) + 0 ? 1 : -1;
  960. else
  961. {
  962. for (int u = (p[1] >> 8) * NBITS, l = (p[1] & 0xff) * NBITS; l < u; l++)
  963. fastmap[l] = 1;
  964. }
  965. p += 2;
  966. }
  967. else
  968. {
  969. int u = p[1] >> 8, l = p[1] & 0xff;
  970. p += 2;
  971. if (h)
  972. fastmap[h] = -1;
  973. else
  974. {
  975. for (int i = u - l - 1; i >= 0; i--)
  976. {
  977. int ii = (i + l) * NBITS;
  978. for (int j = 0; j < NBITS; j++)
  979. if (p[i] & (1 << j))
  980. fastmap[ii + j] = 1;
  981. }
  982. }
  983. p += u - l;
  984. }
  985. }
  986. }
  987. void
  988. regexp_compile::char_class_not_fastmap (const Char *p, char *fastmap) const
  989. {
  990. char tem[256];
  991. bzero (tem, sizeof tem);
  992. char_class_fastmap (p, tem);
  993. for (int i = 0; i < 256; i++)
  994. if (tem[i] <= 0)
  995. fastmap[i] = 1;
  996. }
  997. int
  998. regexp_compile::compile_fastmap (char *fastmap, const Char *p, const Char *pe) const
  999. {
  1000. while (p < pe)
  1001. {
  1002. switch (*p++)
  1003. {
  1004. case BEGLINE:
  1005. case BEGBUF:
  1006. case ENDBUF:
  1007. break;
  1008. case ENDLINE:
  1009. fastmap['\n'] = 1;
  1010. return 1;
  1011. case ANYCHAR:
  1012. {
  1013. int n = fastmap['\n'];
  1014. memset (fastmap, 1, 256);
  1015. fastmap['\n'] = n;
  1016. return 1;
  1017. }
  1018. case START_SAVE_REGS:
  1019. case END_SAVE_REGS:
  1020. p++;
  1021. break;
  1022. case BACKREF:
  1023. return 0;
  1024. case NORMAL_CHARS:
  1025. {
  1026. Char c = p[1];
  1027. if (c < 256)
  1028. fastmap[c] = 1;
  1029. else
  1030. fastmap[c >> 8] = 1;
  1031. return 1;
  1032. }
  1033. case END_BRANCH:
  1034. return 0;
  1035. case BRANCH:
  1036. case BRANCH_BACKTRACK:
  1037. {
  1038. int f = 1;
  1039. do
  1040. {
  1041. if (!compile_fastmap (fastmap, p + 1, p + *p - 1))
  1042. f = 0;
  1043. p += *p - 1;
  1044. }
  1045. while (*p++ != END_BRANCH);
  1046. if (f)
  1047. return 1;
  1048. break;
  1049. }
  1050. case CLOSURE:
  1051. case CLOSURE_BACKTRACK:
  1052. case CLOSURE_SIMPLE:
  1053. case SHORTEST_CLOSURE:
  1054. case SHORTEST_CLOSURE_BACKTRACK:
  1055. case SHORTEST_CLOSURE_SIMPLE:
  1056. if (compile_fastmap (fastmap, p + 3, p + p[2] - 1) && *p)
  1057. return 1;
  1058. p += p[2] - 1;
  1059. break;
  1060. case CHAR_CLASS:
  1061. char_class_fastmap (p, fastmap);
  1062. return 1;
  1063. case CHAR_CLASS_NOT:
  1064. char_class_not_fastmap (p, fastmap);
  1065. return 1;
  1066. case BEGWORD:
  1067. case ENDWORD:
  1068. case WORDBOUND:
  1069. case NOT_WORDBOUND:
  1070. case BEGSYMBOL:
  1071. case ENDSYMBOL:
  1072. case SYMBOLBOUND:
  1073. case NOT_SYMBOLBOUND:
  1074. break;
  1075. case WORDCHAR:
  1076. {
  1077. for (int i = 0; i < 256; i++)
  1078. if (re_syntax_word_p (xchar_syntax (r_syntax_table, i)))
  1079. fastmap[i] = 1;
  1080. return 1;
  1081. }
  1082. case NOT_WORDCHAR:
  1083. {
  1084. for (int i = 0; i < 256; i++)
  1085. if (!re_syntax_word_p (xchar_syntax (r_syntax_table, i)))
  1086. fastmap[i] = 1;
  1087. return 1;
  1088. }
  1089. case SYMBOLCHAR:
  1090. {
  1091. for (int i = 0; i < 256; i++)
  1092. if (re_syntax_symbol_p (xchar_syntax (r_syntax_table, i)))
  1093. fastmap[i] = 1;
  1094. return 1;
  1095. }
  1096. case NOT_SYMBOLCHAR:
  1097. {
  1098. for (int i = 0; i < 256; i++)
  1099. if (!re_syntax_symbol_p (xchar_syntax (r_syntax_table, i)))
  1100. fastmap[i] = 1;
  1101. return 1;
  1102. }
  1103. case SYNTAX_SPEC:
  1104. {
  1105. int n = *p++;
  1106. for (int i = 0; i < 256; i++)
  1107. if (xchar_syntax (r_syntax_table, i) == n)
  1108. fastmap[i] = 1;
  1109. return 1;
  1110. }
  1111. case NOT_SYNTAX_SPEC:
  1112. {
  1113. int n = *p++;
  1114. for (int i = 0; i < 256; i++)
  1115. if (xchar_syntax (r_syntax_table, i) != n)
  1116. fastmap[i] = 1;
  1117. return 1;
  1118. }
  1119. }
  1120. }
  1121. return 0;
  1122. }
  1123. struct re_point: public Point
  1124. {
  1125. point_t p_min;
  1126. point_t p_max;
  1127. int bobp (const Regexp &re) const
  1128. {
  1129. return (p_point == re.last_match ()
  1130. ? re.last_match_char () == lChar_EOF
  1131. : p_point <= p_min);
  1132. }
  1133. int eobp () const {return p_point >= p_max;}
  1134. void forward ()
  1135. {
  1136. if (++p_offset == p_chunk->c_used && p_chunk->c_next)
  1137. {
  1138. p_offset = 0;
  1139. p_chunk = p_chunk->c_next;
  1140. }
  1141. p_point++;
  1142. }
  1143. void backward ()
  1144. {
  1145. if (--p_offset < 0 && p_chunk->c_prev)
  1146. {
  1147. p_chunk = p_chunk->c_prev;
  1148. p_offset = p_chunk->c_used - 1;
  1149. }
  1150. p_point--;
  1151. }
  1152. Char prevch (const Regexp &re) const
  1153. {return (p_point == re.last_match ()
  1154. ? Char (re.last_match_char ())
  1155. : Point::prevch ());}
  1156. Char nextch () const {return ch ();}
  1157. Char getch ()
  1158. {
  1159. Char c = nextch ();
  1160. forward ();
  1161. return c;
  1162. }
  1163. void back (int);
  1164. int nextl ()
  1165. {
  1166. Char *p = p_chunk->c_text;
  1167. for (int i = p_point; i < p_max; i++)
  1168. if (p[i] == '\n')
  1169. {
  1170. p_point = p_offset = i + 1;
  1171. return 1;
  1172. }
  1173. return 0;
  1174. }
  1175. };
  1176. void
  1177. re_point::back (int d)
  1178. {
  1179. Chunk *cp = p_chunk;
  1180. while (p_offset + d < 0)
  1181. {
  1182. d += p_offset + 1;
  1183. p_point -= p_offset + 1;
  1184. cp = cp->c_prev;
  1185. p_offset = cp->c_used - 1;
  1186. }
  1187. p_offset += d;
  1188. p_point += d;
  1189. p_chunk = cp;
  1190. }
  1191. inline int
  1192. Regexp::bobp (const re_point &point) const
  1193. {
  1194. return (point.p_point == last_match ()
  1195. ? last_match_char () == lChar_EOF
  1196. : point.p_point <= range ().p1);
  1197. }
  1198. inline int
  1199. Regexp::eobp (const re_point &point) const
  1200. {
  1201. return point.p_point >= range ().p2;
  1202. }
  1203. void
  1204. Regexp::compile (const Char *p, int size, int use_fastmap)
  1205. {
  1206. regexp_compile re (re_translate, re_syntax_table);
  1207. re.compile (p, size);
  1208. re_pattern = re.r_buffer;
  1209. re.r_buffer = 0;
  1210. re_size = re.r_used;
  1211. re_has_backref = re.r_has_backref;
  1212. if (use_fastmap)
  1213. {
  1214. re_match_bol_p = regexp_compile::match_bol_p (re_pattern, re_pattern + re_size);
  1215. re_match_void_p = regexp_compile::match_void_p (re_pattern, re_pattern + re_size);
  1216. bzero (re_fastmap, sizeof re_fastmap);
  1217. if (!re_match_void_p
  1218. && !re.compile_fastmap (re_fastmap, re_pattern, re_pattern + re_size))
  1219. re_match_void_p = 1;
  1220. }
  1221. }
  1222. void
  1223. Regexp::compile (lisp object, int use_fastmap)
  1224. {
  1225. assert (regexpp (object));
  1226. assert (xregexp_pattern (object));
  1227. re_object = object;
  1228. re_pattern = xregexp_pattern (object);
  1229. re_size = xregexp_length (object);
  1230. re_translate = (xregexp_flags (object) & lregexp::TRANSLATE
  1231. ? char_translate_upcase_table : char_no_translate_table);
  1232. re_match_void_p = xregexp_flags (object) & lregexp::MATCH_VOID;
  1233. re_match_bol_p = xregexp_flags (object) & lregexp::MATCH_BOL;
  1234. re_has_backref = xregexp_flags (object) & lregexp::HAS_BACKREF;
  1235. if (use_fastmap)
  1236. {
  1237. regexp_compile re (re_translate, re_syntax_table, 0);
  1238. bzero (re_fastmap, sizeof re_fastmap);
  1239. if (!re_match_void_p
  1240. && !re.compile_fastmap (re_fastmap, re_pattern, re_pattern + re_size))
  1241. re_match_void_p = 1;
  1242. }
  1243. }
  1244. int
  1245. Regexp::merge_fastmap (lisp object, char *fastmap, const syntax_table *tab)
  1246. {
  1247. if (xregexp_flags (object) & lregexp::MATCH_VOID)
  1248. return 0;
  1249. const u_char *translate = (xregexp_flags (object) & lregexp::TRANSLATE
  1250. ? char_translate_upcase_table
  1251. : char_no_translate_table);
  1252. regexp_compile re (translate, tab, 0);
  1253. char buf[256];
  1254. bzero (buf, sizeof buf);
  1255. if (!re.compile_fastmap (buf, xregexp_pattern (object),
  1256. xregexp_pattern (object) + xregexp_length (object)))
  1257. return 0;
  1258. if (xregexp_flags (object) & lregexp::TRANSLATE)
  1259. for (int u = 'A', l = 'a'; u <= 'Z'; u++, l++)
  1260. buf[u] = buf[l] = buf[u] | buf[l];
  1261. for (int i = 0; i < 256; i++)
  1262. fastmap[i] |= buf[i];
  1263. return 1;
  1264. }
  1265. lisp
  1266. Regexp::make_regexp (lisp source) const
  1267. {
  1268. assert (re_pattern);
  1269. lisp re = ::make_regexp ();
  1270. xregexp_pattern (re) = (Char *)xmalloc (sizeof (Char) * re_size);
  1271. bcopy (re_pattern, xregexp_pattern (re), re_size);
  1272. xregexp_length (re) = re_size;
  1273. xregexp_flags (re) = 0;
  1274. if (re_match_void_p)
  1275. xregexp_flags (re) |= lregexp::MATCH_VOID;
  1276. if (re_match_bol_p)
  1277. xregexp_flags (re) |= lregexp::MATCH_BOL;
  1278. if (re_translate != char_no_translate_table)
  1279. xregexp_flags (re) |= lregexp::TRANSLATE;
  1280. if (re_has_backref)
  1281. xregexp_flags (re) |= lregexp::HAS_BACKREF;
  1282. xregexp_source (re) = source;
  1283. return re;
  1284. }
  1285. int
  1286. Regexp::match (const re_point &point)
  1287. {
  1288. re_failure.init ();
  1289. re_regs.nregs = 0;
  1290. re_point tem = point;
  1291. if (!match (tem, re_pattern, re_pattern + re_size))
  1292. return 0;
  1293. re_regs.start[0] = point.p_point;
  1294. re_regs.end[0] = tem.p_point;
  1295. return 1;
  1296. }
  1297. int
  1298. Regexp::match_char_class (const Char *p, Char c) const
  1299. {
  1300. c = ascii_char_p (c) ? re_translate[c] : c;
  1301. int hi = c >> 8, lo = c & 0xff;
  1302. const Char *pe = p + *p;
  1303. for (p++; p < pe;)
  1304. {
  1305. if (*p & 0xff)
  1306. {
  1307. if ((*p >> 8) == hi)
  1308. {
  1309. int x = lo / NBITS;
  1310. return x >= (p[1] & 0xff) && x < (p[1] >> 8);
  1311. }
  1312. p += 2;
  1313. }
  1314. else
  1315. {
  1316. int u = p[1] >> 8, l = p[1] & 0xff;
  1317. if ((*p >> 8) == hi)
  1318. {
  1319. int x = lo / NBITS;
  1320. if (x < l || x >= u)
  1321. return 0;
  1322. return p[2 + x - l] & (1 << (lo % NBITS));
  1323. }
  1324. p += 2 + u - l;
  1325. }
  1326. }
  1327. return 0;
  1328. }
  1329. inline int
  1330. Regexp::backref (re_point &point, int no) const
  1331. {
  1332. if (no > re_regs.nregs || re_regs.start[no] < 0 || re_regs.end[no] < 0)
  1333. return 0;
  1334. if (re_regs.start[no] > re_regs.end[no] || re_regs.end[no] > point.p_point)
  1335. return 0;
  1336. re_point ref = point;
  1337. ref.back (re_regs.start[no] - ref.p_point);
  1338. for (int n = re_regs.end[no] - re_regs.start[no]; n > 0; n--)
  1339. {
  1340. if (point.eobp ())
  1341. return 0;
  1342. Char c1 = point.getch ();
  1343. if (ascii_char_p (c1))
  1344. c1 = re_translate[c1];
  1345. Char c2 = ref.getch ();
  1346. if (ascii_char_p (c2))
  1347. c2 = re_translate[c2];
  1348. if (c1 != c2)
  1349. return 0;
  1350. }
  1351. return 1;
  1352. }
  1353. static int
  1354. regs_equal1 (const Regexp::sregs &p, const Regexp::sregs &q)
  1355. {
  1356. if (p.nregs != q.nregs)
  1357. return 0;
  1358. for (int i = 1; i <= p.nregs; i++)
  1359. if (p.start[i] != q.start[i] || p.end[i] != q.end[i])
  1360. return 0;
  1361. return 1;
  1362. }
  1363. static void
  1364. copy_regs1 (Regexp::sregs &d, const Regexp::sregs &s)
  1365. {
  1366. d.nregs = s.nregs;
  1367. for (int i = 1; i <= d.nregs; i++)
  1368. {
  1369. d.start[i] = s.start[i];
  1370. d.end[i] = s.end[i];
  1371. }
  1372. }
  1373. inline int
  1374. Regexp::branch (re_point &point, const Char *p, const Char *pe)
  1375. {
  1376. sregs save_regs;
  1377. sregs match_regs;
  1378. re_point longest;
  1379. longest.p_point = -1;
  1380. point_t longest_end = -1;
  1381. const Char *prest = p;
  1382. do
  1383. prest += *prest - 1;
  1384. while (*prest++ != END_BRANCH);
  1385. copy_regs1 (save_regs, re_regs);
  1386. do
  1387. {
  1388. re_point tem = point;
  1389. copy_regs1 (re_regs, save_regs);
  1390. if (p[-1] == BRANCH_BACKTRACK)
  1391. {
  1392. if (match (tem, p + 1, p + *p - 1))
  1393. {
  1394. point_t end = tem.p_point;
  1395. while (1)
  1396. {
  1397. if (match (tem, prest, pe))
  1398. {
  1399. if (tem.p_point > longest.p_point
  1400. || (tem.p_point == longest.p_point
  1401. && end > longest_end))
  1402. {
  1403. longest = tem;
  1404. longest_end = end;
  1405. copy_regs1 (match_regs, re_regs);
  1406. }
  1407. }
  1408. do
  1409. {
  1410. if (--end < point.p_point)
  1411. goto branch_backtrack_end;
  1412. tem = point;
  1413. tem.p_max = end;
  1414. copy_regs1 (re_regs, save_regs);
  1415. }
  1416. while (!match (tem, p + 1, p + *p - 1));
  1417. tem.p_max = point.p_max;
  1418. }
  1419. branch_backtrack_end:;
  1420. }
  1421. }
  1422. else
  1423. {
  1424. if (match (tem, p + 1, p + *p - 1))
  1425. {
  1426. point_t end = tem.p_point;
  1427. if (match (tem, prest, pe))
  1428. {
  1429. if (tem.p_point > longest.p_point
  1430. || (tem.p_point == longest.p_point
  1431. && end > longest_end))
  1432. {
  1433. longest = tem;
  1434. longest_end = end;
  1435. copy_regs1 (match_regs, re_regs);
  1436. }
  1437. }
  1438. }
  1439. }
  1440. p += *p;
  1441. }
  1442. while (p < prest);
  1443. if (longest.p_point == -1)
  1444. return 0;
  1445. point = longest;
  1446. copy_regs1 (re_regs, match_regs);
  1447. return 1;
  1448. }
  1449. void
  1450. Regexp::start_save_regs (int n, point_t point)
  1451. {
  1452. if (n > re_regs.nregs)
  1453. {
  1454. for (int i = re_regs.nregs + 1; i < n; i++)
  1455. {
  1456. re_regs.start[i] = -1;
  1457. re_regs.end[i] = -1;
  1458. }
  1459. re_regs.nregs = n;
  1460. }
  1461. re_regs.start[n] = point;
  1462. re_regs.end[n] = -1;
  1463. }
  1464. inline void
  1465. Regexp::end_save_regs (int n, point_t point)
  1466. {
  1467. re_regs.end[n] = point;
  1468. }
  1469. inline int
  1470. Regexp::repeat_max (Char n)
  1471. {
  1472. return n == INFINITY ? INT_MAX - 1 : n;
  1473. }
  1474. inline int
  1475. Regexp::closure (re_point &point, const Char *p, const Char *pe, int shortest)
  1476. {
  1477. sregs save_regs;
  1478. sregs match_regs;
  1479. re_point longest;
  1480. longest.p_point = -1;
  1481. int nmatches = 0;
  1482. const int nmin = *p++;
  1483. const int nmax = repeat_max (*p++);
  1484. const Char *const prest = p + *p - 3;
  1485. p++;
  1486. while (1)
  1487. {
  1488. if (nmatches >= nmin)
  1489. {
  1490. re_point tem = point;
  1491. copy_regs1 (save_regs, re_regs);
  1492. if (match (tem, prest, pe))
  1493. {
  1494. if (shortest)
  1495. {
  1496. point = tem;
  1497. return 1;
  1498. }
  1499. if (tem.p_point >= longest.p_point)
  1500. {
  1501. longest = tem;
  1502. copy_regs1 (match_regs, re_regs);
  1503. }
  1504. }
  1505. copy_regs1 (re_regs, save_regs);
  1506. }
  1507. if (nmatches >= nmax)
  1508. break;
  1509. point_t opoint = point.p_point;
  1510. if (!match (point, p, prest))
  1511. break;
  1512. if (point.p_point == opoint)
  1513. nmatches = nmax;
  1514. else
  1515. nmatches++;
  1516. }
  1517. if (longest.p_point == -1)
  1518. return 0;
  1519. point = longest;
  1520. copy_regs1 (re_regs, match_regs);
  1521. return 1;
  1522. }
  1523. Regexp::record_failure::record_failure ()
  1524. : m_ep (m_entbuf)
  1525. {
  1526. bzero (m_tab, sizeof m_tab);
  1527. }
  1528. inline u_int
  1529. Regexp::record_failure::hashval (const Char *pat, point_t point, point_t max)
  1530. {
  1531. return (pointer_t (pat) << 24) + point + (max << 12);
  1532. }
  1533. inline void
  1534. Regexp::record_failure::init ()
  1535. {
  1536. if (m_ep != m_entbuf)
  1537. {
  1538. m_ep = m_entbuf;
  1539. bzero (m_tab, sizeof m_tab);
  1540. }
  1541. }
  1542. inline int
  1543. Regexp::record_failure::find (const Char *pat, point_t point, point_t max) const
  1544. {
  1545. for (const ent *e = m_tab[hashval (pat, point, max) % TABSIZE]; e; e = e->cdr)
  1546. if (e->point == point && e->max == max && e->pat == pat)
  1547. return 1;
  1548. return 0;
  1549. }
  1550. inline void
  1551. Regexp::record_failure::add (const Char *pat, const re_point &point)
  1552. {
  1553. if (m_ep != m_entbuf + numberof (m_entbuf))
  1554. {
  1555. u_int h = hashval (pat, point.p_point, point.p_max) % TABSIZE;
  1556. m_ep->pat = pat;
  1557. m_ep->point = point.p_point;
  1558. m_ep->max = point.p_max;
  1559. m_ep->cdr = m_tab[h];
  1560. m_tab[h] = m_ep;
  1561. m_ep++;
  1562. }
  1563. }
  1564. class backtrack_stack
  1565. {
  1566. public:
  1567. enum {MAX_STACK = 4096, STACK_GROW = 256};
  1568. struct stack
  1569. {
  1570. Regexp::sregs regs;
  1571. re_point point;
  1572. int match_void;
  1573. };
  1574. stack b_initstack[STACK_GROW];
  1575. stack *b_stack;
  1576. int b_used;
  1577. int b_allocated;
  1578. backtrack_stack () : b_stack (b_initstack), b_used (0), b_allocated (STACK_GROW) {}
  1579. ~backtrack_stack ()
  1580. {if (b_stack != b_initstack) xfree (b_stack);}
  1581. int push (const re_point &, const Regexp::sregs &, int);
  1582. void clear () {b_used = 0;}
  1583. int match (const re_point &, const Regexp::sregs &, int, int) const;
  1584. };
  1585. int
  1586. backtrack_stack::match (const re_point &point, const Regexp::sregs &regs,
  1587. int match_void, int has_backref) const
  1588. {
  1589. for (int i = 0; i < b_used; i++)
  1590. if (point.p_point == b_stack[i].point.p_point
  1591. && match_void == b_stack[i].match_void
  1592. && (!has_backref || regs_equal1 (regs, b_stack[i].regs)))
  1593. return 1;
  1594. return 0;
  1595. }
  1596. int
  1597. backtrack_stack::push (const re_point &point, const Regexp::sregs &regs,
  1598. int match_void)
  1599. {
  1600. if (b_used == b_allocated)
  1601. {
  1602. if (b_allocated >= MAX_STACK)
  1603. FEsimple_error (Ecomplex_regexp);
  1604. b_allocated += STACK_GROW;
  1605. if (b_stack == b_initstack)
  1606. {
  1607. b_stack = (stack *)xmalloc (sizeof *b_stack * b_allocated);
  1608. memcpy (b_stack, b_initstack, sizeof b_initstack);
  1609. }
  1610. else
  1611. b_stack = (stack *)xrealloc (b_stack, sizeof *b_stack * b_allocated);
  1612. }
  1613. copy_regs1 (b_stack[b_used].regs, regs);
  1614. b_stack[b_used].point = point;
  1615. b_stack[b_used].match_void = match_void;
  1616. b_used++;
  1617. return match_void;
  1618. }
  1619. class state_buf
  1620. {
  1621. enum {BITMAP_SIZE = 4096};
  1622. enum {MAX_BUF = 4096, BUF_GROW = 128};
  1623. u_char s_bitmap[BITMAP_SIZE / CHAR_BIT];
  1624. u_long *s_buf;
  1625. int s_used;
  1626. int s_allocated;
  1627. public:
  1628. state_buf () : s_buf (0), s_used (0), s_allocated (0)
  1629. {bzero (s_bitmap, sizeof s_bitmap);}
  1630. ~state_buf () {xfree (s_buf);}
  1631. void add (u_long);
  1632. int test (u_long) const;
  1633. };
  1634. void
  1635. state_buf::add (u_long x)
  1636. {
  1637. if (x < BITMAP_SIZE)
  1638. s_bitmap[x / CHAR_BIT] |= 1 << (x % CHAR_BIT);
  1639. else
  1640. {
  1641. if (s_used == s_allocated)
  1642. {
  1643. if (s_allocated >= MAX_BUF)
  1644. FEsimple_error (Ecomplex_regexp);
  1645. s_allocated += BUF_GROW;
  1646. s_buf = (u_long *)xrealloc (s_buf, sizeof *s_buf * s_allocated);
  1647. }
  1648. s_buf[s_used++] = x;
  1649. }
  1650. }
  1651. int
  1652. state_buf::test (u_long x) const
  1653. {
  1654. if (x < BITMAP_SIZE)
  1655. return s_bitmap[x / CHAR_BIT] & (1 << (x % CHAR_BIT));
  1656. for (int i = 0; i < s_used; i++)
  1657. if (x == s_buf[i])
  1658. return 1;
  1659. return 0;
  1660. }
  1661. inline int
  1662. Regexp::compare_regs (const sregs &r1, const sregs &r2)
  1663. {
  1664. int i;
  1665. for (i = 1; i <= r1.nregs; i++)
  1666. {
  1667. if (i > r2.nregs)
  1668. return 1;
  1669. int l1 = r1.end[i] - r1.start[i];
  1670. int l2 = r2.end[i] - r2.start[i];
  1671. if (l1 != l2)
  1672. return l1 - l2;
  1673. if (r1.start[i] != r2.start[i])
  1674. return r1.start[i] - r2.start[i];
  1675. }
  1676. return i <= r2.nregs ? -1 : 0;
  1677. }
  1678. inline int
  1679. Regexp::closure_backtrack (re_point &point, const Char *p, const Char *pe,
  1680. int shortest)
  1681. {
  1682. backtrack_stack stack_1, stack_2;
  1683. backtrack_stack *fstack = &stack_1, *tstack = &stack_2;
  1684. state_buf state;
  1685. sregs match_regs;
  1686. re_point longest;
  1687. longest.p_point = -1;
  1688. point_t beg = point.p_point;
  1689. int nmatches = 0;
  1690. const int nmin = *p++;
  1691. const int nmax = repeat_max (*p++);
  1692. const Char *const prest = p + *p - 3;
  1693. p++;
  1694. fstack->push (point, re_regs, 0);
  1695. int match_void = 0;
  1696. while (1)
  1697. {
  1698. if (nmatches >= nmin || match_void)
  1699. for (int i = fstack->b_used - 1; i >= 0; i--)
  1700. if (nmatches >= nmin || fstack->b_stack[i].match_void)
  1701. {
  1702. re_point tem = fstack->b_stack[i].point;
  1703. copy_regs1 (re_regs, fstack->b_stack[i].regs);
  1704. if (match (tem, prest, pe))
  1705. {
  1706. if (shortest)
  1707. {
  1708. point = tem;
  1709. return 1;
  1710. }
  1711. if (tem.p_point > longest.p_point
  1712. || (tem.p_point == longest.p_point
  1713. && compare_regs (re_regs, match_regs) > 0))
  1714. {
  1715. longest = tem;
  1716. copy_regs1 (match_regs, re_regs);
  1717. }
  1718. }
  1719. else if (!re_has_backref)
  1720. re_failure.add (prest, fstack->b_stack[i].point);
  1721. }
  1722. if (nmatches >= nmax)
  1723. break;
  1724. for (int i = 0; i < fstack->b_used; i++)
  1725. if (!fstack->b_stack[i].match_void)
  1726. {
  1727. int l = fstack->b_stack[i].point.p_point - beg;
  1728. if (state.test (l))
  1729. fstack->b_stack[i].match_void = 1;
  1730. else
  1731. state.add (l);
  1732. }
  1733. match_void = 0;
  1734. tstack->clear ();
  1735. for (int i = 0; i < fstack->b_used; i++)
  1736. if (!fstack->b_stack[i].match_void)
  1737. {
  1738. point = fstack->b_stack[i].point;
  1739. point_t opoint = point.p_point;
  1740. copy_regs1 (re_regs, fstack->b_stack[i].regs);
  1741. if (match (point, p, prest))
  1742. {
  1743. if (re_has_backref || !re_failure.find (prest, point.p_point, point.p_max))
  1744. {
  1745. int eq = point.p_point == opoint;
  1746. if (!fstack->match (point, re_regs, eq, re_has_backref)
  1747. && !tstack->match (point, re_regs, eq, re_has_backref))
  1748. match_void |= tstack->push (point, re_regs, eq);
  1749. }
  1750. point_t omax = point.p_max;
  1751. point_t end = point.p_point;
  1752. while (--end >= fstack->b_stack[i].point.p_point)
  1753. {
  1754. point = fstack->b_stack[i].point;
  1755. point.p_max = end;
  1756. copy_regs1 (re_regs, fstack->b_stack[i].regs);
  1757. if (match (point, p, prest)
  1758. && (re_has_backref
  1759. || !re_failure.find (prest, point.p_point, omax)))
  1760. {
  1761. int eq = point.p_point == opoint;
  1762. if (!fstack->match (point, re_regs, eq, re_has_backref)
  1763. && !tstack->match (point, re_regs, eq, re_has_backref))
  1764. {
  1765. point.p_max = omax;
  1766. match_void |= tstack->push (point, re_regs, eq);
  1767. }
  1768. }
  1769. QUIT;
  1770. }
  1771. }
  1772. }
  1773. if (!tstack->b_used)
  1774. break;
  1775. nmatches++;
  1776. swap (fstack, tstack);
  1777. }
  1778. if (longest.p_point == -1)
  1779. return 0;
  1780. point = longest;
  1781. copy_regs1 (re_regs, match_regs);
  1782. return 1;
  1783. }
  1784. int
  1785. Regexp::simple_closure (re_point &point, const Char *p, const Char *pe)
  1786. {
  1787. const int nmin = *p++;
  1788. const int nmax = min (repeat_max (*p++),
  1789. int (point.p_max - point.p_point));
  1790. int nregs = re_regs.nregs;
  1791. int nmatches = 0;
  1792. const Char *const prest = p + *p - 3;
  1793. p++;
  1794. switch (*p)
  1795. {
  1796. case ANYCHAR:
  1797. for (nmatches = 0; nmatches < nmax; nmatches++)
  1798. {
  1799. if (point.nextch () == '\n')
  1800. break;
  1801. point.forward ();
  1802. }
  1803. break;
  1804. case NORMAL_CHARS:
  1805. for (nmatches = 0; nmatches < nmax; nmatches++)
  1806. {
  1807. Char cc = point.nextch ();
  1808. if (ascii_char_p (cc))
  1809. cc = re_translate[cc];
  1810. if (cc != p[2])
  1811. break;
  1812. point.forward ();
  1813. }
  1814. break;
  1815. case WORDCHAR:
  1816. for (nmatches = 0; nmatches < nmax; nmatches++)
  1817. {
  1818. if (!syntax_word_p (point.nextch ()))
  1819. break;
  1820. point.forward ();
  1821. }
  1822. break;
  1823. case NOT_WORDCHAR:
  1824. for (nmatches = 0; nmatches < nmax; nmatches++)
  1825. {
  1826. if (syntax_word_p (point.nextch ()))
  1827. break;
  1828. point.forward ();
  1829. }
  1830. break;
  1831. case SYMBOLCHAR:
  1832. for (nmatches = 0; nmatches < nmax; nmatches++)
  1833. {
  1834. if (!syntax_symbol_p (point.nextch ()))
  1835. break;
  1836. point.forward ();
  1837. }
  1838. break;
  1839. case NOT_SYMBOLCHAR:
  1840. for (nmatches = 0; nmatches < nmax; nmatches++)
  1841. {
  1842. if (syntax_symbol_p (point.nextch ()))
  1843. break;
  1844. point.forward ();
  1845. }
  1846. break;
  1847. case SYNTAX_SPEC:
  1848. for (nmatches = 0; nmatches < nmax; nmatches++)
  1849. {
  1850. if (char_syntax (point.nextch ()) != p[1])
  1851. break;
  1852. point.forward ();
  1853. }
  1854. break;
  1855. case NOT_SYNTAX_SPEC:
  1856. for (nmatches = 0; nmatches < nmax; nmatches++)
  1857. {
  1858. if (char_syntax (point.nextch ()) == p[1])
  1859. break;
  1860. point.forward ();
  1861. }
  1862. break;
  1863. case CHAR_CLASS:
  1864. for (nmatches = 0; nmatches < nmax; nmatches++)
  1865. {
  1866. if (!match_char_class (p + 1, point.nextch ()))
  1867. break;
  1868. point.forward ();
  1869. }
  1870. break;
  1871. case CHAR_CLASS_NOT:
  1872. for (nmatches = 0; nmatches < nmax; nmatches++)
  1873. {
  1874. if (match_char_class (p + 1, point.nextch ()))
  1875. break;
  1876. point.forward ();
  1877. }
  1878. break;
  1879. }
  1880. if (nmatches < nmin)
  1881. return 0;
  1882. sregs match_regs;
  1883. re_point longest;
  1884. longest.p_point = -1;
  1885. while (1)
  1886. {
  1887. re_regs.nregs = nregs;
  1888. re_point tem = point;
  1889. if (match (tem, prest, pe) && tem.p_point > longest.p_point)
  1890. {
  1891. if (tem.p_point == tem.p_max || prest == pe)
  1892. {
  1893. point = tem;
  1894. return 1;
  1895. }
  1896. longest = tem;
  1897. copy_regs1 (match_regs, re_regs);
  1898. }
  1899. if (nmatches == nmin)
  1900. break;
  1901. nmatches--;
  1902. point.backward ();
  1903. }
  1904. if (longest.p_point == -1)
  1905. return 0;
  1906. point = longest;
  1907. copy_regs1 (re_regs, match_regs);
  1908. return 1;
  1909. }
  1910. int
  1911. Regexp::shortest_simple_closure (re_point &point, const Char *p, const Char *pe)
  1912. {
  1913. const int nmin = *p++;
  1914. const int nmax = min (repeat_max (*p++),
  1915. int (point.p_max - point.p_point));
  1916. int nregs = re_regs.nregs;
  1917. const Char *const prest = p + *p - 3;
  1918. p++;
  1919. for (int nmatches = 0;; nmatches++)
  1920. {
  1921. if (nmatches >= nmin)
  1922. {
  1923. re_regs.nregs = nregs;
  1924. re_point tem = point;
  1925. if (match (tem, prest, pe))
  1926. {
  1927. point = tem;
  1928. return 1;
  1929. }
  1930. }
  1931. if (nmatches >= nmax)
  1932. return 0;
  1933. Char cc = point.nextch ();
  1934. switch (*p)
  1935. {
  1936. case ANYCHAR:
  1937. if (cc == '\n')
  1938. return 0;
  1939. break;
  1940. case NORMAL_CHARS:
  1941. if (ascii_char_p (cc))
  1942. cc = re_translate[cc];
  1943. if (cc != p[2])
  1944. return 0;
  1945. break;
  1946. case WORDCHAR:
  1947. if (!syntax_word_p (cc))
  1948. return 0;
  1949. break;
  1950. case NOT_WORDCHAR:
  1951. if (syntax_word_p (cc))
  1952. return 0;
  1953. break;
  1954. case SYMBOLCHAR:
  1955. if (!syntax_symbol_p (cc))
  1956. return 0;
  1957. break;
  1958. case NOT_SYMBOLCHAR:
  1959. if (syntax_symbol_p (cc))
  1960. return 0;
  1961. break;
  1962. case SYNTAX_SPEC:
  1963. if (char_syntax (cc) != p[1])
  1964. return 0;
  1965. break;
  1966. case NOT_SYNTAX_SPEC:
  1967. if (char_syntax (cc) == p[1])
  1968. return 0;
  1969. break;
  1970. case CHAR_CLASS:
  1971. if (!match_char_class (p + 1, cc))
  1972. return 0;
  1973. break;
  1974. case CHAR_CLASS_NOT:
  1975. if (match_char_class (p + 1, cc))
  1976. return 0;
  1977. break;
  1978. }
  1979. point.forward ();
  1980. }
  1981. }
  1982. int
  1983. Regexp::match (re_point &point, const Char *p, const Char *pe)
  1984. {
  1985. while (p < pe)
  1986. {
  1987. Char re = *p++;
  1988. switch (re)
  1989. {
  1990. case BEGLINE:
  1991. if (!bobp (point) && point.prevch (*this) != '\n')
  1992. return 0;
  1993. break;
  1994. case ENDLINE:
  1995. if (!eobp (point) && point.nextch () != '\n')
  1996. return 0;
  1997. break;
  1998. case BEGBUF:
  1999. if (!bobp (point))
  2000. return 0;
  2001. break;
  2002. case ENDBUF:
  2003. if (!eobp (point))
  2004. return 0;
  2005. break;
  2006. case ANYCHAR:
  2007. if (point.eobp () || point.getch () == '\n')
  2008. return 0;
  2009. break;
  2010. case NORMAL_CHARS:
  2011. {
  2012. int n = *p++;
  2013. for (int i = 0; i < n; i++)
  2014. {
  2015. if (point.eobp ())
  2016. return 0;
  2017. Char c = point.getch ();
  2018. if (ascii_char_p (c))
  2019. c = re_translate[c];
  2020. if (c != *p++)
  2021. return 0;
  2022. }
  2023. break;
  2024. }
  2025. case BEGWORD:
  2026. if ((point.bobp (*this) || !syntax_word_p (point.prevch (*this)))
  2027. && (!point.eobp () && syntax_word_p (point.nextch ())))
  2028. break;
  2029. return 0;
  2030. case ENDWORD:
  2031. if ((!point.bobp (*this) && syntax_word_p (point.prevch (*this)))
  2032. && (point.eobp () || !syntax_word_p (point.nextch ())))
  2033. break;
  2034. return 0;
  2035. case WORDBOUND:
  2036. if ((point.bobp (*this) || !syntax_word_p (point.prevch (*this)))
  2037. != (point.eobp () || !syntax_word_p (point.nextch ())))
  2038. break;
  2039. return 0;
  2040. case NOT_WORDBOUND:
  2041. if (point.bobp (*this) && point.eobp ())
  2042. return 0;
  2043. if ((point.bobp (*this) || !syntax_word_p (point.prevch (*this)))
  2044. == (point.eobp () || !syntax_word_p (point.nextch ())))
  2045. break;
  2046. return 0;
  2047. case WORDCHAR:
  2048. if (point.eobp () || !syntax_word_p (point.getch ()))
  2049. return 0;
  2050. break;
  2051. case NOT_WORDCHAR:
  2052. if (point.eobp () || syntax_word_p (point.getch ()))
  2053. return 0;
  2054. break;
  2055. case BEGSYMBOL:
  2056. if ((point.bobp (*this)
  2057. || !syntax_symbol_p (point.prevch (*this)))
  2058. && (!point.eobp ()
  2059. && syntax_symbol_p (point.nextch ())))
  2060. break;
  2061. return 0;
  2062. case ENDSYMBOL:
  2063. if ((!point.bobp (*this)
  2064. && syntax_symbol_p (point.prevch (*this)))
  2065. && (point.eobp ()
  2066. || !syntax_symbol_p (point.nextch ())))
  2067. break;
  2068. return 0;
  2069. case SYMBOLBOUND:
  2070. if ((point.bobp (*this)
  2071. || !syntax_symbol_p (point.prevch (*this)))
  2072. != (point.eobp ()
  2073. || !syntax_symbol_p (point.nextch ())))
  2074. break;
  2075. return 0;
  2076. case NOT_SYMBOLBOUND:
  2077. if (point.bobp (*this) && point.eobp ())
  2078. return 0;
  2079. if ((point.bobp (*this)
  2080. || !syntax_symbol_p (point.prevch (*this)))
  2081. == (point.eobp ()
  2082. || !syntax_symbol_p (point.nextch ())))
  2083. break;
  2084. return 0;
  2085. case SYMBOLCHAR:
  2086. if (point.eobp () || !syntax_symbol_p (point.getch ()))
  2087. return 0;
  2088. break;
  2089. case NOT_SYMBOLCHAR:
  2090. if (point.eobp () || syntax_symbol_p (point.getch ()))
  2091. return 0;
  2092. break;
  2093. case SYNTAX_SPEC:
  2094. {
  2095. if (point.eobp ())
  2096. return 0;
  2097. if (char_syntax (point.getch ()) != *p++)
  2098. return 0;
  2099. break;
  2100. }
  2101. case NOT_SYNTAX_SPEC:
  2102. {
  2103. if (point.eobp ())
  2104. return 0;
  2105. if (char_syntax (point.getch ()) == *p++)
  2106. return 0;
  2107. break;
  2108. }
  2109. case START_SAVE_REGS:
  2110. start_save_regs (*p++, point.p_point);
  2111. break;
  2112. case END_SAVE_REGS:
  2113. end_save_regs (*p++, point.p_point);
  2114. break;
  2115. case CHAR_CLASS:
  2116. if (point.eobp () || !match_char_class (p, point.getch ()))
  2117. return 0;
  2118. p += *p;
  2119. break;
  2120. case CHAR_CLASS_NOT:
  2121. if (point.eobp () || match_char_class (p, point.getch ()))
  2122. return 0;
  2123. p += *p;
  2124. break;
  2125. case BACKREF:
  2126. if (!backref (point, *p++))
  2127. return 0;
  2128. break;
  2129. case END_BRANCH:
  2130. return 1;
  2131. case BRANCH:
  2132. case BRANCH_BACKTRACK:
  2133. return branch (point, p, pe);
  2134. case CLOSURE_SIMPLE:
  2135. return simple_closure (point, p, pe);
  2136. case SHORTEST_CLOSURE_SIMPLE:
  2137. return shortest_simple_closure (point, p, pe);
  2138. case CLOSURE:
  2139. case SHORTEST_CLOSURE:
  2140. return closure (point, p, pe, re == SHORTEST_CLOSURE);
  2141. case CLOSURE_BACKTRACK:
  2142. case SHORTEST_CLOSURE_BACKTRACK:
  2143. return closure_backtrack (point, p, pe, re == SHORTEST_CLOSURE_BACKTRACK);
  2144. }
  2145. }
  2146. return 1;
  2147. }
  2148. class re_search
  2149. {
  2150. public:
  2151. virtual int nextl (re_point &) const = 0;
  2152. };
  2153. class re_search_string: public re_search
  2154. {
  2155. public:
  2156. virtual int nextl (re_point &point) const
  2157. {return point.nextl ();}
  2158. };
  2159. class re_search_buffer: public re_search
  2160. {
  2161. const Buffer *bufp;
  2162. public:
  2163. re_search_buffer (const Buffer *bp) : bufp (bp) {}
  2164. virtual int nextl (re_point &point) const
  2165. {return bufp->line_forward (point, 1) && point.p_point <= point.p_max;}
  2166. };
  2167. int
  2168. Regexp::search (const re_search &re, re_point &point)
  2169. {
  2170. if (re_match_bol_p)
  2171. {
  2172. if (!point.bobp (*this) && point.prevch (*this) != '\n' && !re.nextl (point))
  2173. return 0;
  2174. while (1)
  2175. {
  2176. if (!re_match_void_p)
  2177. {
  2178. if (point.eobp ())
  2179. return 0;
  2180. Char c = point.nextch ();
  2181. if (c < 0x100)
  2182. c = re_translate[c];
  2183. else
  2184. c >>= 8;
  2185. if (!re_fastmap[c])
  2186. goto fail;
  2187. }
  2188. if (match (point))
  2189. return 1;
  2190. fail:
  2191. if (!re.nextl (point))
  2192. return 0;
  2193. }
  2194. }
  2195. if (!re_match_void_p)
  2196. {
  2197. for (; !point.eobp (); point.forward ())
  2198. {
  2199. Char c = point.nextch ();
  2200. if (c < 0x100)
  2201. c = re_translate[c];
  2202. else
  2203. c >>= 8;
  2204. if (re_fastmap[c] && match (point))
  2205. return 1;
  2206. }
  2207. return 0;
  2208. }
  2209. while (1)
  2210. {
  2211. if (match (point))
  2212. return 1;
  2213. if (point.eobp ())
  2214. return 0;
  2215. point.forward ();
  2216. }
  2217. }
  2218. inline void
  2219. Regexp::init_match (const re_point &point, point_t last_match,
  2220. lChar last_match_char)
  2221. {
  2222. re_range.p1 = point.p_min;
  2223. re_range.p2 = point.p_max;
  2224. re_last_match = last_match;
  2225. re_last_match_char = last_match_char;
  2226. }
  2227. inline void
  2228. Regexp::init_match (const Buffer *bp, point_t last_match,
  2229. lChar last_match_char)
  2230. {
  2231. re_range.p1 = bp->b_contents.p1;
  2232. re_range.p2 = bp->b_contents.p2;
  2233. re_last_match = last_match;
  2234. re_last_match_char = last_match_char;
  2235. }
  2236. int
  2237. Regexp::search (const Char *string, int size, int offset)
  2238. {
  2239. Chunk chunk;
  2240. chunk.c_used = size;
  2241. chunk.c_text = (Char *)string;
  2242. chunk.c_prev = 0;
  2243. chunk.c_next = 0;
  2244. re_point point;
  2245. point.p_point = offset;
  2246. point.p_offset = offset;
  2247. point.p_chunk = &chunk;
  2248. point.p_min = 0;
  2249. point.p_max = size;
  2250. init_match (point, -1, 0);
  2251. return search (re_search_string (), point);
  2252. }
  2253. int
  2254. Regexp::search (const Buffer *bp, const Point &start,
  2255. point_t p1, point_t p2,
  2256. point_t last_match, lChar last_match_char)
  2257. {
  2258. re_point point;
  2259. point.p_point = start.p_point;
  2260. point.p_offset = start.p_offset;
  2261. point.p_chunk = start.p_chunk;
  2262. point.p_min = p1;
  2263. point.p_max = p2;
  2264. init_match (bp, last_match, last_match_char);
  2265. return search (re_search_buffer (bp), point);
  2266. }
  2267. int
  2268. Regexp::search_backward (const Buffer *bp, const Point &start,
  2269. point_t p1, point_t p2,
  2270. point_t last_match, lChar last_match_char)
  2271. {
  2272. re_point point;
  2273. point.p_point = start.p_point;
  2274. point.p_offset = start.p_offset;
  2275. point.p_chunk = start.p_chunk;
  2276. point.p_min = p1;
  2277. point.p_max = p2;
  2278. init_match (bp, last_match, last_match_char);
  2279. if (re_match_bol_p)
  2280. {
  2281. if (!point.bobp (*this) && point.prevch (*this) != '\n')
  2282. {
  2283. bp->go_bol (point);
  2284. if (point.p_point < p1)
  2285. return 0;
  2286. }
  2287. while (1)
  2288. {
  2289. if (!re_match_void_p)
  2290. {
  2291. if (point.eobp ())
  2292. goto fail;
  2293. Char c = point.nextch ();
  2294. if (c < 0x100)
  2295. c = re_translate[c];
  2296. else
  2297. c >>= 8;
  2298. if (!re_fastmap[c])
  2299. goto fail;
  2300. }
  2301. if (match (point))
  2302. return 1;
  2303. fail:
  2304. if (!bp->line_backward (point, 1) || point.p_point < p1)
  2305. return 0;
  2306. }
  2307. }
  2308. if (!re_match_void_p)
  2309. {
  2310. if (point.eobp ())
  2311. {
  2312. if (point.bobp (*this))
  2313. return 0;
  2314. point.backward ();
  2315. }
  2316. while (1)
  2317. {
  2318. Char c = point.nextch ();
  2319. if (c < 0x100)
  2320. c = re_translate[c];
  2321. else
  2322. c >>= 8;
  2323. if (re_fastmap[c] && match (point))
  2324. return 1;
  2325. if (point.bobp (*this))
  2326. return 0;
  2327. point.backward ();
  2328. }
  2329. }
  2330. while (1)
  2331. {
  2332. if (match (point))
  2333. return 1;
  2334. if (point.bobp (*this))
  2335. return 0;
  2336. point.backward ();
  2337. }
  2338. }
  2339. int
  2340. Regexp::match (const Char *string, int size, int offset)
  2341. {
  2342. Chunk chunk;
  2343. chunk.c_used = size;
  2344. chunk.c_text = (Char *)string;
  2345. chunk.c_prev = 0;
  2346. chunk.c_next = 0;
  2347. re_point point;
  2348. point.p_point = offset;
  2349. point.p_offset = offset;
  2350. point.p_chunk = &chunk;
  2351. point.p_min = 0;
  2352. point.p_max = size;
  2353. init_match (point, -1, 0);
  2354. return match (point);
  2355. }
  2356. int
  2357. Regexp::match (const Buffer *bp, const Point &start,
  2358. point_t p1, point_t p2)
  2359. {
  2360. re_point point;
  2361. point.p_point = start.p_point;
  2362. point.p_offset = start.p_offset;
  2363. point.p_chunk = start.p_chunk;
  2364. point.p_min = p1;
  2365. point.p_max = p2;
  2366. init_match (bp, -1, 0);
  2367. return match (point);
  2368. }
  2369. int
  2370. Regexp::smart_case_fold_p (const Char *p, int l)
  2371. {
  2372. const Char *const pe = p + l;
  2373. while (p < pe)
  2374. {
  2375. Char c = *p++;
  2376. switch (c)
  2377. {
  2378. default:
  2379. if (upper_char_p (c))
  2380. return 0;
  2381. break;
  2382. case '[':
  2383. if (p < pe && *p == '^')
  2384. p++;
  2385. if (p < pe && *p == ']')
  2386. p++;
  2387. for (; p < pe && *p != ']'; p++)
  2388. if (upper_char_p (*p))
  2389. return 0;
  2390. break;
  2391. case '\\':
  2392. if (p == pe)
  2393. break;
  2394. c = *p++;
  2395. switch (c)
  2396. {
  2397. case 'B':
  2398. case 'W':
  2399. break;
  2400. case 'S':
  2401. p++;
  2402. break;
  2403. case '_':
  2404. if (p == pe)
  2405. break;
  2406. c = *p++;
  2407. switch (c)
  2408. {
  2409. case 'B':
  2410. case 'S':
  2411. break;
  2412. default:
  2413. if (upper_char_p (c))
  2414. return 0;
  2415. break;
  2416. }
  2417. break;
  2418. default:
  2419. if (upper_char_p (c))
  2420. return 0;
  2421. break;
  2422. }
  2423. break;
  2424. }
  2425. }
  2426. return 1;
  2427. }