PageRenderTime 88ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/regcomp.c

https://github.com/diabolo/ruby
C | 6314 lines | 5385 code | 864 blank | 65 comment | 1386 complexity | 7830f555aaa385edcde24e93f132e8e9 MD5 | raw file
Possible License(s): GPL-2.0, BSD-3-Clause
  1. /**********************************************************************
  2. regcomp.c - Oniguruma (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include "regparse.h"
  30. OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
  31. extern OnigCaseFoldType
  32. onig_get_default_case_fold_flag(void)
  33. {
  34. return OnigDefaultCaseFoldFlag;
  35. }
  36. extern int
  37. onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
  38. {
  39. OnigDefaultCaseFoldFlag = case_fold_flag;
  40. return 0;
  41. }
  42. #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
  43. static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
  44. #endif
  45. static UChar*
  46. str_dup(UChar* s, UChar* end)
  47. {
  48. ptrdiff_t len = end - s;
  49. if (len > 0) {
  50. UChar* r = (UChar* )xmalloc(len + 1);
  51. CHECK_NULL_RETURN(r);
  52. xmemcpy(r, s, len);
  53. r[len] = (UChar )0;
  54. return r;
  55. }
  56. else return NULL;
  57. }
  58. static void
  59. swap_node(Node* a, Node* b)
  60. {
  61. Node c;
  62. c = *a; *a = *b; *b = c;
  63. if (NTYPE(a) == NT_STR) {
  64. StrNode* sn = NSTR(a);
  65. if (sn->capa == 0) {
  66. size_t len = sn->end - sn->s;
  67. sn->s = sn->buf;
  68. sn->end = sn->s + len;
  69. }
  70. }
  71. if (NTYPE(b) == NT_STR) {
  72. StrNode* sn = NSTR(b);
  73. if (sn->capa == 0) {
  74. size_t len = sn->end - sn->s;
  75. sn->s = sn->buf;
  76. sn->end = sn->s + len;
  77. }
  78. }
  79. }
  80. static OnigDistance
  81. distance_add(OnigDistance d1, OnigDistance d2)
  82. {
  83. if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
  84. return ONIG_INFINITE_DISTANCE;
  85. else {
  86. if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
  87. else return ONIG_INFINITE_DISTANCE;
  88. }
  89. }
  90. static OnigDistance
  91. distance_multiply(OnigDistance d, int m)
  92. {
  93. if (m == 0) return 0;
  94. if (d < ONIG_INFINITE_DISTANCE / m)
  95. return d * m;
  96. else
  97. return ONIG_INFINITE_DISTANCE;
  98. }
  99. static int
  100. bitset_is_empty(BitSetRef bs)
  101. {
  102. int i;
  103. for (i = 0; i < (int )BITSET_SIZE; i++) {
  104. if (bs[i] != 0) return 0;
  105. }
  106. return 1;
  107. }
  108. #ifdef ONIG_DEBUG
  109. static int
  110. bitset_on_num(BitSetRef bs)
  111. {
  112. int i, n;
  113. n = 0;
  114. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  115. if (BITSET_AT(bs, i)) n++;
  116. }
  117. return n;
  118. }
  119. #endif
  120. extern int
  121. onig_bbuf_init(BBuf* buf, int size)
  122. {
  123. if (size <= 0) {
  124. size = 0;
  125. buf->p = NULL;
  126. }
  127. else {
  128. buf->p = (UChar* )xmalloc(size);
  129. if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
  130. }
  131. buf->alloc = size;
  132. buf->used = 0;
  133. return 0;
  134. }
  135. #ifdef USE_SUBEXP_CALL
  136. static int
  137. unset_addr_list_init(UnsetAddrList* uslist, int size)
  138. {
  139. UnsetAddr* p;
  140. p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
  141. CHECK_NULL_RETURN_MEMERR(p);
  142. uslist->num = 0;
  143. uslist->alloc = size;
  144. uslist->us = p;
  145. return 0;
  146. }
  147. static void
  148. unset_addr_list_end(UnsetAddrList* uslist)
  149. {
  150. if (IS_NOT_NULL(uslist->us))
  151. xfree(uslist->us);
  152. }
  153. static int
  154. unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
  155. {
  156. UnsetAddr* p;
  157. int size;
  158. if (uslist->num >= uslist->alloc) {
  159. size = uslist->alloc * 2;
  160. p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
  161. CHECK_NULL_RETURN_MEMERR(p);
  162. uslist->alloc = size;
  163. uslist->us = p;
  164. }
  165. uslist->us[uslist->num].offset = offset;
  166. uslist->us[uslist->num].target = node;
  167. uslist->num++;
  168. return 0;
  169. }
  170. #endif /* USE_SUBEXP_CALL */
  171. static int
  172. add_opcode(regex_t* reg, int opcode)
  173. {
  174. BBUF_ADD1(reg, opcode);
  175. return 0;
  176. }
  177. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  178. static int
  179. add_state_check_num(regex_t* reg, int num)
  180. {
  181. StateCheckNumType n = (StateCheckNumType )num;
  182. BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
  183. return 0;
  184. }
  185. #endif
  186. static int
  187. add_rel_addr(regex_t* reg, int addr)
  188. {
  189. RelAddrType ra = (RelAddrType )addr;
  190. BBUF_ADD(reg, &ra, SIZE_RELADDR);
  191. return 0;
  192. }
  193. static int
  194. add_abs_addr(regex_t* reg, int addr)
  195. {
  196. AbsAddrType ra = (AbsAddrType )addr;
  197. BBUF_ADD(reg, &ra, SIZE_ABSADDR);
  198. return 0;
  199. }
  200. static int
  201. add_length(regex_t* reg, int len)
  202. {
  203. LengthType l = (LengthType )len;
  204. BBUF_ADD(reg, &l, SIZE_LENGTH);
  205. return 0;
  206. }
  207. static int
  208. add_mem_num(regex_t* reg, int num)
  209. {
  210. MemNumType n = (MemNumType )num;
  211. BBUF_ADD(reg, &n, SIZE_MEMNUM);
  212. return 0;
  213. }
  214. static int
  215. add_pointer(regex_t* reg, void* addr)
  216. {
  217. PointerType ptr = (PointerType )addr;
  218. BBUF_ADD(reg, &ptr, SIZE_POINTER);
  219. return 0;
  220. }
  221. static int
  222. add_option(regex_t* reg, OnigOptionType option)
  223. {
  224. BBUF_ADD(reg, &option, SIZE_OPTION);
  225. return 0;
  226. }
  227. static int
  228. add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
  229. {
  230. int r;
  231. r = add_opcode(reg, opcode);
  232. if (r) return r;
  233. r = add_rel_addr(reg, addr);
  234. return r;
  235. }
  236. static int
  237. add_bytes(regex_t* reg, UChar* bytes, int len)
  238. {
  239. BBUF_ADD(reg, bytes, len);
  240. return 0;
  241. }
  242. static int
  243. add_bitset(regex_t* reg, BitSetRef bs)
  244. {
  245. BBUF_ADD(reg, bs, SIZE_BITSET);
  246. return 0;
  247. }
  248. static int
  249. add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
  250. {
  251. int r;
  252. r = add_opcode(reg, opcode);
  253. if (r) return r;
  254. r = add_option(reg, option);
  255. return r;
  256. }
  257. static int compile_length_tree(Node* node, regex_t* reg);
  258. static int compile_tree(Node* node, regex_t* reg);
  259. #define IS_NEED_STR_LEN_OP_EXACT(op) \
  260. ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
  261. (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
  262. static int
  263. select_str_opcode(int mb_len, int str_len, int ignore_case)
  264. {
  265. int op;
  266. if (ignore_case) {
  267. switch (str_len) {
  268. case 1: op = OP_EXACT1_IC; break;
  269. default: op = OP_EXACTN_IC; break;
  270. }
  271. }
  272. else {
  273. switch (mb_len) {
  274. case 1:
  275. switch (str_len) {
  276. case 1: op = OP_EXACT1; break;
  277. case 2: op = OP_EXACT2; break;
  278. case 3: op = OP_EXACT3; break;
  279. case 4: op = OP_EXACT4; break;
  280. case 5: op = OP_EXACT5; break;
  281. default: op = OP_EXACTN; break;
  282. }
  283. break;
  284. case 2:
  285. switch (str_len) {
  286. case 1: op = OP_EXACTMB2N1; break;
  287. case 2: op = OP_EXACTMB2N2; break;
  288. case 3: op = OP_EXACTMB2N3; break;
  289. default: op = OP_EXACTMB2N; break;
  290. }
  291. break;
  292. case 3:
  293. op = OP_EXACTMB3N;
  294. break;
  295. default:
  296. op = OP_EXACTMBN;
  297. break;
  298. }
  299. }
  300. return op;
  301. }
  302. static int
  303. compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
  304. {
  305. int r;
  306. int saved_num_null_check = reg->num_null_check;
  307. if (empty_info != 0) {
  308. r = add_opcode(reg, OP_NULL_CHECK_START);
  309. if (r) return r;
  310. r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
  311. if (r) return r;
  312. reg->num_null_check++;
  313. }
  314. r = compile_tree(node, reg);
  315. if (r) return r;
  316. if (empty_info != 0) {
  317. if (empty_info == NQ_TARGET_IS_EMPTY)
  318. r = add_opcode(reg, OP_NULL_CHECK_END);
  319. else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
  320. r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
  321. else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
  322. r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
  323. if (r) return r;
  324. r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
  325. }
  326. return r;
  327. }
  328. #ifdef USE_SUBEXP_CALL
  329. static int
  330. compile_call(CallNode* node, regex_t* reg)
  331. {
  332. int r;
  333. r = add_opcode(reg, OP_CALL);
  334. if (r) return r;
  335. r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
  336. node->target);
  337. if (r) return r;
  338. r = add_abs_addr(reg, 0 /*dummy addr.*/);
  339. return r;
  340. }
  341. #endif
  342. static int
  343. compile_tree_n_times(Node* node, int n, regex_t* reg)
  344. {
  345. int i, r;
  346. for (i = 0; i < n; i++) {
  347. r = compile_tree(node, reg);
  348. if (r) return r;
  349. }
  350. return 0;
  351. }
  352. static int
  353. add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance str_len,
  354. regex_t* reg ARG_UNUSED, int ignore_case)
  355. {
  356. int len;
  357. int op = select_str_opcode(mb_len, str_len, ignore_case);
  358. len = SIZE_OPCODE;
  359. if (op == OP_EXACTMBN) len += SIZE_LENGTH;
  360. if (IS_NEED_STR_LEN_OP_EXACT(op))
  361. len += SIZE_LENGTH;
  362. len += mb_len * str_len;
  363. return len;
  364. }
  365. static int
  366. add_compile_string(UChar* s, int mb_len, int str_len,
  367. regex_t* reg, int ignore_case)
  368. {
  369. int op = select_str_opcode(mb_len, str_len, ignore_case);
  370. add_opcode(reg, op);
  371. if (op == OP_EXACTMBN)
  372. add_length(reg, mb_len);
  373. if (IS_NEED_STR_LEN_OP_EXACT(op)) {
  374. if (op == OP_EXACTN_IC)
  375. add_length(reg, mb_len * str_len);
  376. else
  377. add_length(reg, str_len);
  378. }
  379. add_bytes(reg, s, mb_len * str_len);
  380. return 0;
  381. }
  382. static int
  383. compile_length_string_node(Node* node, regex_t* reg)
  384. {
  385. int rlen, r, len, prev_len, slen, ambig;
  386. OnigEncoding enc = reg->enc;
  387. UChar *p, *prev;
  388. StrNode* sn;
  389. sn = NSTR(node);
  390. if (sn->end <= sn->s)
  391. return 0;
  392. ambig = NSTRING_IS_AMBIG(node);
  393. p = prev = sn->s;
  394. prev_len = enclen(enc, p, sn->end);
  395. p += prev_len;
  396. slen = 1;
  397. rlen = 0;
  398. for (; p < sn->end; ) {
  399. len = enclen(enc, p, sn->end);
  400. if (len == prev_len) {
  401. slen++;
  402. }
  403. else {
  404. r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
  405. rlen += r;
  406. prev = p;
  407. slen = 1;
  408. prev_len = len;
  409. }
  410. p += len;
  411. }
  412. r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
  413. rlen += r;
  414. return rlen;
  415. }
  416. static int
  417. compile_length_string_raw_node(StrNode* sn, regex_t* reg)
  418. {
  419. if (sn->end <= sn->s)
  420. return 0;
  421. return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
  422. }
  423. static int
  424. compile_string_node(Node* node, regex_t* reg)
  425. {
  426. int r, len, prev_len, slen, ambig;
  427. OnigEncoding enc = reg->enc;
  428. UChar *p, *prev, *end;
  429. StrNode* sn;
  430. sn = NSTR(node);
  431. if (sn->end <= sn->s)
  432. return 0;
  433. end = sn->end;
  434. ambig = NSTRING_IS_AMBIG(node);
  435. p = prev = sn->s;
  436. prev_len = enclen(enc, p, end);
  437. p += prev_len;
  438. slen = 1;
  439. for (; p < end; ) {
  440. len = enclen(enc, p, end);
  441. if (len == prev_len) {
  442. slen++;
  443. }
  444. else {
  445. r = add_compile_string(prev, prev_len, slen, reg, ambig);
  446. if (r) return r;
  447. prev = p;
  448. slen = 1;
  449. prev_len = len;
  450. }
  451. p += len;
  452. }
  453. return add_compile_string(prev, prev_len, slen, reg, ambig);
  454. }
  455. static int
  456. compile_string_raw_node(StrNode* sn, regex_t* reg)
  457. {
  458. if (sn->end <= sn->s)
  459. return 0;
  460. return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
  461. }
  462. static int
  463. add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
  464. {
  465. #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
  466. add_length(reg, mbuf->used);
  467. return add_bytes(reg, mbuf->p, mbuf->used);
  468. #else
  469. int r, pad_size;
  470. UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
  471. GET_ALIGNMENT_PAD_SIZE(p, pad_size);
  472. add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
  473. if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
  474. r = add_bytes(reg, mbuf->p, mbuf->used);
  475. /* padding for return value from compile_length_cclass_node() to be fix. */
  476. pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
  477. if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
  478. return r;
  479. #endif
  480. }
  481. static int
  482. compile_length_cclass_node(CClassNode* cc, regex_t* reg)
  483. {
  484. int len;
  485. if (IS_NCCLASS_SHARE(cc)) {
  486. len = SIZE_OPCODE + SIZE_POINTER;
  487. return len;
  488. }
  489. if (IS_NULL(cc->mbuf)) {
  490. len = SIZE_OPCODE + SIZE_BITSET;
  491. }
  492. else {
  493. if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
  494. len = SIZE_OPCODE;
  495. }
  496. else {
  497. len = SIZE_OPCODE + SIZE_BITSET;
  498. }
  499. #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
  500. len += SIZE_LENGTH + cc->mbuf->used;
  501. #else
  502. len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
  503. #endif
  504. }
  505. return len;
  506. }
  507. static int
  508. compile_cclass_node(CClassNode* cc, regex_t* reg)
  509. {
  510. int r;
  511. if (IS_NCCLASS_SHARE(cc)) {
  512. add_opcode(reg, OP_CCLASS_NODE);
  513. r = add_pointer(reg, cc);
  514. return r;
  515. }
  516. if (IS_NULL(cc->mbuf)) {
  517. if (IS_NCCLASS_NOT(cc))
  518. add_opcode(reg, OP_CCLASS_NOT);
  519. else
  520. add_opcode(reg, OP_CCLASS);
  521. r = add_bitset(reg, cc->bs);
  522. }
  523. else {
  524. if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
  525. if (IS_NCCLASS_NOT(cc))
  526. add_opcode(reg, OP_CCLASS_MB_NOT);
  527. else
  528. add_opcode(reg, OP_CCLASS_MB);
  529. r = add_multi_byte_cclass(cc->mbuf, reg);
  530. }
  531. else {
  532. if (IS_NCCLASS_NOT(cc))
  533. add_opcode(reg, OP_CCLASS_MIX_NOT);
  534. else
  535. add_opcode(reg, OP_CCLASS_MIX);
  536. r = add_bitset(reg, cc->bs);
  537. if (r) return r;
  538. r = add_multi_byte_cclass(cc->mbuf, reg);
  539. }
  540. }
  541. return r;
  542. }
  543. static int
  544. entry_repeat_range(regex_t* reg, int id, int lower, int upper)
  545. {
  546. #define REPEAT_RANGE_ALLOC 4
  547. OnigRepeatRange* p;
  548. if (reg->repeat_range_alloc == 0) {
  549. p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
  550. CHECK_NULL_RETURN_MEMERR(p);
  551. reg->repeat_range = p;
  552. reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
  553. }
  554. else if (reg->repeat_range_alloc <= id) {
  555. int n;
  556. n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
  557. p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
  558. sizeof(OnigRepeatRange) * n);
  559. CHECK_NULL_RETURN_MEMERR(p);
  560. reg->repeat_range = p;
  561. reg->repeat_range_alloc = n;
  562. }
  563. else {
  564. p = reg->repeat_range;
  565. }
  566. p[id].lower = lower;
  567. p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
  568. return 0;
  569. }
  570. static int
  571. compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
  572. regex_t* reg)
  573. {
  574. int r;
  575. int num_repeat = reg->num_repeat;
  576. r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
  577. if (r) return r;
  578. r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
  579. reg->num_repeat++;
  580. if (r) return r;
  581. r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
  582. if (r) return r;
  583. r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
  584. if (r) return r;
  585. r = compile_tree_empty_check(qn->target, reg, empty_info);
  586. if (r) return r;
  587. if (
  588. #ifdef USE_SUBEXP_CALL
  589. reg->num_call > 0 ||
  590. #endif
  591. IS_QUANTIFIER_IN_REPEAT(qn)) {
  592. r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
  593. }
  594. else {
  595. r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
  596. }
  597. if (r) return r;
  598. r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
  599. return r;
  600. }
  601. static int
  602. is_anychar_star_quantifier(QtfrNode* qn)
  603. {
  604. if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
  605. NTYPE(qn->target) == NT_CANY)
  606. return 1;
  607. else
  608. return 0;
  609. }
  610. #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
  611. #define CKN_ON (ckn > 0)
  612. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  613. static int
  614. compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
  615. {
  616. int len, mod_tlen, cklen;
  617. int ckn;
  618. int infinite = IS_REPEAT_INFINITE(qn->upper);
  619. int empty_info = qn->target_empty_info;
  620. int tlen = compile_length_tree(qn->target, reg);
  621. if (tlen < 0) return tlen;
  622. ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
  623. cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
  624. /* anychar repeat */
  625. if (NTYPE(qn->target) == NT_CANY) {
  626. if (qn->greedy && infinite) {
  627. if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
  628. return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
  629. else
  630. return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
  631. }
  632. }
  633. if (empty_info != 0)
  634. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  635. else
  636. mod_tlen = tlen;
  637. if (infinite && qn->lower <= 1) {
  638. if (qn->greedy) {
  639. if (qn->lower == 1)
  640. len = SIZE_OP_JUMP;
  641. else
  642. len = 0;
  643. len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
  644. }
  645. else {
  646. if (qn->lower == 0)
  647. len = SIZE_OP_JUMP;
  648. else
  649. len = 0;
  650. len += mod_tlen + SIZE_OP_PUSH + cklen;
  651. }
  652. }
  653. else if (qn->upper == 0) {
  654. if (qn->is_refered != 0) /* /(?<n>..){0}/ */
  655. len = SIZE_OP_JUMP + tlen;
  656. else
  657. len = 0;
  658. }
  659. else if (qn->upper == 1 && qn->greedy) {
  660. if (qn->lower == 0) {
  661. if (CKN_ON) {
  662. len = SIZE_OP_STATE_CHECK_PUSH + tlen;
  663. }
  664. else {
  665. len = SIZE_OP_PUSH + tlen;
  666. }
  667. }
  668. else {
  669. len = tlen;
  670. }
  671. }
  672. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  673. len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
  674. }
  675. else {
  676. len = SIZE_OP_REPEAT_INC
  677. + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
  678. if (CKN_ON)
  679. len += SIZE_OP_STATE_CHECK;
  680. }
  681. return len;
  682. }
  683. static int
  684. compile_quantifier_node(QtfrNode* qn, regex_t* reg)
  685. {
  686. int r, mod_tlen;
  687. int ckn;
  688. int infinite = IS_REPEAT_INFINITE(qn->upper);
  689. int empty_info = qn->target_empty_info;
  690. int tlen = compile_length_tree(qn->target, reg);
  691. if (tlen < 0) return tlen;
  692. ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
  693. if (is_anychar_star_quantifier(qn)) {
  694. r = compile_tree_n_times(qn->target, qn->lower, reg);
  695. if (r) return r;
  696. if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
  697. if (IS_MULTILINE(reg->options))
  698. r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
  699. else
  700. r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
  701. if (r) return r;
  702. if (CKN_ON) {
  703. r = add_state_check_num(reg, ckn);
  704. if (r) return r;
  705. }
  706. return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
  707. }
  708. else {
  709. if (IS_MULTILINE(reg->options)) {
  710. r = add_opcode(reg, (CKN_ON ?
  711. OP_STATE_CHECK_ANYCHAR_ML_STAR
  712. : OP_ANYCHAR_ML_STAR));
  713. }
  714. else {
  715. r = add_opcode(reg, (CKN_ON ?
  716. OP_STATE_CHECK_ANYCHAR_STAR
  717. : OP_ANYCHAR_STAR));
  718. }
  719. if (r) return r;
  720. if (CKN_ON)
  721. r = add_state_check_num(reg, ckn);
  722. return r;
  723. }
  724. }
  725. if (empty_info != 0)
  726. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  727. else
  728. mod_tlen = tlen;
  729. if (infinite && qn->lower <= 1) {
  730. if (qn->greedy) {
  731. if (qn->lower == 1) {
  732. r = add_opcode_rel_addr(reg, OP_JUMP,
  733. (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
  734. if (r) return r;
  735. }
  736. if (CKN_ON) {
  737. r = add_opcode(reg, OP_STATE_CHECK_PUSH);
  738. if (r) return r;
  739. r = add_state_check_num(reg, ckn);
  740. if (r) return r;
  741. r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
  742. }
  743. else {
  744. r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
  745. }
  746. if (r) return r;
  747. r = compile_tree_empty_check(qn->target, reg, empty_info);
  748. if (r) return r;
  749. r = add_opcode_rel_addr(reg, OP_JUMP,
  750. -(mod_tlen + (int )SIZE_OP_JUMP
  751. + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
  752. }
  753. else {
  754. if (qn->lower == 0) {
  755. r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
  756. if (r) return r;
  757. }
  758. r = compile_tree_empty_check(qn->target, reg, empty_info);
  759. if (r) return r;
  760. if (CKN_ON) {
  761. r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
  762. if (r) return r;
  763. r = add_state_check_num(reg, ckn);
  764. if (r) return r;
  765. r = add_rel_addr(reg,
  766. -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
  767. }
  768. else
  769. r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
  770. }
  771. }
  772. else if (qn->upper == 0) {
  773. if (qn->is_refered != 0) { /* /(?<n>..){0}/ */
  774. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  775. if (r) return r;
  776. r = compile_tree(qn->target, reg);
  777. }
  778. else
  779. r = 0;
  780. }
  781. else if (qn->upper == 1 && qn->greedy) {
  782. if (qn->lower == 0) {
  783. if (CKN_ON) {
  784. r = add_opcode(reg, OP_STATE_CHECK_PUSH);
  785. if (r) return r;
  786. r = add_state_check_num(reg, ckn);
  787. if (r) return r;
  788. r = add_rel_addr(reg, tlen);
  789. }
  790. else {
  791. r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
  792. }
  793. if (r) return r;
  794. }
  795. r = compile_tree(qn->target, reg);
  796. }
  797. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  798. if (CKN_ON) {
  799. r = add_opcode(reg, OP_STATE_CHECK_PUSH);
  800. if (r) return r;
  801. r = add_state_check_num(reg, ckn);
  802. if (r) return r;
  803. r = add_rel_addr(reg, SIZE_OP_JUMP);
  804. }
  805. else {
  806. r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
  807. }
  808. if (r) return r;
  809. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  810. if (r) return r;
  811. r = compile_tree(qn->target, reg);
  812. }
  813. else {
  814. r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
  815. if (CKN_ON) {
  816. if (r) return r;
  817. r = add_opcode(reg, OP_STATE_CHECK);
  818. if (r) return r;
  819. r = add_state_check_num(reg, ckn);
  820. }
  821. }
  822. return r;
  823. }
  824. #else /* USE_COMBINATION_EXPLOSION_CHECK */
  825. static int
  826. compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
  827. {
  828. int len, mod_tlen;
  829. int infinite = IS_REPEAT_INFINITE(qn->upper);
  830. int empty_info = qn->target_empty_info;
  831. int tlen = compile_length_tree(qn->target, reg);
  832. if (tlen < 0) return tlen;
  833. /* anychar repeat */
  834. if (NTYPE(qn->target) == NT_CANY) {
  835. if (qn->greedy && infinite) {
  836. if (IS_NOT_NULL(qn->next_head_exact))
  837. return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
  838. else
  839. return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
  840. }
  841. }
  842. if (empty_info != 0)
  843. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  844. else
  845. mod_tlen = tlen;
  846. if (infinite &&
  847. (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  848. if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
  849. len = SIZE_OP_JUMP;
  850. }
  851. else {
  852. len = tlen * qn->lower;
  853. }
  854. if (qn->greedy) {
  855. if (IS_NOT_NULL(qn->head_exact))
  856. len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
  857. else if (IS_NOT_NULL(qn->next_head_exact))
  858. len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
  859. else
  860. len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
  861. }
  862. else
  863. len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
  864. }
  865. else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
  866. len = SIZE_OP_JUMP + tlen;
  867. }
  868. else if (!infinite && qn->greedy &&
  869. (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
  870. <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  871. len = tlen * qn->lower;
  872. len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
  873. }
  874. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  875. len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
  876. }
  877. else {
  878. len = SIZE_OP_REPEAT_INC
  879. + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
  880. }
  881. return len;
  882. }
  883. static int
  884. compile_quantifier_node(QtfrNode* qn, regex_t* reg)
  885. {
  886. int i, r, mod_tlen;
  887. int infinite = IS_REPEAT_INFINITE(qn->upper);
  888. int empty_info = qn->target_empty_info;
  889. int tlen = compile_length_tree(qn->target, reg);
  890. if (tlen < 0) return tlen;
  891. if (is_anychar_star_quantifier(qn)) {
  892. r = compile_tree_n_times(qn->target, qn->lower, reg);
  893. if (r) return r;
  894. if (IS_NOT_NULL(qn->next_head_exact)) {
  895. if (IS_MULTILINE(reg->options))
  896. r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
  897. else
  898. r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
  899. if (r) return r;
  900. return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
  901. }
  902. else {
  903. if (IS_MULTILINE(reg->options))
  904. return add_opcode(reg, OP_ANYCHAR_ML_STAR);
  905. else
  906. return add_opcode(reg, OP_ANYCHAR_STAR);
  907. }
  908. }
  909. if (empty_info != 0)
  910. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  911. else
  912. mod_tlen = tlen;
  913. if (infinite &&
  914. (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  915. if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
  916. if (qn->greedy) {
  917. if (IS_NOT_NULL(qn->head_exact))
  918. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
  919. else if (IS_NOT_NULL(qn->next_head_exact))
  920. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
  921. else
  922. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
  923. }
  924. else {
  925. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
  926. }
  927. if (r) return r;
  928. }
  929. else {
  930. r = compile_tree_n_times(qn->target, qn->lower, reg);
  931. if (r) return r;
  932. }
  933. if (qn->greedy) {
  934. if (IS_NOT_NULL(qn->head_exact)) {
  935. r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
  936. mod_tlen + SIZE_OP_JUMP);
  937. if (r) return r;
  938. add_bytes(reg, NSTR(qn->head_exact)->s, 1);
  939. r = compile_tree_empty_check(qn->target, reg, empty_info);
  940. if (r) return r;
  941. r = add_opcode_rel_addr(reg, OP_JUMP,
  942. -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
  943. }
  944. else if (IS_NOT_NULL(qn->next_head_exact)) {
  945. r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
  946. mod_tlen + SIZE_OP_JUMP);
  947. if (r) return r;
  948. add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
  949. r = compile_tree_empty_check(qn->target, reg, empty_info);
  950. if (r) return r;
  951. r = add_opcode_rel_addr(reg, OP_JUMP,
  952. -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
  953. }
  954. else {
  955. r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
  956. if (r) return r;
  957. r = compile_tree_empty_check(qn->target, reg, empty_info);
  958. if (r) return r;
  959. r = add_opcode_rel_addr(reg, OP_JUMP,
  960. -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
  961. }
  962. }
  963. else {
  964. r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
  965. if (r) return r;
  966. r = compile_tree_empty_check(qn->target, reg, empty_info);
  967. if (r) return r;
  968. r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
  969. }
  970. }
  971. else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */
  972. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  973. if (r) return r;
  974. r = compile_tree(qn->target, reg);
  975. }
  976. else if (!infinite && qn->greedy &&
  977. (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
  978. <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  979. int n = qn->upper - qn->lower;
  980. r = compile_tree_n_times(qn->target, qn->lower, reg);
  981. if (r) return r;
  982. for (i = 0; i < n; i++) {
  983. r = add_opcode_rel_addr(reg, OP_PUSH,
  984. (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
  985. if (r) return r;
  986. r = compile_tree(qn->target, reg);
  987. if (r) return r;
  988. }
  989. }
  990. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  991. r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
  992. if (r) return r;
  993. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  994. if (r) return r;
  995. r = compile_tree(qn->target, reg);
  996. }
  997. else {
  998. r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
  999. }
  1000. return r;
  1001. }
  1002. #endif /* USE_COMBINATION_EXPLOSION_CHECK */
  1003. static int
  1004. compile_length_option_node(EncloseNode* node, regex_t* reg)
  1005. {
  1006. int tlen;
  1007. OnigOptionType prev = reg->options;
  1008. reg->options = node->option;
  1009. tlen = compile_length_tree(node->target, reg);
  1010. reg->options = prev;
  1011. if (tlen < 0) return tlen;
  1012. if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
  1013. return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
  1014. + tlen + SIZE_OP_SET_OPTION;
  1015. }
  1016. else
  1017. return tlen;
  1018. }
  1019. static int
  1020. compile_option_node(EncloseNode* node, regex_t* reg)
  1021. {
  1022. int r;
  1023. OnigOptionType prev = reg->options;
  1024. if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
  1025. r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
  1026. if (r) return r;
  1027. r = add_opcode_option(reg, OP_SET_OPTION, prev);
  1028. if (r) return r;
  1029. r = add_opcode(reg, OP_FAIL);
  1030. if (r) return r;
  1031. }
  1032. reg->options = node->option;
  1033. r = compile_tree(node->target, reg);
  1034. reg->options = prev;
  1035. if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
  1036. if (r) return r;
  1037. r = add_opcode_option(reg, OP_SET_OPTION, prev);
  1038. }
  1039. return r;
  1040. }
  1041. static int
  1042. compile_length_enclose_node(EncloseNode* node, regex_t* reg)
  1043. {
  1044. int len;
  1045. int tlen;
  1046. if (node->type == ENCLOSE_OPTION)
  1047. return compile_length_option_node(node, reg);
  1048. if (node->target) {
  1049. tlen = compile_length_tree(node->target, reg);
  1050. if (tlen < 0) return tlen;
  1051. }
  1052. else
  1053. tlen = 0;
  1054. switch (node->type) {
  1055. case ENCLOSE_MEMORY:
  1056. #ifdef USE_SUBEXP_CALL
  1057. if (IS_ENCLOSE_CALLED(node)) {
  1058. len = SIZE_OP_MEMORY_START_PUSH + tlen
  1059. + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
  1060. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1061. len += (IS_ENCLOSE_RECURSION(node)
  1062. ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
  1063. else
  1064. len += (IS_ENCLOSE_RECURSION(node)
  1065. ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
  1066. }
  1067. else
  1068. #endif
  1069. {
  1070. if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
  1071. len = SIZE_OP_MEMORY_START_PUSH;
  1072. else
  1073. len = SIZE_OP_MEMORY_START;
  1074. len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
  1075. ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
  1076. }
  1077. break;
  1078. case ENCLOSE_STOP_BACKTRACK:
  1079. if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
  1080. QtfrNode* qn = NQTFR(node->target);
  1081. tlen = compile_length_tree(qn->target, reg);
  1082. if (tlen < 0) return tlen;
  1083. len = tlen * qn->lower
  1084. + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
  1085. }
  1086. else {
  1087. len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
  1088. }
  1089. break;
  1090. default:
  1091. return ONIGERR_TYPE_BUG;
  1092. break;
  1093. }
  1094. return len;
  1095. }
  1096. static int get_char_length_tree(Node* node, regex_t* reg, int* len);
  1097. static int
  1098. compile_enclose_node(EncloseNode* node, regex_t* reg)
  1099. {
  1100. int r, len;
  1101. if (node->type == ENCLOSE_OPTION)
  1102. return compile_option_node(node, reg);
  1103. switch (node->type) {
  1104. case ENCLOSE_MEMORY:
  1105. #ifdef USE_SUBEXP_CALL
  1106. if (IS_ENCLOSE_CALLED(node)) {
  1107. r = add_opcode(reg, OP_CALL);
  1108. if (r) return r;
  1109. node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
  1110. node->state |= NST_ADDR_FIXED;
  1111. r = add_abs_addr(reg, (int )node->call_addr);
  1112. if (r) return r;
  1113. len = compile_length_tree(node->target, reg);
  1114. len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
  1115. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1116. len += (IS_ENCLOSE_RECURSION(node)
  1117. ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
  1118. else
  1119. len += (IS_ENCLOSE_RECURSION(node)
  1120. ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
  1121. r = add_opcode_rel_addr(reg, OP_JUMP, len);
  1122. if (r) return r;
  1123. }
  1124. #endif
  1125. if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
  1126. r = add_opcode(reg, OP_MEMORY_START_PUSH);
  1127. else
  1128. r = add_opcode(reg, OP_MEMORY_START);
  1129. if (r) return r;
  1130. r = add_mem_num(reg, node->regnum);
  1131. if (r) return r;
  1132. r = compile_tree(node->target, reg);
  1133. if (r) return r;
  1134. #ifdef USE_SUBEXP_CALL
  1135. if (IS_ENCLOSE_CALLED(node)) {
  1136. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1137. r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
  1138. ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
  1139. else
  1140. r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
  1141. ? OP_MEMORY_END_REC : OP_MEMORY_END));
  1142. if (r) return r;
  1143. r = add_mem_num(reg, node->regnum);
  1144. if (r) return r;
  1145. r = add_opcode(reg, OP_RETURN);
  1146. }
  1147. else
  1148. #endif
  1149. {
  1150. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1151. r = add_opcode(reg, OP_MEMORY_END_PUSH);
  1152. else
  1153. r = add_opcode(reg, OP_MEMORY_END);
  1154. if (r) return r;
  1155. r = add_mem_num(reg, node->regnum);
  1156. }
  1157. break;
  1158. case ENCLOSE_STOP_BACKTRACK:
  1159. if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
  1160. QtfrNode* qn = NQTFR(node->target);
  1161. r = compile_tree_n_times(qn->target, qn->lower, reg);
  1162. if (r) return r;
  1163. len = compile_length_tree(qn->target, reg);
  1164. if (len < 0) return len;
  1165. r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
  1166. if (r) return r;
  1167. r = compile_tree(qn->target, reg);
  1168. if (r) return r;
  1169. r = add_opcode(reg, OP_POP);
  1170. if (r) return r;
  1171. r = add_opcode_rel_addr(reg, OP_JUMP,
  1172. -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
  1173. }
  1174. else {
  1175. r = add_opcode(reg, OP_PUSH_STOP_BT);
  1176. if (r) return r;
  1177. r = compile_tree(node->target, reg);
  1178. if (r) return r;
  1179. r = add_opcode(reg, OP_POP_STOP_BT);
  1180. }
  1181. break;
  1182. default:
  1183. return ONIGERR_TYPE_BUG;
  1184. break;
  1185. }
  1186. return r;
  1187. }
  1188. static int
  1189. compile_length_anchor_node(AnchorNode* node, regex_t* reg)
  1190. {
  1191. int len;
  1192. int tlen = 0;
  1193. if (node->target) {
  1194. tlen = compile_length_tree(node->target, reg);
  1195. if (tlen < 0) return tlen;
  1196. }
  1197. switch (node->type) {
  1198. case ANCHOR_PREC_READ:
  1199. len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
  1200. break;
  1201. case ANCHOR_PREC_READ_NOT:
  1202. len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
  1203. break;
  1204. case ANCHOR_LOOK_BEHIND:
  1205. len = SIZE_OP_LOOK_BEHIND + tlen;
  1206. break;
  1207. case ANCHOR_LOOK_BEHIND_NOT:
  1208. len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
  1209. break;
  1210. default:
  1211. len = SIZE_OPCODE;
  1212. break;
  1213. }
  1214. return len;
  1215. }
  1216. static int
  1217. compile_anchor_node(AnchorNode* node, regex_t* reg)
  1218. {
  1219. int r, len;
  1220. switch (node->type) {
  1221. case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
  1222. case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
  1223. case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
  1224. case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
  1225. case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
  1226. case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
  1227. case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break;
  1228. case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break;
  1229. #ifdef USE_WORD_BEGIN_END
  1230. case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break;
  1231. case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break;
  1232. #endif
  1233. case ANCHOR_PREC_READ:
  1234. r = add_opcode(reg, OP_PUSH_POS);
  1235. if (r) return r;
  1236. r = compile_tree(node->target, reg);
  1237. if (r) return r;
  1238. r = add_opcode(reg, OP_POP_POS);
  1239. break;
  1240. case ANCHOR_PREC_READ_NOT:
  1241. len = compile_length_tree(node->target, reg);
  1242. if (len < 0) return len;
  1243. r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
  1244. if (r) return r;
  1245. r = compile_tree(node->target, reg);
  1246. if (r) return r;
  1247. r = add_opcode(reg, OP_FAIL_POS);
  1248. break;
  1249. case ANCHOR_LOOK_BEHIND:
  1250. {
  1251. int n;
  1252. r = add_opcode(reg, OP_LOOK_BEHIND);
  1253. if (r) return r;
  1254. if (node->char_len < 0) {
  1255. r = get_char_length_tree(node->target, reg, &n);
  1256. if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  1257. }
  1258. else
  1259. n = node->char_len;
  1260. r = add_length(reg, n);
  1261. if (r) return r;
  1262. r = compile_tree(node->target, reg);
  1263. }
  1264. break;
  1265. case ANCHOR_LOOK_BEHIND_NOT:
  1266. {
  1267. int n;
  1268. len = compile_length_tree(node->target, reg);
  1269. r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
  1270. len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
  1271. if (r) return r;
  1272. if (node->char_len < 0) {
  1273. r = get_char_length_tree(node->target, reg, &n);
  1274. if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  1275. }
  1276. else
  1277. n = node->char_len;
  1278. r = add_length(reg, n);
  1279. if (r) return r;
  1280. r = compile_tree(node->target, reg);
  1281. if (r) return r;
  1282. r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
  1283. }
  1284. break;
  1285. default:
  1286. return ONIGERR_TYPE_BUG;
  1287. break;
  1288. }
  1289. return r;
  1290. }
  1291. static int
  1292. compile_length_tree(Node* node, regex_t* reg)
  1293. {
  1294. int len, type, r;
  1295. type = NTYPE(node);
  1296. switch (type) {
  1297. case NT_LIST:
  1298. len = 0;
  1299. do {
  1300. r = compile_length_tree(NCAR(node), reg);
  1301. if (r < 0) return r;
  1302. len += r;
  1303. } while (IS_NOT_NULL(node = NCDR(node)));
  1304. r = len;
  1305. break;
  1306. case NT_ALT:
  1307. {
  1308. int n;
  1309. n = r = 0;
  1310. do {
  1311. r += compile_length_tree(NCAR(node), reg);
  1312. n++;
  1313. } while (IS_NOT_NULL(node = NCDR(node)));
  1314. r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
  1315. }
  1316. break;
  1317. case NT_STR:
  1318. if (NSTRING_IS_RAW(node))
  1319. r = compile_length_string_raw_node(NSTR(node), reg);
  1320. else
  1321. r = compile_length_string_node(node, reg);
  1322. break;
  1323. case NT_CCLASS:
  1324. r = compile_length_cclass_node(NCCLASS(node), reg);
  1325. break;
  1326. case NT_CTYPE:
  1327. case NT_CANY:
  1328. r = SIZE_OPCODE;
  1329. break;
  1330. case NT_BREF:
  1331. {
  1332. BRefNode* br = NBREF(node);
  1333. #ifdef USE_BACKREF_WITH_LEVEL
  1334. if (IS_BACKREF_NEST_LEVEL(br)) {
  1335. r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
  1336. SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
  1337. }
  1338. else
  1339. #endif
  1340. if (br->back_num == 1) {
  1341. r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
  1342. ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
  1343. }
  1344. else {
  1345. r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
  1346. }
  1347. }
  1348. break;
  1349. #ifdef USE_SUBEXP_CALL
  1350. case NT_CALL:
  1351. r = SIZE_OP_CALL;
  1352. break;
  1353. #endif
  1354. case NT_QTFR:
  1355. r = compile_length_quantifier_node(NQTFR(node), reg);
  1356. break;
  1357. case NT_ENCLOSE:
  1358. r = compile_length_enclose_node(NENCLOSE(node), reg);
  1359. break;
  1360. case NT_ANCHOR:
  1361. r = compile_length_anchor_node(NANCHOR(node), reg);
  1362. break;
  1363. default:
  1364. return ONIGERR_TYPE_BUG;
  1365. break;
  1366. }
  1367. return r;
  1368. }
  1369. static int
  1370. compile_tree(Node* node, regex_t* reg)
  1371. {
  1372. int n, type, len, pos, r = 0;
  1373. type = NTYPE(node);
  1374. switch (type) {
  1375. case NT_LIST:
  1376. do {
  1377. r = compile_tree(NCAR(node), reg);
  1378. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1379. break;
  1380. case NT_ALT:
  1381. {
  1382. Node* x = node;
  1383. len = 0;
  1384. do {
  1385. len += compile_length_tree(NCAR(x), reg);
  1386. if (NCDR(x) != NULL) {
  1387. len += SIZE_OP_PUSH + SIZE_OP_JUMP;
  1388. }
  1389. } while (IS_NOT_NULL(x = NCDR(x)));
  1390. pos = reg->used + len; /* goal position */
  1391. do {
  1392. len = compile_length_tree(NCAR(node), reg);
  1393. if (IS_NOT_NULL(NCDR(node))) {
  1394. r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
  1395. if (r) break;
  1396. }
  1397. r = compile_tree(NCAR(node), reg);
  1398. if (r) break;
  1399. if (IS_NOT_NULL(NCDR(node))) {
  1400. len = pos - (reg->used + SIZE_OP_JUMP);
  1401. r = add_opcode_rel_addr(reg, OP_JUMP, len);
  1402. if (r) break;
  1403. }
  1404. } while (IS_NOT_NULL(node = NCDR(node)));
  1405. }
  1406. break;
  1407. case NT_STR:
  1408. if (NSTRING_IS_RAW(node))
  1409. r = compile_string_raw_node(NSTR(node), reg);
  1410. else
  1411. r = compile_string_node(node, reg);
  1412. break;
  1413. case NT_CCLASS:
  1414. r = compile_cclass_node(NCCLASS(node), reg);
  1415. break;
  1416. case NT_CTYPE:
  1417. {
  1418. int op;
  1419. switch (NCTYPE(node)->ctype) {
  1420. case ONIGENC_CTYPE_WORD:
  1421. if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
  1422. else op = OP_WORD;
  1423. break;
  1424. default:
  1425. return ONIGERR_TYPE_BUG;
  1426. break;
  1427. }
  1428. r = add_opcode(reg, op);
  1429. }
  1430. break;
  1431. case NT_CANY:
  1432. if (IS_MULTILINE(reg->options))
  1433. r = add_opcode(reg, OP_ANYCHAR_ML);
  1434. else
  1435. r = add_opcode(reg, OP_ANYCHAR);
  1436. break;
  1437. case NT_BREF:
  1438. {
  1439. BRefNode* br = NBREF(node);
  1440. #ifdef USE_BACKREF_WITH_LEVEL
  1441. if (IS_BACKREF_NEST_LEVEL(br)) {
  1442. r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
  1443. if (r) return r;
  1444. r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
  1445. if (r) return r;
  1446. r = add_length(reg, br->nest_level);
  1447. if (r) return r;
  1448. goto add_bacref_mems;
  1449. }
  1450. else
  1451. #endif
  1452. if (br->back_num == 1) {
  1453. n = br->back_static[0];
  1454. if (IS_IGNORECASE(reg->options)) {
  1455. r = add_opcode(reg, OP_BACKREFN_IC);
  1456. if (r) return r;
  1457. r = add_mem_num(reg, n);
  1458. }
  1459. else {
  1460. switch (n) {
  1461. case 1: r = add_opcode(reg, OP_BACKREF1); break;
  1462. case 2: r = add_opcode(reg, OP_BACKREF2); break;
  1463. default:
  1464. r = add_opcode(reg, OP_BACKREFN);
  1465. if (r) return r;
  1466. r = add_mem_num(reg, n);
  1467. break;
  1468. }
  1469. }
  1470. }
  1471. else {
  1472. int i;
  1473. int* p;
  1474. if (IS_IGNORECASE(reg->options)) {
  1475. r = add_opcode(reg, OP_BACKREF_MULTI_IC);
  1476. }
  1477. else {
  1478. r = add_opcode(reg, OP_BACKREF_MULTI);
  1479. }
  1480. if (r) return r;
  1481. #ifdef USE_BACKREF_WITH_LEVEL
  1482. add_bacref_mems:
  1483. #endif
  1484. r = add_length(reg, br->back_num);
  1485. if (r) return r;
  1486. p = BACKREFS_P(br);
  1487. for (i = br->back_num - 1; i >= 0; i--) {
  1488. r = add_mem_num(reg, p[i]);
  1489. if (r) return r;
  1490. }
  1491. }
  1492. }
  1493. break;
  1494. #ifdef USE_SUBEXP_CALL
  1495. case NT_CALL:
  1496. r = compile_call(NCALL(node), reg);
  1497. break;
  1498. #endif
  1499. case NT_QTFR:
  1500. r = compile_quantifier_node(NQTFR(node), reg);
  1501. break;
  1502. case NT_ENCLOSE:
  1503. r = compile_enclose_node(NENCLOSE(node), reg);
  1504. break;
  1505. case NT_ANCHOR:
  1506. r = compile_anchor_node(NANCHOR(node), reg);
  1507. break;
  1508. default:
  1509. #ifdef ONIG_DEBUG
  1510. fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
  1511. #endif
  1512. break;
  1513. }
  1514. return r;
  1515. }
  1516. #ifdef USE_NAMED_GROUP
  1517. static int
  1518. noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
  1519. {
  1520. int r = 0;
  1521. Node* node = *plink;
  1522. switch (NTYPE(node)) {
  1523. case NT_LIST:
  1524. case NT_ALT:
  1525. do {
  1526. r = noname_disable_map(&(NCAR(node)), map, counter);
  1527. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1528. break;
  1529. case NT_QTFR:
  1530. {
  1531. Node** ptarget = &(NQTFR(node)->target);
  1532. Node* old = *ptarget;
  1533. r = noname_disable_map(ptarget, map, counter);
  1534. if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
  1535. onig_reduce_nested_quantifier(node, *ptarget);
  1536. }
  1537. }
  1538. break;
  1539. case NT_ENCLOSE:
  1540. {
  1541. EncloseNode* en = NENCLOSE(node);
  1542. if (en->type == ENCLOSE_MEMORY) {
  1543. if (IS_ENCLOSE_NAMED_GROUP(en)) {
  1544. (*counter)++;
  1545. map[en->regnum].new_val = *counter;
  1546. en->regnum = *counter;
  1547. r = noname_disable_map(&(en->target), map, counter);
  1548. }
  1549. else {
  1550. *plink = en->target;
  1551. en->target = NULL_NODE;
  1552. onig_node_free(node);
  1553. r = noname_disable_map(plink, map, counter);
  1554. }
  1555. }
  1556. else
  1557. r = noname_disable_map(&(en->target), map, counter);
  1558. }
  1559. break;
  1560. case NT_ANCHOR:
  1561. {
  1562. AnchorNode* an = NANCHOR(node);
  1563. switch (an->type) {
  1564. case ANCHOR_PREC_READ:
  1565. case ANCHOR_PREC_READ_NOT:
  1566. case ANCHOR_LOOK_BEHIND:
  1567. case ANCHOR_LOOK_BEHIND_NOT:
  1568. r = noname_disable_map(&(an->target), map, counter);
  1569. break;
  1570. }
  1571. }
  1572. break;
  1573. default:
  1574. break;
  1575. }
  1576. return r;
  1577. }
  1578. static int
  1579. renumber_node_backref(Node* node, GroupNumRemap* map)
  1580. {
  1581. int i, pos, n, old_num;
  1582. int *backs;
  1583. BRefNode* bn = NBREF(node);
  1584. if (! IS_BACKREF_NAME_REF(bn))
  1585. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  1586. old_num = bn->back_num;
  1587. if (IS_NULL(bn->back_dynamic))
  1588. backs = bn->back_static;
  1589. else
  1590. backs = bn->back_dynamic;
  1591. for (i = 0, pos = 0; i < old_num; i++) {
  1592. n = map[backs[i]].new_val;
  1593. if (n > 0) {
  1594. backs[pos] = n;
  1595. pos++;
  1596. }
  1597. }
  1598. bn->back_num = pos;
  1599. return 0;
  1600. }
  1601. static int
  1602. renumber_by_map(Node* node, GroupNumRemap* map)
  1603. {
  1604. int r = 0;
  1605. switch (NTYPE(node)) {
  1606. case NT_LIST:
  1607. case NT_ALT:
  1608. do {
  1609. r = renumber_by_map(NCAR(node), map);
  1610. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1611. break;
  1612. case NT_QTFR:
  1613. r = renumber_by_map(NQTFR(node)->target, map);
  1614. break;
  1615. case NT_ENCLOSE:
  1616. r = renumber_by_map(NENCLOSE(node)->target, map);
  1617. break;
  1618. case NT_BREF:
  1619. r = renumber_node_backref(node, map);
  1620. break;
  1621. case NT_ANCHOR:
  1622. {
  1623. AnchorNode* an = NANCHOR(node);
  1624. switch (an->type) {
  1625. case ANCHOR_PREC_READ:
  1626. case ANCHOR_PREC_READ_NOT:
  1627. case ANCHOR_LOOK_BEHIND:
  1628. case ANCHOR_LOOK_BEHIND_NOT:
  1629. r = renumber_by_map(an->target, map);
  1630. break;
  1631. }
  1632. }
  1633. break;
  1634. default:
  1635. break;
  1636. }
  1637. return r;
  1638. }
  1639. static int
  1640. numbered_ref_check(Node* node)
  1641. {
  1642. int r = 0;
  1643. switch (NTYPE(node)) {
  1644. case NT_LIST:
  1645. case NT_ALT:
  1646. do {
  1647. r = numbered_ref_check(NCAR(node));
  1648. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1649. break;
  1650. case NT_QTFR:
  1651. r = numbered_ref_check(NQTFR(node)->target);
  1652. break;
  1653. case NT_ENCLOSE:
  1654. r = numbered_ref_check(NENCLOSE(node)->target);
  1655. break;
  1656. case NT_BREF:
  1657. if (! IS_BACKREF_NAME_REF(NBREF(node)))
  1658. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  1659. break;
  1660. default:
  1661. break;
  1662. }
  1663. return r;
  1664. }
  1665. static int
  1666. disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
  1667. {
  1668. int r, i, pos, counter;
  1669. BitStatusType loc;
  1670. GroupNumRemap* map;
  1671. map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
  1672. CHECK_NULL_RETURN_MEMERR(map);
  1673. for (i = 1; i <= env->num_mem; i++) {
  1674. map[i].new_val = 0;
  1675. }
  1676. counter = 0;
  1677. r = noname_disable_map(root, map, &counter);
  1678. if (r != 0) return r;
  1679. r = renumber_by_map(*root, map);
  1680. if (r != 0) return r;
  1681. for (i = 1, pos = 1; i <= env->num_mem; i++) {
  1682. if (map[i].new_val > 0) {
  1683. SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
  1684. pos++;
  1685. }
  1686. }
  1687. loc = env->capture_history;
  1688. BIT_STATUS_CLEAR(env->capture_history);
  1689. for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
  1690. if (BIT_STATUS_AT(loc, i)) {
  1691. BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
  1692. }
  1693. }
  1694. env->num_mem = env->num_named;
  1695. reg->num_mem = env->num_named;
  1696. return onig_renumber_name_table(reg, map);
  1697. }
  1698. #endif /* USE_NAMED_GROUP */
  1699. #ifdef USE_SUBEXP_CALL
  1700. static int
  1701. unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
  1702. {
  1703. int i, offset;
  1704. EncloseNode* en;
  1705. AbsAddrType addr;
  1706. for (i = 0; i < uslist->num; i++) {
  1707. en = NENCLOSE(uslist->us[i].target);
  1708. if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
  1709. addr = en->call_addr;
  1710. offset = uslist->us[i].offset;
  1711. BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
  1712. }
  1713. return 0;
  1714. }
  1715. #endif
  1716. #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
  1717. static int
  1718. quantifiers_memory_node_info(Node* node)
  1719. {
  1720. int r = 0;
  1721. switch (NTYPE(node)) {
  1722. case NT_LIST:
  1723. case NT_ALT:
  1724. {
  1725. int v;
  1726. do {
  1727. v = quantifiers_memory_node_info(NCAR(node));
  1728. if (v > r) r = v;
  1729. } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
  1730. }
  1731. break;
  1732. #ifdef USE_SUBEXP_CALL
  1733. case NT_CALL:
  1734. if (IS_CALL_RECURSION(NCALL(node))) {
  1735. return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
  1736. }
  1737. else
  1738. r = quantifiers_memory_node_info(NCALL(node)->target);
  1739. break;
  1740. #endif
  1741. case NT_QTFR:
  1742. {
  1743. QtfrNode* qn = NQTFR(node);
  1744. if (qn->upper != 0) {
  1745. r = quantifiers_memory_node_info(qn->target);
  1746. }
  1747. }
  1748. break;
  1749. case NT_ENCLOSE:
  1750. {
  1751. EncloseNode* en = NENCLOSE(node);
  1752. switch (en->type) {
  1753. case ENCLOSE_MEMORY:
  1754. return NQ_TARGET_IS_EMPTY_MEM;
  1755. break;
  1756. case ENCLOSE_OPTION:
  1757. case ENCLOSE_STOP_BACKTRACK:
  1758. r = quantifiers_memory_node_info(en->target);
  1759. break;
  1760. default:
  1761. break;
  1762. }
  1763. }
  1764. break;
  1765. case NT_BREF:
  1766. case NT_STR:
  1767. case NT_CTYPE:
  1768. case NT_CCLASS:
  1769. case NT_CANY:
  1770. case NT_ANCHOR:
  1771. default:
  1772. break;
  1773. }
  1774. return r;
  1775. }
  1776. #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
  1777. static int
  1778. get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
  1779. {
  1780. OnigDistance tmin;
  1781. int r = 0;
  1782. *min = 0;
  1783. switch (NTYPE(node)) {
  1784. case NT_BREF:
  1785. {
  1786. int i;
  1787. int* backs;
  1788. Node** nodes = SCANENV_MEM_NODES(env);
  1789. BRefNode* br = NBREF(node);
  1790. if (br->state & NST_RECURSION) break;
  1791. backs = BACKREFS_P(br);
  1792. if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  1793. r = get_min_match_length(nodes[backs[0]], min, env);
  1794. if (r != 0) break;
  1795. for (i = 1; i < br->back_num; i++) {
  1796. if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  1797. r = get_min_match_length(nodes[backs[i]], &tmin, env);
  1798. if (r != 0) break;
  1799. if (*min > tmin) *min = tmin;
  1800. }
  1801. }
  1802. break;
  1803. #ifdef USE_SUBEXP_CALL
  1804. case NT_CALL:
  1805. if (IS_CALL_RECURSION(NCALL(node))) {
  1806. EncloseNode* en = NENCLOSE(NCALL(node)->target);
  1807. if (IS_ENCLOSE_MIN_FIXED(en))
  1808. *min = en->min_len;
  1809. }
  1810. else
  1811. r = get_min_match_length(NCALL(node)->target, min, env);
  1812. break;
  1813. #endif
  1814. case NT_LIST:
  1815. do {
  1816. r = get_min_match_length(NCAR(node), &tmin, env);
  1817. if (r == 0) *min += tmin;
  1818. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1819. break;
  1820. case NT_ALT:
  1821. {
  1822. Node *x, *y;
  1823. y = node;
  1824. do {
  1825. x = NCAR(y);
  1826. r = get_min_match_length(x, &tmin, env);
  1827. if (r != 0) break;
  1828. if (y == node) *min = tmin;
  1829. else if (*min > tmin) *min = tmin;
  1830. } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
  1831. }
  1832. break;
  1833. case NT_STR:
  1834. {
  1835. StrNode* sn = NSTR(node);
  1836. *min = sn->end - sn->s;
  1837. }
  1838. break;
  1839. case NT_CTYPE:
  1840. *min = 1;
  1841. break;
  1842. case NT_CCLASS:
  1843. case NT_CANY:
  1844. *min = 1;
  1845. break;
  1846. case NT_QTFR:
  1847. {
  1848. QtfrNode* qn = NQTFR(node);
  1849. if (qn->lower > 0) {
  1850. r = get_min_match_length(qn->target, min, env);
  1851. if (r == 0)
  1852. *min = distance_multiply(*min, qn->lower);
  1853. }
  1854. }
  1855. break;
  1856. case NT_ENCLOSE:
  1857. {
  1858. EncloseNode* en = NENCLOSE(node);
  1859. switch (en->type) {
  1860. case ENCLOSE_MEMORY:
  1861. #ifdef USE_SUBEXP_CALL
  1862. if (IS_ENCLOSE_MIN_FIXED(en))
  1863. *min = en->min_len;
  1864. else {
  1865. r = get_min_match_length(en->target, min, env);
  1866. if (r == 0) {
  1867. en->min_len = *min;
  1868. SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
  1869. }
  1870. }
  1871. break;
  1872. #endif
  1873. case ENCLOSE_OPTION:
  1874. case ENCLOSE_STOP_BACKTRACK:
  1875. r = get_min_match_length(en->target, min, env);
  1876. break;
  1877. }
  1878. }
  1879. break;
  1880. case NT_ANCHOR:
  1881. default:
  1882. break;
  1883. }
  1884. return r;
  1885. }
  1886. static int
  1887. get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
  1888. {
  1889. OnigDistance tmax;
  1890. int r = 0;
  1891. *max = 0;
  1892. switch (NTYPE(node)) {
  1893. case NT_LIST:
  1894. do {
  1895. r = get_max_match_length(NCAR(node), &tmax, env);
  1896. if (r == 0)
  1897. *max = distance_add(*max, tmax);
  1898. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1899. break;
  1900. case NT_ALT:
  1901. do {
  1902. r = get_max_match_length(NCAR(node), &tmax, env);
  1903. if (r == 0 && *max < tmax) *max = tmax;
  1904. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1905. break;
  1906. case NT_STR:
  1907. {
  1908. StrNode* sn = NSTR(node);
  1909. *max = sn->end - sn->s;
  1910. }
  1911. break;
  1912. case NT_CTYPE:
  1913. *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  1914. break;
  1915. case NT_CCLASS:
  1916. case NT_CANY:
  1917. *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  1918. break;
  1919. case NT_BREF:
  1920. {
  1921. int i;
  1922. int* backs;
  1923. Node** nodes = SCANENV_MEM_NODES(env);
  1924. BRefNode* br = NBREF(node);
  1925. if (br->state & NST_RECURSION) {
  1926. *max = ONIG_INFINITE_DISTANCE;
  1927. break;
  1928. }
  1929. backs = BACKREFS_P(br);
  1930. for (i = 0; i < br->back_num; i++) {
  1931. if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  1932. r = get_max_match_length(nodes[backs[i]], &tmax, env);
  1933. if (r != 0) break;
  1934. if (*max < tmax) *max = tmax;
  1935. }
  1936. }
  1937. break;
  1938. #ifdef USE_SUBEXP_CALL
  1939. case NT_CALL:
  1940. if (! IS_CALL_RECURSION(NCALL(node)))
  1941. r = get_max_match_length(NCALL(node)->target, max, env);
  1942. else
  1943. *max = ONIG_INFINITE_DISTANCE;
  1944. break;
  1945. #endif
  1946. case NT_QTFR:
  1947. {
  1948. QtfrNode* qn = NQTFR(node);
  1949. if (qn->upper != 0) {
  1950. r = get_max_match_length(qn->target, max, env);
  1951. if (r == 0 && *max != 0) {
  1952. if (! IS_REPEAT_INFINITE(qn->upper))
  1953. *max = distance_multiply(*max, qn->upper);
  1954. else
  1955. *max = ONIG_INFINITE_DISTANCE;
  1956. }
  1957. }
  1958. }
  1959. break;
  1960. case NT_ENCLOSE:
  1961. {
  1962. EncloseNode* en = NENCLOSE(node);
  1963. switch (en->type) {
  1964. case ENCLOSE_MEMORY:
  1965. #ifdef USE_SUBEXP_CALL
  1966. if (IS_ENCLOSE_MAX_FIXED(en))
  1967. *max = en->max_len;
  1968. else {
  1969. r = get_max_match_length(en->target, max, env);
  1970. if (r == 0) {
  1971. en->max_len = *max;
  1972. SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
  1973. }
  1974. }
  1975. break;
  1976. #endif
  1977. case ENCLOSE_OPTION:
  1978. case ENCLOSE_STOP_BACKTRACK:
  1979. r = get_max_match_length(en->target, max, env);
  1980. break;
  1981. }
  1982. }
  1983. break;
  1984. case NT_ANCHOR:
  1985. default:
  1986. break;
  1987. }
  1988. return r;
  1989. }
  1990. #define GET_CHAR_LEN_VARLEN -1
  1991. #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
  1992. /* fixed size pattern node only */
  1993. static int
  1994. get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
  1995. {
  1996. int tlen;
  1997. int r = 0;
  1998. level++;
  1999. *len = 0;
  2000. switch (NTYPE(node)) {
  2001. case NT_LIST:
  2002. do {
  2003. r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
  2004. if (r == 0)
  2005. *len = distance_add(*len, tlen);
  2006. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2007. break;
  2008. case NT_ALT:
  2009. {
  2010. int tlen2;
  2011. int varlen = 0;
  2012. r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
  2013. while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
  2014. r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
  2015. if (r == 0) {
  2016. if (tlen != tlen2)
  2017. varlen = 1;
  2018. }
  2019. }
  2020. if (r == 0) {
  2021. if (varlen != 0) {
  2022. if (level == 1)
  2023. r = GET_CHAR_LEN_TOP_ALT_VARLEN;
  2024. else
  2025. r = GET_CHAR_LEN_VARLEN;
  2026. }
  2027. else
  2028. *len = tlen;
  2029. }
  2030. }
  2031. break;
  2032. case NT_STR:
  2033. {
  2034. StrNode* sn = NSTR(node);
  2035. UChar *s = sn->s;
  2036. while (s < sn->end) {
  2037. s += enclen(reg->enc, s, sn->end);
  2038. (*len)++;
  2039. }
  2040. }
  2041. break;
  2042. case NT_QTFR:
  2043. {
  2044. QtfrNode* qn = NQTFR(node);
  2045. if (qn->lower == qn->upper) {
  2046. r = get_char_length_tree1(qn->target, reg, &tlen, level);
  2047. if (r == 0)
  2048. *len = distance_multiply(tlen, qn->lower);
  2049. }
  2050. else
  2051. r = GET_CHAR_LEN_VARLEN;
  2052. }
  2053. break;
  2054. #ifdef USE_SUBEXP_CALL
  2055. case NT_CALL:
  2056. if (! IS_CALL_RECURSION(NCALL(node)))
  2057. r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
  2058. else
  2059. r = GET_CHAR_LEN_VARLEN;
  2060. break;
  2061. #endif
  2062. case NT_CTYPE:
  2063. *len = 1;
  2064. break;
  2065. case NT_CCLASS:
  2066. case NT_CANY:
  2067. *len = 1;
  2068. break;
  2069. case NT_ENCLOSE:
  2070. {
  2071. EncloseNode* en = NENCLOSE(node);
  2072. switch (en->type) {
  2073. case ENCLOSE_MEMORY:
  2074. #ifdef USE_SUBEXP_CALL
  2075. if (IS_ENCLOSE_CLEN_FIXED(en))
  2076. *len = en->char_len;
  2077. else {
  2078. r = get_char_length_tree1(en->target, reg, len, level);
  2079. if (r == 0) {
  2080. en->char_len = *len;
  2081. SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
  2082. }
  2083. }
  2084. break;
  2085. #endif
  2086. case ENCLOSE_OPTION:
  2087. case ENCLOSE_STOP_BACKTRACK:
  2088. r = get_char_length_tree1(en->target, reg, len, level);
  2089. break;
  2090. default:
  2091. break;
  2092. }
  2093. }
  2094. break;
  2095. case NT_ANCHOR:
  2096. break;
  2097. default:
  2098. r = GET_CHAR_LEN_VARLEN;
  2099. break;
  2100. }
  2101. return r;
  2102. }
  2103. static int
  2104. get_char_length_tree(Node* node, regex_t* reg, int* len)
  2105. {
  2106. return get_char_length_tree1(node, reg, len, 0);
  2107. }
  2108. /* x is not included y ==> 1 : 0 */
  2109. static int
  2110. is_not_included(Node* x, Node* y, regex_t* reg)
  2111. {
  2112. int i, len;
  2113. OnigCodePoint code;
  2114. UChar *p, c;
  2115. int ytype;
  2116. retry:
  2117. ytype = NTYPE(y);
  2118. switch (NTYPE(x)) {
  2119. case NT_CTYPE:
  2120. {
  2121. switch (ytype) {
  2122. case NT_CTYPE:
  2123. if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
  2124. NCTYPE(y)->not != NCTYPE(x)->not)
  2125. return 1;
  2126. else
  2127. return 0;
  2128. break;
  2129. case NT_CCLASS:
  2130. swap:
  2131. {
  2132. Node* tmp;
  2133. tmp = x; x = y; y = tmp;
  2134. goto retry;
  2135. }
  2136. break;
  2137. case NT_STR:
  2138. goto swap;
  2139. break;
  2140. default:
  2141. break;
  2142. }
  2143. }
  2144. break;
  2145. case NT_CCLASS:
  2146. {
  2147. CClassNode* xc = NCCLASS(x);
  2148. switch (ytype) {
  2149. case NT_CTYPE:
  2150. switch (NCTYPE(y)->ctype) {
  2151. case ONIGENC_CTYPE_WORD:
  2152. if (NCTYPE(y)->not == 0) {
  2153. if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
  2154. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  2155. if (BITSET_AT(xc->bs, i)) {
  2156. if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
  2157. }
  2158. }
  2159. return 1;
  2160. }
  2161. return 0;
  2162. }
  2163. else {
  2164. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  2165. if (! IS_CODE_SB_WORD(reg->enc, i)) {
  2166. if (!IS_NCCLASS_NOT(xc)) {
  2167. if (BITSET_AT(xc->bs, i))
  2168. return 0;
  2169. }
  2170. else {
  2171. if (! BITSET_AT(xc->bs, i))
  2172. return 0;
  2173. }
  2174. }
  2175. }
  2176. return 1;
  2177. }
  2178. break;
  2179. default:
  2180. break;
  2181. }
  2182. break;
  2183. case NT_CCLASS:
  2184. {
  2185. int v;
  2186. CClassNode* yc = NCCLASS(y);
  2187. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  2188. v = BITSET_AT(xc->bs, i);
  2189. if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
  2190. (v == 0 && IS_NCCLASS_NOT(xc))) {
  2191. v = BITSET_AT(yc->bs, i);
  2192. if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
  2193. (v == 0 && IS_NCCLASS_NOT(yc)))
  2194. return 0;
  2195. }
  2196. }
  2197. if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
  2198. (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
  2199. return 1;
  2200. return 0;
  2201. }
  2202. break;
  2203. case NT_STR:
  2204. goto swap;
  2205. break;
  2206. default:
  2207. break;
  2208. }
  2209. }
  2210. break;
  2211. case NT_STR:
  2212. {
  2213. StrNode* xs = NSTR(x);
  2214. if (NSTRING_LEN(x) == 0)
  2215. break;
  2216. c = *(xs->s);
  2217. switch (ytype) {
  2218. case NT_CTYPE:
  2219. switch (NCTYPE(y)->ctype) {
  2220. case ONIGENC_CTYPE_WORD:
  2221. if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
  2222. return NCTYPE(y)->not;
  2223. else
  2224. return !(NCTYPE(y)->not);
  2225. break;
  2226. default:
  2227. break;
  2228. }
  2229. break;
  2230. case NT_CCLASS:
  2231. {
  2232. CClassNode* cc = NCCLASS(y);
  2233. code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
  2234. xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
  2235. return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
  2236. }
  2237. break;
  2238. case NT_STR:
  2239. {
  2240. UChar *q;
  2241. StrNode* ys = NSTR(y);
  2242. len = NSTRING_LEN(x);
  2243. if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
  2244. if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
  2245. /* tiny version */
  2246. return 0;
  2247. }
  2248. else {
  2249. for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
  2250. if (*p != *q) return 1;
  2251. }
  2252. }
  2253. }
  2254. break;
  2255. default:
  2256. break;
  2257. }
  2258. }
  2259. break;
  2260. default:
  2261. break;
  2262. }
  2263. return 0;
  2264. }
  2265. static Node*
  2266. get_head_value_node(Node* node, int exact, regex_t* reg)
  2267. {
  2268. Node* n = NULL_NODE;
  2269. switch (NTYPE(node)) {
  2270. case NT_BREF:
  2271. case NT_ALT:
  2272. case NT_CANY:
  2273. #ifdef USE_SUBEXP_CALL
  2274. case NT_CALL:
  2275. #endif
  2276. break;
  2277. case NT_CTYPE:
  2278. case NT_CCLASS:
  2279. if (exact == 0) {
  2280. n = node;
  2281. }
  2282. break;
  2283. case NT_LIST:
  2284. n = get_head_value_node(NCAR(node), exact, reg);
  2285. break;
  2286. case NT_STR:
  2287. {
  2288. StrNode* sn = NSTR(node);
  2289. if (sn->end <= sn->s)
  2290. break;
  2291. if (exact != 0 &&
  2292. !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
  2293. }
  2294. else {
  2295. n = node;
  2296. }
  2297. }
  2298. break;
  2299. case NT_QTFR:
  2300. {
  2301. QtfrNode* qn = NQTFR(node);
  2302. if (qn->lower > 0) {
  2303. if (IS_NOT_NULL(qn->head_exact))
  2304. n = qn->head_exact;
  2305. else
  2306. n = get_head_value_node(qn->target, exact, reg);
  2307. }
  2308. }
  2309. break;
  2310. case NT_ENCLOSE:
  2311. {
  2312. EncloseNode* en = NENCLOSE(node);
  2313. switch (en->type) {
  2314. case ENCLOSE_OPTION:
  2315. {
  2316. OnigOptionType options = reg->options;
  2317. reg->options = NENCLOSE(node)->option;
  2318. n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
  2319. reg->options = options;
  2320. }
  2321. break;
  2322. case ENCLOSE_MEMORY:
  2323. case ENCLOSE_STOP_BACKTRACK:
  2324. n = get_head_value_node(en->target, exact, reg);
  2325. break;
  2326. }
  2327. }
  2328. break;
  2329. case NT_ANCHOR:
  2330. if (NANCHOR(node)->type == ANCHOR_PREC_READ)
  2331. n = get_head_value_node(NANCHOR(node)->target, exact, reg);
  2332. break;
  2333. default:
  2334. break;
  2335. }
  2336. return n;
  2337. }
  2338. static int
  2339. check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
  2340. {
  2341. int type, r = 0;
  2342. type = NTYPE(node);
  2343. if ((NTYPE2BIT(type) & type_mask) == 0)
  2344. return 1;
  2345. switch (type) {
  2346. case NT_LIST:
  2347. case NT_ALT:
  2348. do {
  2349. r = check_type_tree(NCAR(node), type_mask, enclose_mask,
  2350. anchor_mask);
  2351. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2352. break;
  2353. case NT_QTFR:
  2354. r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
  2355. anchor_mask);
  2356. break;
  2357. case NT_ENCLOSE:
  2358. {
  2359. EncloseNode* en = NENCLOSE(node);
  2360. if ((en->type & enclose_mask) == 0)
  2361. return 1;
  2362. r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
  2363. }
  2364. break;
  2365. case NT_ANCHOR:
  2366. type = NANCHOR(node)->type;
  2367. if ((type & anchor_mask) == 0)
  2368. return 1;
  2369. if (NANCHOR(node)->target)
  2370. r = check_type_tree(NANCHOR(node)->target,
  2371. type_mask, enclose_mask, anchor_mask);
  2372. break;
  2373. default:
  2374. break;
  2375. }
  2376. return r;
  2377. }
  2378. #ifdef USE_SUBEXP_CALL
  2379. #define RECURSION_EXIST 1
  2380. #define RECURSION_INFINITE 2
  2381. static int
  2382. subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
  2383. {
  2384. int type;
  2385. int r = 0;
  2386. type = NTYPE(node);
  2387. switch (type) {
  2388. case NT_LIST:
  2389. {
  2390. Node *x;
  2391. OnigDistance min;
  2392. int ret;
  2393. x = node;
  2394. do {
  2395. ret = subexp_inf_recursive_check(NCAR(x), env, head);
  2396. if (ret < 0 || ret == RECURSION_INFINITE) return ret;
  2397. r |= ret;
  2398. if (head) {
  2399. ret = get_min_match_length(NCAR(x), &min, env);
  2400. if (ret != 0) return ret;
  2401. if (min != 0) head = 0;
  2402. }
  2403. } while (IS_NOT_NULL(x = NCDR(x)));
  2404. }
  2405. break;
  2406. case NT_ALT:
  2407. {
  2408. int ret;
  2409. r = RECURSION_EXIST;
  2410. do {
  2411. ret = subexp_inf_recursive_check(NCAR(node), env, head);
  2412. if (ret < 0 || ret == RECURSION_INFINITE) return ret;
  2413. r &= ret;
  2414. } while (IS_NOT_NULL(node = NCDR(node)));
  2415. }
  2416. break;
  2417. case NT_QTFR:
  2418. r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
  2419. if (r == RECURSION_EXIST) {
  2420. if (NQTFR(node)->lower == 0) r = 0;
  2421. }
  2422. break;
  2423. case NT_ANCHOR:
  2424. {
  2425. AnchorNode* an = NANCHOR(node);
  2426. switch (an->type) {
  2427. case ANCHOR_PREC_READ:
  2428. case ANCHOR_PREC_READ_NOT:
  2429. case ANCHOR_LOOK_BEHIND:
  2430. case ANCHOR_LOOK_BEHIND_NOT:
  2431. r = subexp_inf_recursive_check(an->target, env, head);
  2432. break;
  2433. }
  2434. }
  2435. break;
  2436. case NT_CALL:
  2437. r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
  2438. break;
  2439. case NT_ENCLOSE:
  2440. if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
  2441. return 0;
  2442. else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
  2443. return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
  2444. else {
  2445. SET_ENCLOSE_STATUS(node, NST_MARK2);
  2446. r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
  2447. CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
  2448. }
  2449. break;
  2450. default:
  2451. break;
  2452. }
  2453. return r;
  2454. }
  2455. static int
  2456. subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
  2457. {
  2458. int type;
  2459. int r = 0;
  2460. type = NTYPE(node);
  2461. switch (type) {
  2462. case NT_LIST:
  2463. case NT_ALT:
  2464. do {
  2465. r = subexp_inf_recursive_check_trav(NCAR(node), env);
  2466. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2467. break;
  2468. case NT_QTFR:
  2469. r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
  2470. break;
  2471. case NT_ANCHOR:
  2472. {
  2473. AnchorNode* an = NANCHOR(node);
  2474. switch (an->type) {
  2475. case ANCHOR_PREC_READ:
  2476. case ANCHOR_PREC_READ_NOT:
  2477. case ANCHOR_LOOK_BEHIND:
  2478. case ANCHOR_LOOK_BEHIND_NOT:
  2479. r = subexp_inf_recursive_check_trav(an->target, env);
  2480. break;
  2481. }
  2482. }
  2483. break;
  2484. case NT_ENCLOSE:
  2485. {
  2486. EncloseNode* en = NENCLOSE(node);
  2487. if (IS_ENCLOSE_RECURSION(en)) {
  2488. SET_ENCLOSE_STATUS(node, NST_MARK1);
  2489. r = subexp_inf_recursive_check(en->target, env, 1);
  2490. if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
  2491. CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
  2492. }
  2493. r = subexp_inf_recursive_check_trav(en->target, env);
  2494. }
  2495. break;
  2496. default:
  2497. break;
  2498. }
  2499. return r;
  2500. }
  2501. static int
  2502. subexp_recursive_check(Node* node)
  2503. {
  2504. int r = 0;
  2505. switch (NTYPE(node)) {
  2506. case NT_LIST:
  2507. case NT_ALT:
  2508. do {
  2509. r |= subexp_recursive_check(NCAR(node));
  2510. } while (IS_NOT_NULL(node = NCDR(node)));
  2511. break;
  2512. case NT_QTFR:
  2513. r = subexp_recursive_check(NQTFR(node)->target);
  2514. break;
  2515. case NT_ANCHOR:
  2516. {
  2517. AnchorNode* an = NANCHOR(node);
  2518. switch (an->type) {
  2519. case ANCHOR_PREC_READ:
  2520. case ANCHOR_PREC_READ_NOT:
  2521. case ANCHOR_LOOK_BEHIND:
  2522. case ANCHOR_LOOK_BEHIND_NOT:
  2523. r = subexp_recursive_check(an->target);
  2524. break;
  2525. }
  2526. }
  2527. break;
  2528. case NT_CALL:
  2529. r = subexp_recursive_check(NCALL(node)->target);
  2530. if (r != 0) SET_CALL_RECURSION(node);
  2531. break;
  2532. case NT_ENCLOSE:
  2533. if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
  2534. return 0;
  2535. else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
  2536. return 1; /* recursion */
  2537. else {
  2538. SET_ENCLOSE_STATUS(node, NST_MARK2);
  2539. r = subexp_recursive_check(NENCLOSE(node)->target);
  2540. CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
  2541. }
  2542. break;
  2543. default:
  2544. break;
  2545. }
  2546. return r;
  2547. }
  2548. static int
  2549. subexp_recursive_check_trav(Node* node, ScanEnv* env)
  2550. {
  2551. #define FOUND_CALLED_NODE 1
  2552. int type;
  2553. int r = 0;
  2554. type = NTYPE(node);
  2555. switch (type) {
  2556. case NT_LIST:
  2557. case NT_ALT:
  2558. {
  2559. int ret;
  2560. do {
  2561. ret = subexp_recursive_check_trav(NCAR(node), env);
  2562. if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
  2563. else if (ret < 0) return ret;
  2564. } while (IS_NOT_NULL(node = NCDR(node)));
  2565. }
  2566. break;
  2567. case NT_QTFR:
  2568. r = subexp_recursive_check_trav(NQTFR(node)->target, env);
  2569. if (NQTFR(node)->upper == 0) {
  2570. if (r == FOUND_CALLED_NODE)
  2571. NQTFR(node)->is_refered = 1;
  2572. }
  2573. break;
  2574. case NT_ANCHOR:
  2575. {
  2576. AnchorNode* an = NANCHOR(node);
  2577. switch (an->type) {
  2578. case ANCHOR_PREC_READ:
  2579. case ANCHOR_PREC_READ_NOT:
  2580. case ANCHOR_LOOK_BEHIND:
  2581. case ANCHOR_LOOK_BEHIND_NOT:
  2582. r = subexp_recursive_check_trav(an->target, env);
  2583. break;
  2584. }
  2585. }
  2586. break;
  2587. case NT_ENCLOSE:
  2588. {
  2589. EncloseNode* en = NENCLOSE(node);
  2590. if (! IS_ENCLOSE_RECURSION(en)) {
  2591. if (IS_ENCLOSE_CALLED(en)) {
  2592. SET_ENCLOSE_STATUS(node, NST_MARK1);
  2593. r = subexp_recursive_check(en->target);
  2594. if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
  2595. CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
  2596. }
  2597. }
  2598. r = subexp_recursive_check_trav(en->target, env);
  2599. if (IS_ENCLOSE_CALLED(en))
  2600. r |= FOUND_CALLED_NODE;
  2601. }
  2602. break;
  2603. default:
  2604. break;
  2605. }
  2606. return r;
  2607. }
  2608. static int
  2609. setup_subexp_call(Node* node, ScanEnv* env)
  2610. {
  2611. int type;
  2612. int r = 0;
  2613. type = NTYPE(node);
  2614. switch (type) {
  2615. case NT_LIST:
  2616. do {
  2617. r = setup_subexp_call(NCAR(node), env);
  2618. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2619. break;
  2620. case NT_ALT:
  2621. do {
  2622. r = setup_subexp_call(NCAR(node), env);
  2623. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2624. break;
  2625. case NT_QTFR:
  2626. r = setup_subexp_call(NQTFR(node)->target, env);
  2627. break;
  2628. case NT_ENCLOSE:
  2629. r = setup_subexp_call(NENCLOSE(node)->target, env);
  2630. break;
  2631. case NT_CALL:
  2632. {
  2633. CallNode* cn = NCALL(node);
  2634. Node** nodes = SCANENV_MEM_NODES(env);
  2635. if (cn->group_num != 0) {
  2636. int gnum = cn->group_num;
  2637. #ifdef USE_NAMED_GROUP
  2638. if (env->num_named > 0 &&
  2639. IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
  2640. !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
  2641. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  2642. }
  2643. #endif
  2644. if (gnum > env->num_mem) {
  2645. onig_scan_env_set_error_string(env,
  2646. ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
  2647. return ONIGERR_UNDEFINED_GROUP_REFERENCE;
  2648. }
  2649. #ifdef USE_NAMED_GROUP
  2650. set_call_attr:
  2651. #endif
  2652. cn->target = nodes[cn->group_num];
  2653. if (IS_NULL(cn->target)) {
  2654. onig_scan_env_set_error_string(env,
  2655. ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
  2656. return ONIGERR_UNDEFINED_NAME_REFERENCE;
  2657. }
  2658. SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
  2659. BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
  2660. cn->unset_addr_list = env->unset_addr_list;
  2661. }
  2662. #ifdef USE_NAMED_GROUP
  2663. else {
  2664. int *refs;
  2665. int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
  2666. &refs);
  2667. if (n <= 0) {
  2668. onig_scan_env_set_error_string(env,
  2669. ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
  2670. return ONIGERR_UNDEFINED_NAME_REFERENCE;
  2671. }
  2672. else if (n > 1) {
  2673. onig_scan_env_set_error_string(env,
  2674. ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
  2675. return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
  2676. }
  2677. else {
  2678. cn->group_num = refs[0];
  2679. goto set_call_attr;
  2680. }
  2681. }
  2682. #endif
  2683. }
  2684. break;
  2685. case NT_ANCHOR:
  2686. {
  2687. AnchorNode* an = NANCHOR(node);
  2688. switch (an->type) {
  2689. case ANCHOR_PREC_READ:
  2690. case ANCHOR_PREC_READ_NOT:
  2691. case ANCHOR_LOOK_BEHIND:
  2692. case ANCHOR_LOOK_BEHIND_NOT:
  2693. r = setup_subexp_call(an->target, env);
  2694. break;
  2695. }
  2696. }
  2697. break;
  2698. default:
  2699. break;
  2700. }
  2701. return r;
  2702. }
  2703. #endif
  2704. /* divide different length alternatives in look-behind.
  2705. (?<=A|B) ==> (?<=A)|(?<=B)
  2706. (?<!A|B) ==> (?<!A)(?<!B)
  2707. */
  2708. static int
  2709. divide_look_behind_alternatives(Node* node)
  2710. {
  2711. Node *head, *np, *insert_node;
  2712. AnchorNode* an = NANCHOR(node);
  2713. int anc_type = an->type;
  2714. head = an->target;
  2715. np = NCAR(head);
  2716. swap_node(node, head);
  2717. NCAR(node) = head;
  2718. NANCHOR(head)->target = np;
  2719. np = node;
  2720. while ((np = NCDR(np)) != NULL_NODE) {
  2721. insert_node = onig_node_new_anchor(anc_type);
  2722. CHECK_NULL_RETURN_MEMERR(insert_node);
  2723. NANCHOR(insert_node)->target = NCAR(np);
  2724. NCAR(np) = insert_node;
  2725. }
  2726. if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
  2727. np = node;
  2728. do {
  2729. SET_NTYPE(np, NT_LIST); /* alt -> list */
  2730. } while ((np = NCDR(np)) != NULL_NODE);
  2731. }
  2732. return 0;
  2733. }
  2734. static int
  2735. setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
  2736. {
  2737. int r, len;
  2738. AnchorNode* an = NANCHOR(node);
  2739. r = get_char_length_tree(an->target, reg, &len);
  2740. if (r == 0)
  2741. an->char_len = len;
  2742. else if (r == GET_CHAR_LEN_VARLEN)
  2743. r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  2744. else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
  2745. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
  2746. r = divide_look_behind_alternatives(node);
  2747. else
  2748. r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  2749. }
  2750. return r;
  2751. }
  2752. static int
  2753. next_setup(Node* node, Node* next_node, regex_t* reg)
  2754. {
  2755. int type;
  2756. retry:
  2757. type = NTYPE(node);
  2758. if (type == NT_QTFR) {
  2759. QtfrNode* qn = NQTFR(node);
  2760. if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
  2761. #ifdef USE_QTFR_PEEK_NEXT
  2762. Node* n = get_head_value_node(next_node, 1, reg);
  2763. /* '\0': for UTF-16BE etc... */
  2764. if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
  2765. qn->next_head_exact = n;
  2766. }
  2767. #endif
  2768. /* automatic posseivation a*b ==> (?>a*)b */
  2769. if (qn->lower <= 1) {
  2770. int ttype = NTYPE(qn->target);
  2771. if (IS_NODE_TYPE_SIMPLE(ttype)) {
  2772. Node *x, *y;
  2773. x = get_head_value_node(qn->target, 0, reg);
  2774. if (IS_NOT_NULL(x)) {
  2775. y = get_head_value_node(next_node, 0, reg);
  2776. if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
  2777. Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
  2778. CHECK_NULL_RETURN_MEMERR(en);
  2779. SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
  2780. swap_node(node, en);
  2781. NENCLOSE(node)->target = en;
  2782. }
  2783. }
  2784. }
  2785. }
  2786. }
  2787. }
  2788. else if (type == NT_ENCLOSE) {
  2789. EncloseNode* en = NENCLOSE(node);
  2790. if (en->type == ENCLOSE_MEMORY) {
  2791. node = en->target;
  2792. goto retry;
  2793. }
  2794. }
  2795. return 0;
  2796. }
  2797. static int
  2798. update_string_node_case_fold(regex_t* reg, Node *node)
  2799. {
  2800. UChar *p, *q, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
  2801. UChar *sbuf, *ebuf, *sp;
  2802. int r, i, len, sbuf_size;
  2803. StrNode* sn = NSTR(node);
  2804. end = sn->end;
  2805. sbuf_size = (end - sn->s) * 2;
  2806. sbuf = (UChar* )xmalloc(sbuf_size);
  2807. CHECK_NULL_RETURN_MEMERR(sbuf);
  2808. ebuf = sbuf + sbuf_size;
  2809. sp = sbuf;
  2810. p = sn->s;
  2811. while (p < end) {
  2812. len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
  2813. q = buf;
  2814. for (i = 0; i < len; i++) {
  2815. if (sp >= ebuf) {
  2816. sbuf = (UChar* )xrealloc(sbuf, sbuf_size * 2);
  2817. CHECK_NULL_RETURN_MEMERR(sbuf);
  2818. sp = sbuf + sbuf_size;
  2819. sbuf_size *= 2;
  2820. ebuf = sbuf + sbuf_size;
  2821. }
  2822. *sp++ = buf[i];
  2823. }
  2824. }
  2825. r = onig_node_str_set(node, sbuf, sp);
  2826. if (r != 0) {
  2827. xfree(sbuf);
  2828. return r;
  2829. }
  2830. xfree(sbuf);
  2831. return 0;
  2832. }
  2833. static int
  2834. expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
  2835. regex_t* reg)
  2836. {
  2837. int r;
  2838. Node *node;
  2839. node = onig_node_new_str(s, end);
  2840. if (IS_NULL(node)) return ONIGERR_MEMORY;
  2841. r = update_string_node_case_fold(reg, node);
  2842. if (r != 0) {
  2843. onig_node_free(node);
  2844. return r;
  2845. }
  2846. NSTRING_SET_AMBIG(node);
  2847. NSTRING_SET_DONT_GET_OPT_INFO(node);
  2848. *rnode = node;
  2849. return 0;
  2850. }
  2851. static int
  2852. expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
  2853. UChar *p, int slen, UChar *end,
  2854. regex_t* reg, Node **rnode)
  2855. {
  2856. int r, i, j, len, varlen;
  2857. Node *anode, *var_anode, *snode, *xnode, *an;
  2858. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  2859. *rnode = var_anode = NULL_NODE;
  2860. varlen = 0;
  2861. for (i = 0; i < item_num; i++) {
  2862. if (items[i].byte_len != slen) {
  2863. varlen = 1;
  2864. break;
  2865. }
  2866. }
  2867. if (varlen != 0) {
  2868. *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
  2869. if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
  2870. xnode = onig_node_new_list(NULL, NULL);
  2871. if (IS_NULL(xnode)) goto mem_err;
  2872. NCAR(var_anode) = xnode;
  2873. anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
  2874. if (IS_NULL(anode)) goto mem_err;
  2875. NCAR(xnode) = anode;
  2876. }
  2877. else {
  2878. *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
  2879. if (IS_NULL(anode)) return ONIGERR_MEMORY;
  2880. }
  2881. snode = onig_node_new_str(p, p + slen);
  2882. if (IS_NULL(snode)) goto mem_err;
  2883. NCAR(anode) = snode;
  2884. for (i = 0; i < item_num; i++) {
  2885. snode = onig_node_new_str(NULL, NULL);
  2886. if (IS_NULL(snode)) goto mem_err;
  2887. for (j = 0; j < items[i].code_len; j++) {
  2888. len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
  2889. if (len < 0) {
  2890. r = len;
  2891. goto mem_err2;
  2892. }
  2893. r = onig_node_str_cat(snode, buf, buf + len);
  2894. if (r != 0) goto mem_err2;
  2895. }
  2896. an = onig_node_new_alt(NULL_NODE, NULL_NODE);
  2897. if (IS_NULL(an)) {
  2898. goto mem_err2;
  2899. }
  2900. if (items[i].byte_len != slen) {
  2901. Node *rem;
  2902. UChar *q = p + items[i].byte_len;
  2903. if (q < end) {
  2904. r = expand_case_fold_make_rem_string(&rem, q, end, reg);
  2905. if (r != 0) {
  2906. onig_node_free(an);
  2907. goto mem_err2;
  2908. }
  2909. xnode = onig_node_list_add(NULL_NODE, snode);
  2910. if (IS_NULL(xnode)) {
  2911. onig_node_free(an);
  2912. onig_node_free(rem);
  2913. goto mem_err2;
  2914. }
  2915. if (IS_NULL(onig_node_list_add(xnode, rem))) {
  2916. onig_node_free(an);
  2917. onig_node_free(xnode);
  2918. onig_node_free(rem);
  2919. goto mem_err;
  2920. }
  2921. NCAR(an) = xnode;
  2922. }
  2923. else {
  2924. NCAR(an) = snode;
  2925. }
  2926. NCDR(var_anode) = an;
  2927. var_anode = an;
  2928. }
  2929. else {
  2930. NCAR(an) = snode;
  2931. NCDR(anode) = an;
  2932. anode = an;
  2933. }
  2934. }
  2935. return varlen;
  2936. mem_err2:
  2937. onig_node_free(snode);
  2938. mem_err:
  2939. onig_node_free(*rnode);
  2940. return ONIGERR_MEMORY;
  2941. }
  2942. static int
  2943. expand_case_fold_string(Node* node, regex_t* reg)
  2944. {
  2945. #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
  2946. int r, n, len, alt_num;
  2947. UChar *start, *end, *p;
  2948. Node *top_root, *root, *snode, *prev_node;
  2949. OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
  2950. StrNode* sn = NSTR(node);
  2951. if (NSTRING_IS_AMBIG(node)) return 0;
  2952. start = sn->s;
  2953. end = sn->end;
  2954. if (start >= end) return 0;
  2955. r = 0;
  2956. top_root = root = prev_node = snode = NULL_NODE;
  2957. alt_num = 1;
  2958. p = start;
  2959. while (p < end) {
  2960. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
  2961. p, end, items);
  2962. if (n < 0) {
  2963. r = n;
  2964. goto err;
  2965. }
  2966. len = enclen(reg->enc, p, end);
  2967. if (n == 0) {
  2968. if (IS_NULL(snode)) {
  2969. if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
  2970. top_root = root = onig_node_list_add(NULL_NODE, prev_node);
  2971. if (IS_NULL(root)) {
  2972. onig_node_free(prev_node);
  2973. goto mem_err;
  2974. }
  2975. }
  2976. prev_node = snode = onig_node_new_str(NULL, NULL);
  2977. if (IS_NULL(snode)) goto mem_err;
  2978. if (IS_NOT_NULL(root)) {
  2979. if (IS_NULL(onig_node_list_add(root, snode))) {
  2980. onig_node_free(snode);
  2981. goto mem_err;
  2982. }
  2983. }
  2984. }
  2985. r = onig_node_str_cat(snode, p, p + len);
  2986. if (r != 0) goto err;
  2987. }
  2988. else {
  2989. alt_num *= (n + 1);
  2990. if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
  2991. if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
  2992. top_root = root = onig_node_list_add(NULL_NODE, prev_node);
  2993. if (IS_NULL(root)) {
  2994. onig_node_free(prev_node);
  2995. goto mem_err;
  2996. }
  2997. }
  2998. r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
  2999. if (r < 0) goto mem_err;
  3000. if (r == 1) {
  3001. if (IS_NULL(root)) {
  3002. top_root = prev_node;
  3003. }
  3004. else {
  3005. if (IS_NULL(onig_node_list_add(root, prev_node))) {
  3006. onig_node_free(prev_node);
  3007. goto mem_err;
  3008. }
  3009. }
  3010. root = NCAR(prev_node);
  3011. }
  3012. else { /* r == 0 */
  3013. if (IS_NOT_NULL(root)) {
  3014. if (IS_NULL(onig_node_list_add(root, prev_node))) {
  3015. onig_node_free(prev_node);
  3016. goto mem_err;
  3017. }
  3018. }
  3019. }
  3020. snode = NULL_NODE;
  3021. }
  3022. p += len;
  3023. }
  3024. if (p < end) {
  3025. Node *srem;
  3026. r = expand_case_fold_make_rem_string(&srem, p, end, reg);
  3027. if (r != 0) goto mem_err;
  3028. if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
  3029. top_root = root = onig_node_list_add(NULL_NODE, prev_node);
  3030. if (IS_NULL(root)) {
  3031. onig_node_free(srem);
  3032. onig_node_free(prev_node);
  3033. goto mem_err;
  3034. }
  3035. }
  3036. if (IS_NULL(root)) {
  3037. prev_node = srem;
  3038. }
  3039. else {
  3040. if (IS_NULL(onig_node_list_add(root, srem))) {
  3041. onig_node_free(srem);
  3042. goto mem_err;
  3043. }
  3044. }
  3045. }
  3046. /* ending */
  3047. top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
  3048. swap_node(node, top_root);
  3049. onig_node_free(top_root);
  3050. return 0;
  3051. mem_err:
  3052. r = ONIGERR_MEMORY;
  3053. err:
  3054. onig_node_free(top_root);
  3055. return r;
  3056. }
  3057. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  3058. #define CEC_THRES_NUM_BIG_REPEAT 512
  3059. #define CEC_INFINITE_NUM 0x7fffffff
  3060. #define CEC_IN_INFINITE_REPEAT (1<<0)
  3061. #define CEC_IN_FINITE_REPEAT (1<<1)
  3062. #define CEC_CONT_BIG_REPEAT (1<<2)
  3063. static int
  3064. setup_comb_exp_check(Node* node, int state, ScanEnv* env)
  3065. {
  3066. int type;
  3067. int r = state;
  3068. type = NTYPE(node);
  3069. switch (type) {
  3070. case NT_LIST:
  3071. {
  3072. Node* prev = NULL_NODE;
  3073. do {
  3074. r = setup_comb_exp_check(NCAR(node), r, env);
  3075. prev = NCAR(node);
  3076. } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
  3077. }
  3078. break;
  3079. case NT_ALT:
  3080. {
  3081. int ret;
  3082. do {
  3083. ret = setup_comb_exp_check(NCAR(node), state, env);
  3084. r |= ret;
  3085. } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
  3086. }
  3087. break;
  3088. case NT_QTFR:
  3089. {
  3090. int child_state = state;
  3091. int add_state = 0;
  3092. QtfrNode* qn = NQTFR(node);
  3093. Node* target = qn->target;
  3094. int var_num;
  3095. if (! IS_REPEAT_INFINITE(qn->upper)) {
  3096. if (qn->upper > 1) {
  3097. /* {0,1}, {1,1} are allowed */
  3098. child_state |= CEC_IN_FINITE_REPEAT;
  3099. /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
  3100. if (env->backrefed_mem == 0) {
  3101. if (NTYPE(qn->target) == NT_ENCLOSE) {
  3102. EncloseNode* en = NENCLOSE(qn->target);
  3103. if (en->type == ENCLOSE_MEMORY) {
  3104. if (NTYPE(en->target) == NT_QTFR) {
  3105. QtfrNode* q = NQTFR(en->target);
  3106. if (IS_REPEAT_INFINITE(q->upper)
  3107. && q->greedy == qn->greedy) {
  3108. qn->upper = (qn->lower == 0 ? 1 : qn->lower);
  3109. if (qn->upper == 1)
  3110. child_state = state;
  3111. }
  3112. }
  3113. }
  3114. }
  3115. }
  3116. }
  3117. }
  3118. if (state & CEC_IN_FINITE_REPEAT) {
  3119. qn->comb_exp_check_num = -1;
  3120. }
  3121. else {
  3122. if (IS_REPEAT_INFINITE(qn->upper)) {
  3123. var_num = CEC_INFINITE_NUM;
  3124. child_state |= CEC_IN_INFINITE_REPEAT;
  3125. }
  3126. else {
  3127. var_num = qn->upper - qn->lower;
  3128. }
  3129. if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
  3130. add_state |= CEC_CONT_BIG_REPEAT;
  3131. if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
  3132. ((state & CEC_CONT_BIG_REPEAT) != 0 &&
  3133. var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
  3134. if (qn->comb_exp_check_num == 0) {
  3135. env->num_comb_exp_check++;
  3136. qn->comb_exp_check_num = env->num_comb_exp_check;
  3137. if (env->curr_max_regnum > env->comb_exp_max_regnum)
  3138. env->comb_exp_max_regnum = env->curr_max_regnum;
  3139. }
  3140. }
  3141. }
  3142. r = setup_comb_exp_check(target, child_state, env);
  3143. r |= add_state;
  3144. }
  3145. break;
  3146. case NT_ENCLOSE:
  3147. {
  3148. EncloseNode* en = NENCLOSE(node);
  3149. switch (en->type) {
  3150. case ENCLOSE_MEMORY:
  3151. {
  3152. if (env->curr_max_regnum < en->regnum)
  3153. env->curr_max_regnum = en->regnum;
  3154. r = setup_comb_exp_check(en->target, state, env);
  3155. }
  3156. break;
  3157. default:
  3158. r = setup_comb_exp_check(en->target, state, env);
  3159. break;
  3160. }
  3161. }
  3162. break;
  3163. #ifdef USE_SUBEXP_CALL
  3164. case NT_CALL:
  3165. if (IS_CALL_RECURSION(NCALL(node)))
  3166. env->has_recursion = 1;
  3167. else
  3168. r = setup_comb_exp_check(NCALL(node)->target, state, env);
  3169. break;
  3170. #endif
  3171. default:
  3172. break;
  3173. }
  3174. return r;
  3175. }
  3176. #endif
  3177. #define IN_ALT (1<<0)
  3178. #define IN_NOT (1<<1)
  3179. #define IN_REPEAT (1<<2)
  3180. #define IN_VAR_REPEAT (1<<3)
  3181. #define IN_LAST (1<<4)
  3182. /* setup_tree does the following work.
  3183. 1. check empty loop. (set qn->target_empty_info)
  3184. 2. expand ignore-case in char class.
  3185. 3. set memory status bit flags. (reg->mem_stats)
  3186. 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
  3187. 5. find invalid patterns in look-behind.
  3188. 6. expand repeated string.
  3189. */
  3190. static int
  3191. setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
  3192. {
  3193. int type;
  3194. int r = 0;
  3195. type = NTYPE(node);
  3196. switch (type) {
  3197. case NT_LIST:
  3198. {
  3199. Node* prev = NULL_NODE;
  3200. do {
  3201. int s = IS_NOT_NULL(NCDR(node)) ? (state & ~IN_LAST) : state;
  3202. r = setup_tree(NCAR(node), reg, s, env);
  3203. if (IS_NOT_NULL(prev) && r == 0) {
  3204. r = next_setup(prev, NCAR(node), reg);
  3205. }
  3206. prev = NCAR(node);
  3207. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  3208. }
  3209. break;
  3210. case NT_ALT:
  3211. do {
  3212. r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
  3213. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  3214. break;
  3215. case NT_CCLASS:
  3216. break;
  3217. case NT_STR:
  3218. if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
  3219. r = expand_case_fold_string(node, reg);
  3220. }
  3221. break;
  3222. case NT_CTYPE:
  3223. case NT_CANY:
  3224. break;
  3225. #ifdef USE_SUBEXP_CALL
  3226. case NT_CALL:
  3227. break;
  3228. #endif
  3229. case NT_BREF:
  3230. {
  3231. int i;
  3232. int* p;
  3233. Node** nodes = SCANENV_MEM_NODES(env);
  3234. BRefNode* br = NBREF(node);
  3235. p = BACKREFS_P(br);
  3236. for (i = 0; i < br->back_num; i++) {
  3237. if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  3238. BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
  3239. BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
  3240. #ifdef USE_BACKREF_WITH_LEVEL
  3241. if (IS_BACKREF_NEST_LEVEL(br)) {
  3242. BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
  3243. }
  3244. #endif
  3245. SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
  3246. }
  3247. }
  3248. break;
  3249. case NT_QTFR:
  3250. {
  3251. OnigDistance d;
  3252. QtfrNode* qn = NQTFR(node);
  3253. Node* target = qn->target;
  3254. if ((state & IN_REPEAT) != 0) {
  3255. qn->state |= NST_IN_REPEAT;
  3256. }
  3257. if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
  3258. r = get_min_match_length(target, &d, env);
  3259. if (r) break;
  3260. if (d == 0) {
  3261. qn->target_empty_info = NQ_TARGET_IS_EMPTY;
  3262. #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
  3263. r = quantifiers_memory_node_info(target);
  3264. if (r < 0) break;
  3265. if (r > 0) {
  3266. qn->target_empty_info = r;
  3267. }
  3268. #endif
  3269. #if 0
  3270. r = get_max_match_length(target, &d, env);
  3271. if (r == 0 && d == 0) {
  3272. /* ()* ==> ()?, ()+ ==> () */
  3273. qn->upper = 1;
  3274. if (qn->lower > 1) qn->lower = 1;
  3275. if (NTYPE(target) == NT_STR) {
  3276. qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
  3277. }
  3278. }
  3279. #endif
  3280. }
  3281. }
  3282. state |= IN_REPEAT;
  3283. if (qn->lower != qn->upper)
  3284. state |= IN_VAR_REPEAT;
  3285. r = setup_tree(target, reg, state, env);
  3286. if (r) break;
  3287. /* expand string */
  3288. #define EXPAND_STRING_MAX_LENGTH 100
  3289. if (NTYPE(target) == NT_STR) {
  3290. if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper &&
  3291. qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) {
  3292. int len = NSTRING_LEN(target);
  3293. StrNode* sn = NSTR(target);
  3294. if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) {
  3295. int i, n = qn->lower;
  3296. onig_node_conv_to_str_node(node, NSTR(target)->flag);
  3297. for (i = 0; i < n; i++) {
  3298. r = onig_node_str_cat(node, sn->s, sn->end);
  3299. if (r) break;
  3300. }
  3301. onig_node_free(target);
  3302. break; /* break case NT_QTFR: */
  3303. }
  3304. }
  3305. }
  3306. #ifdef USE_OP_PUSH_OR_JUMP_EXACT
  3307. if (qn->greedy && (qn->target_empty_info != 0)) {
  3308. if (NTYPE(target) == NT_QTFR) {
  3309. QtfrNode* tqn = NQTFR(target);
  3310. if (IS_NOT_NULL(tqn->head_exact)) {
  3311. qn->head_exact = tqn->head_exact;
  3312. tqn->head_exact = NULL;
  3313. }
  3314. }
  3315. else {
  3316. qn->head_exact = get_head_value_node(qn->target, 1, reg);
  3317. }
  3318. }
  3319. #endif
  3320. if ((state & IN_LAST) != 0 && qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
  3321. /* automatic posseivation a* (at last) ==> (?>a*) */
  3322. if (qn->lower <= 1) {
  3323. int ttype = NTYPE(qn->target);
  3324. if (IS_NODE_TYPE_SIMPLE(ttype)) {
  3325. Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
  3326. CHECK_NULL_RETURN_MEMERR(en);
  3327. SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
  3328. swap_node(node, en);
  3329. NENCLOSE(node)->target = en;
  3330. }
  3331. }
  3332. }
  3333. }
  3334. break;
  3335. case NT_ENCLOSE:
  3336. {
  3337. EncloseNode* en = NENCLOSE(node);
  3338. switch (en->type) {
  3339. case ENCLOSE_OPTION:
  3340. {
  3341. OnigOptionType options = reg->options;
  3342. reg->options = NENCLOSE(node)->option;
  3343. r = setup_tree(NENCLOSE(node)->target, reg, state, env);
  3344. reg->options = options;
  3345. }
  3346. break;
  3347. case ENCLOSE_MEMORY:
  3348. if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
  3349. BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
  3350. /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
  3351. }
  3352. r = setup_tree(en->target, reg, state, env);
  3353. break;
  3354. case ENCLOSE_STOP_BACKTRACK:
  3355. {
  3356. Node* target = en->target;
  3357. r = setup_tree(target, reg, state, env);
  3358. if (NTYPE(target) == NT_QTFR) {
  3359. QtfrNode* tqn = NQTFR(target);
  3360. if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
  3361. tqn->greedy != 0) { /* (?>a*), a*+ etc... */
  3362. int qtype = NTYPE(tqn->target);
  3363. if (IS_NODE_TYPE_SIMPLE(qtype))
  3364. SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
  3365. }
  3366. }
  3367. }
  3368. break;
  3369. }
  3370. }
  3371. break;
  3372. case NT_ANCHOR:
  3373. {
  3374. AnchorNode* an = NANCHOR(node);
  3375. switch (an->type) {
  3376. case ANCHOR_PREC_READ:
  3377. r = setup_tree(an->target, reg, state, env);
  3378. break;
  3379. case ANCHOR_PREC_READ_NOT:
  3380. r = setup_tree(an->target, reg, (state | IN_NOT), env);
  3381. break;
  3382. /* allowed node types in look-behind */
  3383. #define ALLOWED_TYPE_IN_LB \
  3384. ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
  3385. BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
  3386. #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY )
  3387. #define ALLOWED_ENCLOSE_IN_LB_NOT 0
  3388. #define ALLOWED_ANCHOR_IN_LB \
  3389. ( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
  3390. #define ALLOWED_ANCHOR_IN_LB_NOT \
  3391. ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION )
  3392. case ANCHOR_LOOK_BEHIND:
  3393. {
  3394. r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
  3395. ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
  3396. if (r < 0) return r;
  3397. if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  3398. r = setup_look_behind(node, reg, env);
  3399. if (r != 0) return r;
  3400. r = setup_tree(an->target, reg, state, env);
  3401. }
  3402. break;
  3403. case ANCHOR_LOOK_BEHIND_NOT:
  3404. {
  3405. r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
  3406. ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
  3407. if (r < 0) return r;
  3408. if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  3409. r = setup_look_behind(node, reg, env);
  3410. if (r != 0) return r;
  3411. r = setup_tree(an->target, reg, (state | IN_NOT), env);
  3412. }
  3413. break;
  3414. }
  3415. }
  3416. break;
  3417. default:
  3418. break;
  3419. }
  3420. return r;
  3421. }
  3422. /* set skip map for Boyer-Moor search */
  3423. static int
  3424. set_bm_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED,
  3425. UChar skip[], int** int_skip)
  3426. {
  3427. int i, len;
  3428. len = end - s;
  3429. if (len < ONIG_CHAR_TABLE_SIZE) {
  3430. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
  3431. for (i = 0; i < len - 1; i++)
  3432. skip[s[i]] = len - 1 - i;
  3433. }
  3434. else {
  3435. if (IS_NULL(*int_skip)) {
  3436. *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
  3437. if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
  3438. }
  3439. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
  3440. for (i = 0; i < len - 1; i++)
  3441. (*int_skip)[s[i]] = len - 1 - i;
  3442. }
  3443. return 0;
  3444. }
  3445. #define OPT_EXACT_MAXLEN 24
  3446. typedef struct {
  3447. OnigDistance min; /* min byte length */
  3448. OnigDistance max; /* max byte length */
  3449. } MinMaxLen;
  3450. typedef struct {
  3451. MinMaxLen mmd;
  3452. OnigEncoding enc;
  3453. OnigOptionType options;
  3454. OnigCaseFoldType case_fold_flag;
  3455. ScanEnv* scan_env;
  3456. } OptEnv;
  3457. typedef struct {
  3458. int left_anchor;
  3459. int right_anchor;
  3460. } OptAncInfo;
  3461. typedef struct {
  3462. MinMaxLen mmd; /* info position */
  3463. OptAncInfo anc;
  3464. int reach_end;
  3465. int ignore_case;
  3466. int len;
  3467. UChar s[OPT_EXACT_MAXLEN];
  3468. } OptExactInfo;
  3469. typedef struct {
  3470. MinMaxLen mmd; /* info position */
  3471. OptAncInfo anc;
  3472. int value; /* weighted value */
  3473. UChar map[ONIG_CHAR_TABLE_SIZE];
  3474. } OptMapInfo;
  3475. typedef struct {
  3476. MinMaxLen len;
  3477. OptAncInfo anc;
  3478. OptExactInfo exb; /* boundary */
  3479. OptExactInfo exm; /* middle */
  3480. OptExactInfo expr; /* prec read (?=...) */
  3481. OptMapInfo map; /* boundary */
  3482. } NodeOptInfo;
  3483. static int
  3484. map_position_value(OnigEncoding enc, int i)
  3485. {
  3486. static const short int ByteValTable[] = {
  3487. 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
  3488. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3489. 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
  3490. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
  3491. 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
  3492. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
  3493. 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
  3494. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
  3495. };
  3496. if (i < (int )(sizeof(ByteValTable)/sizeof(ByteValTable[0]))) {
  3497. if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
  3498. return 20;
  3499. else
  3500. return (int )ByteValTable[i];
  3501. }
  3502. else
  3503. return 4; /* Take it easy. */
  3504. }
  3505. static int
  3506. distance_value(MinMaxLen* mm)
  3507. {
  3508. /* 1000 / (min-max-dist + 1) */
  3509. static const short int dist_vals[] = {
  3510. 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
  3511. 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
  3512. 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
  3513. 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
  3514. 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
  3515. 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
  3516. 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
  3517. 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
  3518. 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
  3519. 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
  3520. };
  3521. int d;
  3522. if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
  3523. d = mm->max - mm->min;
  3524. if (d < (int )(sizeof(dist_vals)/sizeof(dist_vals[0])))
  3525. /* return dist_vals[d] * 16 / (mm->min + 12); */
  3526. return (int )dist_vals[d];
  3527. else
  3528. return 1;
  3529. }
  3530. static int
  3531. comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
  3532. {
  3533. if (v2 <= 0) return -1;
  3534. if (v1 <= 0) return 1;
  3535. v1 *= distance_value(d1);
  3536. v2 *= distance_value(d2);
  3537. if (v2 > v1) return 1;
  3538. if (v2 < v1) return -1;
  3539. if (d2->min < d1->min) return 1;
  3540. if (d2->min > d1->min) return -1;
  3541. return 0;
  3542. }
  3543. static int
  3544. is_equal_mml(MinMaxLen* a, MinMaxLen* b)
  3545. {
  3546. return (a->min == b->min && a->max == b->max) ? 1 : 0;
  3547. }
  3548. static void
  3549. set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
  3550. {
  3551. mml->min = min;
  3552. mml->max = max;
  3553. }
  3554. static void
  3555. clear_mml(MinMaxLen* mml)
  3556. {
  3557. mml->min = mml->max = 0;
  3558. }
  3559. static void
  3560. copy_mml(MinMaxLen* to, MinMaxLen* from)
  3561. {
  3562. to->min = from->min;
  3563. to->max = from->max;
  3564. }
  3565. static void
  3566. add_mml(MinMaxLen* to, MinMaxLen* from)
  3567. {
  3568. to->min = distance_add(to->min, from->min);
  3569. to->max = distance_add(to->max, from->max);
  3570. }
  3571. #if 0
  3572. static void
  3573. add_len_mml(MinMaxLen* to, OnigDistance len)
  3574. {
  3575. to->min = distance_add(to->min, len);
  3576. to->max = distance_add(to->max, len);
  3577. }
  3578. #endif
  3579. static void
  3580. alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
  3581. {
  3582. if (to->min > from->min) to->min = from->min;
  3583. if (to->max < from->max) to->max = from->max;
  3584. }
  3585. static void
  3586. copy_opt_env(OptEnv* to, OptEnv* from)
  3587. {
  3588. *to = *from;
  3589. }
  3590. static void
  3591. clear_opt_anc_info(OptAncInfo* anc)
  3592. {
  3593. anc->left_anchor = 0;
  3594. anc->right_anchor = 0;
  3595. }
  3596. static void
  3597. copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
  3598. {
  3599. *to = *from;
  3600. }
  3601. static void
  3602. concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
  3603. OnigDistance left_len, OnigDistance right_len)
  3604. {
  3605. clear_opt_anc_info(to);
  3606. to->left_anchor = left->left_anchor;
  3607. if (left_len == 0) {
  3608. to->left_anchor |= right->left_anchor;
  3609. }
  3610. to->right_anchor = right->right_anchor;
  3611. if (right_len == 0) {
  3612. to->right_anchor |= left->right_anchor;
  3613. }
  3614. }
  3615. static int
  3616. is_left_anchor(int anc)
  3617. {
  3618. if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
  3619. anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
  3620. anc == ANCHOR_PREC_READ_NOT)
  3621. return 0;
  3622. return 1;
  3623. }
  3624. static int
  3625. is_set_opt_anc_info(OptAncInfo* to, int anc)
  3626. {
  3627. if ((to->left_anchor & anc) != 0) return 1;
  3628. return ((to->right_anchor & anc) != 0 ? 1 : 0);
  3629. }
  3630. static void
  3631. add_opt_anc_info(OptAncInfo* to, int anc)
  3632. {
  3633. if (is_left_anchor(anc))
  3634. to->left_anchor |= anc;
  3635. else
  3636. to->right_anchor |= anc;
  3637. }
  3638. static void
  3639. remove_opt_anc_info(OptAncInfo* to, int anc)
  3640. {
  3641. if (is_left_anchor(anc))
  3642. to->left_anchor &= ~anc;
  3643. else
  3644. to->right_anchor &= ~anc;
  3645. }
  3646. static void
  3647. alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
  3648. {
  3649. to->left_anchor &= add->left_anchor;
  3650. to->right_anchor &= add->right_anchor;
  3651. }
  3652. static int
  3653. is_full_opt_exact_info(OptExactInfo* ex)
  3654. {
  3655. return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
  3656. }
  3657. static void
  3658. clear_opt_exact_info(OptExactInfo* ex)
  3659. {
  3660. clear_mml(&ex->mmd);
  3661. clear_opt_anc_info(&ex->anc);
  3662. ex->reach_end = 0;
  3663. ex->ignore_case = 0;
  3664. ex->len = 0;
  3665. ex->s[0] = '\0';
  3666. }
  3667. static void
  3668. copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
  3669. {
  3670. *to = *from;
  3671. }
  3672. static void
  3673. concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
  3674. {
  3675. int i, j, len;
  3676. UChar *p, *end;
  3677. OptAncInfo tanc;
  3678. if (! to->ignore_case && add->ignore_case) {
  3679. if (to->len >= add->len) return ; /* avoid */
  3680. to->ignore_case = 1;
  3681. }
  3682. p = add->s;
  3683. end = p + add->len;
  3684. for (i = to->len; p < end; ) {
  3685. len = enclen(enc, p, end);
  3686. if (i + len > OPT_EXACT_MAXLEN) break;
  3687. for (j = 0; j < len && p < end; j++)
  3688. to->s[i++] = *p++;
  3689. }
  3690. to->len = i;
  3691. to->reach_end = (p == end ? add->reach_end : 0);
  3692. concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
  3693. if (! to->reach_end) tanc.right_anchor = 0;
  3694. copy_opt_anc_info(&to->anc, &tanc);
  3695. }
  3696. static void
  3697. concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
  3698. int raw ARG_UNUSED, OnigEncoding enc)
  3699. {
  3700. int i, j, len;
  3701. UChar *p;
  3702. for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
  3703. len = enclen(enc, p, end);
  3704. if (i + len > OPT_EXACT_MAXLEN) break;
  3705. for (j = 0; j < len && p < end; j++)
  3706. to->s[i++] = *p++;
  3707. }
  3708. to->len = i;
  3709. }
  3710. static void
  3711. alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
  3712. {
  3713. int i, j, len;
  3714. if (add->len == 0 || to->len == 0) {
  3715. clear_opt_exact_info(to);
  3716. return ;
  3717. }
  3718. if (! is_equal_mml(&to->mmd, &add->mmd)) {
  3719. clear_opt_exact_info(to);
  3720. return ;
  3721. }
  3722. for (i = 0; i < to->len && i < add->len; ) {
  3723. if (to->s[i] != add->s[i]) break;
  3724. len = enclen(env->enc, to->s + i, to->s + to->len);
  3725. for (j = 1; j < len; j++) {
  3726. if (to->s[i+j] != add->s[i+j]) break;
  3727. }
  3728. if (j < len) break;
  3729. i += len;
  3730. }
  3731. if (! add->reach_end || i < add->len || i < to->len) {
  3732. to->reach_end = 0;
  3733. }
  3734. to->len = i;
  3735. to->ignore_case |= add->ignore_case;
  3736. alt_merge_opt_anc_info(&to->anc, &add->anc);
  3737. if (! to->reach_end) to->anc.right_anchor = 0;
  3738. }
  3739. static void
  3740. select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
  3741. {
  3742. int v1, v2;
  3743. v1 = now->len;
  3744. v2 = alt->len;
  3745. if (v2 == 0) {
  3746. return ;
  3747. }
  3748. else if (v1 == 0) {
  3749. copy_opt_exact_info(now, alt);
  3750. return ;
  3751. }
  3752. else if (v1 <= 2 && v2 <= 2) {
  3753. /* ByteValTable[x] is big value --> low price */
  3754. v2 = map_position_value(enc, now->s[0]);
  3755. v1 = map_position_value(enc, alt->s[0]);
  3756. if (now->len > 1) v1 += 5;
  3757. if (alt->len > 1) v2 += 5;
  3758. }
  3759. if (now->ignore_case == 0) v1 *= 2;
  3760. if (alt->ignore_case == 0) v2 *= 2;
  3761. if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
  3762. copy_opt_exact_info(now, alt);
  3763. }
  3764. static void
  3765. clear_opt_map_info(OptMapInfo* map)
  3766. {
  3767. static const OptMapInfo clean_info = {
  3768. {0, 0}, {0, 0}, 0,
  3769. {
  3770. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3771. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3772. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3773. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3774. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3775. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3776. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3777. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3778. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3779. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3780. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3781. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3782. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3783. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3784. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  3785. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  3786. }
  3787. };
  3788. xmemcpy(map, &clean_info, sizeof(OptMapInfo));
  3789. }
  3790. static void
  3791. copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
  3792. {
  3793. *to = *from;
  3794. }
  3795. static void
  3796. add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
  3797. {
  3798. if (map->map[c] == 0) {
  3799. map->map[c] = 1;
  3800. map->value += map_position_value(enc, c);
  3801. }
  3802. }
  3803. static int
  3804. add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
  3805. OnigEncoding enc, OnigCaseFoldType case_fold_flag)
  3806. {
  3807. OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
  3808. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  3809. int i, n;
  3810. add_char_opt_map_info(map, p[0], enc);
  3811. case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
  3812. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
  3813. if (n < 0) return n;
  3814. for (i = 0; i < n; i++) {
  3815. ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
  3816. add_char_opt_map_info(map, buf[0], enc);
  3817. }
  3818. return 0;
  3819. }
  3820. static void
  3821. select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
  3822. {
  3823. const int z = 1<<15; /* 32768: something big value */
  3824. int v1, v2;
  3825. if (alt->value == 0) return ;
  3826. if (now->value == 0) {
  3827. copy_opt_map_info(now, alt);
  3828. return ;
  3829. }
  3830. v1 = z / now->value;
  3831. v2 = z / alt->value;
  3832. if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
  3833. copy_opt_map_info(now, alt);
  3834. }
  3835. static int
  3836. comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
  3837. {
  3838. #define COMP_EM_BASE 20
  3839. int ve, vm;
  3840. if (m->value <= 0) return -1;
  3841. ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2);
  3842. vm = COMP_EM_BASE * 5 * 2 / m->value;
  3843. return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
  3844. }
  3845. static void
  3846. alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
  3847. {
  3848. int i, val;
  3849. /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
  3850. if (to->value == 0) return ;
  3851. if (add->value == 0 || to->mmd.max < add->mmd.min) {
  3852. clear_opt_map_info(to);
  3853. return ;
  3854. }
  3855. alt_merge_mml(&to->mmd, &add->mmd);
  3856. val = 0;
  3857. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
  3858. if (add->map[i])
  3859. to->map[i] = 1;
  3860. if (to->map[i])
  3861. val += map_position_value(enc, i);
  3862. }
  3863. to->value = val;
  3864. alt_merge_opt_anc_info(&to->anc, &add->anc);
  3865. }
  3866. static void
  3867. set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
  3868. {
  3869. copy_mml(&(opt->exb.mmd), mmd);
  3870. copy_mml(&(opt->expr.mmd), mmd);
  3871. copy_mml(&(opt->map.mmd), mmd);
  3872. }
  3873. static void
  3874. clear_node_opt_info(NodeOptInfo* opt)
  3875. {
  3876. clear_mml(&opt->len);
  3877. clear_opt_anc_info(&opt->anc);
  3878. clear_opt_exact_info(&opt->exb);
  3879. clear_opt_exact_info(&opt->exm);
  3880. clear_opt_exact_info(&opt->expr);
  3881. clear_opt_map_info(&opt->map);
  3882. }
  3883. static void
  3884. copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
  3885. {
  3886. *to = *from;
  3887. }
  3888. static void
  3889. concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
  3890. {
  3891. int exb_reach, exm_reach;
  3892. OptAncInfo tanc;
  3893. concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
  3894. copy_opt_anc_info(&to->anc, &tanc);
  3895. if (add->exb.len > 0 && to->len.max == 0) {
  3896. concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
  3897. to->len.max, add->len.max);
  3898. copy_opt_anc_info(&add->exb.anc, &tanc);
  3899. }
  3900. if (add->map.value > 0 && to->len.max == 0) {
  3901. if (add->map.mmd.max == 0)
  3902. add->map.anc.left_anchor |= to->anc.left_anchor;
  3903. }
  3904. exb_reach = to->exb.reach_end;
  3905. exm_reach = to->exm.reach_end;
  3906. if (add->len.max != 0)
  3907. to->exb.reach_end = to->exm.reach_end = 0;
  3908. if (add->exb.len > 0) {
  3909. if (exb_reach) {
  3910. concat_opt_exact_info(&to->exb, &add->exb, enc);
  3911. clear_opt_exact_info(&add->exb);
  3912. }
  3913. else if (exm_reach) {
  3914. concat_opt_exact_info(&to->exm, &add->exb, enc);
  3915. clear_opt_exact_info(&add->exb);
  3916. }
  3917. }
  3918. select_opt_exact_info(enc, &to->exm, &add->exb);
  3919. select_opt_exact_info(enc, &to->exm, &add->exm);
  3920. if (to->expr.len > 0) {
  3921. if (add->len.max > 0) {
  3922. if (to->expr.len > (int )add->len.max)
  3923. to->expr.len = add->len.max;
  3924. if (to->expr.mmd.max == 0)
  3925. select_opt_exact_info(enc, &to->exb, &to->expr);
  3926. else
  3927. select_opt_exact_info(enc, &to->exm, &to->expr);
  3928. }
  3929. }
  3930. else if (add->expr.len > 0) {
  3931. copy_opt_exact_info(&to->expr, &add->expr);
  3932. }
  3933. select_opt_map_info(&to->map, &add->map);
  3934. add_mml(&to->len, &add->len);
  3935. }
  3936. static void
  3937. alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
  3938. {
  3939. alt_merge_opt_anc_info (&to->anc, &add->anc);
  3940. alt_merge_opt_exact_info(&to->exb, &add->exb, env);
  3941. alt_merge_opt_exact_info(&to->exm, &add->exm, env);
  3942. alt_merge_opt_exact_info(&to->expr, &add->expr, env);
  3943. alt_merge_opt_map_info(env->enc, &to->map, &add->map);
  3944. alt_merge_mml(&to->len, &add->len);
  3945. }
  3946. #define MAX_NODE_OPT_INFO_REF_COUNT 5
  3947. static int
  3948. optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
  3949. {
  3950. int type;
  3951. int r = 0;
  3952. clear_node_opt_info(opt);
  3953. set_bound_node_opt_info(opt, &env->mmd);
  3954. type = NTYPE(node);
  3955. switch (type) {
  3956. case NT_LIST:
  3957. {
  3958. OptEnv nenv;
  3959. NodeOptInfo nopt;
  3960. Node* nd = node;
  3961. copy_opt_env(&nenv, env);
  3962. do {
  3963. r = optimize_node_left(NCAR(nd), &nopt, &nenv);
  3964. if (r == 0) {
  3965. add_mml(&nenv.mmd, &nopt.len);
  3966. concat_left_node_opt_info(env->enc, opt, &nopt);
  3967. }
  3968. } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
  3969. }
  3970. break;
  3971. case NT_ALT:
  3972. {
  3973. NodeOptInfo nopt;
  3974. Node* nd = node;
  3975. do {
  3976. r = optimize_node_left(NCAR(nd), &nopt, env);
  3977. if (r == 0) {
  3978. if (nd == node) copy_node_opt_info(opt, &nopt);
  3979. else alt_merge_node_opt_info(opt, &nopt, env);
  3980. }
  3981. } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
  3982. }
  3983. break;
  3984. case NT_STR:
  3985. {
  3986. StrNode* sn = NSTR(node);
  3987. int slen = sn->end - sn->s;
  3988. int is_raw = NSTRING_IS_RAW(node);
  3989. if (! NSTRING_IS_AMBIG(node)) {
  3990. concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
  3991. NSTRING_IS_RAW(node), env->enc);
  3992. if (slen > 0) {
  3993. add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
  3994. }
  3995. set_mml(&opt->len, slen, slen);
  3996. }
  3997. else {
  3998. int max;
  3999. if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
  4000. int n = onigenc_strlen(env->enc, sn->s, sn->end);
  4001. max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
  4002. }
  4003. else {
  4004. concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
  4005. is_raw, env->enc);
  4006. opt->exb.ignore_case = 1;
  4007. if (slen > 0) {
  4008. r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
  4009. env->enc, env->case_fold_flag);
  4010. if (r != 0) break;
  4011. }
  4012. max = slen;
  4013. }
  4014. set_mml(&opt->len, slen, max);
  4015. }
  4016. if (opt->exb.len == slen)
  4017. opt->exb.reach_end = 1;
  4018. }
  4019. break;
  4020. case NT_CCLASS:
  4021. {
  4022. int i, z;
  4023. CClassNode* cc = NCCLASS(node);
  4024. /* no need to check ignore case. (setted in setup_tree()) */
  4025. if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
  4026. OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
  4027. OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  4028. set_mml(&opt->len, min, max);
  4029. }
  4030. else {
  4031. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  4032. z = BITSET_AT(cc->bs, i);
  4033. if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
  4034. add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
  4035. }
  4036. }
  4037. set_mml(&opt->len, 1, 1);
  4038. }
  4039. }
  4040. break;
  4041. case NT_CTYPE:
  4042. {
  4043. int i, min, max;
  4044. max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  4045. if (max == 1) {
  4046. min = 1;
  4047. switch (NCTYPE(node)->ctype) {
  4048. case ONIGENC_CTYPE_WORD:
  4049. if (NCTYPE(node)->not != 0) {
  4050. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  4051. if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
  4052. add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
  4053. }
  4054. }
  4055. }
  4056. else {
  4057. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  4058. if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
  4059. add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
  4060. }
  4061. }
  4062. }
  4063. break;
  4064. }
  4065. }
  4066. else {
  4067. min = ONIGENC_MBC_MINLEN(env->enc);
  4068. }
  4069. set_mml(&opt->len, min, max);
  4070. }
  4071. break;
  4072. case NT_CANY:
  4073. {
  4074. OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
  4075. OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  4076. set_mml(&opt->len, min, max);
  4077. }
  4078. break;
  4079. case NT_ANCHOR:
  4080. switch (NANCHOR(node)->type) {
  4081. case ANCHOR_BEGIN_BUF:
  4082. case ANCHOR_BEGIN_POSITION:
  4083. case ANCHOR_BEGIN_LINE:
  4084. case ANCHOR_END_BUF:
  4085. case ANCHOR_SEMI_END_BUF:
  4086. case ANCHOR_END_LINE:
  4087. add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
  4088. break;
  4089. case ANCHOR_PREC_READ:
  4090. {
  4091. NodeOptInfo nopt;
  4092. r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
  4093. if (r == 0) {
  4094. if (nopt.exb.len > 0)
  4095. copy_opt_exact_info(&opt->expr, &nopt.exb);
  4096. else if (nopt.exm.len > 0)
  4097. copy_opt_exact_info(&opt->expr, &nopt.exm);
  4098. opt->expr.reach_end = 0;
  4099. if (nopt.map.value > 0)
  4100. copy_opt_map_info(&opt->map, &nopt.map);
  4101. }
  4102. }
  4103. break;
  4104. case ANCHOR_PREC_READ_NOT:
  4105. case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */
  4106. case ANCHOR_LOOK_BEHIND_NOT:
  4107. break;
  4108. }
  4109. break;
  4110. case NT_BREF:
  4111. {
  4112. int i;
  4113. int* backs;
  4114. OnigDistance min, max, tmin, tmax;
  4115. Node** nodes = SCANENV_MEM_NODES(env->scan_env);
  4116. BRefNode* br = NBREF(node);
  4117. if (br->state & NST_RECURSION) {
  4118. set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
  4119. break;
  4120. }
  4121. backs = BACKREFS_P(br);
  4122. r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
  4123. if (r != 0) break;
  4124. r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
  4125. if (r != 0) break;
  4126. for (i = 1; i < br->back_num; i++) {
  4127. r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
  4128. if (r != 0) break;
  4129. r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
  4130. if (r != 0) break;
  4131. if (min > tmin) min = tmin;
  4132. if (max < tmax) max = tmax;
  4133. }
  4134. if (r == 0) set_mml(&opt->len, min, max);
  4135. }
  4136. break;
  4137. #ifdef USE_SUBEXP_CALL
  4138. case NT_CALL:
  4139. if (IS_CALL_RECURSION(NCALL(node)))
  4140. set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
  4141. else {
  4142. OnigOptionType save = env->options;
  4143. env->options = NENCLOSE(NCALL(node)->target)->option;
  4144. r = optimize_node_left(NCALL(node)->target, opt, env);
  4145. env->options = save;
  4146. }
  4147. break;
  4148. #endif
  4149. case NT_QTFR:
  4150. {
  4151. int i;
  4152. OnigDistance min, max;
  4153. NodeOptInfo nopt;
  4154. QtfrNode* qn = NQTFR(node);
  4155. r = optimize_node_left(qn->target, &nopt, env);
  4156. if (r) break;
  4157. if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
  4158. if (env->mmd.max == 0 &&
  4159. NTYPE(qn->target) == NT_CANY && qn->greedy) {
  4160. if (IS_MULTILINE(env->options))
  4161. add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
  4162. else
  4163. add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
  4164. }
  4165. }
  4166. else {
  4167. if (qn->lower > 0) {
  4168. copy_node_opt_info(opt, &nopt);
  4169. if (nopt.exb.len > 0) {
  4170. if (nopt.exb.reach_end) {
  4171. for (i = 2; i <= qn->lower &&
  4172. ! is_full_opt_exact_info(&opt->exb); i++) {
  4173. concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
  4174. }
  4175. if (i < qn->lower) {
  4176. opt->exb.reach_end = 0;
  4177. }
  4178. }
  4179. }
  4180. if (qn->lower != qn->upper) {
  4181. opt->exb.reach_end = 0;
  4182. opt->exm.reach_end = 0;
  4183. }
  4184. if (qn->lower > 1)
  4185. opt->exm.reach_end = 0;
  4186. }
  4187. }
  4188. min = distance_multiply(nopt.len.min, qn->lower);
  4189. if (IS_REPEAT_INFINITE(qn->upper))
  4190. max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
  4191. else
  4192. max = distance_multiply(nopt.len.max, qn->upper);
  4193. set_mml(&opt->len, min, max);
  4194. }
  4195. break;
  4196. case NT_ENCLOSE:
  4197. {
  4198. EncloseNode* en = NENCLOSE(node);
  4199. switch (en->type) {
  4200. case ENCLOSE_OPTION:
  4201. {
  4202. OnigOptionType save = env->options;
  4203. env->options = en->option;
  4204. r = optimize_node_left(en->target, opt, env);
  4205. env->options = save;
  4206. }
  4207. break;
  4208. case ENCLOSE_MEMORY:
  4209. #ifdef USE_SUBEXP_CALL
  4210. en->opt_count++;
  4211. if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
  4212. OnigDistance min, max;
  4213. min = 0;
  4214. max = ONIG_INFINITE_DISTANCE;
  4215. if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
  4216. if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
  4217. set_mml(&opt->len, min, max);
  4218. }
  4219. else
  4220. #endif
  4221. {
  4222. r = optimize_node_left(en->target, opt, env);
  4223. if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
  4224. if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
  4225. remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
  4226. }
  4227. }
  4228. break;
  4229. case ENCLOSE_STOP_BACKTRACK:
  4230. r = optimize_node_left(en->target, opt, env);
  4231. break;
  4232. }
  4233. }
  4234. break;
  4235. default:
  4236. #ifdef ONIG_DEBUG
  4237. fprintf(stderr, "optimize_node_left: undefined node type %d\n",
  4238. NTYPE(node));
  4239. #endif
  4240. r = ONIGERR_TYPE_BUG;
  4241. break;
  4242. }
  4243. return r;
  4244. }
  4245. static int
  4246. set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
  4247. {
  4248. int r;
  4249. if (e->len == 0) return 0;
  4250. if (e->ignore_case) {
  4251. reg->exact = (UChar* )xmalloc(e->len);
  4252. CHECK_NULL_RETURN_MEMERR(reg->exact);
  4253. xmemcpy(reg->exact, e->s, e->len);
  4254. reg->exact_end = reg->exact + e->len;
  4255. reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
  4256. }
  4257. else {
  4258. int allow_reverse;
  4259. reg->exact = str_dup(e->s, e->s + e->len);
  4260. CHECK_NULL_RETURN_MEMERR(reg->exact);
  4261. reg->exact_end = reg->exact + e->len;
  4262. allow_reverse =
  4263. ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
  4264. if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
  4265. r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
  4266. reg->map, &(reg->int_map));
  4267. if (r) return r;
  4268. reg->optimize = (allow_reverse != 0
  4269. ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
  4270. }
  4271. else {
  4272. reg->optimize = ONIG_OPTIMIZE_EXACT;
  4273. }
  4274. }
  4275. reg->dmin = e->mmd.min;
  4276. reg->dmax = e->mmd.max;
  4277. if (reg->dmin != ONIG_INFINITE_DISTANCE) {
  4278. reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact);
  4279. }
  4280. return 0;
  4281. }
  4282. static void
  4283. set_optimize_map_info(regex_t* reg, OptMapInfo* m)
  4284. {
  4285. int i;
  4286. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
  4287. reg->map[i] = m->map[i];
  4288. reg->optimize = ONIG_OPTIMIZE_MAP;
  4289. reg->dmin = m->mmd.min;
  4290. reg->dmax = m->mmd.max;
  4291. if (reg->dmin != ONIG_INFINITE_DISTANCE) {
  4292. reg->threshold_len = reg->dmin + 1;
  4293. }
  4294. }
  4295. static void
  4296. set_sub_anchor(regex_t* reg, OptAncInfo* anc)
  4297. {
  4298. reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
  4299. reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
  4300. }
  4301. #ifdef ONIG_DEBUG
  4302. static void print_optimize_info(FILE* f, regex_t* reg);
  4303. #endif
  4304. static int
  4305. set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
  4306. {
  4307. int r;
  4308. NodeOptInfo opt;
  4309. OptEnv env;
  4310. env.enc = reg->enc;
  4311. env.options = reg->options;
  4312. env.case_fold_flag = reg->case_fold_flag;
  4313. env.scan_env = scan_env;
  4314. clear_mml(&env.mmd);
  4315. r = optimize_node_left(node, &opt, &env);
  4316. if (r) return r;
  4317. reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
  4318. ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML);
  4319. reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF);
  4320. if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
  4321. reg->anchor_dmin = opt.len.min;
  4322. reg->anchor_dmax = opt.len.max;
  4323. }
  4324. if (opt.exb.len > 0 || opt.exm.len > 0) {
  4325. select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
  4326. if (opt.map.value > 0 &&
  4327. comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
  4328. goto set_map;
  4329. }
  4330. else {
  4331. r = set_optimize_exact_info(reg, &opt.exb);
  4332. set_sub_anchor(reg, &opt.exb.anc);
  4333. }
  4334. }
  4335. else if (opt.map.value > 0) {
  4336. set_map:
  4337. set_optimize_map_info(reg, &opt.map);
  4338. set_sub_anchor(reg, &opt.map.anc);
  4339. }
  4340. else {
  4341. reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
  4342. if (opt.len.max == 0)
  4343. reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
  4344. }
  4345. #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
  4346. print_optimize_info(stderr, reg);
  4347. #endif
  4348. return r;
  4349. }
  4350. static void
  4351. clear_optimize_info(regex_t* reg)
  4352. {
  4353. reg->optimize = ONIG_OPTIMIZE_NONE;
  4354. reg->anchor = 0;
  4355. reg->anchor_dmin = 0;
  4356. reg->anchor_dmax = 0;
  4357. reg->sub_anchor = 0;
  4358. reg->exact_end = (UChar* )NULL;
  4359. reg->threshold_len = 0;
  4360. if (IS_NOT_NULL(reg->exact)) {
  4361. xfree(reg->exact);
  4362. reg->exact = (UChar* )NULL;
  4363. }
  4364. }
  4365. #ifdef ONIG_DEBUG
  4366. static void print_enc_string(FILE* fp, OnigEncoding enc,
  4367. const UChar *s, const UChar *end)
  4368. {
  4369. fprintf(fp, "\nPATTERN: /");
  4370. if (ONIGENC_MBC_MINLEN(enc) > 1) {
  4371. const UChar *p;
  4372. OnigCodePoint code;
  4373. p = s;
  4374. while (p < end) {
  4375. code = ONIGENC_MBC_TO_CODE(enc, p, end);
  4376. if (code >= 0x80) {
  4377. fprintf(fp, " 0x%04x ", (int )code);
  4378. }
  4379. else {
  4380. fputc((int )code, fp);
  4381. }
  4382. p += enclen(enc, p, end);
  4383. }
  4384. }
  4385. else {
  4386. while (s < end) {
  4387. fputc((int )*s, fp);
  4388. s++;
  4389. }
  4390. }
  4391. fprintf(fp, "/\n");
  4392. }
  4393. static void
  4394. print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
  4395. {
  4396. if (a == ONIG_INFINITE_DISTANCE)
  4397. fputs("inf", f);
  4398. else
  4399. fprintf(f, "(%u)", a);
  4400. fputs("-", f);
  4401. if (b == ONIG_INFINITE_DISTANCE)
  4402. fputs("inf", f);
  4403. else
  4404. fprintf(f, "(%u)", b);
  4405. }
  4406. static void
  4407. print_anchor(FILE* f, int anchor)
  4408. {
  4409. int q = 0;
  4410. fprintf(f, "[");
  4411. if (anchor & ANCHOR_BEGIN_BUF) {
  4412. fprintf(f, "begin-buf");
  4413. q = 1;
  4414. }
  4415. if (anchor & ANCHOR_BEGIN_LINE) {
  4416. if (q) fprintf(f, ", ");
  4417. q = 1;
  4418. fprintf(f, "begin-line");
  4419. }
  4420. if (anchor & ANCHOR_BEGIN_POSITION) {
  4421. if (q) fprintf(f, ", ");
  4422. q = 1;
  4423. fprintf(f, "begin-pos");
  4424. }
  4425. if (anchor & ANCHOR_END_BUF) {
  4426. if (q) fprintf(f, ", ");
  4427. q = 1;
  4428. fprintf(f, "end-buf");
  4429. }
  4430. if (anchor & ANCHOR_SEMI_END_BUF) {
  4431. if (q) fprintf(f, ", ");
  4432. q = 1;
  4433. fprintf(f, "semi-end-buf");
  4434. }
  4435. if (anchor & ANCHOR_END_LINE) {
  4436. if (q) fprintf(f, ", ");
  4437. q = 1;
  4438. fprintf(f, "end-line");
  4439. }
  4440. if (anchor & ANCHOR_ANYCHAR_STAR) {
  4441. if (q) fprintf(f, ", ");
  4442. q = 1;
  4443. fprintf(f, "anychar-star");
  4444. }
  4445. if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
  4446. if (q) fprintf(f, ", ");
  4447. fprintf(f, "anychar-star-pl");
  4448. }
  4449. fprintf(f, "]");
  4450. }
  4451. static void
  4452. print_optimize_info(FILE* f, regex_t* reg)
  4453. {
  4454. static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
  4455. "EXACT_IC", "MAP" };
  4456. fprintf(f, "optimize: %s\n", on[reg->optimize]);
  4457. fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
  4458. if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
  4459. print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
  4460. fprintf(f, "\n");
  4461. if (reg->optimize) {
  4462. fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
  4463. fprintf(f, "\n");
  4464. }
  4465. fprintf(f, "\n");
  4466. if (reg->exact) {
  4467. UChar *p;
  4468. fprintf(f, "exact: [");
  4469. for (p = reg->exact; p < reg->exact_end; p++) {
  4470. fputc(*p, f);
  4471. }
  4472. fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
  4473. }
  4474. else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
  4475. int c, i, n = 0;
  4476. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
  4477. if (reg->map[i]) n++;
  4478. fprintf(f, "map: n=%d\n", n);
  4479. if (n > 0) {
  4480. c = 0;
  4481. fputc('[', f);
  4482. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
  4483. if (reg->map[i] != 0) {
  4484. if (c > 0) fputs(", ", f);
  4485. c++;
  4486. if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
  4487. ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
  4488. fputc(i, f);
  4489. else
  4490. fprintf(f, "%d", i);
  4491. }
  4492. }
  4493. fprintf(f, "]\n");
  4494. }
  4495. }
  4496. }
  4497. #endif /* ONIG_DEBUG */
  4498. extern void
  4499. onig_free_body(regex_t* reg)
  4500. {
  4501. if (IS_NOT_NULL(reg)) {
  4502. if (IS_NOT_NULL(reg->p)) xfree(reg->p);
  4503. if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
  4504. if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
  4505. if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
  4506. if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
  4507. if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
  4508. #ifdef USE_NAMED_GROUP
  4509. onig_names_free(reg);
  4510. #endif
  4511. }
  4512. }
  4513. extern void
  4514. onig_free(regex_t* reg)
  4515. {
  4516. if (IS_NOT_NULL(reg)) {
  4517. onig_free_body(reg);
  4518. xfree(reg);
  4519. }
  4520. }
  4521. size_t
  4522. onig_memsize(regex_t *reg)
  4523. {
  4524. size_t size = sizeof(regex_t);
  4525. if (IS_NOT_NULL(reg->p)) size += reg->alloc;
  4526. if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
  4527. if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
  4528. if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
  4529. if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
  4530. if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
  4531. return size;
  4532. }
  4533. #define REGEX_TRANSFER(to,from) do {\
  4534. (to)->state = ONIG_STATE_MODIFY;\
  4535. onig_free_body(to);\
  4536. xmemcpy(to, from, sizeof(regex_t));\
  4537. xfree(from);\
  4538. } while (0)
  4539. extern void
  4540. onig_transfer(regex_t* to, regex_t* from)
  4541. {
  4542. THREAD_ATOMIC_START;
  4543. REGEX_TRANSFER(to, from);
  4544. THREAD_ATOMIC_END;
  4545. }
  4546. #define REGEX_CHAIN_HEAD(reg) do {\
  4547. while (IS_NOT_NULL((reg)->chain)) {\
  4548. (reg) = (reg)->chain;\
  4549. }\
  4550. } while (0)
  4551. extern void
  4552. onig_chain_link_add(regex_t* to, regex_t* add)
  4553. {
  4554. THREAD_ATOMIC_START;
  4555. REGEX_CHAIN_HEAD(to);
  4556. to->chain = add;
  4557. THREAD_ATOMIC_END;
  4558. }
  4559. extern void
  4560. onig_chain_reduce(regex_t* reg)
  4561. {
  4562. regex_t *head, *prev;
  4563. prev = reg;
  4564. head = prev->chain;
  4565. if (IS_NOT_NULL(head)) {
  4566. reg->state = ONIG_STATE_MODIFY;
  4567. while (IS_NOT_NULL(head->chain)) {
  4568. prev = head;
  4569. head = head->chain;
  4570. }
  4571. prev->chain = (regex_t* )NULL;
  4572. REGEX_TRANSFER(reg, head);
  4573. }
  4574. }
  4575. #ifdef ONIG_DEBUG
  4576. static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg));
  4577. #endif
  4578. #ifdef ONIG_DEBUG_PARSE_TREE
  4579. static void print_tree P_((FILE* f, Node* node));
  4580. #endif
  4581. extern int
  4582. onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
  4583. OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
  4584. {
  4585. #define COMPILE_INIT_SIZE 20
  4586. int r, init_size;
  4587. Node* root;
  4588. ScanEnv scan_env = {0};
  4589. #ifdef USE_SUBEXP_CALL
  4590. UnsetAddrList uslist;
  4591. #endif
  4592. if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
  4593. scan_env.sourcefile = sourcefile;
  4594. scan_env.sourceline = sourceline;
  4595. reg->state = ONIG_STATE_COMPILING;
  4596. #ifdef ONIG_DEBUG
  4597. print_enc_string(stderr, reg->enc, pattern, pattern_end);
  4598. #endif
  4599. if (reg->alloc == 0) {
  4600. init_size = (pattern_end - pattern) * 2;
  4601. if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
  4602. r = BBUF_INIT(reg, init_size);
  4603. if (r != 0) goto end;
  4604. }
  4605. else
  4606. reg->used = 0;
  4607. reg->num_mem = 0;
  4608. reg->num_repeat = 0;
  4609. reg->num_null_check = 0;
  4610. reg->repeat_range_alloc = 0;
  4611. reg->repeat_range = (OnigRepeatRange* )NULL;
  4612. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  4613. reg->num_comb_exp_check = 0;
  4614. #endif
  4615. r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
  4616. if (r != 0) goto err;
  4617. #ifdef USE_NAMED_GROUP
  4618. /* mixed use named group and no-named group */
  4619. if (scan_env.num_named > 0 &&
  4620. IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
  4621. !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
  4622. if (scan_env.num_named != scan_env.num_mem)
  4623. r = disable_noname_group_capture(&root, reg, &scan_env);
  4624. else
  4625. r = numbered_ref_check(root);
  4626. if (r != 0) goto err;
  4627. }
  4628. #endif
  4629. #ifdef USE_SUBEXP_CALL
  4630. if (scan_env.num_call > 0) {
  4631. r = unset_addr_list_init(&uslist, scan_env.num_call);
  4632. if (r != 0) goto err;
  4633. scan_env.unset_addr_list = &uslist;
  4634. r = setup_subexp_call(root, &scan_env);
  4635. if (r != 0) goto err_unset;
  4636. r = subexp_recursive_check_trav(root, &scan_env);
  4637. if (r < 0) goto err_unset;
  4638. r = subexp_inf_recursive_check_trav(root, &scan_env);
  4639. if (r != 0) goto err_unset;
  4640. reg->num_call = scan_env.num_call;
  4641. }
  4642. else
  4643. reg->num_call = 0;
  4644. #endif
  4645. r = setup_tree(root, reg, IN_LAST, &scan_env);
  4646. if (r != 0) goto err_unset;
  4647. #ifdef ONIG_DEBUG_PARSE_TREE
  4648. print_tree(stderr, root);
  4649. #endif
  4650. reg->capture_history = scan_env.capture_history;
  4651. reg->bt_mem_start = scan_env.bt_mem_start;
  4652. reg->bt_mem_start |= reg->capture_history;
  4653. if (IS_FIND_CONDITION(reg->options))
  4654. BIT_STATUS_ON_ALL(reg->bt_mem_end);
  4655. else {
  4656. reg->bt_mem_end = scan_env.bt_mem_end;
  4657. reg->bt_mem_end |= reg->capture_history;
  4658. }
  4659. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  4660. if (scan_env.backrefed_mem == 0
  4661. #ifdef USE_SUBEXP_CALL
  4662. || scan_env.num_call == 0
  4663. #endif
  4664. ) {
  4665. setup_comb_exp_check(root, 0, &scan_env);
  4666. #ifdef USE_SUBEXP_CALL
  4667. if (scan_env.has_recursion != 0) {
  4668. scan_env.num_comb_exp_check = 0;
  4669. }
  4670. else
  4671. #endif
  4672. if (scan_env.comb_exp_max_regnum > 0) {
  4673. int i;
  4674. for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
  4675. if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
  4676. scan_env.num_comb_exp_check = 0;
  4677. break;
  4678. }
  4679. }
  4680. }
  4681. }
  4682. reg->num_comb_exp_check = scan_env.num_comb_exp_check;
  4683. #endif
  4684. clear_optimize_info(reg);
  4685. #ifndef ONIG_DONT_OPTIMIZE
  4686. r = set_optimize_info_from_tree(root, reg, &scan_env);
  4687. if (r != 0) goto err_unset;
  4688. #endif
  4689. if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
  4690. xfree(scan_env.mem_nodes_dynamic);
  4691. scan_env.mem_nodes_dynamic = (Node** )NULL;
  4692. }
  4693. r = compile_tree(root, reg);
  4694. if (r == 0) {
  4695. r = add_opcode(reg, OP_END);
  4696. #ifdef USE_SUBEXP_CALL
  4697. if (scan_env.num_call > 0) {
  4698. r = unset_addr_list_fix(&uslist, reg);
  4699. unset_addr_list_end(&uslist);
  4700. if (r) goto err;
  4701. }
  4702. #endif
  4703. if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
  4704. reg->stack_pop_level = STACK_POP_LEVEL_ALL;
  4705. else {
  4706. if (reg->bt_mem_start != 0)
  4707. reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
  4708. else
  4709. reg->stack_pop_level = STACK_POP_LEVEL_FREE;
  4710. }
  4711. }
  4712. #ifdef USE_SUBEXP_CALL
  4713. else if (scan_env.num_call > 0) {
  4714. unset_addr_list_end(&uslist);
  4715. }
  4716. #endif
  4717. onig_node_free(root);
  4718. #ifdef ONIG_DEBUG_COMPILE
  4719. #ifdef USE_NAMED_GROUP
  4720. onig_print_names(stderr, reg);
  4721. #endif
  4722. print_compiled_byte_code_list(stderr, reg);
  4723. #endif
  4724. end:
  4725. reg->state = ONIG_STATE_NORMAL;
  4726. return r;
  4727. err_unset:
  4728. #ifdef USE_SUBEXP_CALL
  4729. if (scan_env.num_call > 0) {
  4730. unset_addr_list_end(&uslist);
  4731. }
  4732. #endif
  4733. err:
  4734. if (IS_NOT_NULL(scan_env.error)) {
  4735. if (IS_NOT_NULL(einfo)) {
  4736. einfo->enc = scan_env.enc;
  4737. einfo->par = scan_env.error;
  4738. einfo->par_end = scan_env.error_end;
  4739. }
  4740. }
  4741. onig_node_free(root);
  4742. if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
  4743. xfree(scan_env.mem_nodes_dynamic);
  4744. return r;
  4745. }
  4746. #ifdef USE_RECOMPILE_API
  4747. extern int
  4748. onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
  4749. OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax,
  4750. OnigErrorInfo* einfo)
  4751. {
  4752. int r;
  4753. regex_t *new_reg;
  4754. r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo);
  4755. if (r) return r;
  4756. if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
  4757. onig_transfer(reg, new_reg);
  4758. }
  4759. else {
  4760. onig_chain_link_add(reg, new_reg);
  4761. }
  4762. return 0;
  4763. }
  4764. #endif
  4765. static int onig_inited = 0;
  4766. extern int
  4767. onig_reg_init(regex_t* reg, OnigOptionType option,
  4768. OnigCaseFoldType case_fold_flag,
  4769. OnigEncoding enc, const OnigSyntaxType* syntax)
  4770. {
  4771. if (! onig_inited)
  4772. onig_init();
  4773. if (IS_NULL(reg))
  4774. return ONIGERR_INVALID_ARGUMENT;
  4775. if (ONIGENC_IS_UNDEF(enc))
  4776. return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED;
  4777. if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
  4778. == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
  4779. return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
  4780. }
  4781. (reg)->state = ONIG_STATE_MODIFY;
  4782. if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
  4783. option |= syntax->options;
  4784. option &= ~ONIG_OPTION_SINGLELINE;
  4785. }
  4786. else
  4787. option |= syntax->options;
  4788. (reg)->enc = enc;
  4789. (reg)->options = option;
  4790. (reg)->syntax = syntax;
  4791. (reg)->optimize = 0;
  4792. (reg)->exact = (UChar* )NULL;
  4793. (reg)->int_map = (int* )NULL;
  4794. (reg)->int_map_backward = (int* )NULL;
  4795. (reg)->chain = (regex_t* )NULL;
  4796. (reg)->p = (UChar* )NULL;
  4797. (reg)->alloc = 0;
  4798. (reg)->used = 0;
  4799. (reg)->name_table = (void* )NULL;
  4800. (reg)->case_fold_flag = case_fold_flag;
  4801. return 0;
  4802. }
  4803. extern int
  4804. onig_new_without_alloc(regex_t* reg, const UChar* pattern,
  4805. const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
  4806. OnigSyntaxType* syntax, OnigErrorInfo* einfo)
  4807. {
  4808. int r;
  4809. r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
  4810. if (r) return r;
  4811. r = onig_compile(reg, pattern, pattern_end, einfo, NULL, 0);
  4812. return r;
  4813. }
  4814. extern int
  4815. onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
  4816. OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
  4817. OnigErrorInfo* einfo)
  4818. {
  4819. int r;
  4820. *reg = (regex_t* )xmalloc(sizeof(regex_t));
  4821. if (IS_NULL(*reg)) return ONIGERR_MEMORY;
  4822. r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
  4823. if (r) goto err;
  4824. r = onig_compile(*reg, pattern, pattern_end, einfo, NULL, 0);
  4825. if (r) {
  4826. err:
  4827. onig_free(*reg);
  4828. *reg = NULL;
  4829. }
  4830. return r;
  4831. }
  4832. extern int
  4833. onig_init(void)
  4834. {
  4835. if (onig_inited != 0)
  4836. return 0;
  4837. THREAD_SYSTEM_INIT;
  4838. THREAD_ATOMIC_START;
  4839. onig_inited = 1;
  4840. onigenc_init();
  4841. /* onigenc_set_default_caseconv_table((UChar* )0); */
  4842. #ifdef ONIG_DEBUG_STATISTICS
  4843. onig_statistics_init();
  4844. #endif
  4845. THREAD_ATOMIC_END;
  4846. return 0;
  4847. }
  4848. extern int
  4849. onig_end(void)
  4850. {
  4851. THREAD_ATOMIC_START;
  4852. #ifdef ONIG_DEBUG_STATISTICS
  4853. onig_print_statistics(stderr);
  4854. #endif
  4855. #ifdef USE_SHARED_CCLASS_TABLE
  4856. onig_free_shared_cclass_table();
  4857. #endif
  4858. #ifdef USE_PARSE_TREE_NODE_RECYCLE
  4859. onig_free_node_list();
  4860. #endif
  4861. onig_inited = 0;
  4862. THREAD_ATOMIC_END;
  4863. THREAD_SYSTEM_END;
  4864. return 0;
  4865. }
  4866. extern int
  4867. onig_is_in_code_range(const UChar* p, OnigCodePoint code)
  4868. {
  4869. OnigCodePoint n, *data;
  4870. OnigCodePoint low, high, x;
  4871. GET_CODE_POINT(n, p);
  4872. data = (OnigCodePoint* )p;
  4873. data++;
  4874. for (low = 0, high = n; low < high; ) {
  4875. x = (low + high) >> 1;
  4876. if (code > data[x * 2 + 1])
  4877. low = x + 1;
  4878. else
  4879. high = x;
  4880. }
  4881. return ((low < n && code >= data[low * 2]) ? 1 : 0);
  4882. }
  4883. extern int
  4884. onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
  4885. {
  4886. int found;
  4887. if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
  4888. if (IS_NULL(cc->mbuf)) {
  4889. found = 0;
  4890. }
  4891. else {
  4892. found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
  4893. }
  4894. }
  4895. else {
  4896. found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
  4897. }
  4898. if (IS_NCCLASS_NOT(cc))
  4899. return !found;
  4900. else
  4901. return found;
  4902. }
  4903. extern int
  4904. onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
  4905. {
  4906. int len;
  4907. if (ONIGENC_MBC_MINLEN(enc) > 1) {
  4908. len = 2;
  4909. }
  4910. else {
  4911. len = ONIGENC_CODE_TO_MBCLEN(enc, code);
  4912. }
  4913. return onig_is_code_in_cc_len(len, code, cc);
  4914. }
  4915. #ifdef ONIG_DEBUG
  4916. /* arguments type */
  4917. #define ARG_SPECIAL -1
  4918. #define ARG_NON 0
  4919. #define ARG_RELADDR 1
  4920. #define ARG_ABSADDR 2
  4921. #define ARG_LENGTH 3
  4922. #define ARG_MEMNUM 4
  4923. #define ARG_OPTION 5
  4924. #define ARG_STATE_CHECK 6
  4925. OnigOpInfoType OnigOpInfo[] = {
  4926. { OP_FINISH, "finish", ARG_NON },
  4927. { OP_END, "end", ARG_NON },
  4928. { OP_EXACT1, "exact1", ARG_SPECIAL },
  4929. { OP_EXACT2, "exact2", ARG_SPECIAL },
  4930. { OP_EXACT3, "exact3", ARG_SPECIAL },
  4931. { OP_EXACT4, "exact4", ARG_SPECIAL },
  4932. { OP_EXACT5, "exact5", ARG_SPECIAL },
  4933. { OP_EXACTN, "exactn", ARG_SPECIAL },
  4934. { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
  4935. { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
  4936. { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
  4937. { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
  4938. { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
  4939. { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
  4940. { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
  4941. { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
  4942. { OP_CCLASS, "cclass", ARG_SPECIAL },
  4943. { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
  4944. { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
  4945. { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
  4946. { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
  4947. { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
  4948. { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL },
  4949. { OP_ANYCHAR, "anychar", ARG_NON },
  4950. { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
  4951. { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
  4952. { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
  4953. { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
  4954. { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
  4955. { OP_WORD, "word", ARG_NON },
  4956. { OP_NOT_WORD, "not-word", ARG_NON },
  4957. { OP_WORD_BOUND, "word-bound", ARG_NON },
  4958. { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
  4959. { OP_WORD_BEGIN, "word-begin", ARG_NON },
  4960. { OP_WORD_END, "word-end", ARG_NON },
  4961. { OP_BEGIN_BUF, "begin-buf", ARG_NON },
  4962. { OP_END_BUF, "end-buf", ARG_NON },
  4963. { OP_BEGIN_LINE, "begin-line", ARG_NON },
  4964. { OP_END_LINE, "end-line", ARG_NON },
  4965. { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
  4966. { OP_BEGIN_POSITION, "begin-position", ARG_NON },
  4967. { OP_BACKREF1, "backref1", ARG_NON },
  4968. { OP_BACKREF2, "backref2", ARG_NON },
  4969. { OP_BACKREFN, "backrefn", ARG_MEMNUM },
  4970. { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
  4971. { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
  4972. { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
  4973. { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
  4974. { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
  4975. { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
  4976. { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
  4977. { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
  4978. { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
  4979. { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
  4980. { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
  4981. { OP_SET_OPTION, "set-option", ARG_OPTION },
  4982. { OP_FAIL, "fail", ARG_NON },
  4983. { OP_JUMP, "jump", ARG_RELADDR },
  4984. { OP_PUSH, "push", ARG_RELADDR },
  4985. { OP_POP, "pop", ARG_NON },
  4986. { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
  4987. { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
  4988. { OP_REPEAT, "repeat", ARG_SPECIAL },
  4989. { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
  4990. { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
  4991. { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
  4992. { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
  4993. { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
  4994. { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
  4995. { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
  4996. { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
  4997. { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
  4998. { OP_PUSH_POS, "push-pos", ARG_NON },
  4999. { OP_POP_POS, "pop-pos", ARG_NON },
  5000. { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
  5001. { OP_FAIL_POS, "fail-pos", ARG_NON },
  5002. { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
  5003. { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
  5004. { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
  5005. { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
  5006. { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
  5007. { OP_CALL, "call", ARG_ABSADDR },
  5008. { OP_RETURN, "return", ARG_NON },
  5009. { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
  5010. { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
  5011. { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
  5012. { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
  5013. { OP_STATE_CHECK_ANYCHAR_ML_STAR,
  5014. "state-check-anychar-ml*", ARG_STATE_CHECK },
  5015. { -1, "", ARG_NON }
  5016. };
  5017. static char*
  5018. op2name(int opcode)
  5019. {
  5020. int i;
  5021. for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
  5022. if (opcode == OnigOpInfo[i].opcode)
  5023. return OnigOpInfo[i].name;
  5024. }
  5025. return "";
  5026. }
  5027. static int
  5028. op2arg_type(int opcode)
  5029. {
  5030. int i;
  5031. for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
  5032. if (opcode == OnigOpInfo[i].opcode)
  5033. return OnigOpInfo[i].arg_type;
  5034. }
  5035. return ARG_SPECIAL;
  5036. }
  5037. static void
  5038. Indent(FILE* f, int indent)
  5039. {
  5040. int i;
  5041. for (i = 0; i < indent; i++) putc(' ', f);
  5042. }
  5043. static void
  5044. p_string(FILE* f, int len, UChar* s)
  5045. {
  5046. fputs(":", f);
  5047. while (len-- > 0) { fputc(*s++, f); }
  5048. }
  5049. static void
  5050. p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
  5051. {
  5052. int x = len * mb_len;
  5053. fprintf(f, ":%d:", len);
  5054. while (x-- > 0) { fputc(*s++, f); }
  5055. }
  5056. extern void
  5057. onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
  5058. OnigEncoding enc)
  5059. {
  5060. int i, n, arg_type;
  5061. RelAddrType addr;
  5062. LengthType len;
  5063. MemNumType mem;
  5064. StateCheckNumType scn;
  5065. OnigCodePoint code;
  5066. UChar *q;
  5067. fprintf(f, "[%s", op2name(*bp));
  5068. arg_type = op2arg_type(*bp);
  5069. if (arg_type != ARG_SPECIAL) {
  5070. bp++;
  5071. switch (arg_type) {
  5072. case ARG_NON:
  5073. break;
  5074. case ARG_RELADDR:
  5075. GET_RELADDR_INC(addr, bp);
  5076. fprintf(f, ":(%d)", addr);
  5077. break;
  5078. case ARG_ABSADDR:
  5079. GET_ABSADDR_INC(addr, bp);
  5080. fprintf(f, ":(%d)", addr);
  5081. break;
  5082. case ARG_LENGTH:
  5083. GET_LENGTH_INC(len, bp);
  5084. fprintf(f, ":%d", len);
  5085. break;
  5086. case ARG_MEMNUM:
  5087. mem = *((MemNumType* )bp);
  5088. bp += SIZE_MEMNUM;
  5089. fprintf(f, ":%d", mem);
  5090. break;
  5091. case ARG_OPTION:
  5092. {
  5093. OnigOptionType option = *((OnigOptionType* )bp);
  5094. bp += SIZE_OPTION;
  5095. fprintf(f, ":%d", option);
  5096. }
  5097. break;
  5098. case ARG_STATE_CHECK:
  5099. scn = *((StateCheckNumType* )bp);
  5100. bp += SIZE_STATE_CHECK_NUM;
  5101. fprintf(f, ":%d", scn);
  5102. break;
  5103. }
  5104. }
  5105. else {
  5106. switch (*bp++) {
  5107. case OP_EXACT1:
  5108. case OP_ANYCHAR_STAR_PEEK_NEXT:
  5109. case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
  5110. p_string(f, 1, bp++); break;
  5111. case OP_EXACT2:
  5112. p_string(f, 2, bp); bp += 2; break;
  5113. case OP_EXACT3:
  5114. p_string(f, 3, bp); bp += 3; break;
  5115. case OP_EXACT4:
  5116. p_string(f, 4, bp); bp += 4; break;
  5117. case OP_EXACT5:
  5118. p_string(f, 5, bp); bp += 5; break;
  5119. case OP_EXACTN:
  5120. GET_LENGTH_INC(len, bp);
  5121. p_len_string(f, len, 1, bp);
  5122. bp += len;
  5123. break;
  5124. case OP_EXACTMB2N1:
  5125. p_string(f, 2, bp); bp += 2; break;
  5126. case OP_EXACTMB2N2:
  5127. p_string(f, 4, bp); bp += 4; break;
  5128. case OP_EXACTMB2N3:
  5129. p_string(f, 6, bp); bp += 6; break;
  5130. case OP_EXACTMB2N:
  5131. GET_LENGTH_INC(len, bp);
  5132. p_len_string(f, len, 2, bp);
  5133. bp += len * 2;
  5134. break;
  5135. case OP_EXACTMB3N:
  5136. GET_LENGTH_INC(len, bp);
  5137. p_len_string(f, len, 3, bp);
  5138. bp += len * 3;
  5139. break;
  5140. case OP_EXACTMBN:
  5141. {
  5142. int mb_len;
  5143. GET_LENGTH_INC(mb_len, bp);
  5144. GET_LENGTH_INC(len, bp);
  5145. fprintf(f, ":%d:%d:", mb_len, len);
  5146. n = len * mb_len;
  5147. while (n-- > 0) { fputc(*bp++, f); }
  5148. }
  5149. break;
  5150. case OP_EXACT1_IC:
  5151. len = enclen(enc, bp, bpend);
  5152. p_string(f, len, bp);
  5153. bp += len;
  5154. break;
  5155. case OP_EXACTN_IC:
  5156. GET_LENGTH_INC(len, bp);
  5157. p_len_string(f, len, 1, bp);
  5158. bp += len;
  5159. break;
  5160. case OP_CCLASS:
  5161. n = bitset_on_num((BitSetRef )bp);
  5162. bp += SIZE_BITSET;
  5163. fprintf(f, ":%d", n);
  5164. break;
  5165. case OP_CCLASS_NOT:
  5166. n = bitset_on_num((BitSetRef )bp);
  5167. bp += SIZE_BITSET;
  5168. fprintf(f, ":%d", n);
  5169. break;
  5170. case OP_CCLASS_MB:
  5171. case OP_CCLASS_MB_NOT:
  5172. GET_LENGTH_INC(len, bp);
  5173. q = bp;
  5174. #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
  5175. ALIGNMENT_RIGHT(q);
  5176. #endif
  5177. GET_CODE_POINT(code, q);
  5178. bp += len;
  5179. fprintf(f, ":%d:%d", (int )code, len);
  5180. break;
  5181. case OP_CCLASS_MIX:
  5182. case OP_CCLASS_MIX_NOT:
  5183. n = bitset_on_num((BitSetRef )bp);
  5184. bp += SIZE_BITSET;
  5185. GET_LENGTH_INC(len, bp);
  5186. q = bp;
  5187. #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
  5188. ALIGNMENT_RIGHT(q);
  5189. #endif
  5190. GET_CODE_POINT(code, q);
  5191. bp += len;
  5192. fprintf(f, ":%d:%d:%d", n, (int )code, len);
  5193. break;
  5194. case OP_CCLASS_NODE:
  5195. {
  5196. CClassNode *cc;
  5197. GET_POINTER_INC(cc, bp);
  5198. n = bitset_on_num(cc->bs);
  5199. fprintf(f, ":%u:%d", (unsigned int )cc, n);
  5200. }
  5201. break;
  5202. case OP_BACKREFN_IC:
  5203. mem = *((MemNumType* )bp);
  5204. bp += SIZE_MEMNUM;
  5205. fprintf(f, ":%d", mem);
  5206. break;
  5207. case OP_BACKREF_MULTI_IC:
  5208. case OP_BACKREF_MULTI:
  5209. fputs(" ", f);
  5210. GET_LENGTH_INC(len, bp);
  5211. for (i = 0; i < len; i++) {
  5212. GET_MEMNUM_INC(mem, bp);
  5213. if (i > 0) fputs(", ", f);
  5214. fprintf(f, "%d", mem);
  5215. }
  5216. break;
  5217. case OP_BACKREF_WITH_LEVEL:
  5218. {
  5219. OnigOptionType option;
  5220. LengthType level;
  5221. GET_OPTION_INC(option, bp);
  5222. fprintf(f, ":%d", option);
  5223. GET_LENGTH_INC(level, bp);
  5224. fprintf(f, ":%d", level);
  5225. fputs(" ", f);
  5226. GET_LENGTH_INC(len, bp);
  5227. for (i = 0; i < len; i++) {
  5228. GET_MEMNUM_INC(mem, bp);
  5229. if (i > 0) fputs(", ", f);
  5230. fprintf(f, "%d", mem);
  5231. }
  5232. }
  5233. break;
  5234. case OP_REPEAT:
  5235. case OP_REPEAT_NG:
  5236. {
  5237. mem = *((MemNumType* )bp);
  5238. bp += SIZE_MEMNUM;
  5239. addr = *((RelAddrType* )bp);
  5240. bp += SIZE_RELADDR;
  5241. fprintf(f, ":%d:%d", mem, addr);
  5242. }
  5243. break;
  5244. case OP_PUSH_OR_JUMP_EXACT1:
  5245. case OP_PUSH_IF_PEEK_NEXT:
  5246. addr = *((RelAddrType* )bp);
  5247. bp += SIZE_RELADDR;
  5248. fprintf(f, ":(%d)", addr);
  5249. p_string(f, 1, bp);
  5250. bp += 1;
  5251. break;
  5252. case OP_LOOK_BEHIND:
  5253. GET_LENGTH_INC(len, bp);
  5254. fprintf(f, ":%d", len);
  5255. break;
  5256. case OP_PUSH_LOOK_BEHIND_NOT:
  5257. GET_RELADDR_INC(addr, bp);
  5258. GET_LENGTH_INC(len, bp);
  5259. fprintf(f, ":%d:(%d)", len, addr);
  5260. break;
  5261. case OP_STATE_CHECK_PUSH:
  5262. case OP_STATE_CHECK_PUSH_OR_JUMP:
  5263. scn = *((StateCheckNumType* )bp);
  5264. bp += SIZE_STATE_CHECK_NUM;
  5265. addr = *((RelAddrType* )bp);
  5266. bp += SIZE_RELADDR;
  5267. fprintf(f, ":%d:(%d)", scn, addr);
  5268. break;
  5269. default:
  5270. fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
  5271. *--bp);
  5272. }
  5273. }
  5274. fputs("]", f);
  5275. if (nextp) *nextp = bp;
  5276. }
  5277. static void
  5278. print_compiled_byte_code_list(FILE* f, regex_t* reg)
  5279. {
  5280. int ncode;
  5281. UChar* bp = reg->p;
  5282. UChar* end = reg->p + reg->used;
  5283. fprintf(f, "code length: %d\n", reg->used);
  5284. ncode = 0;
  5285. while (bp < end) {
  5286. ncode++;
  5287. if (bp > reg->p) {
  5288. if (ncode % 5 == 0)
  5289. fprintf(f, "\n");
  5290. else
  5291. fputs(" ", f);
  5292. }
  5293. onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
  5294. }
  5295. fprintf(f, "\n");
  5296. }
  5297. static void
  5298. print_indent_tree(FILE* f, Node* node, int indent)
  5299. {
  5300. int i, type;
  5301. int add = 3;
  5302. UChar* p;
  5303. Indent(f, indent);
  5304. if (IS_NULL(node)) {
  5305. fprintf(f, "ERROR: null node!!!\n");
  5306. exit (0);
  5307. }
  5308. type = NTYPE(node);
  5309. switch (type) {
  5310. case NT_LIST:
  5311. case NT_ALT:
  5312. if (NTYPE(node) == NT_LIST)
  5313. fprintf(f, "<list:%x>\n", (int )node);
  5314. else
  5315. fprintf(f, "<alt:%x>\n", (int )node);
  5316. print_indent_tree(f, NCAR(node), indent + add);
  5317. while (IS_NOT_NULL(node = NCDR(node))) {
  5318. if (NTYPE(node) != type) {
  5319. fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
  5320. exit(0);
  5321. }
  5322. print_indent_tree(f, NCAR(node), indent + add);
  5323. }
  5324. break;
  5325. case NT_STR:
  5326. fprintf(f, "<string%s:%x>",
  5327. (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node);
  5328. for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
  5329. if (*p >= 0x20 && *p < 0x7f)
  5330. fputc(*p, f);
  5331. else {
  5332. fprintf(f, " 0x%02x", *p);
  5333. }
  5334. }
  5335. break;
  5336. case NT_CCLASS:
  5337. fprintf(f, "<cclass:%x>", (int )node);
  5338. if (IS_NCCLASS_NOT(NCCLASS(node))) fputs(" not", f);
  5339. if (NCCLASS(node)->mbuf) {
  5340. BBuf* bbuf = NCCLASS(node)->mbuf;
  5341. for (i = 0; i < bbuf->used; i++) {
  5342. if (i > 0) fprintf(f, ",");
  5343. fprintf(f, "%0x", bbuf->p[i]);
  5344. }
  5345. }
  5346. break;
  5347. case NT_CTYPE:
  5348. fprintf(f, "<ctype:%x> ", (int )node);
  5349. switch (NCTYPE(node)->ctype) {
  5350. case ONIGENC_CTYPE_WORD:
  5351. if (NCTYPE(node)->not != 0)
  5352. fputs("not word", f);
  5353. else
  5354. fputs("word", f);
  5355. break;
  5356. default:
  5357. fprintf(f, "ERROR: undefined ctype.\n");
  5358. exit(0);
  5359. }
  5360. break;
  5361. case NT_CANY:
  5362. fprintf(f, "<anychar:%x>", (int )node);
  5363. break;
  5364. case NT_ANCHOR:
  5365. fprintf(f, "<anchor:%x> ", (int )node);
  5366. switch (NANCHOR(node)->type) {
  5367. case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
  5368. case ANCHOR_END_BUF: fputs("end buf", f); break;
  5369. case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
  5370. case ANCHOR_END_LINE: fputs("end line", f); break;
  5371. case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
  5372. case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
  5373. case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
  5374. case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
  5375. #ifdef USE_WORD_BEGIN_END
  5376. case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
  5377. case ANCHOR_WORD_END: fputs("word end", f); break;
  5378. #endif
  5379. case ANCHOR_PREC_READ: fputs("prec read", f); break;
  5380. case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break;
  5381. case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break;
  5382. case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break;
  5383. default:
  5384. fprintf(f, "ERROR: undefined anchor type.\n");
  5385. break;
  5386. }
  5387. break;
  5388. case NT_BREF:
  5389. {
  5390. int* p;
  5391. BRefNode* br = NBREF(node);
  5392. p = BACKREFS_P(br);
  5393. fprintf(f, "<backref:%x>", (int )node);
  5394. for (i = 0; i < br->back_num; i++) {
  5395. if (i > 0) fputs(", ", f);
  5396. fprintf(f, "%d", p[i]);
  5397. }
  5398. }
  5399. break;
  5400. #ifdef USE_SUBEXP_CALL
  5401. case NT_CALL:
  5402. {
  5403. CallNode* cn = NCALL(node);
  5404. fprintf(f, "<call:%x>", (int )node);
  5405. p_string(f, cn->name_end - cn->name, cn->name);
  5406. }
  5407. break;
  5408. #endif
  5409. case NT_QTFR:
  5410. fprintf(f, "<quantifier:%x>{%d,%d}%s\n", (int )node,
  5411. NQTFR(node)->lower, NQTFR(node)->upper,
  5412. (NQTFR(node)->greedy ? "" : "?"));
  5413. print_indent_tree(f, NQTFR(node)->target, indent + add);
  5414. break;
  5415. case NT_ENCLOSE:
  5416. fprintf(f, "<enclose:%x> ", (int )node);
  5417. switch (NENCLOSE(node)->type) {
  5418. case ENCLOSE_OPTION:
  5419. fprintf(f, "option:%d\n", NENCLOSE(node)->option);
  5420. print_indent_tree(f, NENCLOSE(node)->target, indent + add);
  5421. break;
  5422. case ENCLOSE_MEMORY:
  5423. fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
  5424. break;
  5425. case ENCLOSE_STOP_BACKTRACK:
  5426. fprintf(f, "stop-bt");
  5427. break;
  5428. default:
  5429. break;
  5430. }
  5431. fprintf(f, "\n");
  5432. print_indent_tree(f, NENCLOSE(node)->target, indent + add);
  5433. break;
  5434. default:
  5435. fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
  5436. break;
  5437. }
  5438. if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
  5439. type != NT_ENCLOSE)
  5440. fprintf(f, "\n");
  5441. fflush(f);
  5442. }
  5443. #endif /* ONIG_DEBUG */
  5444. #ifdef ONIG_DEBUG_PARSE_TREE
  5445. static void
  5446. print_tree(FILE* f, Node* node)
  5447. {
  5448. print_indent_tree(f, node, 0);
  5449. }
  5450. #endif