PageRenderTime 48ms CodeModel.GetById 12ms RepoModel.GetById 0ms app.codeStats 1ms

/regcomp.c

https://github.com/wanabe/ruby
C | 6764 lines | 5799 code | 894 blank | 71 comment | 1548 complexity | 1db447b23f7c522e48fb2db082a273af MD5 | raw file
Possible License(s): LGPL-2.1, AGPL-3.0, 0BSD, Unlicense, GPL-2.0, BSD-3-Clause
  1. /**********************************************************************
  2. regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
  3. **********************************************************************/
  4. /*-
  5. * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
  6. * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
  7. * All rights reserved.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in the
  16. * documentation and/or other materials provided with the distribution.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  19. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  22. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28. * SUCH DAMAGE.
  29. */
  30. #include "regparse.h"
  31. OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN;
  32. extern OnigCaseFoldType
  33. onig_get_default_case_fold_flag(void)
  34. {
  35. return OnigDefaultCaseFoldFlag;
  36. }
  37. extern int
  38. onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag)
  39. {
  40. OnigDefaultCaseFoldFlag = case_fold_flag;
  41. return 0;
  42. }
  43. #ifndef PLATFORM_UNALIGNED_WORD_ACCESS
  44. static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
  45. #endif
  46. #if 0
  47. static UChar*
  48. str_dup(UChar* s, UChar* end)
  49. {
  50. ptrdiff_t len = end - s;
  51. if (len > 0) {
  52. UChar* r = (UChar* )xmalloc(len + 1);
  53. CHECK_NULL_RETURN(r);
  54. xmemcpy(r, s, len);
  55. r[len] = (UChar )0;
  56. return r;
  57. }
  58. else return NULL;
  59. }
  60. #endif
  61. static void
  62. swap_node(Node* a, Node* b)
  63. {
  64. Node c;
  65. c = *a; *a = *b; *b = c;
  66. if (NTYPE(a) == NT_STR) {
  67. StrNode* sn = NSTR(a);
  68. if (sn->capa == 0) {
  69. size_t len = sn->end - sn->s;
  70. sn->s = sn->buf;
  71. sn->end = sn->s + len;
  72. }
  73. }
  74. if (NTYPE(b) == NT_STR) {
  75. StrNode* sn = NSTR(b);
  76. if (sn->capa == 0) {
  77. size_t len = sn->end - sn->s;
  78. sn->s = sn->buf;
  79. sn->end = sn->s + len;
  80. }
  81. }
  82. }
  83. static OnigDistance
  84. distance_add(OnigDistance d1, OnigDistance d2)
  85. {
  86. if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE)
  87. return ONIG_INFINITE_DISTANCE;
  88. else {
  89. if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2;
  90. else return ONIG_INFINITE_DISTANCE;
  91. }
  92. }
  93. static OnigDistance
  94. distance_multiply(OnigDistance d, int m)
  95. {
  96. if (m == 0) return 0;
  97. if (d < ONIG_INFINITE_DISTANCE / m)
  98. return d * m;
  99. else
  100. return ONIG_INFINITE_DISTANCE;
  101. }
  102. static int
  103. bitset_is_empty(BitSetRef bs)
  104. {
  105. int i;
  106. for (i = 0; i < BITSET_SIZE; i++) {
  107. if (bs[i] != 0) return 0;
  108. }
  109. return 1;
  110. }
  111. #ifdef ONIG_DEBUG
  112. static int
  113. bitset_on_num(BitSetRef bs)
  114. {
  115. int i, n;
  116. n = 0;
  117. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  118. if (BITSET_AT(bs, i)) n++;
  119. }
  120. return n;
  121. }
  122. #endif
  123. // Attempt to right size allocated buffers for a regex post compile
  124. static void
  125. onig_reg_resize(regex_t *reg)
  126. {
  127. resize:
  128. if (reg->alloc > reg->used) {
  129. unsigned char *new_ptr = xrealloc(reg->p, reg->used);
  130. // Skip the right size optimization if memory allocation fails
  131. if (new_ptr) {
  132. reg->alloc = reg->used;
  133. reg->p = new_ptr;
  134. }
  135. }
  136. if (reg->chain) {
  137. reg = reg->chain;
  138. goto resize;
  139. }
  140. }
  141. extern int
  142. onig_bbuf_init(BBuf* buf, OnigDistance size)
  143. {
  144. if (size <= 0) {
  145. size = 0;
  146. buf->p = NULL;
  147. }
  148. else {
  149. buf->p = (UChar* )xmalloc(size);
  150. if (IS_NULL(buf->p)) return(ONIGERR_MEMORY);
  151. }
  152. buf->alloc = (unsigned int )size;
  153. buf->used = 0;
  154. return 0;
  155. }
  156. #ifdef USE_SUBEXP_CALL
  157. static int
  158. unset_addr_list_init(UnsetAddrList* uslist, int size)
  159. {
  160. UnsetAddr* p;
  161. p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size);
  162. CHECK_NULL_RETURN_MEMERR(p);
  163. uslist->num = 0;
  164. uslist->alloc = size;
  165. uslist->us = p;
  166. return 0;
  167. }
  168. static void
  169. unset_addr_list_end(UnsetAddrList* uslist)
  170. {
  171. if (IS_NOT_NULL(uslist->us))
  172. xfree(uslist->us);
  173. }
  174. static int
  175. unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
  176. {
  177. UnsetAddr* p;
  178. int size;
  179. if (uslist->num >= uslist->alloc) {
  180. size = uslist->alloc * 2;
  181. p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size);
  182. CHECK_NULL_RETURN_MEMERR(p);
  183. uslist->alloc = size;
  184. uslist->us = p;
  185. }
  186. uslist->us[uslist->num].offset = offset;
  187. uslist->us[uslist->num].target = node;
  188. uslist->num++;
  189. return 0;
  190. }
  191. #endif /* USE_SUBEXP_CALL */
  192. static int
  193. add_opcode(regex_t* reg, int opcode)
  194. {
  195. BBUF_ADD1(reg, opcode);
  196. return 0;
  197. }
  198. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  199. static int
  200. add_state_check_num(regex_t* reg, int num)
  201. {
  202. StateCheckNumType n = (StateCheckNumType )num;
  203. BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM);
  204. return 0;
  205. }
  206. #endif
  207. static int
  208. add_rel_addr(regex_t* reg, int addr)
  209. {
  210. RelAddrType ra = (RelAddrType )addr;
  211. BBUF_ADD(reg, &ra, SIZE_RELADDR);
  212. return 0;
  213. }
  214. static int
  215. add_abs_addr(regex_t* reg, int addr)
  216. {
  217. AbsAddrType ra = (AbsAddrType )addr;
  218. BBUF_ADD(reg, &ra, SIZE_ABSADDR);
  219. return 0;
  220. }
  221. static int
  222. add_length(regex_t* reg, OnigDistance len)
  223. {
  224. LengthType l = (LengthType )len;
  225. BBUF_ADD(reg, &l, SIZE_LENGTH);
  226. return 0;
  227. }
  228. static int
  229. add_mem_num(regex_t* reg, int num)
  230. {
  231. MemNumType n = (MemNumType )num;
  232. BBUF_ADD(reg, &n, SIZE_MEMNUM);
  233. return 0;
  234. }
  235. #if 0
  236. static int
  237. add_pointer(regex_t* reg, void* addr)
  238. {
  239. PointerType ptr = (PointerType )addr;
  240. BBUF_ADD(reg, &ptr, SIZE_POINTER);
  241. return 0;
  242. }
  243. #endif
  244. static int
  245. add_option(regex_t* reg, OnigOptionType option)
  246. {
  247. BBUF_ADD(reg, &option, SIZE_OPTION);
  248. return 0;
  249. }
  250. static int
  251. add_opcode_rel_addr(regex_t* reg, int opcode, int addr)
  252. {
  253. int r;
  254. r = add_opcode(reg, opcode);
  255. if (r) return r;
  256. r = add_rel_addr(reg, addr);
  257. return r;
  258. }
  259. static int
  260. add_bytes(regex_t* reg, UChar* bytes, OnigDistance len)
  261. {
  262. BBUF_ADD(reg, bytes, len);
  263. return 0;
  264. }
  265. static int
  266. add_bitset(regex_t* reg, BitSetRef bs)
  267. {
  268. BBUF_ADD(reg, bs, SIZE_BITSET);
  269. return 0;
  270. }
  271. static int
  272. add_opcode_option(regex_t* reg, int opcode, OnigOptionType option)
  273. {
  274. int r;
  275. r = add_opcode(reg, opcode);
  276. if (r) return r;
  277. r = add_option(reg, option);
  278. return r;
  279. }
  280. static int compile_length_tree(Node* node, regex_t* reg);
  281. static int compile_tree(Node* node, regex_t* reg);
  282. #define IS_NEED_STR_LEN_OP_EXACT(op) \
  283. ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\
  284. (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC)
  285. static int
  286. select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case)
  287. {
  288. int op;
  289. OnigDistance str_len = (byte_len + mb_len - 1) / mb_len;
  290. if (ignore_case) {
  291. switch (str_len) {
  292. case 1: op = OP_EXACT1_IC; break;
  293. default: op = OP_EXACTN_IC; break;
  294. }
  295. }
  296. else {
  297. switch (mb_len) {
  298. case 1:
  299. switch (str_len) {
  300. case 1: op = OP_EXACT1; break;
  301. case 2: op = OP_EXACT2; break;
  302. case 3: op = OP_EXACT3; break;
  303. case 4: op = OP_EXACT4; break;
  304. case 5: op = OP_EXACT5; break;
  305. default: op = OP_EXACTN; break;
  306. }
  307. break;
  308. case 2:
  309. switch (str_len) {
  310. case 1: op = OP_EXACTMB2N1; break;
  311. case 2: op = OP_EXACTMB2N2; break;
  312. case 3: op = OP_EXACTMB2N3; break;
  313. default: op = OP_EXACTMB2N; break;
  314. }
  315. break;
  316. case 3:
  317. op = OP_EXACTMB3N;
  318. break;
  319. default:
  320. op = OP_EXACTMBN;
  321. break;
  322. }
  323. }
  324. return op;
  325. }
  326. static int
  327. compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
  328. {
  329. int r;
  330. int saved_num_null_check = reg->num_null_check;
  331. if (empty_info != 0) {
  332. r = add_opcode(reg, OP_NULL_CHECK_START);
  333. if (r) return r;
  334. r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
  335. if (r) return r;
  336. reg->num_null_check++;
  337. }
  338. r = compile_tree(node, reg);
  339. if (r) return r;
  340. if (empty_info != 0) {
  341. if (empty_info == NQ_TARGET_IS_EMPTY)
  342. r = add_opcode(reg, OP_NULL_CHECK_END);
  343. else if (empty_info == NQ_TARGET_IS_EMPTY_MEM)
  344. r = add_opcode(reg, OP_NULL_CHECK_END_MEMST);
  345. else if (empty_info == NQ_TARGET_IS_EMPTY_REC)
  346. r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH);
  347. if (r) return r;
  348. r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */
  349. }
  350. return r;
  351. }
  352. #ifdef USE_SUBEXP_CALL
  353. static int
  354. compile_call(CallNode* node, regex_t* reg)
  355. {
  356. int r;
  357. r = add_opcode(reg, OP_CALL);
  358. if (r) return r;
  359. r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
  360. node->target);
  361. if (r) return r;
  362. r = add_abs_addr(reg, 0 /*dummy addr.*/);
  363. return r;
  364. }
  365. #endif
  366. static int
  367. compile_tree_n_times(Node* node, int n, regex_t* reg)
  368. {
  369. int i, r;
  370. for (i = 0; i < n; i++) {
  371. r = compile_tree(node, reg);
  372. if (r) return r;
  373. }
  374. return 0;
  375. }
  376. static int
  377. add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len,
  378. regex_t* reg ARG_UNUSED, int ignore_case)
  379. {
  380. int len;
  381. int op = select_str_opcode(mb_len, byte_len, ignore_case);
  382. len = SIZE_OPCODE;
  383. if (op == OP_EXACTMBN) len += SIZE_LENGTH;
  384. if (IS_NEED_STR_LEN_OP_EXACT(op))
  385. len += SIZE_LENGTH;
  386. len += (int )byte_len;
  387. return len;
  388. }
  389. static int
  390. add_compile_string(UChar* s, int mb_len, OnigDistance byte_len,
  391. regex_t* reg, int ignore_case)
  392. {
  393. int op = select_str_opcode(mb_len, byte_len, ignore_case);
  394. add_opcode(reg, op);
  395. if (op == OP_EXACTMBN)
  396. add_length(reg, mb_len);
  397. if (IS_NEED_STR_LEN_OP_EXACT(op)) {
  398. if (op == OP_EXACTN_IC)
  399. add_length(reg, byte_len);
  400. else
  401. add_length(reg, byte_len / mb_len);
  402. }
  403. add_bytes(reg, s, byte_len);
  404. return 0;
  405. }
  406. static int
  407. compile_length_string_node(Node* node, regex_t* reg)
  408. {
  409. int rlen, r, len, prev_len, blen, ambig;
  410. OnigEncoding enc = reg->enc;
  411. UChar *p, *prev;
  412. StrNode* sn;
  413. sn = NSTR(node);
  414. if (sn->end <= sn->s)
  415. return 0;
  416. ambig = NSTRING_IS_AMBIG(node);
  417. p = prev = sn->s;
  418. prev_len = enclen(enc, p, sn->end);
  419. p += prev_len;
  420. blen = prev_len;
  421. rlen = 0;
  422. for (; p < sn->end; ) {
  423. len = enclen(enc, p, sn->end);
  424. if (len == prev_len || ambig) {
  425. blen += len;
  426. }
  427. else {
  428. r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
  429. rlen += r;
  430. prev = p;
  431. blen = len;
  432. prev_len = len;
  433. }
  434. p += len;
  435. }
  436. r = add_compile_string_length(prev, prev_len, blen, reg, ambig);
  437. rlen += r;
  438. return rlen;
  439. }
  440. static int
  441. compile_length_string_raw_node(StrNode* sn, regex_t* reg)
  442. {
  443. if (sn->end <= sn->s)
  444. return 0;
  445. return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
  446. }
  447. static int
  448. compile_string_node(Node* node, regex_t* reg)
  449. {
  450. int r, len, prev_len, blen, ambig;
  451. OnigEncoding enc = reg->enc;
  452. UChar *p, *prev, *end;
  453. StrNode* sn;
  454. sn = NSTR(node);
  455. if (sn->end <= sn->s)
  456. return 0;
  457. end = sn->end;
  458. ambig = NSTRING_IS_AMBIG(node);
  459. p = prev = sn->s;
  460. prev_len = enclen(enc, p, end);
  461. p += prev_len;
  462. blen = prev_len;
  463. for (; p < end; ) {
  464. len = enclen(enc, p, end);
  465. if (len == prev_len || ambig) {
  466. blen += len;
  467. }
  468. else {
  469. r = add_compile_string(prev, prev_len, blen, reg, ambig);
  470. if (r) return r;
  471. prev = p;
  472. blen = len;
  473. prev_len = len;
  474. }
  475. p += len;
  476. }
  477. return add_compile_string(prev, prev_len, blen, reg, ambig);
  478. }
  479. static int
  480. compile_string_raw_node(StrNode* sn, regex_t* reg)
  481. {
  482. if (sn->end <= sn->s)
  483. return 0;
  484. return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0);
  485. }
  486. static int
  487. add_multi_byte_cclass(BBuf* mbuf, regex_t* reg)
  488. {
  489. #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
  490. add_length(reg, mbuf->used);
  491. return add_bytes(reg, mbuf->p, mbuf->used);
  492. #else
  493. int r, pad_size;
  494. UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH;
  495. GET_ALIGNMENT_PAD_SIZE(p, pad_size);
  496. add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1));
  497. if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
  498. r = add_bytes(reg, mbuf->p, mbuf->used);
  499. /* padding for return value from compile_length_cclass_node() to be fix. */
  500. pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size;
  501. if (pad_size != 0) add_bytes(reg, PadBuf, pad_size);
  502. return r;
  503. #endif
  504. }
  505. static int
  506. compile_length_cclass_node(CClassNode* cc, regex_t* reg)
  507. {
  508. int len;
  509. if (IS_NULL(cc->mbuf)) {
  510. len = SIZE_OPCODE + SIZE_BITSET;
  511. }
  512. else {
  513. if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
  514. len = SIZE_OPCODE;
  515. }
  516. else {
  517. len = SIZE_OPCODE + SIZE_BITSET;
  518. }
  519. #ifdef PLATFORM_UNALIGNED_WORD_ACCESS
  520. len += SIZE_LENGTH + cc->mbuf->used;
  521. #else
  522. len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1);
  523. #endif
  524. }
  525. return len;
  526. }
  527. static int
  528. compile_cclass_node(CClassNode* cc, regex_t* reg)
  529. {
  530. int r;
  531. if (IS_NULL(cc->mbuf)) {
  532. if (IS_NCCLASS_NOT(cc))
  533. add_opcode(reg, OP_CCLASS_NOT);
  534. else
  535. add_opcode(reg, OP_CCLASS);
  536. r = add_bitset(reg, cc->bs);
  537. }
  538. else {
  539. if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
  540. if (IS_NCCLASS_NOT(cc))
  541. add_opcode(reg, OP_CCLASS_MB_NOT);
  542. else
  543. add_opcode(reg, OP_CCLASS_MB);
  544. r = add_multi_byte_cclass(cc->mbuf, reg);
  545. }
  546. else {
  547. if (IS_NCCLASS_NOT(cc))
  548. add_opcode(reg, OP_CCLASS_MIX_NOT);
  549. else
  550. add_opcode(reg, OP_CCLASS_MIX);
  551. r = add_bitset(reg, cc->bs);
  552. if (r) return r;
  553. r = add_multi_byte_cclass(cc->mbuf, reg);
  554. }
  555. }
  556. return r;
  557. }
  558. static int
  559. entry_repeat_range(regex_t* reg, int id, int lower, int upper)
  560. {
  561. #define REPEAT_RANGE_ALLOC 4
  562. OnigRepeatRange* p;
  563. if (reg->repeat_range_alloc == 0) {
  564. p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC);
  565. CHECK_NULL_RETURN_MEMERR(p);
  566. reg->repeat_range = p;
  567. reg->repeat_range_alloc = REPEAT_RANGE_ALLOC;
  568. }
  569. else if (reg->repeat_range_alloc <= id) {
  570. int n;
  571. n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
  572. p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
  573. sizeof(OnigRepeatRange) * n);
  574. CHECK_NULL_RETURN_MEMERR(p);
  575. reg->repeat_range = p;
  576. reg->repeat_range_alloc = n;
  577. }
  578. else {
  579. p = reg->repeat_range;
  580. }
  581. p[id].lower = lower;
  582. p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper);
  583. return 0;
  584. }
  585. static int
  586. compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
  587. regex_t* reg)
  588. {
  589. int r;
  590. int num_repeat = reg->num_repeat;
  591. r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG);
  592. if (r) return r;
  593. r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
  594. reg->num_repeat++;
  595. if (r) return r;
  596. r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
  597. if (r) return r;
  598. r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper);
  599. if (r) return r;
  600. r = compile_tree_empty_check(qn->target, reg, empty_info);
  601. if (r) return r;
  602. if (
  603. #ifdef USE_SUBEXP_CALL
  604. reg->num_call > 0 ||
  605. #endif
  606. IS_QUANTIFIER_IN_REPEAT(qn)) {
  607. r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
  608. }
  609. else {
  610. r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
  611. }
  612. if (r) return r;
  613. r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
  614. return r;
  615. }
  616. static int
  617. is_anychar_star_quantifier(QtfrNode* qn)
  618. {
  619. if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) &&
  620. NTYPE(qn->target) == NT_CANY)
  621. return 1;
  622. else
  623. return 0;
  624. }
  625. #define QUANTIFIER_EXPAND_LIMIT_SIZE 50
  626. #define CKN_ON (ckn > 0)
  627. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  628. static int
  629. compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
  630. {
  631. int len, mod_tlen, cklen;
  632. int ckn;
  633. int infinite = IS_REPEAT_INFINITE(qn->upper);
  634. int empty_info = qn->target_empty_info;
  635. int tlen = compile_length_tree(qn->target, reg);
  636. if (tlen < 0) return tlen;
  637. ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
  638. cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0);
  639. /* anychar repeat */
  640. if (NTYPE(qn->target) == NT_CANY) {
  641. if (qn->greedy && infinite) {
  642. if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON)
  643. return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen;
  644. else
  645. return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen;
  646. }
  647. }
  648. if (empty_info != 0)
  649. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  650. else
  651. mod_tlen = tlen;
  652. if (infinite && qn->lower <= 1) {
  653. if (qn->greedy) {
  654. if (qn->lower == 1)
  655. len = SIZE_OP_JUMP;
  656. else
  657. len = 0;
  658. len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP;
  659. }
  660. else {
  661. if (qn->lower == 0)
  662. len = SIZE_OP_JUMP;
  663. else
  664. len = 0;
  665. len += mod_tlen + SIZE_OP_PUSH + cklen;
  666. }
  667. }
  668. else if (qn->upper == 0) {
  669. if (qn->is_referred != 0) /* /(?<n>..){0}/ */
  670. len = SIZE_OP_JUMP + tlen;
  671. else
  672. len = 0;
  673. }
  674. else if (qn->upper == 1 && qn->greedy) {
  675. if (qn->lower == 0) {
  676. if (CKN_ON) {
  677. len = SIZE_OP_STATE_CHECK_PUSH + tlen;
  678. }
  679. else {
  680. len = SIZE_OP_PUSH + tlen;
  681. }
  682. }
  683. else {
  684. len = tlen;
  685. }
  686. }
  687. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  688. len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen;
  689. }
  690. else {
  691. len = SIZE_OP_REPEAT_INC
  692. + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
  693. if (CKN_ON)
  694. len += SIZE_OP_STATE_CHECK;
  695. }
  696. return len;
  697. }
  698. static int
  699. compile_quantifier_node(QtfrNode* qn, regex_t* reg)
  700. {
  701. int r, mod_tlen;
  702. int ckn;
  703. int infinite = IS_REPEAT_INFINITE(qn->upper);
  704. int empty_info = qn->target_empty_info;
  705. int tlen = compile_length_tree(qn->target, reg);
  706. if (tlen < 0) return tlen;
  707. ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0);
  708. if (is_anychar_star_quantifier(qn)) {
  709. r = compile_tree_n_times(qn->target, qn->lower, reg);
  710. if (r) return r;
  711. if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) {
  712. if (IS_MULTILINE(reg->options))
  713. r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
  714. else
  715. r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
  716. if (r) return r;
  717. if (CKN_ON) {
  718. r = add_state_check_num(reg, ckn);
  719. if (r) return r;
  720. }
  721. return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
  722. }
  723. else {
  724. if (IS_MULTILINE(reg->options)) {
  725. r = add_opcode(reg, (CKN_ON ?
  726. OP_STATE_CHECK_ANYCHAR_ML_STAR
  727. : OP_ANYCHAR_ML_STAR));
  728. }
  729. else {
  730. r = add_opcode(reg, (CKN_ON ?
  731. OP_STATE_CHECK_ANYCHAR_STAR
  732. : OP_ANYCHAR_STAR));
  733. }
  734. if (r) return r;
  735. if (CKN_ON)
  736. r = add_state_check_num(reg, ckn);
  737. return r;
  738. }
  739. }
  740. if (empty_info != 0)
  741. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  742. else
  743. mod_tlen = tlen;
  744. if (infinite && qn->lower <= 1) {
  745. if (qn->greedy) {
  746. if (qn->lower == 1) {
  747. r = add_opcode_rel_addr(reg, OP_JUMP,
  748. (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH));
  749. if (r) return r;
  750. }
  751. if (CKN_ON) {
  752. r = add_opcode(reg, OP_STATE_CHECK_PUSH);
  753. if (r) return r;
  754. r = add_state_check_num(reg, ckn);
  755. if (r) return r;
  756. r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP);
  757. }
  758. else {
  759. r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
  760. }
  761. if (r) return r;
  762. r = compile_tree_empty_check(qn->target, reg, empty_info);
  763. if (r) return r;
  764. r = add_opcode_rel_addr(reg, OP_JUMP,
  765. -(mod_tlen + (int )SIZE_OP_JUMP
  766. + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)));
  767. }
  768. else {
  769. if (qn->lower == 0) {
  770. r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
  771. if (r) return r;
  772. }
  773. r = compile_tree_empty_check(qn->target, reg, empty_info);
  774. if (r) return r;
  775. if (CKN_ON) {
  776. r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP);
  777. if (r) return r;
  778. r = add_state_check_num(reg, ckn);
  779. if (r) return r;
  780. r = add_rel_addr(reg,
  781. -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP));
  782. }
  783. else
  784. r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
  785. }
  786. }
  787. else if (qn->upper == 0) {
  788. if (qn->is_referred != 0) { /* /(?<n>..){0}/ */
  789. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  790. if (r) return r;
  791. r = compile_tree(qn->target, reg);
  792. }
  793. else
  794. r = 0;
  795. }
  796. else if (qn->upper == 1 && qn->greedy) {
  797. if (qn->lower == 0) {
  798. if (CKN_ON) {
  799. r = add_opcode(reg, OP_STATE_CHECK_PUSH);
  800. if (r) return r;
  801. r = add_state_check_num(reg, ckn);
  802. if (r) return r;
  803. r = add_rel_addr(reg, tlen);
  804. }
  805. else {
  806. r = add_opcode_rel_addr(reg, OP_PUSH, tlen);
  807. }
  808. if (r) return r;
  809. }
  810. r = compile_tree(qn->target, reg);
  811. }
  812. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  813. if (CKN_ON) {
  814. r = add_opcode(reg, OP_STATE_CHECK_PUSH);
  815. if (r) return r;
  816. r = add_state_check_num(reg, ckn);
  817. if (r) return r;
  818. r = add_rel_addr(reg, SIZE_OP_JUMP);
  819. }
  820. else {
  821. r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
  822. }
  823. if (r) return r;
  824. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  825. if (r) return r;
  826. r = compile_tree(qn->target, reg);
  827. }
  828. else {
  829. r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
  830. if (CKN_ON) {
  831. if (r) return r;
  832. r = add_opcode(reg, OP_STATE_CHECK);
  833. if (r) return r;
  834. r = add_state_check_num(reg, ckn);
  835. }
  836. }
  837. return r;
  838. }
  839. #else /* USE_COMBINATION_EXPLOSION_CHECK */
  840. static int
  841. compile_length_quantifier_node(QtfrNode* qn, regex_t* reg)
  842. {
  843. int len, mod_tlen;
  844. int infinite = IS_REPEAT_INFINITE(qn->upper);
  845. int empty_info = qn->target_empty_info;
  846. int tlen = compile_length_tree(qn->target, reg);
  847. if (tlen < 0) return tlen;
  848. /* anychar repeat */
  849. if (NTYPE(qn->target) == NT_CANY) {
  850. if (qn->greedy && infinite) {
  851. if (IS_NOT_NULL(qn->next_head_exact))
  852. return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
  853. else
  854. return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
  855. }
  856. }
  857. if (empty_info != 0)
  858. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  859. else
  860. mod_tlen = tlen;
  861. if (infinite &&
  862. (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  863. if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
  864. len = SIZE_OP_JUMP;
  865. }
  866. else {
  867. len = tlen * qn->lower;
  868. }
  869. if (qn->greedy) {
  870. #ifdef USE_OP_PUSH_OR_JUMP_EXACT
  871. if (IS_NOT_NULL(qn->head_exact))
  872. len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP;
  873. else
  874. #endif
  875. if (IS_NOT_NULL(qn->next_head_exact))
  876. len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP;
  877. else
  878. len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP;
  879. }
  880. else
  881. len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH;
  882. }
  883. else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
  884. len = SIZE_OP_JUMP + tlen;
  885. }
  886. else if (!infinite && qn->greedy &&
  887. (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
  888. <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  889. len = tlen * qn->lower;
  890. len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
  891. }
  892. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  893. len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen;
  894. }
  895. else {
  896. len = SIZE_OP_REPEAT_INC
  897. + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM;
  898. }
  899. return len;
  900. }
  901. static int
  902. compile_quantifier_node(QtfrNode* qn, regex_t* reg)
  903. {
  904. int i, r, mod_tlen;
  905. int infinite = IS_REPEAT_INFINITE(qn->upper);
  906. int empty_info = qn->target_empty_info;
  907. int tlen = compile_length_tree(qn->target, reg);
  908. if (tlen < 0) return tlen;
  909. if (is_anychar_star_quantifier(qn)) {
  910. r = compile_tree_n_times(qn->target, qn->lower, reg);
  911. if (r) return r;
  912. if (IS_NOT_NULL(qn->next_head_exact)) {
  913. if (IS_MULTILINE(reg->options))
  914. r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT);
  915. else
  916. r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT);
  917. if (r) return r;
  918. return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
  919. }
  920. else {
  921. if (IS_MULTILINE(reg->options))
  922. return add_opcode(reg, OP_ANYCHAR_ML_STAR);
  923. else
  924. return add_opcode(reg, OP_ANYCHAR_STAR);
  925. }
  926. }
  927. if (empty_info != 0)
  928. mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END);
  929. else
  930. mod_tlen = tlen;
  931. if (infinite &&
  932. (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  933. if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) {
  934. if (qn->greedy) {
  935. #ifdef USE_OP_PUSH_OR_JUMP_EXACT
  936. if (IS_NOT_NULL(qn->head_exact))
  937. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1);
  938. else
  939. #endif
  940. if (IS_NOT_NULL(qn->next_head_exact))
  941. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT);
  942. else
  943. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH);
  944. }
  945. else {
  946. r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP);
  947. }
  948. if (r) return r;
  949. }
  950. else {
  951. r = compile_tree_n_times(qn->target, qn->lower, reg);
  952. if (r) return r;
  953. }
  954. if (qn->greedy) {
  955. #ifdef USE_OP_PUSH_OR_JUMP_EXACT
  956. if (IS_NOT_NULL(qn->head_exact)) {
  957. r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1,
  958. mod_tlen + SIZE_OP_JUMP);
  959. if (r) return r;
  960. add_bytes(reg, NSTR(qn->head_exact)->s, 1);
  961. r = compile_tree_empty_check(qn->target, reg, empty_info);
  962. if (r) return r;
  963. r = add_opcode_rel_addr(reg, OP_JUMP,
  964. -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1));
  965. }
  966. else
  967. #endif
  968. if (IS_NOT_NULL(qn->next_head_exact)) {
  969. r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT,
  970. mod_tlen + SIZE_OP_JUMP);
  971. if (r) return r;
  972. add_bytes(reg, NSTR(qn->next_head_exact)->s, 1);
  973. r = compile_tree_empty_check(qn->target, reg, empty_info);
  974. if (r) return r;
  975. r = add_opcode_rel_addr(reg, OP_JUMP,
  976. -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT));
  977. }
  978. else {
  979. r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP);
  980. if (r) return r;
  981. r = compile_tree_empty_check(qn->target, reg, empty_info);
  982. if (r) return r;
  983. r = add_opcode_rel_addr(reg, OP_JUMP,
  984. -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH));
  985. }
  986. }
  987. else {
  988. r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen);
  989. if (r) return r;
  990. r = compile_tree_empty_check(qn->target, reg, empty_info);
  991. if (r) return r;
  992. r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH));
  993. }
  994. }
  995. else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */
  996. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  997. if (r) return r;
  998. r = compile_tree(qn->target, reg);
  999. }
  1000. else if (!infinite && qn->greedy &&
  1001. (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
  1002. <= QUANTIFIER_EXPAND_LIMIT_SIZE)) {
  1003. int n = qn->upper - qn->lower;
  1004. r = compile_tree_n_times(qn->target, qn->lower, reg);
  1005. if (r) return r;
  1006. for (i = 0; i < n; i++) {
  1007. r = add_opcode_rel_addr(reg, OP_PUSH,
  1008. (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH);
  1009. if (r) return r;
  1010. r = compile_tree(qn->target, reg);
  1011. if (r) return r;
  1012. }
  1013. }
  1014. else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */
  1015. r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP);
  1016. if (r) return r;
  1017. r = add_opcode_rel_addr(reg, OP_JUMP, tlen);
  1018. if (r) return r;
  1019. r = compile_tree(qn->target, reg);
  1020. }
  1021. else {
  1022. r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg);
  1023. }
  1024. return r;
  1025. }
  1026. #endif /* USE_COMBINATION_EXPLOSION_CHECK */
  1027. static int
  1028. compile_length_option_node(EncloseNode* node, regex_t* reg)
  1029. {
  1030. int tlen;
  1031. OnigOptionType prev = reg->options;
  1032. reg->options = node->option;
  1033. tlen = compile_length_tree(node->target, reg);
  1034. reg->options = prev;
  1035. if (tlen < 0) return tlen;
  1036. if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
  1037. return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL
  1038. + tlen + SIZE_OP_SET_OPTION;
  1039. }
  1040. else
  1041. return tlen;
  1042. }
  1043. static int
  1044. compile_option_node(EncloseNode* node, regex_t* reg)
  1045. {
  1046. int r;
  1047. OnigOptionType prev = reg->options;
  1048. if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
  1049. r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option);
  1050. if (r) return r;
  1051. r = add_opcode_option(reg, OP_SET_OPTION, prev);
  1052. if (r) return r;
  1053. r = add_opcode(reg, OP_FAIL);
  1054. if (r) return r;
  1055. }
  1056. reg->options = node->option;
  1057. r = compile_tree(node->target, reg);
  1058. reg->options = prev;
  1059. if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
  1060. if (r) return r;
  1061. r = add_opcode_option(reg, OP_SET_OPTION, prev);
  1062. }
  1063. return r;
  1064. }
  1065. static int
  1066. compile_length_enclose_node(EncloseNode* node, regex_t* reg)
  1067. {
  1068. int len;
  1069. int tlen;
  1070. if (node->type == ENCLOSE_OPTION)
  1071. return compile_length_option_node(node, reg);
  1072. if (node->target) {
  1073. tlen = compile_length_tree(node->target, reg);
  1074. if (tlen < 0) return tlen;
  1075. }
  1076. else
  1077. tlen = 0;
  1078. switch (node->type) {
  1079. case ENCLOSE_MEMORY:
  1080. #ifdef USE_SUBEXP_CALL
  1081. if (IS_ENCLOSE_CALLED(node)) {
  1082. len = SIZE_OP_MEMORY_START_PUSH + tlen
  1083. + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN;
  1084. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1085. len += (IS_ENCLOSE_RECURSION(node)
  1086. ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
  1087. else
  1088. len += (IS_ENCLOSE_RECURSION(node)
  1089. ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
  1090. }
  1091. else if (IS_ENCLOSE_RECURSION(node)) {
  1092. len = SIZE_OP_MEMORY_START_PUSH;
  1093. len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
  1094. ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC);
  1095. }
  1096. else
  1097. #endif
  1098. {
  1099. if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
  1100. len = SIZE_OP_MEMORY_START_PUSH;
  1101. else
  1102. len = SIZE_OP_MEMORY_START;
  1103. len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)
  1104. ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END);
  1105. }
  1106. break;
  1107. case ENCLOSE_STOP_BACKTRACK:
  1108. if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
  1109. QtfrNode* qn = NQTFR(node->target);
  1110. tlen = compile_length_tree(qn->target, reg);
  1111. if (tlen < 0) return tlen;
  1112. len = tlen * qn->lower
  1113. + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
  1114. }
  1115. else {
  1116. len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
  1117. }
  1118. break;
  1119. case ENCLOSE_CONDITION:
  1120. len = SIZE_OP_CONDITION;
  1121. if (NTYPE(node->target) == NT_ALT) {
  1122. Node* x = node->target;
  1123. tlen = compile_length_tree(NCAR(x), reg); /* yes-node */
  1124. if (tlen < 0) return tlen;
  1125. len += tlen + SIZE_OP_JUMP;
  1126. if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
  1127. x = NCDR(x);
  1128. tlen = compile_length_tree(NCAR(x), reg); /* no-node */
  1129. if (tlen < 0) return tlen;
  1130. len += tlen;
  1131. if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
  1132. }
  1133. else {
  1134. return ONIGERR_PARSER_BUG;
  1135. }
  1136. break;
  1137. case ENCLOSE_ABSENT:
  1138. len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END;
  1139. break;
  1140. default:
  1141. return ONIGERR_TYPE_BUG;
  1142. break;
  1143. }
  1144. return len;
  1145. }
  1146. static int get_char_length_tree(Node* node, regex_t* reg, int* len);
  1147. static int
  1148. compile_enclose_node(EncloseNode* node, regex_t* reg)
  1149. {
  1150. int r, len;
  1151. if (node->type == ENCLOSE_OPTION)
  1152. return compile_option_node(node, reg);
  1153. switch (node->type) {
  1154. case ENCLOSE_MEMORY:
  1155. #ifdef USE_SUBEXP_CALL
  1156. if (IS_ENCLOSE_CALLED(node)) {
  1157. r = add_opcode(reg, OP_CALL);
  1158. if (r) return r;
  1159. node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP;
  1160. node->state |= NST_ADDR_FIXED;
  1161. r = add_abs_addr(reg, (int )node->call_addr);
  1162. if (r) return r;
  1163. len = compile_length_tree(node->target, reg);
  1164. len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN);
  1165. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1166. len += (IS_ENCLOSE_RECURSION(node)
  1167. ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH);
  1168. else
  1169. len += (IS_ENCLOSE_RECURSION(node)
  1170. ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END);
  1171. r = add_opcode_rel_addr(reg, OP_JUMP, len);
  1172. if (r) return r;
  1173. }
  1174. #endif
  1175. if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum))
  1176. r = add_opcode(reg, OP_MEMORY_START_PUSH);
  1177. else
  1178. r = add_opcode(reg, OP_MEMORY_START);
  1179. if (r) return r;
  1180. r = add_mem_num(reg, node->regnum);
  1181. if (r) return r;
  1182. r = compile_tree(node->target, reg);
  1183. if (r) return r;
  1184. #ifdef USE_SUBEXP_CALL
  1185. if (IS_ENCLOSE_CALLED(node)) {
  1186. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1187. r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
  1188. ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH));
  1189. else
  1190. r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node)
  1191. ? OP_MEMORY_END_REC : OP_MEMORY_END));
  1192. if (r) return r;
  1193. r = add_mem_num(reg, node->regnum);
  1194. if (r) return r;
  1195. r = add_opcode(reg, OP_RETURN);
  1196. }
  1197. else if (IS_ENCLOSE_RECURSION(node)) {
  1198. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1199. r = add_opcode(reg, OP_MEMORY_END_PUSH_REC);
  1200. else
  1201. r = add_opcode(reg, OP_MEMORY_END_REC);
  1202. if (r) return r;
  1203. r = add_mem_num(reg, node->regnum);
  1204. }
  1205. else
  1206. #endif
  1207. {
  1208. if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum))
  1209. r = add_opcode(reg, OP_MEMORY_END_PUSH);
  1210. else
  1211. r = add_opcode(reg, OP_MEMORY_END);
  1212. if (r) return r;
  1213. r = add_mem_num(reg, node->regnum);
  1214. }
  1215. break;
  1216. case ENCLOSE_STOP_BACKTRACK:
  1217. if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
  1218. QtfrNode* qn = NQTFR(node->target);
  1219. r = compile_tree_n_times(qn->target, qn->lower, reg);
  1220. if (r) return r;
  1221. len = compile_length_tree(qn->target, reg);
  1222. if (len < 0) return len;
  1223. r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP);
  1224. if (r) return r;
  1225. r = compile_tree(qn->target, reg);
  1226. if (r) return r;
  1227. r = add_opcode(reg, OP_POP);
  1228. if (r) return r;
  1229. r = add_opcode_rel_addr(reg, OP_JUMP,
  1230. -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
  1231. }
  1232. else {
  1233. r = add_opcode(reg, OP_PUSH_STOP_BT);
  1234. if (r) return r;
  1235. r = compile_tree(node->target, reg);
  1236. if (r) return r;
  1237. r = add_opcode(reg, OP_POP_STOP_BT);
  1238. }
  1239. break;
  1240. case ENCLOSE_CONDITION:
  1241. r = add_opcode(reg, OP_CONDITION);
  1242. if (r) return r;
  1243. r = add_mem_num(reg, node->regnum);
  1244. if (r) return r;
  1245. if (NTYPE(node->target) == NT_ALT) {
  1246. Node* x = node->target;
  1247. int len2;
  1248. len = compile_length_tree(NCAR(x), reg); /* yes-node */
  1249. if (len < 0) return len;
  1250. if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG;
  1251. x = NCDR(x);
  1252. len2 = compile_length_tree(NCAR(x), reg); /* no-node */
  1253. if (len2 < 0) return len2;
  1254. if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN;
  1255. x = node->target;
  1256. r = add_rel_addr(reg, len + SIZE_OP_JUMP);
  1257. if (r) return r;
  1258. r = compile_tree(NCAR(x), reg); /* yes-node */
  1259. if (r) return r;
  1260. r = add_opcode_rel_addr(reg, OP_JUMP, len2);
  1261. if (r) return r;
  1262. x = NCDR(x);
  1263. r = compile_tree(NCAR(x), reg); /* no-node */
  1264. }
  1265. else {
  1266. return ONIGERR_PARSER_BUG;
  1267. }
  1268. break;
  1269. case ENCLOSE_ABSENT:
  1270. len = compile_length_tree(node->target, reg);
  1271. if (len < 0) return len;
  1272. r = add_opcode(reg, OP_PUSH_ABSENT_POS);
  1273. if (r) return r;
  1274. r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END);
  1275. if (r) return r;
  1276. r = compile_tree(node->target, reg);
  1277. if (r) return r;
  1278. r = add_opcode(reg, OP_ABSENT_END);
  1279. break;
  1280. default:
  1281. return ONIGERR_TYPE_BUG;
  1282. break;
  1283. }
  1284. return r;
  1285. }
  1286. static int
  1287. compile_length_anchor_node(AnchorNode* node, regex_t* reg)
  1288. {
  1289. int len;
  1290. int tlen = 0;
  1291. if (node->target) {
  1292. tlen = compile_length_tree(node->target, reg);
  1293. if (tlen < 0) return tlen;
  1294. }
  1295. switch (node->type) {
  1296. case ANCHOR_PREC_READ:
  1297. len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS;
  1298. break;
  1299. case ANCHOR_PREC_READ_NOT:
  1300. len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS;
  1301. break;
  1302. case ANCHOR_LOOK_BEHIND:
  1303. len = SIZE_OP_LOOK_BEHIND + tlen;
  1304. break;
  1305. case ANCHOR_LOOK_BEHIND_NOT:
  1306. len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT;
  1307. break;
  1308. default:
  1309. len = SIZE_OPCODE;
  1310. break;
  1311. }
  1312. return len;
  1313. }
  1314. static int
  1315. compile_anchor_node(AnchorNode* node, regex_t* reg)
  1316. {
  1317. int r, len;
  1318. switch (node->type) {
  1319. case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break;
  1320. case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break;
  1321. case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break;
  1322. case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break;
  1323. case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
  1324. case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;
  1325. case ANCHOR_WORD_BOUND:
  1326. if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
  1327. else r = add_opcode(reg, OP_WORD_BOUND);
  1328. break;
  1329. case ANCHOR_NOT_WORD_BOUND:
  1330. if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND);
  1331. else r = add_opcode(reg, OP_NOT_WORD_BOUND);
  1332. break;
  1333. #ifdef USE_WORD_BEGIN_END
  1334. case ANCHOR_WORD_BEGIN:
  1335. if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN);
  1336. else r = add_opcode(reg, OP_WORD_BEGIN);
  1337. break;
  1338. case ANCHOR_WORD_END:
  1339. if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END);
  1340. else r = add_opcode(reg, OP_WORD_END);
  1341. break;
  1342. #endif
  1343. case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break;
  1344. case ANCHOR_PREC_READ:
  1345. r = add_opcode(reg, OP_PUSH_POS);
  1346. if (r) return r;
  1347. r = compile_tree(node->target, reg);
  1348. if (r) return r;
  1349. r = add_opcode(reg, OP_POP_POS);
  1350. break;
  1351. case ANCHOR_PREC_READ_NOT:
  1352. len = compile_length_tree(node->target, reg);
  1353. if (len < 0) return len;
  1354. r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS);
  1355. if (r) return r;
  1356. r = compile_tree(node->target, reg);
  1357. if (r) return r;
  1358. r = add_opcode(reg, OP_FAIL_POS);
  1359. break;
  1360. case ANCHOR_LOOK_BEHIND:
  1361. {
  1362. int n;
  1363. r = add_opcode(reg, OP_LOOK_BEHIND);
  1364. if (r) return r;
  1365. if (node->char_len < 0) {
  1366. r = get_char_length_tree(node->target, reg, &n);
  1367. if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  1368. }
  1369. else
  1370. n = node->char_len;
  1371. r = add_length(reg, n);
  1372. if (r) return r;
  1373. r = compile_tree(node->target, reg);
  1374. }
  1375. break;
  1376. case ANCHOR_LOOK_BEHIND_NOT:
  1377. {
  1378. int n;
  1379. len = compile_length_tree(node->target, reg);
  1380. r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT,
  1381. len + SIZE_OP_FAIL_LOOK_BEHIND_NOT);
  1382. if (r) return r;
  1383. if (node->char_len < 0) {
  1384. r = get_char_length_tree(node->target, reg, &n);
  1385. if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  1386. }
  1387. else
  1388. n = node->char_len;
  1389. r = add_length(reg, n);
  1390. if (r) return r;
  1391. r = compile_tree(node->target, reg);
  1392. if (r) return r;
  1393. r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT);
  1394. }
  1395. break;
  1396. default:
  1397. return ONIGERR_TYPE_BUG;
  1398. break;
  1399. }
  1400. return r;
  1401. }
  1402. static int
  1403. compile_length_tree(Node* node, regex_t* reg)
  1404. {
  1405. int len, type, r;
  1406. type = NTYPE(node);
  1407. switch (type) {
  1408. case NT_LIST:
  1409. len = 0;
  1410. do {
  1411. r = compile_length_tree(NCAR(node), reg);
  1412. if (r < 0) return r;
  1413. len += r;
  1414. } while (IS_NOT_NULL(node = NCDR(node)));
  1415. r = len;
  1416. break;
  1417. case NT_ALT:
  1418. {
  1419. int n = 0;
  1420. len = 0;
  1421. do {
  1422. r = compile_length_tree(NCAR(node), reg);
  1423. if (r < 0) return r;
  1424. len += r;
  1425. n++;
  1426. } while (IS_NOT_NULL(node = NCDR(node)));
  1427. r = len;
  1428. r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1);
  1429. }
  1430. break;
  1431. case NT_STR:
  1432. if (NSTRING_IS_RAW(node))
  1433. r = compile_length_string_raw_node(NSTR(node), reg);
  1434. else
  1435. r = compile_length_string_node(node, reg);
  1436. break;
  1437. case NT_CCLASS:
  1438. r = compile_length_cclass_node(NCCLASS(node), reg);
  1439. break;
  1440. case NT_CTYPE:
  1441. case NT_CANY:
  1442. r = SIZE_OPCODE;
  1443. break;
  1444. case NT_BREF:
  1445. {
  1446. BRefNode* br = NBREF(node);
  1447. #ifdef USE_BACKREF_WITH_LEVEL
  1448. if (IS_BACKREF_NEST_LEVEL(br)) {
  1449. r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH +
  1450. SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
  1451. }
  1452. else
  1453. #endif
  1454. if (br->back_num == 1) {
  1455. r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2)
  1456. ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM));
  1457. }
  1458. else {
  1459. r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num);
  1460. }
  1461. }
  1462. break;
  1463. #ifdef USE_SUBEXP_CALL
  1464. case NT_CALL:
  1465. r = SIZE_OP_CALL;
  1466. break;
  1467. #endif
  1468. case NT_QTFR:
  1469. r = compile_length_quantifier_node(NQTFR(node), reg);
  1470. break;
  1471. case NT_ENCLOSE:
  1472. r = compile_length_enclose_node(NENCLOSE(node), reg);
  1473. break;
  1474. case NT_ANCHOR:
  1475. r = compile_length_anchor_node(NANCHOR(node), reg);
  1476. break;
  1477. default:
  1478. return ONIGERR_TYPE_BUG;
  1479. break;
  1480. }
  1481. return r;
  1482. }
  1483. static int
  1484. compile_tree(Node* node, regex_t* reg)
  1485. {
  1486. int n, type, len, pos, r = 0;
  1487. type = NTYPE(node);
  1488. switch (type) {
  1489. case NT_LIST:
  1490. do {
  1491. r = compile_tree(NCAR(node), reg);
  1492. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1493. break;
  1494. case NT_ALT:
  1495. {
  1496. Node* x = node;
  1497. len = 0;
  1498. do {
  1499. len += compile_length_tree(NCAR(x), reg);
  1500. if (NCDR(x) != NULL) {
  1501. len += SIZE_OP_PUSH + SIZE_OP_JUMP;
  1502. }
  1503. } while (IS_NOT_NULL(x = NCDR(x)));
  1504. pos = reg->used + len; /* goal position */
  1505. do {
  1506. len = compile_length_tree(NCAR(node), reg);
  1507. if (IS_NOT_NULL(NCDR(node))) {
  1508. r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP);
  1509. if (r) break;
  1510. }
  1511. r = compile_tree(NCAR(node), reg);
  1512. if (r) break;
  1513. if (IS_NOT_NULL(NCDR(node))) {
  1514. len = pos - (reg->used + SIZE_OP_JUMP);
  1515. r = add_opcode_rel_addr(reg, OP_JUMP, len);
  1516. if (r) break;
  1517. }
  1518. } while (IS_NOT_NULL(node = NCDR(node)));
  1519. }
  1520. break;
  1521. case NT_STR:
  1522. if (NSTRING_IS_RAW(node))
  1523. r = compile_string_raw_node(NSTR(node), reg);
  1524. else
  1525. r = compile_string_node(node, reg);
  1526. break;
  1527. case NT_CCLASS:
  1528. r = compile_cclass_node(NCCLASS(node), reg);
  1529. break;
  1530. case NT_CTYPE:
  1531. {
  1532. int op;
  1533. switch (NCTYPE(node)->ctype) {
  1534. case ONIGENC_CTYPE_WORD:
  1535. if (NCTYPE(node)->ascii_range != 0) {
  1536. if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD;
  1537. else op = OP_ASCII_WORD;
  1538. }
  1539. else {
  1540. if (NCTYPE(node)->not != 0) op = OP_NOT_WORD;
  1541. else op = OP_WORD;
  1542. }
  1543. break;
  1544. default:
  1545. return ONIGERR_TYPE_BUG;
  1546. break;
  1547. }
  1548. r = add_opcode(reg, op);
  1549. }
  1550. break;
  1551. case NT_CANY:
  1552. if (IS_MULTILINE(reg->options))
  1553. r = add_opcode(reg, OP_ANYCHAR_ML);
  1554. else
  1555. r = add_opcode(reg, OP_ANYCHAR);
  1556. break;
  1557. case NT_BREF:
  1558. {
  1559. BRefNode* br = NBREF(node);
  1560. #ifdef USE_BACKREF_WITH_LEVEL
  1561. if (IS_BACKREF_NEST_LEVEL(br)) {
  1562. r = add_opcode(reg, OP_BACKREF_WITH_LEVEL);
  1563. if (r) return r;
  1564. r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE));
  1565. if (r) return r;
  1566. r = add_length(reg, br->nest_level);
  1567. if (r) return r;
  1568. goto add_bacref_mems;
  1569. }
  1570. else
  1571. #endif
  1572. if (br->back_num == 1) {
  1573. n = br->back_static[0];
  1574. if (IS_IGNORECASE(reg->options)) {
  1575. r = add_opcode(reg, OP_BACKREFN_IC);
  1576. if (r) return r;
  1577. r = add_mem_num(reg, n);
  1578. }
  1579. else {
  1580. switch (n) {
  1581. case 1: r = add_opcode(reg, OP_BACKREF1); break;
  1582. case 2: r = add_opcode(reg, OP_BACKREF2); break;
  1583. default:
  1584. r = add_opcode(reg, OP_BACKREFN);
  1585. if (r) return r;
  1586. r = add_mem_num(reg, n);
  1587. break;
  1588. }
  1589. }
  1590. }
  1591. else {
  1592. int i;
  1593. int* p;
  1594. if (IS_IGNORECASE(reg->options)) {
  1595. r = add_opcode(reg, OP_BACKREF_MULTI_IC);
  1596. }
  1597. else {
  1598. r = add_opcode(reg, OP_BACKREF_MULTI);
  1599. }
  1600. if (r) return r;
  1601. #ifdef USE_BACKREF_WITH_LEVEL
  1602. add_bacref_mems:
  1603. #endif
  1604. r = add_length(reg, br->back_num);
  1605. if (r) return r;
  1606. p = BACKREFS_P(br);
  1607. for (i = br->back_num - 1; i >= 0; i--) {
  1608. r = add_mem_num(reg, p[i]);
  1609. if (r) return r;
  1610. }
  1611. }
  1612. }
  1613. break;
  1614. #ifdef USE_SUBEXP_CALL
  1615. case NT_CALL:
  1616. r = compile_call(NCALL(node), reg);
  1617. break;
  1618. #endif
  1619. case NT_QTFR:
  1620. r = compile_quantifier_node(NQTFR(node), reg);
  1621. break;
  1622. case NT_ENCLOSE:
  1623. r = compile_enclose_node(NENCLOSE(node), reg);
  1624. break;
  1625. case NT_ANCHOR:
  1626. r = compile_anchor_node(NANCHOR(node), reg);
  1627. break;
  1628. default:
  1629. #ifdef ONIG_DEBUG
  1630. fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node));
  1631. #endif
  1632. break;
  1633. }
  1634. return r;
  1635. }
  1636. #ifdef USE_NAMED_GROUP
  1637. static int
  1638. noname_disable_map(Node** plink, GroupNumRemap* map, int* counter)
  1639. {
  1640. int r = 0;
  1641. Node* node = *plink;
  1642. switch (NTYPE(node)) {
  1643. case NT_LIST:
  1644. case NT_ALT:
  1645. do {
  1646. r = noname_disable_map(&(NCAR(node)), map, counter);
  1647. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1648. break;
  1649. case NT_QTFR:
  1650. {
  1651. Node** ptarget = &(NQTFR(node)->target);
  1652. Node* old = *ptarget;
  1653. r = noname_disable_map(ptarget, map, counter);
  1654. if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) {
  1655. onig_reduce_nested_quantifier(node, *ptarget);
  1656. }
  1657. }
  1658. break;
  1659. case NT_ENCLOSE:
  1660. {
  1661. EncloseNode* en = NENCLOSE(node);
  1662. if (en->type == ENCLOSE_MEMORY) {
  1663. if (IS_ENCLOSE_NAMED_GROUP(en)) {
  1664. (*counter)++;
  1665. map[en->regnum].new_val = *counter;
  1666. en->regnum = *counter;
  1667. }
  1668. else if (en->regnum != 0) {
  1669. *plink = en->target;
  1670. en->target = NULL_NODE;
  1671. onig_node_free(node);
  1672. r = noname_disable_map(plink, map, counter);
  1673. break;
  1674. }
  1675. }
  1676. r = noname_disable_map(&(en->target), map, counter);
  1677. }
  1678. break;
  1679. case NT_ANCHOR:
  1680. if (NANCHOR(node)->target)
  1681. r = noname_disable_map(&(NANCHOR(node)->target), map, counter);
  1682. break;
  1683. default:
  1684. break;
  1685. }
  1686. return r;
  1687. }
  1688. static int
  1689. renumber_node_backref(Node* node, GroupNumRemap* map, const int num_mem)
  1690. {
  1691. int i, pos, n, old_num;
  1692. int *backs;
  1693. BRefNode* bn = NBREF(node);
  1694. if (! IS_BACKREF_NAME_REF(bn))
  1695. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  1696. old_num = bn->back_num;
  1697. if (IS_NULL(bn->back_dynamic))
  1698. backs = bn->back_static;
  1699. else
  1700. backs = bn->back_dynamic;
  1701. for (i = 0, pos = 0; i < old_num; i++) {
  1702. if (backs[i] > num_mem) return ONIGERR_INVALID_BACKREF;
  1703. n = map[backs[i]].new_val;
  1704. if (n > 0) {
  1705. backs[pos] = n;
  1706. pos++;
  1707. }
  1708. }
  1709. bn->back_num = pos;
  1710. return 0;
  1711. }
  1712. static int
  1713. renumber_by_map(Node* node, GroupNumRemap* map, const int num_mem)
  1714. {
  1715. int r = 0;
  1716. switch (NTYPE(node)) {
  1717. case NT_LIST:
  1718. case NT_ALT:
  1719. do {
  1720. r = renumber_by_map(NCAR(node), map, num_mem);
  1721. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1722. break;
  1723. case NT_QTFR:
  1724. r = renumber_by_map(NQTFR(node)->target, map, num_mem);
  1725. break;
  1726. case NT_ENCLOSE:
  1727. {
  1728. EncloseNode* en = NENCLOSE(node);
  1729. if (en->type == ENCLOSE_CONDITION) {
  1730. if (en->regnum > num_mem) return ONIGERR_INVALID_BACKREF;
  1731. en->regnum = map[en->regnum].new_val;
  1732. }
  1733. r = renumber_by_map(en->target, map, num_mem);
  1734. }
  1735. break;
  1736. case NT_BREF:
  1737. r = renumber_node_backref(node, map, num_mem);
  1738. break;
  1739. case NT_ANCHOR:
  1740. if (NANCHOR(node)->target)
  1741. r = renumber_by_map(NANCHOR(node)->target, map, num_mem);
  1742. break;
  1743. default:
  1744. break;
  1745. }
  1746. return r;
  1747. }
  1748. static int
  1749. numbered_ref_check(Node* node)
  1750. {
  1751. int r = 0;
  1752. switch (NTYPE(node)) {
  1753. case NT_LIST:
  1754. case NT_ALT:
  1755. do {
  1756. r = numbered_ref_check(NCAR(node));
  1757. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1758. break;
  1759. case NT_QTFR:
  1760. r = numbered_ref_check(NQTFR(node)->target);
  1761. break;
  1762. case NT_ENCLOSE:
  1763. r = numbered_ref_check(NENCLOSE(node)->target);
  1764. break;
  1765. case NT_BREF:
  1766. if (! IS_BACKREF_NAME_REF(NBREF(node)))
  1767. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  1768. break;
  1769. case NT_ANCHOR:
  1770. if (NANCHOR(node)->target)
  1771. r = numbered_ref_check(NANCHOR(node)->target);
  1772. break;
  1773. default:
  1774. break;
  1775. }
  1776. return r;
  1777. }
  1778. static int
  1779. disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env)
  1780. {
  1781. int r, i, pos, counter;
  1782. BitStatusType loc;
  1783. GroupNumRemap* map;
  1784. map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1));
  1785. CHECK_NULL_RETURN_MEMERR(map);
  1786. for (i = 1; i <= env->num_mem; i++) {
  1787. map[i].new_val = 0;
  1788. }
  1789. counter = 0;
  1790. r = noname_disable_map(root, map, &counter);
  1791. if (r != 0) return r;
  1792. r = renumber_by_map(*root, map, env->num_mem);
  1793. if (r != 0) return r;
  1794. for (i = 1, pos = 1; i <= env->num_mem; i++) {
  1795. if (map[i].new_val > 0) {
  1796. SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i];
  1797. pos++;
  1798. }
  1799. }
  1800. loc = env->capture_history;
  1801. BIT_STATUS_CLEAR(env->capture_history);
  1802. for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
  1803. if (BIT_STATUS_AT(loc, i)) {
  1804. BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val);
  1805. }
  1806. }
  1807. env->num_mem = env->num_named;
  1808. reg->num_mem = env->num_named;
  1809. return onig_renumber_name_table(reg, map);
  1810. }
  1811. #endif /* USE_NAMED_GROUP */
  1812. #ifdef USE_SUBEXP_CALL
  1813. static int
  1814. unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg)
  1815. {
  1816. int i, offset;
  1817. EncloseNode* en;
  1818. AbsAddrType addr;
  1819. for (i = 0; i < uslist->num; i++) {
  1820. en = NENCLOSE(uslist->us[i].target);
  1821. if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG;
  1822. addr = en->call_addr;
  1823. offset = uslist->us[i].offset;
  1824. BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR);
  1825. }
  1826. return 0;
  1827. }
  1828. #endif
  1829. #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
  1830. static int
  1831. quantifiers_memory_node_info(Node* node)
  1832. {
  1833. int r = 0;
  1834. switch (NTYPE(node)) {
  1835. case NT_LIST:
  1836. case NT_ALT:
  1837. {
  1838. int v;
  1839. do {
  1840. v = quantifiers_memory_node_info(NCAR(node));
  1841. if (v > r) r = v;
  1842. } while (v >= 0 && IS_NOT_NULL(node = NCDR(node)));
  1843. }
  1844. break;
  1845. # ifdef USE_SUBEXP_CALL
  1846. case NT_CALL:
  1847. if (IS_CALL_RECURSION(NCALL(node))) {
  1848. return NQ_TARGET_IS_EMPTY_REC; /* tiny version */
  1849. }
  1850. else
  1851. r = quantifiers_memory_node_info(NCALL(node)->target);
  1852. break;
  1853. # endif
  1854. case NT_QTFR:
  1855. {
  1856. QtfrNode* qn = NQTFR(node);
  1857. if (qn->upper != 0) {
  1858. r = quantifiers_memory_node_info(qn->target);
  1859. }
  1860. }
  1861. break;
  1862. case NT_ENCLOSE:
  1863. {
  1864. EncloseNode* en = NENCLOSE(node);
  1865. switch (en->type) {
  1866. case ENCLOSE_MEMORY:
  1867. return NQ_TARGET_IS_EMPTY_MEM;
  1868. break;
  1869. case ENCLOSE_OPTION:
  1870. case ENCLOSE_STOP_BACKTRACK:
  1871. case ENCLOSE_CONDITION:
  1872. case ENCLOSE_ABSENT:
  1873. r = quantifiers_memory_node_info(en->target);
  1874. break;
  1875. default:
  1876. break;
  1877. }
  1878. }
  1879. break;
  1880. case NT_BREF:
  1881. case NT_STR:
  1882. case NT_CTYPE:
  1883. case NT_CCLASS:
  1884. case NT_CANY:
  1885. case NT_ANCHOR:
  1886. default:
  1887. break;
  1888. }
  1889. return r;
  1890. }
  1891. #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */
  1892. static int
  1893. get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env)
  1894. {
  1895. OnigDistance tmin;
  1896. int r = 0;
  1897. *min = 0;
  1898. switch (NTYPE(node)) {
  1899. case NT_BREF:
  1900. {
  1901. int i;
  1902. int* backs;
  1903. Node** nodes = SCANENV_MEM_NODES(env);
  1904. BRefNode* br = NBREF(node);
  1905. if (br->state & NST_RECURSION) break;
  1906. backs = BACKREFS_P(br);
  1907. if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  1908. r = get_min_match_length(nodes[backs[0]], min, env);
  1909. if (r != 0) break;
  1910. for (i = 1; i < br->back_num; i++) {
  1911. if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  1912. r = get_min_match_length(nodes[backs[i]], &tmin, env);
  1913. if (r != 0) break;
  1914. if (*min > tmin) *min = tmin;
  1915. }
  1916. }
  1917. break;
  1918. #ifdef USE_SUBEXP_CALL
  1919. case NT_CALL:
  1920. if (IS_CALL_RECURSION(NCALL(node))) {
  1921. EncloseNode* en = NENCLOSE(NCALL(node)->target);
  1922. if (IS_ENCLOSE_MIN_FIXED(en))
  1923. *min = en->min_len;
  1924. }
  1925. else
  1926. r = get_min_match_length(NCALL(node)->target, min, env);
  1927. break;
  1928. #endif
  1929. case NT_LIST:
  1930. do {
  1931. r = get_min_match_length(NCAR(node), &tmin, env);
  1932. if (r == 0) *min += tmin;
  1933. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  1934. break;
  1935. case NT_ALT:
  1936. {
  1937. Node *x, *y;
  1938. y = node;
  1939. do {
  1940. x = NCAR(y);
  1941. r = get_min_match_length(x, &tmin, env);
  1942. if (r != 0) break;
  1943. if (y == node) *min = tmin;
  1944. else if (*min > tmin) *min = tmin;
  1945. } while (r == 0 && IS_NOT_NULL(y = NCDR(y)));
  1946. }
  1947. break;
  1948. case NT_STR:
  1949. {
  1950. StrNode* sn = NSTR(node);
  1951. *min = sn->end - sn->s;
  1952. }
  1953. break;
  1954. case NT_CTYPE:
  1955. *min = 1;
  1956. break;
  1957. case NT_CCLASS:
  1958. case NT_CANY:
  1959. *min = 1;
  1960. break;
  1961. case NT_QTFR:
  1962. {
  1963. QtfrNode* qn = NQTFR(node);
  1964. if (qn->lower > 0) {
  1965. r = get_min_match_length(qn->target, min, env);
  1966. if (r == 0)
  1967. *min = distance_multiply(*min, qn->lower);
  1968. }
  1969. }
  1970. break;
  1971. case NT_ENCLOSE:
  1972. {
  1973. EncloseNode* en = NENCLOSE(node);
  1974. switch (en->type) {
  1975. case ENCLOSE_MEMORY:
  1976. if (IS_ENCLOSE_MIN_FIXED(en))
  1977. *min = en->min_len;
  1978. else {
  1979. if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
  1980. *min = 0; /* recursive */
  1981. else {
  1982. SET_ENCLOSE_STATUS(node, NST_MARK1);
  1983. r = get_min_match_length(en->target, min, env);
  1984. CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
  1985. if (r == 0) {
  1986. en->min_len = *min;
  1987. SET_ENCLOSE_STATUS(node, NST_MIN_FIXED);
  1988. }
  1989. }
  1990. }
  1991. break;
  1992. case ENCLOSE_OPTION:
  1993. case ENCLOSE_STOP_BACKTRACK:
  1994. case ENCLOSE_CONDITION:
  1995. r = get_min_match_length(en->target, min, env);
  1996. break;
  1997. case ENCLOSE_ABSENT:
  1998. break;
  1999. }
  2000. }
  2001. break;
  2002. case NT_ANCHOR:
  2003. default:
  2004. break;
  2005. }
  2006. return r;
  2007. }
  2008. static int
  2009. get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env)
  2010. {
  2011. OnigDistance tmax;
  2012. int r = 0;
  2013. *max = 0;
  2014. switch (NTYPE(node)) {
  2015. case NT_LIST:
  2016. do {
  2017. r = get_max_match_length(NCAR(node), &tmax, env);
  2018. if (r == 0)
  2019. *max = distance_add(*max, tmax);
  2020. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2021. break;
  2022. case NT_ALT:
  2023. do {
  2024. r = get_max_match_length(NCAR(node), &tmax, env);
  2025. if (r == 0 && *max < tmax) *max = tmax;
  2026. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2027. break;
  2028. case NT_STR:
  2029. {
  2030. StrNode* sn = NSTR(node);
  2031. *max = sn->end - sn->s;
  2032. }
  2033. break;
  2034. case NT_CTYPE:
  2035. *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  2036. break;
  2037. case NT_CCLASS:
  2038. case NT_CANY:
  2039. *max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  2040. break;
  2041. case NT_BREF:
  2042. {
  2043. int i;
  2044. int* backs;
  2045. Node** nodes = SCANENV_MEM_NODES(env);
  2046. BRefNode* br = NBREF(node);
  2047. if (br->state & NST_RECURSION) {
  2048. *max = ONIG_INFINITE_DISTANCE;
  2049. break;
  2050. }
  2051. backs = BACKREFS_P(br);
  2052. for (i = 0; i < br->back_num; i++) {
  2053. if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  2054. r = get_max_match_length(nodes[backs[i]], &tmax, env);
  2055. if (r != 0) break;
  2056. if (*max < tmax) *max = tmax;
  2057. }
  2058. }
  2059. break;
  2060. #ifdef USE_SUBEXP_CALL
  2061. case NT_CALL:
  2062. if (! IS_CALL_RECURSION(NCALL(node)))
  2063. r = get_max_match_length(NCALL(node)->target, max, env);
  2064. else
  2065. *max = ONIG_INFINITE_DISTANCE;
  2066. break;
  2067. #endif
  2068. case NT_QTFR:
  2069. {
  2070. QtfrNode* qn = NQTFR(node);
  2071. if (qn->upper != 0) {
  2072. r = get_max_match_length(qn->target, max, env);
  2073. if (r == 0 && *max != 0) {
  2074. if (! IS_REPEAT_INFINITE(qn->upper))
  2075. *max = distance_multiply(*max, qn->upper);
  2076. else
  2077. *max = ONIG_INFINITE_DISTANCE;
  2078. }
  2079. }
  2080. }
  2081. break;
  2082. case NT_ENCLOSE:
  2083. {
  2084. EncloseNode* en = NENCLOSE(node);
  2085. switch (en->type) {
  2086. case ENCLOSE_MEMORY:
  2087. if (IS_ENCLOSE_MAX_FIXED(en))
  2088. *max = en->max_len;
  2089. else {
  2090. if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
  2091. *max = ONIG_INFINITE_DISTANCE;
  2092. else {
  2093. SET_ENCLOSE_STATUS(node, NST_MARK1);
  2094. r = get_max_match_length(en->target, max, env);
  2095. CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
  2096. if (r == 0) {
  2097. en->max_len = *max;
  2098. SET_ENCLOSE_STATUS(node, NST_MAX_FIXED);
  2099. }
  2100. }
  2101. }
  2102. break;
  2103. case ENCLOSE_OPTION:
  2104. case ENCLOSE_STOP_BACKTRACK:
  2105. case ENCLOSE_CONDITION:
  2106. r = get_max_match_length(en->target, max, env);
  2107. break;
  2108. case ENCLOSE_ABSENT:
  2109. break;
  2110. }
  2111. }
  2112. break;
  2113. case NT_ANCHOR:
  2114. default:
  2115. break;
  2116. }
  2117. return r;
  2118. }
  2119. #define GET_CHAR_LEN_VARLEN -1
  2120. #define GET_CHAR_LEN_TOP_ALT_VARLEN -2
  2121. /* fixed size pattern node only */
  2122. static int
  2123. get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
  2124. {
  2125. int tlen;
  2126. int r = 0;
  2127. level++;
  2128. *len = 0;
  2129. switch (NTYPE(node)) {
  2130. case NT_LIST:
  2131. do {
  2132. r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
  2133. if (r == 0)
  2134. *len = (int )distance_add(*len, tlen);
  2135. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2136. break;
  2137. case NT_ALT:
  2138. {
  2139. int tlen2;
  2140. int varlen = 0;
  2141. r = get_char_length_tree1(NCAR(node), reg, &tlen, level);
  2142. while (r == 0 && IS_NOT_NULL(node = NCDR(node))) {
  2143. r = get_char_length_tree1(NCAR(node), reg, &tlen2, level);
  2144. if (r == 0) {
  2145. if (tlen != tlen2)
  2146. varlen = 1;
  2147. }
  2148. }
  2149. if (r == 0) {
  2150. if (varlen != 0) {
  2151. if (level == 1)
  2152. r = GET_CHAR_LEN_TOP_ALT_VARLEN;
  2153. else
  2154. r = GET_CHAR_LEN_VARLEN;
  2155. }
  2156. else
  2157. *len = tlen;
  2158. }
  2159. }
  2160. break;
  2161. case NT_STR:
  2162. {
  2163. StrNode* sn = NSTR(node);
  2164. UChar *s = sn->s;
  2165. while (s < sn->end) {
  2166. s += enclen(reg->enc, s, sn->end);
  2167. (*len)++;
  2168. }
  2169. }
  2170. break;
  2171. case NT_QTFR:
  2172. {
  2173. QtfrNode* qn = NQTFR(node);
  2174. if (qn->lower == qn->upper) {
  2175. r = get_char_length_tree1(qn->target, reg, &tlen, level);
  2176. if (r == 0)
  2177. *len = (int )distance_multiply(tlen, qn->lower);
  2178. }
  2179. else
  2180. r = GET_CHAR_LEN_VARLEN;
  2181. }
  2182. break;
  2183. #ifdef USE_SUBEXP_CALL
  2184. case NT_CALL:
  2185. if (! IS_CALL_RECURSION(NCALL(node)))
  2186. r = get_char_length_tree1(NCALL(node)->target, reg, len, level);
  2187. else
  2188. r = GET_CHAR_LEN_VARLEN;
  2189. break;
  2190. #endif
  2191. case NT_CTYPE:
  2192. *len = 1;
  2193. break;
  2194. case NT_CCLASS:
  2195. case NT_CANY:
  2196. *len = 1;
  2197. break;
  2198. case NT_ENCLOSE:
  2199. {
  2200. EncloseNode* en = NENCLOSE(node);
  2201. switch (en->type) {
  2202. case ENCLOSE_MEMORY:
  2203. #ifdef USE_SUBEXP_CALL
  2204. if (IS_ENCLOSE_CLEN_FIXED(en))
  2205. *len = en->char_len;
  2206. else {
  2207. r = get_char_length_tree1(en->target, reg, len, level);
  2208. if (r == 0) {
  2209. en->char_len = *len;
  2210. SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED);
  2211. }
  2212. }
  2213. break;
  2214. #endif
  2215. case ENCLOSE_OPTION:
  2216. case ENCLOSE_STOP_BACKTRACK:
  2217. case ENCLOSE_CONDITION:
  2218. r = get_char_length_tree1(en->target, reg, len, level);
  2219. break;
  2220. case ENCLOSE_ABSENT:
  2221. default:
  2222. break;
  2223. }
  2224. }
  2225. break;
  2226. case NT_ANCHOR:
  2227. break;
  2228. default:
  2229. r = GET_CHAR_LEN_VARLEN;
  2230. break;
  2231. }
  2232. return r;
  2233. }
  2234. static int
  2235. get_char_length_tree(Node* node, regex_t* reg, int* len)
  2236. {
  2237. return get_char_length_tree1(node, reg, len, 0);
  2238. }
  2239. /* x is not included y ==> 1 : 0 */
  2240. static int
  2241. is_not_included(Node* x, Node* y, regex_t* reg)
  2242. {
  2243. int i;
  2244. OnigDistance len;
  2245. OnigCodePoint code;
  2246. UChar *p;
  2247. int ytype;
  2248. retry:
  2249. ytype = NTYPE(y);
  2250. switch (NTYPE(x)) {
  2251. case NT_CTYPE:
  2252. {
  2253. switch (ytype) {
  2254. case NT_CTYPE:
  2255. if (NCTYPE(y)->ctype == NCTYPE(x)->ctype &&
  2256. NCTYPE(y)->not != NCTYPE(x)->not &&
  2257. NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range)
  2258. return 1;
  2259. else
  2260. return 0;
  2261. break;
  2262. case NT_CCLASS:
  2263. swap:
  2264. {
  2265. Node* tmp;
  2266. tmp = x; x = y; y = tmp;
  2267. goto retry;
  2268. }
  2269. break;
  2270. case NT_STR:
  2271. goto swap;
  2272. break;
  2273. default:
  2274. break;
  2275. }
  2276. }
  2277. break;
  2278. case NT_CCLASS:
  2279. {
  2280. CClassNode* xc = NCCLASS(x);
  2281. switch (ytype) {
  2282. case NT_CTYPE:
  2283. switch (NCTYPE(y)->ctype) {
  2284. case ONIGENC_CTYPE_WORD:
  2285. if (NCTYPE(y)->not == 0) {
  2286. if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) {
  2287. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  2288. if (BITSET_AT(xc->bs, i)) {
  2289. if (NCTYPE(y)->ascii_range) {
  2290. if (IS_CODE_SB_WORD(reg->enc, i)) return 0;
  2291. }
  2292. else {
  2293. if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0;
  2294. }
  2295. }
  2296. }
  2297. return 1;
  2298. }
  2299. return 0;
  2300. }
  2301. else {
  2302. if (IS_NOT_NULL(xc->mbuf)) return 0;
  2303. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  2304. int is_word;
  2305. if (NCTYPE(y)->ascii_range)
  2306. is_word = IS_CODE_SB_WORD(reg->enc, i);
  2307. else
  2308. is_word = ONIGENC_IS_CODE_WORD(reg->enc, i);
  2309. if (! is_word) {
  2310. if (!IS_NCCLASS_NOT(xc)) {
  2311. if (BITSET_AT(xc->bs, i))
  2312. return 0;
  2313. }
  2314. else {
  2315. if (! BITSET_AT(xc->bs, i))
  2316. return 0;
  2317. }
  2318. }
  2319. }
  2320. return 1;
  2321. }
  2322. break;
  2323. default:
  2324. break;
  2325. }
  2326. break;
  2327. case NT_CCLASS:
  2328. {
  2329. int v;
  2330. CClassNode* yc = NCCLASS(y);
  2331. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  2332. v = BITSET_AT(xc->bs, i);
  2333. if ((v != 0 && !IS_NCCLASS_NOT(xc)) ||
  2334. (v == 0 && IS_NCCLASS_NOT(xc))) {
  2335. v = BITSET_AT(yc->bs, i);
  2336. if ((v != 0 && !IS_NCCLASS_NOT(yc)) ||
  2337. (v == 0 && IS_NCCLASS_NOT(yc)))
  2338. return 0;
  2339. }
  2340. }
  2341. if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) ||
  2342. (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc)))
  2343. return 1;
  2344. return 0;
  2345. }
  2346. break;
  2347. case NT_STR:
  2348. goto swap;
  2349. break;
  2350. default:
  2351. break;
  2352. }
  2353. }
  2354. break;
  2355. case NT_STR:
  2356. {
  2357. StrNode* xs = NSTR(x);
  2358. if (NSTRING_LEN(x) == 0)
  2359. break;
  2360. switch (ytype) {
  2361. case NT_CTYPE:
  2362. switch (NCTYPE(y)->ctype) {
  2363. case ONIGENC_CTYPE_WORD:
  2364. if (NCTYPE(y)->ascii_range) {
  2365. if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end))
  2366. return NCTYPE(y)->not;
  2367. else
  2368. return !(NCTYPE(y)->not);
  2369. }
  2370. else {
  2371. if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end))
  2372. return NCTYPE(y)->not;
  2373. else
  2374. return !(NCTYPE(y)->not);
  2375. }
  2376. break;
  2377. default:
  2378. break;
  2379. }
  2380. break;
  2381. case NT_CCLASS:
  2382. {
  2383. CClassNode* cc = NCCLASS(y);
  2384. code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
  2385. xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
  2386. return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
  2387. }
  2388. break;
  2389. case NT_STR:
  2390. {
  2391. UChar *q;
  2392. StrNode* ys = NSTR(y);
  2393. len = NSTRING_LEN(x);
  2394. if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
  2395. if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
  2396. /* tiny version */
  2397. return 0;
  2398. }
  2399. else {
  2400. for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) {
  2401. if (*p != *q) return 1;
  2402. }
  2403. }
  2404. }
  2405. break;
  2406. default:
  2407. break;
  2408. }
  2409. }
  2410. break;
  2411. default:
  2412. break;
  2413. }
  2414. return 0;
  2415. }
  2416. static Node*
  2417. get_head_value_node(Node* node, int exact, regex_t* reg)
  2418. {
  2419. Node* n = NULL_NODE;
  2420. switch (NTYPE(node)) {
  2421. case NT_BREF:
  2422. case NT_ALT:
  2423. case NT_CANY:
  2424. #ifdef USE_SUBEXP_CALL
  2425. case NT_CALL:
  2426. #endif
  2427. break;
  2428. case NT_CTYPE:
  2429. case NT_CCLASS:
  2430. if (exact == 0) {
  2431. n = node;
  2432. }
  2433. break;
  2434. case NT_LIST:
  2435. n = get_head_value_node(NCAR(node), exact, reg);
  2436. break;
  2437. case NT_STR:
  2438. {
  2439. StrNode* sn = NSTR(node);
  2440. if (sn->end <= sn->s)
  2441. break;
  2442. if (exact != 0 &&
  2443. !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
  2444. }
  2445. else {
  2446. n = node;
  2447. }
  2448. }
  2449. break;
  2450. case NT_QTFR:
  2451. {
  2452. QtfrNode* qn = NQTFR(node);
  2453. if (qn->lower > 0) {
  2454. #ifdef USE_OP_PUSH_OR_JUMP_EXACT
  2455. if (IS_NOT_NULL(qn->head_exact))
  2456. n = qn->head_exact;
  2457. else
  2458. #endif
  2459. n = get_head_value_node(qn->target, exact, reg);
  2460. }
  2461. }
  2462. break;
  2463. case NT_ENCLOSE:
  2464. {
  2465. EncloseNode* en = NENCLOSE(node);
  2466. switch (en->type) {
  2467. case ENCLOSE_OPTION:
  2468. {
  2469. OnigOptionType options = reg->options;
  2470. reg->options = NENCLOSE(node)->option;
  2471. n = get_head_value_node(NENCLOSE(node)->target, exact, reg);
  2472. reg->options = options;
  2473. }
  2474. break;
  2475. case ENCLOSE_MEMORY:
  2476. case ENCLOSE_STOP_BACKTRACK:
  2477. case ENCLOSE_CONDITION:
  2478. n = get_head_value_node(en->target, exact, reg);
  2479. break;
  2480. case ENCLOSE_ABSENT:
  2481. break;
  2482. }
  2483. }
  2484. break;
  2485. case NT_ANCHOR:
  2486. if (NANCHOR(node)->type == ANCHOR_PREC_READ)
  2487. n = get_head_value_node(NANCHOR(node)->target, exact, reg);
  2488. break;
  2489. default:
  2490. break;
  2491. }
  2492. return n;
  2493. }
  2494. static int
  2495. check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask)
  2496. {
  2497. int type, r = 0;
  2498. type = NTYPE(node);
  2499. if ((NTYPE2BIT(type) & type_mask) == 0)
  2500. return 1;
  2501. switch (type) {
  2502. case NT_LIST:
  2503. case NT_ALT:
  2504. do {
  2505. r = check_type_tree(NCAR(node), type_mask, enclose_mask,
  2506. anchor_mask);
  2507. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2508. break;
  2509. case NT_QTFR:
  2510. r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask,
  2511. anchor_mask);
  2512. break;
  2513. case NT_ENCLOSE:
  2514. {
  2515. EncloseNode* en = NENCLOSE(node);
  2516. if ((en->type & enclose_mask) == 0)
  2517. return 1;
  2518. r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask);
  2519. }
  2520. break;
  2521. case NT_ANCHOR:
  2522. type = NANCHOR(node)->type;
  2523. if ((type & anchor_mask) == 0)
  2524. return 1;
  2525. if (NANCHOR(node)->target)
  2526. r = check_type_tree(NANCHOR(node)->target,
  2527. type_mask, enclose_mask, anchor_mask);
  2528. break;
  2529. default:
  2530. break;
  2531. }
  2532. return r;
  2533. }
  2534. #ifdef USE_SUBEXP_CALL
  2535. # define RECURSION_EXIST 1
  2536. # define RECURSION_INFINITE 2
  2537. static int
  2538. subexp_inf_recursive_check(Node* node, ScanEnv* env, int head)
  2539. {
  2540. int type;
  2541. int r = 0;
  2542. type = NTYPE(node);
  2543. switch (type) {
  2544. case NT_LIST:
  2545. {
  2546. Node *x;
  2547. OnigDistance min;
  2548. int ret;
  2549. x = node;
  2550. do {
  2551. ret = subexp_inf_recursive_check(NCAR(x), env, head);
  2552. if (ret < 0 || ret == RECURSION_INFINITE) return ret;
  2553. r |= ret;
  2554. if (head) {
  2555. ret = get_min_match_length(NCAR(x), &min, env);
  2556. if (ret != 0) return ret;
  2557. if (min != 0) head = 0;
  2558. }
  2559. } while (IS_NOT_NULL(x = NCDR(x)));
  2560. }
  2561. break;
  2562. case NT_ALT:
  2563. {
  2564. int ret;
  2565. r = RECURSION_EXIST;
  2566. do {
  2567. ret = subexp_inf_recursive_check(NCAR(node), env, head);
  2568. if (ret < 0 || ret == RECURSION_INFINITE) return ret;
  2569. r &= ret;
  2570. } while (IS_NOT_NULL(node = NCDR(node)));
  2571. }
  2572. break;
  2573. case NT_QTFR:
  2574. r = subexp_inf_recursive_check(NQTFR(node)->target, env, head);
  2575. if (r == RECURSION_EXIST) {
  2576. if (NQTFR(node)->lower == 0) r = 0;
  2577. }
  2578. break;
  2579. case NT_ANCHOR:
  2580. {
  2581. AnchorNode* an = NANCHOR(node);
  2582. switch (an->type) {
  2583. case ANCHOR_PREC_READ:
  2584. case ANCHOR_PREC_READ_NOT:
  2585. case ANCHOR_LOOK_BEHIND:
  2586. case ANCHOR_LOOK_BEHIND_NOT:
  2587. r = subexp_inf_recursive_check(an->target, env, head);
  2588. break;
  2589. }
  2590. }
  2591. break;
  2592. case NT_CALL:
  2593. r = subexp_inf_recursive_check(NCALL(node)->target, env, head);
  2594. break;
  2595. case NT_ENCLOSE:
  2596. if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
  2597. return 0;
  2598. else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
  2599. return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE);
  2600. else {
  2601. SET_ENCLOSE_STATUS(node, NST_MARK2);
  2602. r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head);
  2603. CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
  2604. }
  2605. break;
  2606. default:
  2607. break;
  2608. }
  2609. return r;
  2610. }
  2611. static int
  2612. subexp_inf_recursive_check_trav(Node* node, ScanEnv* env)
  2613. {
  2614. int type;
  2615. int r = 0;
  2616. type = NTYPE(node);
  2617. switch (type) {
  2618. case NT_LIST:
  2619. case NT_ALT:
  2620. do {
  2621. r = subexp_inf_recursive_check_trav(NCAR(node), env);
  2622. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2623. break;
  2624. case NT_QTFR:
  2625. r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env);
  2626. break;
  2627. case NT_ANCHOR:
  2628. {
  2629. AnchorNode* an = NANCHOR(node);
  2630. switch (an->type) {
  2631. case ANCHOR_PREC_READ:
  2632. case ANCHOR_PREC_READ_NOT:
  2633. case ANCHOR_LOOK_BEHIND:
  2634. case ANCHOR_LOOK_BEHIND_NOT:
  2635. r = subexp_inf_recursive_check_trav(an->target, env);
  2636. break;
  2637. }
  2638. }
  2639. break;
  2640. case NT_ENCLOSE:
  2641. {
  2642. EncloseNode* en = NENCLOSE(node);
  2643. if (IS_ENCLOSE_RECURSION(en)) {
  2644. SET_ENCLOSE_STATUS(node, NST_MARK1);
  2645. r = subexp_inf_recursive_check(en->target, env, 1);
  2646. if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION;
  2647. CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
  2648. }
  2649. r = subexp_inf_recursive_check_trav(en->target, env);
  2650. }
  2651. break;
  2652. default:
  2653. break;
  2654. }
  2655. return r;
  2656. }
  2657. static int
  2658. subexp_recursive_check(Node* node)
  2659. {
  2660. int r = 0;
  2661. switch (NTYPE(node)) {
  2662. case NT_LIST:
  2663. case NT_ALT:
  2664. do {
  2665. r |= subexp_recursive_check(NCAR(node));
  2666. } while (IS_NOT_NULL(node = NCDR(node)));
  2667. break;
  2668. case NT_QTFR:
  2669. r = subexp_recursive_check(NQTFR(node)->target);
  2670. break;
  2671. case NT_ANCHOR:
  2672. {
  2673. AnchorNode* an = NANCHOR(node);
  2674. switch (an->type) {
  2675. case ANCHOR_PREC_READ:
  2676. case ANCHOR_PREC_READ_NOT:
  2677. case ANCHOR_LOOK_BEHIND:
  2678. case ANCHOR_LOOK_BEHIND_NOT:
  2679. r = subexp_recursive_check(an->target);
  2680. break;
  2681. }
  2682. }
  2683. break;
  2684. case NT_CALL:
  2685. r = subexp_recursive_check(NCALL(node)->target);
  2686. if (r != 0) SET_CALL_RECURSION(node);
  2687. break;
  2688. case NT_ENCLOSE:
  2689. if (IS_ENCLOSE_MARK2(NENCLOSE(node)))
  2690. return 0;
  2691. else if (IS_ENCLOSE_MARK1(NENCLOSE(node)))
  2692. return 1; /* recursion */
  2693. else {
  2694. SET_ENCLOSE_STATUS(node, NST_MARK2);
  2695. r = subexp_recursive_check(NENCLOSE(node)->target);
  2696. CLEAR_ENCLOSE_STATUS(node, NST_MARK2);
  2697. }
  2698. break;
  2699. default:
  2700. break;
  2701. }
  2702. return r;
  2703. }
  2704. static int
  2705. subexp_recursive_check_trav(Node* node, ScanEnv* env)
  2706. {
  2707. # define FOUND_CALLED_NODE 1
  2708. int type;
  2709. int r = 0;
  2710. type = NTYPE(node);
  2711. switch (type) {
  2712. case NT_LIST:
  2713. case NT_ALT:
  2714. {
  2715. int ret;
  2716. do {
  2717. ret = subexp_recursive_check_trav(NCAR(node), env);
  2718. if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE;
  2719. else if (ret < 0) return ret;
  2720. } while (IS_NOT_NULL(node = NCDR(node)));
  2721. }
  2722. break;
  2723. case NT_QTFR:
  2724. r = subexp_recursive_check_trav(NQTFR(node)->target, env);
  2725. if (NQTFR(node)->upper == 0) {
  2726. if (r == FOUND_CALLED_NODE)
  2727. NQTFR(node)->is_referred = 1;
  2728. }
  2729. break;
  2730. case NT_ANCHOR:
  2731. {
  2732. AnchorNode* an = NANCHOR(node);
  2733. switch (an->type) {
  2734. case ANCHOR_PREC_READ:
  2735. case ANCHOR_PREC_READ_NOT:
  2736. case ANCHOR_LOOK_BEHIND:
  2737. case ANCHOR_LOOK_BEHIND_NOT:
  2738. r = subexp_recursive_check_trav(an->target, env);
  2739. break;
  2740. }
  2741. }
  2742. break;
  2743. case NT_ENCLOSE:
  2744. {
  2745. EncloseNode* en = NENCLOSE(node);
  2746. if (! IS_ENCLOSE_RECURSION(en)) {
  2747. if (IS_ENCLOSE_CALLED(en)) {
  2748. SET_ENCLOSE_STATUS(node, NST_MARK1);
  2749. r = subexp_recursive_check(en->target);
  2750. if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION);
  2751. CLEAR_ENCLOSE_STATUS(node, NST_MARK1);
  2752. }
  2753. }
  2754. r = subexp_recursive_check_trav(en->target, env);
  2755. if (IS_ENCLOSE_CALLED(en))
  2756. r |= FOUND_CALLED_NODE;
  2757. }
  2758. break;
  2759. default:
  2760. break;
  2761. }
  2762. return r;
  2763. }
  2764. static int
  2765. setup_subexp_call(Node* node, ScanEnv* env)
  2766. {
  2767. int type;
  2768. int r = 0;
  2769. type = NTYPE(node);
  2770. switch (type) {
  2771. case NT_LIST:
  2772. do {
  2773. r = setup_subexp_call(NCAR(node), env);
  2774. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2775. break;
  2776. case NT_ALT:
  2777. do {
  2778. r = setup_subexp_call(NCAR(node), env);
  2779. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  2780. break;
  2781. case NT_QTFR:
  2782. r = setup_subexp_call(NQTFR(node)->target, env);
  2783. break;
  2784. case NT_ENCLOSE:
  2785. r = setup_subexp_call(NENCLOSE(node)->target, env);
  2786. break;
  2787. case NT_CALL:
  2788. {
  2789. CallNode* cn = NCALL(node);
  2790. Node** nodes = SCANENV_MEM_NODES(env);
  2791. if (cn->group_num != 0) {
  2792. int gnum = cn->group_num;
  2793. # ifdef USE_NAMED_GROUP
  2794. if (env->num_named > 0 &&
  2795. IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
  2796. !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
  2797. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  2798. }
  2799. # endif
  2800. if (gnum > env->num_mem) {
  2801. onig_scan_env_set_error_string(env,
  2802. ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end);
  2803. return ONIGERR_UNDEFINED_GROUP_REFERENCE;
  2804. }
  2805. # ifdef USE_NAMED_GROUP
  2806. set_call_attr:
  2807. # endif
  2808. cn->target = nodes[cn->group_num];
  2809. if (IS_NULL(cn->target)) {
  2810. onig_scan_env_set_error_string(env,
  2811. ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
  2812. return ONIGERR_UNDEFINED_NAME_REFERENCE;
  2813. }
  2814. SET_ENCLOSE_STATUS(cn->target, NST_CALLED);
  2815. BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num);
  2816. cn->unset_addr_list = env->unset_addr_list;
  2817. }
  2818. # ifdef USE_NAMED_GROUP
  2819. # ifdef USE_PERL_SUBEXP_CALL
  2820. else if (cn->name == cn->name_end) {
  2821. goto set_call_attr;
  2822. }
  2823. # endif
  2824. else {
  2825. int *refs;
  2826. int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end,
  2827. &refs);
  2828. if (n <= 0) {
  2829. onig_scan_env_set_error_string(env,
  2830. ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end);
  2831. return ONIGERR_UNDEFINED_NAME_REFERENCE;
  2832. }
  2833. else if (n > 1 &&
  2834. ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) {
  2835. onig_scan_env_set_error_string(env,
  2836. ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end);
  2837. return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL;
  2838. }
  2839. else {
  2840. cn->group_num = refs[0];
  2841. goto set_call_attr;
  2842. }
  2843. }
  2844. # endif
  2845. }
  2846. break;
  2847. case NT_ANCHOR:
  2848. {
  2849. AnchorNode* an = NANCHOR(node);
  2850. switch (an->type) {
  2851. case ANCHOR_PREC_READ:
  2852. case ANCHOR_PREC_READ_NOT:
  2853. case ANCHOR_LOOK_BEHIND:
  2854. case ANCHOR_LOOK_BEHIND_NOT:
  2855. r = setup_subexp_call(an->target, env);
  2856. break;
  2857. }
  2858. }
  2859. break;
  2860. default:
  2861. break;
  2862. }
  2863. return r;
  2864. }
  2865. #endif
  2866. /* divide different length alternatives in look-behind.
  2867. (?<=A|B) ==> (?<=A)|(?<=B)
  2868. (?<!A|B) ==> (?<!A)(?<!B)
  2869. */
  2870. static int
  2871. divide_look_behind_alternatives(Node* node)
  2872. {
  2873. Node *head, *np, *insert_node;
  2874. AnchorNode* an = NANCHOR(node);
  2875. int anc_type = an->type;
  2876. head = an->target;
  2877. np = NCAR(head);
  2878. swap_node(node, head);
  2879. NCAR(node) = head;
  2880. NANCHOR(head)->target = np;
  2881. np = node;
  2882. while ((np = NCDR(np)) != NULL_NODE) {
  2883. insert_node = onig_node_new_anchor(anc_type);
  2884. CHECK_NULL_RETURN_MEMERR(insert_node);
  2885. NANCHOR(insert_node)->target = NCAR(np);
  2886. NCAR(np) = insert_node;
  2887. }
  2888. if (anc_type == ANCHOR_LOOK_BEHIND_NOT) {
  2889. np = node;
  2890. do {
  2891. SET_NTYPE(np, NT_LIST); /* alt -> list */
  2892. } while ((np = NCDR(np)) != NULL_NODE);
  2893. }
  2894. return 0;
  2895. }
  2896. static int
  2897. setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
  2898. {
  2899. int r, len;
  2900. AnchorNode* an = NANCHOR(node);
  2901. r = get_char_length_tree(an->target, reg, &len);
  2902. if (r == 0)
  2903. an->char_len = len;
  2904. else if (r == GET_CHAR_LEN_VARLEN)
  2905. r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  2906. else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) {
  2907. if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND))
  2908. r = divide_look_behind_alternatives(node);
  2909. else
  2910. r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  2911. }
  2912. return r;
  2913. }
  2914. static int
  2915. next_setup(Node* node, Node* next_node, regex_t* reg)
  2916. {
  2917. int type;
  2918. retry:
  2919. type = NTYPE(node);
  2920. if (type == NT_QTFR) {
  2921. QtfrNode* qn = NQTFR(node);
  2922. if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) {
  2923. #ifdef USE_QTFR_PEEK_NEXT
  2924. Node* n = get_head_value_node(next_node, 1, reg);
  2925. /* '\0': for UTF-16BE etc... */
  2926. if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') {
  2927. qn->next_head_exact = n;
  2928. }
  2929. #endif
  2930. /* automatic possessification a*b ==> (?>a*)b */
  2931. if (qn->lower <= 1) {
  2932. int ttype = NTYPE(qn->target);
  2933. if (IS_NODE_TYPE_SIMPLE(ttype)) {
  2934. Node *x, *y;
  2935. x = get_head_value_node(qn->target, 0, reg);
  2936. if (IS_NOT_NULL(x)) {
  2937. y = get_head_value_node(next_node, 0, reg);
  2938. if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
  2939. Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK);
  2940. CHECK_NULL_RETURN_MEMERR(en);
  2941. SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
  2942. swap_node(node, en);
  2943. NENCLOSE(node)->target = en;
  2944. }
  2945. }
  2946. }
  2947. }
  2948. }
  2949. }
  2950. else if (type == NT_ENCLOSE) {
  2951. EncloseNode* en = NENCLOSE(node);
  2952. if (en->type == ENCLOSE_MEMORY) {
  2953. node = en->target;
  2954. goto retry;
  2955. }
  2956. }
  2957. return 0;
  2958. }
  2959. static int
  2960. update_string_node_case_fold(regex_t* reg, Node *node)
  2961. {
  2962. UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
  2963. UChar *sbuf, *ebuf, *sp;
  2964. int r, i, len;
  2965. OnigDistance sbuf_size;
  2966. StrNode* sn = NSTR(node);
  2967. end = sn->end;
  2968. sbuf_size = (end - sn->s) * 2;
  2969. sbuf = (UChar* )xmalloc(sbuf_size);
  2970. CHECK_NULL_RETURN_MEMERR(sbuf);
  2971. ebuf = sbuf + sbuf_size;
  2972. sp = sbuf;
  2973. p = sn->s;
  2974. while (p < end) {
  2975. len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf);
  2976. for (i = 0; i < len; i++) {
  2977. if (sp >= ebuf) {
  2978. UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2);
  2979. if (IS_NULL(p)) {
  2980. xfree(sbuf);
  2981. return ONIGERR_MEMORY;
  2982. }
  2983. sbuf = p;
  2984. sp = sbuf + sbuf_size;
  2985. sbuf_size *= 2;
  2986. ebuf = sbuf + sbuf_size;
  2987. }
  2988. *sp++ = buf[i];
  2989. }
  2990. }
  2991. r = onig_node_str_set(node, sbuf, sp);
  2992. xfree(sbuf);
  2993. return r;
  2994. }
  2995. static int
  2996. expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end,
  2997. regex_t* reg)
  2998. {
  2999. int r;
  3000. Node *node;
  3001. node = onig_node_new_str(s, end);
  3002. if (IS_NULL(node)) return ONIGERR_MEMORY;
  3003. r = update_string_node_case_fold(reg, node);
  3004. if (r != 0) {
  3005. onig_node_free(node);
  3006. return r;
  3007. }
  3008. NSTRING_SET_AMBIG(node);
  3009. NSTRING_SET_DONT_GET_OPT_INFO(node);
  3010. *rnode = node;
  3011. return 0;
  3012. }
  3013. static int
  3014. is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[],
  3015. int slen)
  3016. {
  3017. int i;
  3018. for (i = 0; i < item_num; i++) {
  3019. if (items[i].byte_len != slen) {
  3020. return 1;
  3021. }
  3022. if (items[i].code_len != 1) {
  3023. return 1;
  3024. }
  3025. }
  3026. return 0;
  3027. }
  3028. static int
  3029. expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
  3030. UChar *p, int slen, UChar *end,
  3031. regex_t* reg, Node **rnode)
  3032. {
  3033. int r, i, j, len, varlen;
  3034. Node *anode, *var_anode, *snode, *xnode, *an;
  3035. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  3036. *rnode = var_anode = NULL_NODE;
  3037. varlen = 0;
  3038. for (i = 0; i < item_num; i++) {
  3039. if (items[i].byte_len != slen) {
  3040. varlen = 1;
  3041. break;
  3042. }
  3043. }
  3044. if (varlen != 0) {
  3045. *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
  3046. if (IS_NULL(var_anode)) return ONIGERR_MEMORY;
  3047. xnode = onig_node_new_list(NULL, NULL);
  3048. if (IS_NULL(xnode)) goto mem_err;
  3049. NCAR(var_anode) = xnode;
  3050. anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
  3051. if (IS_NULL(anode)) goto mem_err;
  3052. NCAR(xnode) = anode;
  3053. }
  3054. else {
  3055. *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE);
  3056. if (IS_NULL(anode)) return ONIGERR_MEMORY;
  3057. }
  3058. snode = onig_node_new_str(p, p + slen);
  3059. if (IS_NULL(snode)) goto mem_err;
  3060. NCAR(anode) = snode;
  3061. for (i = 0; i < item_num; i++) {
  3062. snode = onig_node_new_str(NULL, NULL);
  3063. if (IS_NULL(snode)) goto mem_err;
  3064. for (j = 0; j < items[i].code_len; j++) {
  3065. len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf);
  3066. if (len < 0) {
  3067. r = len;
  3068. goto mem_err2;
  3069. }
  3070. r = onig_node_str_cat(snode, buf, buf + len);
  3071. if (r != 0) goto mem_err2;
  3072. }
  3073. an = onig_node_new_alt(NULL_NODE, NULL_NODE);
  3074. if (IS_NULL(an)) {
  3075. goto mem_err2;
  3076. }
  3077. if (items[i].byte_len != slen) {
  3078. Node *rem;
  3079. UChar *q = p + items[i].byte_len;
  3080. if (q < end) {
  3081. r = expand_case_fold_make_rem_string(&rem, q, end, reg);
  3082. if (r != 0) {
  3083. onig_node_free(an);
  3084. goto mem_err2;
  3085. }
  3086. xnode = onig_node_list_add(NULL_NODE, snode);
  3087. if (IS_NULL(xnode)) {
  3088. onig_node_free(an);
  3089. onig_node_free(rem);
  3090. goto mem_err2;
  3091. }
  3092. if (IS_NULL(onig_node_list_add(xnode, rem))) {
  3093. onig_node_free(an);
  3094. onig_node_free(xnode);
  3095. onig_node_free(rem);
  3096. goto mem_err;
  3097. }
  3098. NCAR(an) = xnode;
  3099. }
  3100. else {
  3101. NCAR(an) = snode;
  3102. }
  3103. NCDR(var_anode) = an;
  3104. var_anode = an;
  3105. }
  3106. else {
  3107. NCAR(an) = snode;
  3108. NCDR(anode) = an;
  3109. anode = an;
  3110. }
  3111. }
  3112. return varlen;
  3113. mem_err2:
  3114. onig_node_free(snode);
  3115. mem_err:
  3116. onig_node_free(*rnode);
  3117. return ONIGERR_MEMORY;
  3118. }
  3119. static int
  3120. expand_case_fold_string(Node* node, regex_t* reg)
  3121. {
  3122. #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
  3123. int r, n, len, alt_num;
  3124. int varlen = 0;
  3125. UChar *start, *end, *p;
  3126. Node *top_root, *root, *snode, *prev_node;
  3127. OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
  3128. StrNode* sn = NSTR(node);
  3129. if (NSTRING_IS_AMBIG(node)) return 0;
  3130. start = sn->s;
  3131. end = sn->end;
  3132. if (start >= end) return 0;
  3133. r = 0;
  3134. top_root = root = prev_node = snode = NULL_NODE;
  3135. alt_num = 1;
  3136. p = start;
  3137. while (p < end) {
  3138. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag,
  3139. p, end, items);
  3140. if (n < 0) {
  3141. r = n;
  3142. goto err;
  3143. }
  3144. len = enclen(reg->enc, p, end);
  3145. varlen = is_case_fold_variable_len(n, items, len);
  3146. if (n == 0 || varlen == 0) {
  3147. if (IS_NULL(snode)) {
  3148. if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
  3149. onig_node_free(top_root);
  3150. top_root = root = onig_node_list_add(NULL_NODE, prev_node);
  3151. if (IS_NULL(root)) {
  3152. onig_node_free(prev_node);
  3153. goto mem_err;
  3154. }
  3155. }
  3156. prev_node = snode = onig_node_new_str(NULL, NULL);
  3157. if (IS_NULL(snode)) goto mem_err;
  3158. if (IS_NOT_NULL(root)) {
  3159. if (IS_NULL(onig_node_list_add(root, snode))) {
  3160. onig_node_free(snode);
  3161. goto mem_err;
  3162. }
  3163. }
  3164. }
  3165. r = onig_node_str_cat(snode, p, p + len);
  3166. if (r != 0) goto err;
  3167. }
  3168. else {
  3169. alt_num *= (n + 1);
  3170. if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break;
  3171. if (IS_NOT_NULL(snode)) {
  3172. r = update_string_node_case_fold(reg, snode);
  3173. if (r == 0) {
  3174. NSTRING_SET_AMBIG(snode);
  3175. }
  3176. }
  3177. if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
  3178. onig_node_free(top_root);
  3179. top_root = root = onig_node_list_add(NULL_NODE, prev_node);
  3180. if (IS_NULL(root)) {
  3181. onig_node_free(prev_node);
  3182. goto mem_err;
  3183. }
  3184. }
  3185. r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node);
  3186. if (r < 0) goto mem_err;
  3187. if (r == 1) {
  3188. if (IS_NULL(root)) {
  3189. top_root = prev_node;
  3190. }
  3191. else {
  3192. if (IS_NULL(onig_node_list_add(root, prev_node))) {
  3193. onig_node_free(prev_node);
  3194. goto mem_err;
  3195. }
  3196. }
  3197. root = NCAR(prev_node);
  3198. }
  3199. else { /* r == 0 */
  3200. if (IS_NOT_NULL(root)) {
  3201. if (IS_NULL(onig_node_list_add(root, prev_node))) {
  3202. onig_node_free(prev_node);
  3203. goto mem_err;
  3204. }
  3205. }
  3206. }
  3207. snode = NULL_NODE;
  3208. }
  3209. p += len;
  3210. }
  3211. if (IS_NOT_NULL(snode)) {
  3212. r = update_string_node_case_fold(reg, snode);
  3213. if (r == 0) {
  3214. NSTRING_SET_AMBIG(snode);
  3215. }
  3216. }
  3217. if (p < end) {
  3218. Node *srem;
  3219. r = expand_case_fold_make_rem_string(&srem, p, end, reg);
  3220. if (r != 0) goto mem_err;
  3221. if (IS_NOT_NULL(prev_node) && IS_NULL(root)) {
  3222. onig_node_free(top_root);
  3223. top_root = root = onig_node_list_add(NULL_NODE, prev_node);
  3224. if (IS_NULL(root)) {
  3225. onig_node_free(srem);
  3226. onig_node_free(prev_node);
  3227. goto mem_err;
  3228. }
  3229. }
  3230. if (IS_NULL(root)) {
  3231. prev_node = srem;
  3232. }
  3233. else {
  3234. if (IS_NULL(onig_node_list_add(root, srem))) {
  3235. onig_node_free(srem);
  3236. goto mem_err;
  3237. }
  3238. }
  3239. }
  3240. /* ending */
  3241. top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node);
  3242. swap_node(node, top_root);
  3243. onig_node_free(top_root);
  3244. return 0;
  3245. mem_err:
  3246. r = ONIGERR_MEMORY;
  3247. err:
  3248. onig_node_free(top_root);
  3249. return r;
  3250. }
  3251. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  3252. # define CEC_THRES_NUM_BIG_REPEAT 512
  3253. # define CEC_INFINITE_NUM 0x7fffffff
  3254. # define CEC_IN_INFINITE_REPEAT (1<<0)
  3255. # define CEC_IN_FINITE_REPEAT (1<<1)
  3256. # define CEC_CONT_BIG_REPEAT (1<<2)
  3257. static int
  3258. setup_comb_exp_check(Node* node, int state, ScanEnv* env)
  3259. {
  3260. int type;
  3261. int r = state;
  3262. type = NTYPE(node);
  3263. switch (type) {
  3264. case NT_LIST:
  3265. {
  3266. do {
  3267. r = setup_comb_exp_check(NCAR(node), r, env);
  3268. } while (r >= 0 && IS_NOT_NULL(node = NCDR(node)));
  3269. }
  3270. break;
  3271. case NT_ALT:
  3272. {
  3273. int ret;
  3274. do {
  3275. ret = setup_comb_exp_check(NCAR(node), state, env);
  3276. r |= ret;
  3277. } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node)));
  3278. }
  3279. break;
  3280. case NT_QTFR:
  3281. {
  3282. int child_state = state;
  3283. int add_state = 0;
  3284. QtfrNode* qn = NQTFR(node);
  3285. Node* target = qn->target;
  3286. int var_num;
  3287. if (! IS_REPEAT_INFINITE(qn->upper)) {
  3288. if (qn->upper > 1) {
  3289. /* {0,1}, {1,1} are allowed */
  3290. child_state |= CEC_IN_FINITE_REPEAT;
  3291. /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */
  3292. if (env->backrefed_mem == 0) {
  3293. if (NTYPE(qn->target) == NT_ENCLOSE) {
  3294. EncloseNode* en = NENCLOSE(qn->target);
  3295. if (en->type == ENCLOSE_MEMORY) {
  3296. if (NTYPE(en->target) == NT_QTFR) {
  3297. QtfrNode* q = NQTFR(en->target);
  3298. if (IS_REPEAT_INFINITE(q->upper)
  3299. && q->greedy == qn->greedy) {
  3300. qn->upper = (qn->lower == 0 ? 1 : qn->lower);
  3301. if (qn->upper == 1)
  3302. child_state = state;
  3303. }
  3304. }
  3305. }
  3306. }
  3307. }
  3308. }
  3309. }
  3310. if (state & CEC_IN_FINITE_REPEAT) {
  3311. qn->comb_exp_check_num = -1;
  3312. }
  3313. else {
  3314. if (IS_REPEAT_INFINITE(qn->upper)) {
  3315. var_num = CEC_INFINITE_NUM;
  3316. child_state |= CEC_IN_INFINITE_REPEAT;
  3317. }
  3318. else {
  3319. var_num = qn->upper - qn->lower;
  3320. }
  3321. if (var_num >= CEC_THRES_NUM_BIG_REPEAT)
  3322. add_state |= CEC_CONT_BIG_REPEAT;
  3323. if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) ||
  3324. ((state & CEC_CONT_BIG_REPEAT) != 0 &&
  3325. var_num >= CEC_THRES_NUM_BIG_REPEAT)) {
  3326. if (qn->comb_exp_check_num == 0) {
  3327. env->num_comb_exp_check++;
  3328. qn->comb_exp_check_num = env->num_comb_exp_check;
  3329. if (env->curr_max_regnum > env->comb_exp_max_regnum)
  3330. env->comb_exp_max_regnum = env->curr_max_regnum;
  3331. }
  3332. }
  3333. }
  3334. r = setup_comb_exp_check(target, child_state, env);
  3335. r |= add_state;
  3336. }
  3337. break;
  3338. case NT_ENCLOSE:
  3339. {
  3340. EncloseNode* en = NENCLOSE(node);
  3341. switch (en->type) {
  3342. case ENCLOSE_MEMORY:
  3343. {
  3344. if (env->curr_max_regnum < en->regnum)
  3345. env->curr_max_regnum = en->regnum;
  3346. r = setup_comb_exp_check(en->target, state, env);
  3347. }
  3348. break;
  3349. default:
  3350. r = setup_comb_exp_check(en->target, state, env);
  3351. break;
  3352. }
  3353. }
  3354. break;
  3355. # ifdef USE_SUBEXP_CALL
  3356. case NT_CALL:
  3357. if (IS_CALL_RECURSION(NCALL(node)))
  3358. env->has_recursion = 1;
  3359. else
  3360. r = setup_comb_exp_check(NCALL(node)->target, state, env);
  3361. break;
  3362. # endif
  3363. default:
  3364. break;
  3365. }
  3366. return r;
  3367. }
  3368. #endif
  3369. #define IN_ALT (1<<0)
  3370. #define IN_NOT (1<<1)
  3371. #define IN_REPEAT (1<<2)
  3372. #define IN_VAR_REPEAT (1<<3)
  3373. #define IN_CALL (1<<4)
  3374. #define IN_RECCALL (1<<5)
  3375. /* setup_tree does the following work.
  3376. 1. check empty loop. (set qn->target_empty_info)
  3377. 2. expand ignore-case in char class.
  3378. 3. set memory status bit flags. (reg->mem_stats)
  3379. 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact].
  3380. 5. find invalid patterns in look-behind.
  3381. 6. expand repeated string.
  3382. */
  3383. static int
  3384. setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
  3385. {
  3386. int type;
  3387. int r = 0;
  3388. restart:
  3389. type = NTYPE(node);
  3390. switch (type) {
  3391. case NT_LIST:
  3392. {
  3393. Node* prev = NULL_NODE;
  3394. do {
  3395. r = setup_tree(NCAR(node), reg, state, env);
  3396. if (IS_NOT_NULL(prev) && r == 0) {
  3397. r = next_setup(prev, NCAR(node), reg);
  3398. }
  3399. prev = NCAR(node);
  3400. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  3401. }
  3402. break;
  3403. case NT_ALT:
  3404. do {
  3405. r = setup_tree(NCAR(node), reg, (state | IN_ALT), env);
  3406. } while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
  3407. break;
  3408. case NT_CCLASS:
  3409. break;
  3410. case NT_STR:
  3411. if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
  3412. r = expand_case_fold_string(node, reg);
  3413. }
  3414. break;
  3415. case NT_CTYPE:
  3416. case NT_CANY:
  3417. break;
  3418. #ifdef USE_SUBEXP_CALL
  3419. case NT_CALL:
  3420. break;
  3421. #endif
  3422. case NT_BREF:
  3423. {
  3424. int i;
  3425. int* p;
  3426. Node** nodes = SCANENV_MEM_NODES(env);
  3427. BRefNode* br = NBREF(node);
  3428. p = BACKREFS_P(br);
  3429. for (i = 0; i < br->back_num; i++) {
  3430. if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF;
  3431. BIT_STATUS_ON_AT(env->backrefed_mem, p[i]);
  3432. BIT_STATUS_ON_AT(env->bt_mem_start, p[i]);
  3433. #ifdef USE_BACKREF_WITH_LEVEL
  3434. if (IS_BACKREF_NEST_LEVEL(br)) {
  3435. BIT_STATUS_ON_AT(env->bt_mem_end, p[i]);
  3436. }
  3437. #endif
  3438. SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED);
  3439. }
  3440. }
  3441. break;
  3442. case NT_QTFR:
  3443. {
  3444. OnigDistance d;
  3445. QtfrNode* qn = NQTFR(node);
  3446. Node* target = qn->target;
  3447. if ((state & IN_REPEAT) != 0) {
  3448. qn->state |= NST_IN_REPEAT;
  3449. }
  3450. if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
  3451. r = get_min_match_length(target, &d, env);
  3452. if (r) break;
  3453. if (d == 0) {
  3454. qn->target_empty_info = NQ_TARGET_IS_EMPTY;
  3455. #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT
  3456. r = quantifiers_memory_node_info(target);
  3457. if (r < 0) break;
  3458. if (r > 0) {
  3459. qn->target_empty_info = r;
  3460. }
  3461. #endif
  3462. #if 0
  3463. r = get_max_match_length(target, &d, env);
  3464. if (r == 0 && d == 0) {
  3465. /* ()* ==> ()?, ()+ ==> () */
  3466. qn->upper = 1;
  3467. if (qn->lower > 1) qn->lower = 1;
  3468. if (NTYPE(target) == NT_STR) {
  3469. qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */
  3470. }
  3471. }
  3472. #endif
  3473. }
  3474. }
  3475. state |= IN_REPEAT;
  3476. if (qn->lower != qn->upper)
  3477. state |= IN_VAR_REPEAT;
  3478. r = setup_tree(target, reg, state, env);
  3479. if (r) break;
  3480. /* expand string */
  3481. #define EXPAND_STRING_MAX_LENGTH 100
  3482. if (NTYPE(target) == NT_STR) {
  3483. if (qn->lower > 1) {
  3484. int i, n = qn->lower;
  3485. OnigDistance len = NSTRING_LEN(target);
  3486. StrNode* sn = NSTR(target);
  3487. Node* np;
  3488. np = onig_node_new_str(sn->s, sn->end);
  3489. if (IS_NULL(np)) return ONIGERR_MEMORY;
  3490. NSTR(np)->flag = sn->flag;
  3491. for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) {
  3492. r = onig_node_str_cat(np, sn->s, sn->end);
  3493. if (r) {
  3494. onig_node_free(np);
  3495. return r;
  3496. }
  3497. }
  3498. if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) {
  3499. Node *np1, *np2;
  3500. qn->lower -= i;
  3501. if (! IS_REPEAT_INFINITE(qn->upper))
  3502. qn->upper -= i;
  3503. np1 = onig_node_new_list(np, NULL);
  3504. if (IS_NULL(np1)) {
  3505. onig_node_free(np);
  3506. return ONIGERR_MEMORY;
  3507. }
  3508. swap_node(np1, node);
  3509. np2 = onig_node_list_add(node, np1);
  3510. if (IS_NULL(np2)) {
  3511. onig_node_free(np1);
  3512. return ONIGERR_MEMORY;
  3513. }
  3514. }
  3515. else {
  3516. swap_node(np, node);
  3517. onig_node_free(np);
  3518. }
  3519. break; /* break case NT_QTFR: */
  3520. }
  3521. }
  3522. #ifdef USE_OP_PUSH_OR_JUMP_EXACT
  3523. if (qn->greedy && (qn->target_empty_info != 0)) {
  3524. if (NTYPE(target) == NT_QTFR) {
  3525. QtfrNode* tqn = NQTFR(target);
  3526. if (IS_NOT_NULL(tqn->head_exact)) {
  3527. qn->head_exact = tqn->head_exact;
  3528. tqn->head_exact = NULL;
  3529. }
  3530. }
  3531. else {
  3532. qn->head_exact = get_head_value_node(qn->target, 1, reg);
  3533. }
  3534. }
  3535. #endif
  3536. }
  3537. break;
  3538. case NT_ENCLOSE:
  3539. {
  3540. EncloseNode* en = NENCLOSE(node);
  3541. switch (en->type) {
  3542. case ENCLOSE_OPTION:
  3543. {
  3544. OnigOptionType options = reg->options;
  3545. reg->options = NENCLOSE(node)->option;
  3546. r = setup_tree(NENCLOSE(node)->target, reg, state, env);
  3547. reg->options = options;
  3548. }
  3549. break;
  3550. case ENCLOSE_MEMORY:
  3551. if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) {
  3552. BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
  3553. /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */
  3554. }
  3555. if (IS_ENCLOSE_CALLED(en))
  3556. state |= IN_CALL;
  3557. if (IS_ENCLOSE_RECURSION(en))
  3558. state |= IN_RECCALL;
  3559. else if ((state & IN_RECCALL) != 0)
  3560. SET_CALL_RECURSION(node);
  3561. r = setup_tree(en->target, reg, state, env);
  3562. break;
  3563. case ENCLOSE_STOP_BACKTRACK:
  3564. {
  3565. Node* target = en->target;
  3566. r = setup_tree(target, reg, state, env);
  3567. if (NTYPE(target) == NT_QTFR) {
  3568. QtfrNode* tqn = NQTFR(target);
  3569. if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 &&
  3570. tqn->greedy != 0) { /* (?>a*), a*+ etc... */
  3571. int qtype = NTYPE(tqn->target);
  3572. if (IS_NODE_TYPE_SIMPLE(qtype))
  3573. SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
  3574. }
  3575. }
  3576. }
  3577. break;
  3578. case ENCLOSE_CONDITION:
  3579. #ifdef USE_NAMED_GROUP
  3580. if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) &&
  3581. env->num_named > 0 &&
  3582. IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
  3583. !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) {
  3584. return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED;
  3585. }
  3586. #endif
  3587. if (NENCLOSE(node)->regnum > env->num_mem)
  3588. return ONIGERR_INVALID_BACKREF;
  3589. r = setup_tree(NENCLOSE(node)->target, reg, state, env);
  3590. break;
  3591. case ENCLOSE_ABSENT:
  3592. r = setup_tree(NENCLOSE(node)->target, reg, state, env);
  3593. break;
  3594. }
  3595. }
  3596. break;
  3597. case NT_ANCHOR:
  3598. {
  3599. AnchorNode* an = NANCHOR(node);
  3600. switch (an->type) {
  3601. case ANCHOR_PREC_READ:
  3602. r = setup_tree(an->target, reg, state, env);
  3603. break;
  3604. case ANCHOR_PREC_READ_NOT:
  3605. r = setup_tree(an->target, reg, (state | IN_NOT), env);
  3606. break;
  3607. /* allowed node types in look-behind */
  3608. #define ALLOWED_TYPE_IN_LB \
  3609. ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \
  3610. BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL )
  3611. #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION )
  3612. #define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION
  3613. #define ALLOWED_ANCHOR_IN_LB \
  3614. ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
  3615. ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
  3616. ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
  3617. ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
  3618. #define ALLOWED_ANCHOR_IN_LB_NOT \
  3619. ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \
  3620. ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \
  3621. ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \
  3622. ANCHOR_WORD_BEGIN | ANCHOR_WORD_END )
  3623. case ANCHOR_LOOK_BEHIND:
  3624. {
  3625. r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
  3626. ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB);
  3627. if (r < 0) return r;
  3628. if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  3629. if (NTYPE(node) != NT_ANCHOR) goto restart;
  3630. r = setup_tree(an->target, reg, state, env);
  3631. if (r != 0) return r;
  3632. r = setup_look_behind(node, reg, env);
  3633. }
  3634. break;
  3635. case ANCHOR_LOOK_BEHIND_NOT:
  3636. {
  3637. r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB,
  3638. ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT);
  3639. if (r < 0) return r;
  3640. if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
  3641. if (NTYPE(node) != NT_ANCHOR) goto restart;
  3642. r = setup_tree(an->target, reg, (state | IN_NOT), env);
  3643. if (r != 0) return r;
  3644. r = setup_look_behind(node, reg, env);
  3645. }
  3646. break;
  3647. }
  3648. }
  3649. break;
  3650. default:
  3651. break;
  3652. }
  3653. return r;
  3654. }
  3655. #ifndef USE_SUNDAY_QUICK_SEARCH
  3656. /* set skip map for Boyer-Moore search */
  3657. static int
  3658. set_bm_skip(UChar* s, UChar* end, regex_t* reg,
  3659. UChar skip[], int** int_skip, int ignore_case)
  3660. {
  3661. OnigDistance i, len;
  3662. int clen, flen, n, j, k;
  3663. UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
  3664. OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
  3665. OnigEncoding enc = reg->enc;
  3666. len = end - s;
  3667. if (len < ONIG_CHAR_TABLE_SIZE) {
  3668. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len;
  3669. n = 0;
  3670. for (i = 0; i < len - 1; i += clen) {
  3671. p = s + i;
  3672. if (ignore_case)
  3673. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
  3674. p, end, items);
  3675. clen = enclen(enc, p, end);
  3676. if (p + clen > end)
  3677. clen = (int )(end - p);
  3678. for (j = 0; j < n; j++) {
  3679. if ((items[j].code_len != 1) || (items[j].byte_len != clen))
  3680. return 1; /* different length isn't supported. */
  3681. flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
  3682. if (flen != clen)
  3683. return 1; /* different length isn't supported. */
  3684. }
  3685. for (j = 0; j < clen; j++) {
  3686. skip[s[i + j]] = (UChar )(len - 1 - i - j);
  3687. for (k = 0; k < n; k++) {
  3688. skip[buf[k][j]] = (UChar )(len - 1 - i - j);
  3689. }
  3690. }
  3691. }
  3692. }
  3693. else {
  3694. # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
  3695. /* This should not happen. */
  3696. return ONIGERR_TYPE_BUG;
  3697. # else
  3698. if (IS_NULL(*int_skip)) {
  3699. *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
  3700. if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
  3701. }
  3702. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len;
  3703. n = 0;
  3704. for (i = 0; i < len - 1; i += clen) {
  3705. p = s + i;
  3706. if (ignore_case)
  3707. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
  3708. p, end, items);
  3709. clen = enclen(enc, p, end);
  3710. if (p + clen > end)
  3711. clen = (int )(end - p);
  3712. for (j = 0; j < n; j++) {
  3713. if ((items[j].code_len != 1) || (items[j].byte_len != clen))
  3714. return 1; /* different length isn't supported. */
  3715. flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
  3716. if (flen != clen)
  3717. return 1; /* different length isn't supported. */
  3718. }
  3719. for (j = 0; j < clen; j++) {
  3720. (*int_skip)[s[i + j]] = (int )(len - 1 - i - j);
  3721. for (k = 0; k < n; k++) {
  3722. (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j);
  3723. }
  3724. }
  3725. }
  3726. # endif
  3727. }
  3728. return 0;
  3729. }
  3730. #else /* USE_SUNDAY_QUICK_SEARCH */
  3731. /* set skip map for Sunday's quick search */
  3732. static int
  3733. set_bm_skip(UChar* s, UChar* end, regex_t* reg,
  3734. UChar skip[], int** int_skip, int ignore_case)
  3735. {
  3736. OnigDistance i, len;
  3737. int clen, flen, n, j, k;
  3738. UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
  3739. OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
  3740. OnigEncoding enc = reg->enc;
  3741. len = end - s;
  3742. if (len < ONIG_CHAR_TABLE_SIZE) {
  3743. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1);
  3744. n = 0;
  3745. for (i = 0; i < len; i += clen) {
  3746. p = s + i;
  3747. if (ignore_case)
  3748. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
  3749. p, end, items);
  3750. clen = enclen(enc, p, end);
  3751. if (p + clen > end)
  3752. clen = (int )(end - p);
  3753. for (j = 0; j < n; j++) {
  3754. if ((items[j].code_len != 1) || (items[j].byte_len != clen))
  3755. return 1; /* different length isn't supported. */
  3756. flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
  3757. if (flen != clen)
  3758. return 1; /* different length isn't supported. */
  3759. }
  3760. for (j = 0; j < clen; j++) {
  3761. skip[s[i + j]] = (UChar )(len - i - j);
  3762. for (k = 0; k < n; k++) {
  3763. skip[buf[k][j]] = (UChar )(len - i - j);
  3764. }
  3765. }
  3766. }
  3767. }
  3768. else {
  3769. # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
  3770. /* This should not happen. */
  3771. return ONIGERR_TYPE_BUG;
  3772. # else
  3773. if (IS_NULL(*int_skip)) {
  3774. *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
  3775. if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
  3776. }
  3777. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1);
  3778. n = 0;
  3779. for (i = 0; i < len; i += clen) {
  3780. p = s + i;
  3781. if (ignore_case)
  3782. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
  3783. p, end, items);
  3784. clen = enclen(enc, p, end);
  3785. if (p + clen > end)
  3786. clen = (int )(end - p);
  3787. for (j = 0; j < n; j++) {
  3788. if ((items[j].code_len != 1) || (items[j].byte_len != clen))
  3789. return 1; /* different length isn't supported. */
  3790. flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
  3791. if (flen != clen)
  3792. return 1; /* different length isn't supported. */
  3793. }
  3794. for (j = 0; j < clen; j++) {
  3795. (*int_skip)[s[i + j]] = (int )(len - i - j);
  3796. for (k = 0; k < n; k++) {
  3797. (*int_skip)[buf[k][j]] = (int )(len - i - j);
  3798. }
  3799. }
  3800. }
  3801. # endif
  3802. }
  3803. return 0;
  3804. }
  3805. #endif /* USE_SUNDAY_QUICK_SEARCH */
  3806. typedef struct {
  3807. OnigDistance min; /* min byte length */
  3808. OnigDistance max; /* max byte length */
  3809. } MinMaxLen;
  3810. typedef struct {
  3811. MinMaxLen mmd;
  3812. OnigEncoding enc;
  3813. OnigOptionType options;
  3814. OnigCaseFoldType case_fold_flag;
  3815. ScanEnv* scan_env;
  3816. } OptEnv;
  3817. typedef struct {
  3818. int left_anchor;
  3819. int right_anchor;
  3820. } OptAncInfo;
  3821. typedef struct {
  3822. MinMaxLen mmd; /* info position */
  3823. OptAncInfo anc;
  3824. int reach_end;
  3825. int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */
  3826. int len;
  3827. UChar s[OPT_EXACT_MAXLEN];
  3828. } OptExactInfo;
  3829. typedef struct {
  3830. MinMaxLen mmd; /* info position */
  3831. OptAncInfo anc;
  3832. int value; /* weighted value */
  3833. UChar map[ONIG_CHAR_TABLE_SIZE];
  3834. } OptMapInfo;
  3835. typedef struct {
  3836. MinMaxLen len;
  3837. OptAncInfo anc;
  3838. OptExactInfo exb; /* boundary */
  3839. OptExactInfo exm; /* middle */
  3840. OptExactInfo expr; /* prec read (?=...) */
  3841. OptMapInfo map; /* boundary */
  3842. } NodeOptInfo;
  3843. static int
  3844. map_position_value(OnigEncoding enc, int i)
  3845. {
  3846. static const short int ByteValTable[] = {
  3847. 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
  3848. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  3849. 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
  3850. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
  3851. 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
  3852. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
  3853. 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
  3854. 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
  3855. };
  3856. if (i < numberof(ByteValTable)) {
  3857. if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1)
  3858. return 20;
  3859. else
  3860. return (int )ByteValTable[i];
  3861. }
  3862. else
  3863. return 4; /* Take it easy. */
  3864. }
  3865. static int
  3866. distance_value(MinMaxLen* mm)
  3867. {
  3868. /* 1000 / (min-max-dist + 1) */
  3869. static const short int dist_vals[] = {
  3870. 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
  3871. 91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
  3872. 48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
  3873. 32, 31, 30, 29, 29, 28, 27, 26, 26, 25,
  3874. 24, 24, 23, 23, 22, 22, 21, 21, 20, 20,
  3875. 20, 19, 19, 19, 18, 18, 18, 17, 17, 17,
  3876. 16, 16, 16, 16, 15, 15, 15, 15, 14, 14,
  3877. 14, 14, 14, 14, 13, 13, 13, 13, 13, 13,
  3878. 12, 12, 12, 12, 12, 12, 11, 11, 11, 11,
  3879. 11, 11, 11, 11, 11, 10, 10, 10, 10, 10
  3880. };
  3881. OnigDistance d;
  3882. if (mm->max == ONIG_INFINITE_DISTANCE) return 0;
  3883. d = mm->max - mm->min;
  3884. if (d < numberof(dist_vals))
  3885. /* return dist_vals[d] * 16 / (mm->min + 12); */
  3886. return (int )dist_vals[d];
  3887. else
  3888. return 1;
  3889. }
  3890. static int
  3891. comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2)
  3892. {
  3893. if (v2 <= 0) return -1;
  3894. if (v1 <= 0) return 1;
  3895. v1 *= distance_value(d1);
  3896. v2 *= distance_value(d2);
  3897. if (v2 > v1) return 1;
  3898. if (v2 < v1) return -1;
  3899. if (d2->min < d1->min) return 1;
  3900. if (d2->min > d1->min) return -1;
  3901. return 0;
  3902. }
  3903. static int
  3904. is_equal_mml(MinMaxLen* a, MinMaxLen* b)
  3905. {
  3906. return (a->min == b->min && a->max == b->max) ? 1 : 0;
  3907. }
  3908. static void
  3909. set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max)
  3910. {
  3911. mml->min = min;
  3912. mml->max = max;
  3913. }
  3914. static void
  3915. clear_mml(MinMaxLen* mml)
  3916. {
  3917. mml->min = mml->max = 0;
  3918. }
  3919. static void
  3920. copy_mml(MinMaxLen* to, MinMaxLen* from)
  3921. {
  3922. to->min = from->min;
  3923. to->max = from->max;
  3924. }
  3925. static void
  3926. add_mml(MinMaxLen* to, MinMaxLen* from)
  3927. {
  3928. to->min = distance_add(to->min, from->min);
  3929. to->max = distance_add(to->max, from->max);
  3930. }
  3931. #if 0
  3932. static void
  3933. add_len_mml(MinMaxLen* to, OnigDistance len)
  3934. {
  3935. to->min = distance_add(to->min, len);
  3936. to->max = distance_add(to->max, len);
  3937. }
  3938. #endif
  3939. static void
  3940. alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
  3941. {
  3942. if (to->min > from->min) to->min = from->min;
  3943. if (to->max < from->max) to->max = from->max;
  3944. }
  3945. static void
  3946. copy_opt_env(OptEnv* to, OptEnv* from)
  3947. {
  3948. *to = *from;
  3949. }
  3950. static void
  3951. clear_opt_anc_info(OptAncInfo* anc)
  3952. {
  3953. anc->left_anchor = 0;
  3954. anc->right_anchor = 0;
  3955. }
  3956. static void
  3957. copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from)
  3958. {
  3959. *to = *from;
  3960. }
  3961. static void
  3962. concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right,
  3963. OnigDistance left_len, OnigDistance right_len)
  3964. {
  3965. clear_opt_anc_info(to);
  3966. to->left_anchor = left->left_anchor;
  3967. if (left_len == 0) {
  3968. to->left_anchor |= right->left_anchor;
  3969. }
  3970. to->right_anchor = right->right_anchor;
  3971. if (right_len == 0) {
  3972. to->right_anchor |= left->right_anchor;
  3973. }
  3974. else {
  3975. to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT);
  3976. }
  3977. }
  3978. static int
  3979. is_left_anchor(int anc)
  3980. {
  3981. if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF ||
  3982. anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ ||
  3983. anc == ANCHOR_PREC_READ_NOT)
  3984. return 0;
  3985. return 1;
  3986. }
  3987. static int
  3988. is_set_opt_anc_info(OptAncInfo* to, int anc)
  3989. {
  3990. if ((to->left_anchor & anc) != 0) return 1;
  3991. return ((to->right_anchor & anc) != 0 ? 1 : 0);
  3992. }
  3993. static void
  3994. add_opt_anc_info(OptAncInfo* to, int anc)
  3995. {
  3996. if (is_left_anchor(anc))
  3997. to->left_anchor |= anc;
  3998. else
  3999. to->right_anchor |= anc;
  4000. }
  4001. static void
  4002. remove_opt_anc_info(OptAncInfo* to, int anc)
  4003. {
  4004. if (is_left_anchor(anc))
  4005. to->left_anchor &= ~anc;
  4006. else
  4007. to->right_anchor &= ~anc;
  4008. }
  4009. static void
  4010. alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add)
  4011. {
  4012. to->left_anchor &= add->left_anchor;
  4013. to->right_anchor &= add->right_anchor;
  4014. }
  4015. static int
  4016. is_full_opt_exact_info(OptExactInfo* ex)
  4017. {
  4018. return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0);
  4019. }
  4020. static void
  4021. clear_opt_exact_info(OptExactInfo* ex)
  4022. {
  4023. clear_mml(&ex->mmd);
  4024. clear_opt_anc_info(&ex->anc);
  4025. ex->reach_end = 0;
  4026. ex->ignore_case = -1; /* unset */
  4027. ex->len = 0;
  4028. ex->s[0] = '\0';
  4029. }
  4030. static void
  4031. copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from)
  4032. {
  4033. *to = *from;
  4034. }
  4035. static void
  4036. concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc)
  4037. {
  4038. int i, j, len;
  4039. UChar *p, *end;
  4040. OptAncInfo tanc;
  4041. if (to->ignore_case < 0)
  4042. to->ignore_case = add->ignore_case;
  4043. else if (to->ignore_case != add->ignore_case)
  4044. return ; /* avoid */
  4045. p = add->s;
  4046. end = p + add->len;
  4047. for (i = to->len; p < end; ) {
  4048. len = enclen(enc, p, end);
  4049. if (i + len > OPT_EXACT_MAXLEN) break;
  4050. for (j = 0; j < len && p < end; j++)
  4051. to->s[i++] = *p++;
  4052. }
  4053. to->len = i;
  4054. to->reach_end = (p == end ? add->reach_end : 0);
  4055. concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1);
  4056. if (! to->reach_end) tanc.right_anchor = 0;
  4057. copy_opt_anc_info(&to->anc, &tanc);
  4058. }
  4059. static void
  4060. concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end,
  4061. int raw ARG_UNUSED, OnigEncoding enc)
  4062. {
  4063. int i, j, len;
  4064. UChar *p;
  4065. for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) {
  4066. len = enclen(enc, p, end);
  4067. if (i + len > OPT_EXACT_MAXLEN) break;
  4068. for (j = 0; j < len && p < end; j++)
  4069. to->s[i++] = *p++;
  4070. }
  4071. to->len = i;
  4072. }
  4073. static void
  4074. alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
  4075. {
  4076. int i, j, len;
  4077. if (add->len == 0 || to->len == 0) {
  4078. clear_opt_exact_info(to);
  4079. return ;
  4080. }
  4081. if (! is_equal_mml(&to->mmd, &add->mmd)) {
  4082. clear_opt_exact_info(to);
  4083. return ;
  4084. }
  4085. for (i = 0; i < to->len && i < add->len; ) {
  4086. if (to->s[i] != add->s[i]) break;
  4087. len = enclen(env->enc, to->s + i, to->s + to->len);
  4088. for (j = 1; j < len; j++) {
  4089. if (to->s[i+j] != add->s[i+j]) break;
  4090. }
  4091. if (j < len) break;
  4092. i += len;
  4093. }
  4094. if (! add->reach_end || i < add->len || i < to->len) {
  4095. to->reach_end = 0;
  4096. }
  4097. to->len = i;
  4098. if (to->ignore_case < 0)
  4099. to->ignore_case = add->ignore_case;
  4100. else if (add->ignore_case >= 0)
  4101. to->ignore_case |= add->ignore_case;
  4102. alt_merge_opt_anc_info(&to->anc, &add->anc);
  4103. if (! to->reach_end) to->anc.right_anchor = 0;
  4104. }
  4105. static void
  4106. select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt)
  4107. {
  4108. int v1, v2;
  4109. v1 = now->len;
  4110. v2 = alt->len;
  4111. if (v2 == 0) {
  4112. return ;
  4113. }
  4114. else if (v1 == 0) {
  4115. copy_opt_exact_info(now, alt);
  4116. return ;
  4117. }
  4118. else if (v1 <= 2 && v2 <= 2) {
  4119. /* ByteValTable[x] is big value --> low price */
  4120. v2 = map_position_value(enc, now->s[0]);
  4121. v1 = map_position_value(enc, alt->s[0]);
  4122. if (now->len > 1) v1 += 5;
  4123. if (alt->len > 1) v2 += 5;
  4124. }
  4125. if (now->ignore_case <= 0) v1 *= 2;
  4126. if (alt->ignore_case <= 0) v2 *= 2;
  4127. if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
  4128. copy_opt_exact_info(now, alt);
  4129. }
  4130. static void
  4131. clear_opt_map_info(OptMapInfo* map)
  4132. {
  4133. static const OptMapInfo clean_info = {
  4134. {0, 0}, {0, 0}, 0,
  4135. {
  4136. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4137. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4138. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4139. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4140. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4141. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4142. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4143. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4144. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4145. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4146. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4147. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4148. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4149. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4150. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  4151. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  4152. }
  4153. };
  4154. xmemcpy(map, &clean_info, sizeof(OptMapInfo));
  4155. }
  4156. static void
  4157. copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
  4158. {
  4159. *to = *from;
  4160. }
  4161. static void
  4162. add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc)
  4163. {
  4164. if (map->map[c] == 0) {
  4165. map->map[c] = 1;
  4166. map->value += map_position_value(enc, c);
  4167. }
  4168. }
  4169. static int
  4170. add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
  4171. OnigEncoding enc, OnigCaseFoldType case_fold_flag)
  4172. {
  4173. OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
  4174. UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
  4175. int i, n;
  4176. add_char_opt_map_info(map, p[0], enc);
  4177. case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag);
  4178. n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items);
  4179. if (n < 0) return n;
  4180. for (i = 0; i < n; i++) {
  4181. ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf);
  4182. add_char_opt_map_info(map, buf[0], enc);
  4183. }
  4184. return 0;
  4185. }
  4186. static void
  4187. select_opt_map_info(OptMapInfo* now, OptMapInfo* alt)
  4188. {
  4189. const int z = 1<<15; /* 32768: something big value */
  4190. int v1, v2;
  4191. if (alt->value == 0) return ;
  4192. if (now->value == 0) {
  4193. copy_opt_map_info(now, alt);
  4194. return ;
  4195. }
  4196. v1 = z / now->value;
  4197. v2 = z / alt->value;
  4198. if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
  4199. copy_opt_map_info(now, alt);
  4200. }
  4201. static int
  4202. comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m)
  4203. {
  4204. #define COMP_EM_BASE 20
  4205. int ve, vm;
  4206. if (m->value <= 0) return -1;
  4207. ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2);
  4208. vm = COMP_EM_BASE * 5 * 2 / m->value;
  4209. return comp_distance_value(&e->mmd, &m->mmd, ve, vm);
  4210. }
  4211. static void
  4212. alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add)
  4213. {
  4214. int i, val;
  4215. /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */
  4216. if (to->value == 0) return ;
  4217. if (add->value == 0 || to->mmd.max < add->mmd.min) {
  4218. clear_opt_map_info(to);
  4219. return ;
  4220. }
  4221. alt_merge_mml(&to->mmd, &add->mmd);
  4222. val = 0;
  4223. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
  4224. if (add->map[i])
  4225. to->map[i] = 1;
  4226. if (to->map[i])
  4227. val += map_position_value(enc, i);
  4228. }
  4229. to->value = val;
  4230. alt_merge_opt_anc_info(&to->anc, &add->anc);
  4231. }
  4232. static void
  4233. set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd)
  4234. {
  4235. copy_mml(&(opt->exb.mmd), mmd);
  4236. copy_mml(&(opt->expr.mmd), mmd);
  4237. copy_mml(&(opt->map.mmd), mmd);
  4238. }
  4239. static void
  4240. clear_node_opt_info(NodeOptInfo* opt)
  4241. {
  4242. clear_mml(&opt->len);
  4243. clear_opt_anc_info(&opt->anc);
  4244. clear_opt_exact_info(&opt->exb);
  4245. clear_opt_exact_info(&opt->exm);
  4246. clear_opt_exact_info(&opt->expr);
  4247. clear_opt_map_info(&opt->map);
  4248. }
  4249. static void
  4250. copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from)
  4251. {
  4252. *to = *from;
  4253. }
  4254. static void
  4255. concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add)
  4256. {
  4257. int exb_reach, exm_reach;
  4258. OptAncInfo tanc;
  4259. concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max);
  4260. copy_opt_anc_info(&to->anc, &tanc);
  4261. if (add->exb.len > 0 && to->len.max == 0) {
  4262. concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc,
  4263. to->len.max, add->len.max);
  4264. copy_opt_anc_info(&add->exb.anc, &tanc);
  4265. }
  4266. if (add->map.value > 0 && to->len.max == 0) {
  4267. if (add->map.mmd.max == 0)
  4268. add->map.anc.left_anchor |= to->anc.left_anchor;
  4269. }
  4270. exb_reach = to->exb.reach_end;
  4271. exm_reach = to->exm.reach_end;
  4272. if (add->len.max != 0)
  4273. to->exb.reach_end = to->exm.reach_end = 0;
  4274. if (add->exb.len > 0) {
  4275. if (exb_reach) {
  4276. concat_opt_exact_info(&to->exb, &add->exb, enc);
  4277. clear_opt_exact_info(&add->exb);
  4278. }
  4279. else if (exm_reach) {
  4280. concat_opt_exact_info(&to->exm, &add->exb, enc);
  4281. clear_opt_exact_info(&add->exb);
  4282. }
  4283. }
  4284. select_opt_exact_info(enc, &to->exm, &add->exb);
  4285. select_opt_exact_info(enc, &to->exm, &add->exm);
  4286. if (to->expr.len > 0) {
  4287. if (add->len.max > 0) {
  4288. if (to->expr.len > (int )add->len.max)
  4289. to->expr.len = (int )add->len.max;
  4290. if (to->expr.mmd.max == 0)
  4291. select_opt_exact_info(enc, &to->exb, &to->expr);
  4292. else
  4293. select_opt_exact_info(enc, &to->exm, &to->expr);
  4294. }
  4295. }
  4296. else if (add->expr.len > 0) {
  4297. copy_opt_exact_info(&to->expr, &add->expr);
  4298. }
  4299. select_opt_map_info(&to->map, &add->map);
  4300. add_mml(&to->len, &add->len);
  4301. }
  4302. static void
  4303. alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env)
  4304. {
  4305. alt_merge_opt_anc_info (&to->anc, &add->anc);
  4306. alt_merge_opt_exact_info(&to->exb, &add->exb, env);
  4307. alt_merge_opt_exact_info(&to->exm, &add->exm, env);
  4308. alt_merge_opt_exact_info(&to->expr, &add->expr, env);
  4309. alt_merge_opt_map_info(env->enc, &to->map, &add->map);
  4310. alt_merge_mml(&to->len, &add->len);
  4311. }
  4312. #define MAX_NODE_OPT_INFO_REF_COUNT 5
  4313. static int
  4314. optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
  4315. {
  4316. int type;
  4317. int r = 0;
  4318. clear_node_opt_info(opt);
  4319. set_bound_node_opt_info(opt, &env->mmd);
  4320. type = NTYPE(node);
  4321. switch (type) {
  4322. case NT_LIST:
  4323. {
  4324. OptEnv nenv;
  4325. NodeOptInfo nopt;
  4326. Node* nd = node;
  4327. copy_opt_env(&nenv, env);
  4328. do {
  4329. r = optimize_node_left(NCAR(nd), &nopt, &nenv);
  4330. if (r == 0) {
  4331. add_mml(&nenv.mmd, &nopt.len);
  4332. concat_left_node_opt_info(env->enc, opt, &nopt);
  4333. }
  4334. } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd)));
  4335. }
  4336. break;
  4337. case NT_ALT:
  4338. {
  4339. NodeOptInfo nopt;
  4340. Node* nd = node;
  4341. do {
  4342. r = optimize_node_left(NCAR(nd), &nopt, env);
  4343. if (r == 0) {
  4344. if (nd == node) copy_node_opt_info(opt, &nopt);
  4345. else alt_merge_node_opt_info(opt, &nopt, env);
  4346. }
  4347. } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd)));
  4348. }
  4349. break;
  4350. case NT_STR:
  4351. {
  4352. StrNode* sn = NSTR(node);
  4353. OnigDistance slen = sn->end - sn->s;
  4354. int is_raw = NSTRING_IS_RAW(node);
  4355. if (! NSTRING_IS_AMBIG(node)) {
  4356. concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
  4357. is_raw, env->enc);
  4358. opt->exb.ignore_case = 0;
  4359. if (slen > 0) {
  4360. add_char_opt_map_info(&opt->map, *(sn->s), env->enc);
  4361. }
  4362. set_mml(&opt->len, slen, slen);
  4363. }
  4364. else {
  4365. OnigDistance max;
  4366. if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
  4367. int n = onigenc_strlen(env->enc, sn->s, sn->end);
  4368. max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
  4369. }
  4370. else {
  4371. concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
  4372. is_raw, env->enc);
  4373. opt->exb.ignore_case = 1;
  4374. if (slen > 0) {
  4375. r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
  4376. env->enc, env->case_fold_flag);
  4377. if (r != 0) break;
  4378. }
  4379. max = slen;
  4380. }
  4381. set_mml(&opt->len, slen, max);
  4382. }
  4383. if ((OnigDistance )opt->exb.len == slen)
  4384. opt->exb.reach_end = 1;
  4385. }
  4386. break;
  4387. case NT_CCLASS:
  4388. {
  4389. int i, z;
  4390. CClassNode* cc = NCCLASS(node);
  4391. /* no need to check ignore case. (set in setup_tree()) */
  4392. if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) {
  4393. OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
  4394. OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  4395. set_mml(&opt->len, min, max);
  4396. }
  4397. else {
  4398. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  4399. z = BITSET_AT(cc->bs, i);
  4400. if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) {
  4401. add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
  4402. }
  4403. }
  4404. set_mml(&opt->len, 1, 1);
  4405. }
  4406. }
  4407. break;
  4408. case NT_CTYPE:
  4409. {
  4410. int i, min, max;
  4411. int maxcode;
  4412. max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  4413. if (max == 1) {
  4414. min = 1;
  4415. maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE;
  4416. switch (NCTYPE(node)->ctype) {
  4417. case ONIGENC_CTYPE_WORD:
  4418. if (NCTYPE(node)->not != 0) {
  4419. for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
  4420. if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) {
  4421. add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
  4422. }
  4423. }
  4424. }
  4425. else {
  4426. for (i = 0; i < maxcode; i++) {
  4427. if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
  4428. add_char_opt_map_info(&opt->map, (UChar )i, env->enc);
  4429. }
  4430. }
  4431. }
  4432. break;
  4433. }
  4434. }
  4435. else {
  4436. min = ONIGENC_MBC_MINLEN(env->enc);
  4437. }
  4438. set_mml(&opt->len, min, max);
  4439. }
  4440. break;
  4441. case NT_CANY:
  4442. {
  4443. OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
  4444. OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
  4445. set_mml(&opt->len, min, max);
  4446. }
  4447. break;
  4448. case NT_ANCHOR:
  4449. switch (NANCHOR(node)->type) {
  4450. case ANCHOR_BEGIN_BUF:
  4451. case ANCHOR_BEGIN_POSITION:
  4452. case ANCHOR_BEGIN_LINE:
  4453. case ANCHOR_END_BUF:
  4454. case ANCHOR_SEMI_END_BUF:
  4455. case ANCHOR_END_LINE:
  4456. case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */
  4457. case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */
  4458. add_opt_anc_info(&opt->anc, NANCHOR(node)->type);
  4459. break;
  4460. case ANCHOR_PREC_READ:
  4461. {
  4462. NodeOptInfo nopt;
  4463. r = optimize_node_left(NANCHOR(node)->target, &nopt, env);
  4464. if (r == 0) {
  4465. if (nopt.exb.len > 0)
  4466. copy_opt_exact_info(&opt->expr, &nopt.exb);
  4467. else if (nopt.exm.len > 0)
  4468. copy_opt_exact_info(&opt->expr, &nopt.exm);
  4469. opt->expr.reach_end = 0;
  4470. if (nopt.map.value > 0)
  4471. copy_opt_map_info(&opt->map, &nopt.map);
  4472. }
  4473. }
  4474. break;
  4475. case ANCHOR_LOOK_BEHIND_NOT:
  4476. break;
  4477. }
  4478. break;
  4479. case NT_BREF:
  4480. {
  4481. int i;
  4482. int* backs;
  4483. OnigDistance min, max, tmin, tmax;
  4484. Node** nodes = SCANENV_MEM_NODES(env->scan_env);
  4485. BRefNode* br = NBREF(node);
  4486. if (br->state & NST_RECURSION) {
  4487. set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
  4488. break;
  4489. }
  4490. backs = BACKREFS_P(br);
  4491. r = get_min_match_length(nodes[backs[0]], &min, env->scan_env);
  4492. if (r != 0) break;
  4493. r = get_max_match_length(nodes[backs[0]], &max, env->scan_env);
  4494. if (r != 0) break;
  4495. for (i = 1; i < br->back_num; i++) {
  4496. r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env);
  4497. if (r != 0) break;
  4498. r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env);
  4499. if (r != 0) break;
  4500. if (min > tmin) min = tmin;
  4501. if (max < tmax) max = tmax;
  4502. }
  4503. if (r == 0) set_mml(&opt->len, min, max);
  4504. }
  4505. break;
  4506. #ifdef USE_SUBEXP_CALL
  4507. case NT_CALL:
  4508. if (IS_CALL_RECURSION(NCALL(node)))
  4509. set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
  4510. else {
  4511. OnigOptionType save = env->options;
  4512. env->options = NENCLOSE(NCALL(node)->target)->option;
  4513. r = optimize_node_left(NCALL(node)->target, opt, env);
  4514. env->options = save;
  4515. }
  4516. break;
  4517. #endif
  4518. case NT_QTFR:
  4519. {
  4520. int i;
  4521. OnigDistance min, max;
  4522. NodeOptInfo nopt;
  4523. QtfrNode* qn = NQTFR(node);
  4524. r = optimize_node_left(qn->target, &nopt, env);
  4525. if (r) break;
  4526. if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) {
  4527. if (env->mmd.max == 0 &&
  4528. NTYPE(qn->target) == NT_CANY && qn->greedy) {
  4529. if (IS_MULTILINE(env->options))
  4530. /* implicit anchor: /.*a/ ==> /\A.*a/ */
  4531. add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML);
  4532. else
  4533. add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR);
  4534. }
  4535. }
  4536. else {
  4537. if (qn->lower > 0) {
  4538. copy_node_opt_info(opt, &nopt);
  4539. if (nopt.exb.len > 0) {
  4540. if (nopt.exb.reach_end) {
  4541. for (i = 2; i <= qn->lower &&
  4542. ! is_full_opt_exact_info(&opt->exb); i++) {
  4543. concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc);
  4544. }
  4545. if (i < qn->lower) {
  4546. opt->exb.reach_end = 0;
  4547. }
  4548. }
  4549. }
  4550. if (qn->lower != qn->upper) {
  4551. opt->exb.reach_end = 0;
  4552. opt->exm.reach_end = 0;
  4553. }
  4554. if (qn->lower > 1)
  4555. opt->exm.reach_end = 0;
  4556. }
  4557. }
  4558. min = distance_multiply(nopt.len.min, qn->lower);
  4559. if (IS_REPEAT_INFINITE(qn->upper))
  4560. max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0);
  4561. else
  4562. max = distance_multiply(nopt.len.max, qn->upper);
  4563. set_mml(&opt->len, min, max);
  4564. }
  4565. break;
  4566. case NT_ENCLOSE:
  4567. {
  4568. EncloseNode* en = NENCLOSE(node);
  4569. switch (en->type) {
  4570. case ENCLOSE_OPTION:
  4571. {
  4572. OnigOptionType save = env->options;
  4573. env->options = en->option;
  4574. r = optimize_node_left(en->target, opt, env);
  4575. env->options = save;
  4576. }
  4577. break;
  4578. case ENCLOSE_MEMORY:
  4579. #ifdef USE_SUBEXP_CALL
  4580. en->opt_count++;
  4581. if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) {
  4582. OnigDistance min, max;
  4583. min = 0;
  4584. max = ONIG_INFINITE_DISTANCE;
  4585. if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len;
  4586. if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len;
  4587. set_mml(&opt->len, min, max);
  4588. }
  4589. else
  4590. #endif
  4591. {
  4592. r = optimize_node_left(en->target, opt, env);
  4593. if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) {
  4594. if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum))
  4595. remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK);
  4596. }
  4597. }
  4598. break;
  4599. case ENCLOSE_STOP_BACKTRACK:
  4600. case ENCLOSE_CONDITION:
  4601. r = optimize_node_left(en->target, opt, env);
  4602. break;
  4603. case ENCLOSE_ABSENT:
  4604. set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE);
  4605. break;
  4606. }
  4607. }
  4608. break;
  4609. default:
  4610. #ifdef ONIG_DEBUG
  4611. fprintf(stderr, "optimize_node_left: undefined node type %d\n",
  4612. NTYPE(node));
  4613. #endif
  4614. r = ONIGERR_TYPE_BUG;
  4615. break;
  4616. }
  4617. return r;
  4618. }
  4619. static int
  4620. set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
  4621. {
  4622. int r;
  4623. int allow_reverse;
  4624. if (e->len == 0) return 0;
  4625. reg->exact = (UChar* )xmalloc(e->len);
  4626. CHECK_NULL_RETURN_MEMERR(reg->exact);
  4627. xmemcpy(reg->exact, e->s, e->len);
  4628. reg->exact_end = reg->exact + e->len;
  4629. allow_reverse =
  4630. ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
  4631. if (e->ignore_case > 0) {
  4632. if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
  4633. r = set_bm_skip(reg->exact, reg->exact_end, reg,
  4634. reg->map, &(reg->int_map), 1);
  4635. if (r == 0) {
  4636. reg->optimize = (allow_reverse != 0
  4637. ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
  4638. }
  4639. else {
  4640. reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
  4641. }
  4642. }
  4643. else {
  4644. reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
  4645. }
  4646. }
  4647. else {
  4648. if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
  4649. r = set_bm_skip(reg->exact, reg->exact_end, reg,
  4650. reg->map, &(reg->int_map), 0);
  4651. if (r == 0) {
  4652. reg->optimize = (allow_reverse != 0
  4653. ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
  4654. }
  4655. else {
  4656. reg->optimize = ONIG_OPTIMIZE_EXACT;
  4657. }
  4658. }
  4659. else {
  4660. reg->optimize = ONIG_OPTIMIZE_EXACT;
  4661. }
  4662. }
  4663. reg->dmin = e->mmd.min;
  4664. reg->dmax = e->mmd.max;
  4665. if (reg->dmin != ONIG_INFINITE_DISTANCE) {
  4666. reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact));
  4667. }
  4668. return 0;
  4669. }
  4670. static void
  4671. set_optimize_map_info(regex_t* reg, OptMapInfo* m)
  4672. {
  4673. int i;
  4674. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
  4675. reg->map[i] = m->map[i];
  4676. reg->optimize = ONIG_OPTIMIZE_MAP;
  4677. reg->dmin = m->mmd.min;
  4678. reg->dmax = m->mmd.max;
  4679. if (reg->dmin != ONIG_INFINITE_DISTANCE) {
  4680. reg->threshold_len = (int )(reg->dmin + 1);
  4681. }
  4682. }
  4683. static void
  4684. set_sub_anchor(regex_t* reg, OptAncInfo* anc)
  4685. {
  4686. reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE;
  4687. reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE;
  4688. }
  4689. #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
  4690. static void print_optimize_info(FILE* f, regex_t* reg);
  4691. #endif
  4692. static int
  4693. set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
  4694. {
  4695. int r;
  4696. NodeOptInfo opt;
  4697. OptEnv env;
  4698. env.enc = reg->enc;
  4699. env.options = reg->options;
  4700. env.case_fold_flag = reg->case_fold_flag;
  4701. env.scan_env = scan_env;
  4702. clear_mml(&env.mmd);
  4703. r = optimize_node_left(node, &opt, &env);
  4704. if (r) return r;
  4705. reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF |
  4706. ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML |
  4707. ANCHOR_LOOK_BEHIND);
  4708. if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0)
  4709. reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML;
  4710. reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF |
  4711. ANCHOR_PREC_READ_NOT);
  4712. if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) {
  4713. reg->anchor_dmin = opt.len.min;
  4714. reg->anchor_dmax = opt.len.max;
  4715. }
  4716. if (opt.exb.len > 0 || opt.exm.len > 0) {
  4717. select_opt_exact_info(reg->enc, &opt.exb, &opt.exm);
  4718. if (opt.map.value > 0 &&
  4719. comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) {
  4720. goto set_map;
  4721. }
  4722. else {
  4723. r = set_optimize_exact_info(reg, &opt.exb);
  4724. set_sub_anchor(reg, &opt.exb.anc);
  4725. }
  4726. }
  4727. else if (opt.map.value > 0) {
  4728. set_map:
  4729. set_optimize_map_info(reg, &opt.map);
  4730. set_sub_anchor(reg, &opt.map.anc);
  4731. }
  4732. else {
  4733. reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE;
  4734. if (opt.len.max == 0)
  4735. reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE;
  4736. }
  4737. #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
  4738. print_optimize_info(stderr, reg);
  4739. #endif
  4740. return r;
  4741. }
  4742. static void
  4743. clear_optimize_info(regex_t* reg)
  4744. {
  4745. reg->optimize = ONIG_OPTIMIZE_NONE;
  4746. reg->anchor = 0;
  4747. reg->anchor_dmin = 0;
  4748. reg->anchor_dmax = 0;
  4749. reg->sub_anchor = 0;
  4750. reg->exact_end = (UChar* )NULL;
  4751. reg->threshold_len = 0;
  4752. if (IS_NOT_NULL(reg->exact)) {
  4753. xfree(reg->exact);
  4754. reg->exact = (UChar* )NULL;
  4755. }
  4756. }
  4757. #ifdef ONIG_DEBUG
  4758. static void print_enc_string(FILE* fp, OnigEncoding enc,
  4759. const UChar *s, const UChar *end)
  4760. {
  4761. fprintf(fp, "\nPATTERN: /");
  4762. if (ONIGENC_MBC_MINLEN(enc) > 1) {
  4763. const UChar *p;
  4764. OnigCodePoint code;
  4765. p = s;
  4766. while (p < end) {
  4767. code = ONIGENC_MBC_TO_CODE(enc, p, end);
  4768. if (code >= 0x80) {
  4769. fprintf(fp, " 0x%04x ", (int )code);
  4770. }
  4771. else {
  4772. fputc((int )code, fp);
  4773. }
  4774. p += enclen(enc, p, end);
  4775. }
  4776. }
  4777. else {
  4778. while (s < end) {
  4779. fputc((int )*s, fp);
  4780. s++;
  4781. }
  4782. }
  4783. fprintf(fp, "/ (%s)\n", enc->name);
  4784. }
  4785. #endif /* ONIG_DEBUG */
  4786. #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH)
  4787. static void
  4788. print_distance_range(FILE* f, OnigDistance a, OnigDistance b)
  4789. {
  4790. if (a == ONIG_INFINITE_DISTANCE)
  4791. fputs("inf", f);
  4792. else
  4793. fprintf(f, "(%"PRIuPTR")", a);
  4794. fputs("-", f);
  4795. if (b == ONIG_INFINITE_DISTANCE)
  4796. fputs("inf", f);
  4797. else
  4798. fprintf(f, "(%"PRIuPTR")", b);
  4799. }
  4800. static void
  4801. print_anchor(FILE* f, int anchor)
  4802. {
  4803. int q = 0;
  4804. fprintf(f, "[");
  4805. if (anchor & ANCHOR_BEGIN_BUF) {
  4806. fprintf(f, "begin-buf");
  4807. q = 1;
  4808. }
  4809. if (anchor & ANCHOR_BEGIN_LINE) {
  4810. if (q) fprintf(f, ", ");
  4811. q = 1;
  4812. fprintf(f, "begin-line");
  4813. }
  4814. if (anchor & ANCHOR_BEGIN_POSITION) {
  4815. if (q) fprintf(f, ", ");
  4816. q = 1;
  4817. fprintf(f, "begin-pos");
  4818. }
  4819. if (anchor & ANCHOR_END_BUF) {
  4820. if (q) fprintf(f, ", ");
  4821. q = 1;
  4822. fprintf(f, "end-buf");
  4823. }
  4824. if (anchor & ANCHOR_SEMI_END_BUF) {
  4825. if (q) fprintf(f, ", ");
  4826. q = 1;
  4827. fprintf(f, "semi-end-buf");
  4828. }
  4829. if (anchor & ANCHOR_END_LINE) {
  4830. if (q) fprintf(f, ", ");
  4831. q = 1;
  4832. fprintf(f, "end-line");
  4833. }
  4834. if (anchor & ANCHOR_ANYCHAR_STAR) {
  4835. if (q) fprintf(f, ", ");
  4836. q = 1;
  4837. fprintf(f, "anychar-star");
  4838. }
  4839. if (anchor & ANCHOR_ANYCHAR_STAR_ML) {
  4840. if (q) fprintf(f, ", ");
  4841. fprintf(f, "anychar-star-ml");
  4842. }
  4843. fprintf(f, "]");
  4844. }
  4845. static void
  4846. print_optimize_info(FILE* f, regex_t* reg)
  4847. {
  4848. static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV",
  4849. "EXACT_IC", "MAP",
  4850. "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" };
  4851. fprintf(f, "optimize: %s\n", on[reg->optimize]);
  4852. fprintf(f, " anchor: "); print_anchor(f, reg->anchor);
  4853. if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0)
  4854. print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax);
  4855. fprintf(f, "\n");
  4856. if (reg->optimize) {
  4857. fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor);
  4858. fprintf(f, "\n");
  4859. }
  4860. fprintf(f, "\n");
  4861. if (reg->exact) {
  4862. UChar *p;
  4863. fprintf(f, "exact: [");
  4864. for (p = reg->exact; p < reg->exact_end; p++) {
  4865. fputc(*p, f);
  4866. }
  4867. fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact));
  4868. }
  4869. else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
  4870. int c, i, n = 0;
  4871. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
  4872. if (reg->map[i]) n++;
  4873. fprintf(f, "map: n=%d\n", n);
  4874. if (n > 0) {
  4875. c = 0;
  4876. fputc('[', f);
  4877. for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
  4878. if (reg->map[i] != 0) {
  4879. if (c > 0) fputs(", ", f);
  4880. c++;
  4881. if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
  4882. ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
  4883. fputc(i, f);
  4884. else
  4885. fprintf(f, "%d", i);
  4886. }
  4887. }
  4888. fprintf(f, "]\n");
  4889. }
  4890. }
  4891. }
  4892. #endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */
  4893. extern void
  4894. onig_free_body(regex_t* reg)
  4895. {
  4896. if (IS_NOT_NULL(reg)) {
  4897. if (IS_NOT_NULL(reg->p)) xfree(reg->p);
  4898. if (IS_NOT_NULL(reg->exact)) xfree(reg->exact);
  4899. if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map);
  4900. if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward);
  4901. if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range);
  4902. if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain);
  4903. #ifdef USE_NAMED_GROUP
  4904. onig_names_free(reg);
  4905. #endif
  4906. }
  4907. }
  4908. extern void
  4909. onig_free(regex_t* reg)
  4910. {
  4911. if (IS_NOT_NULL(reg)) {
  4912. onig_free_body(reg);
  4913. xfree(reg);
  4914. }
  4915. }
  4916. #ifdef RUBY
  4917. size_t
  4918. onig_memsize(const regex_t *reg)
  4919. {
  4920. size_t size = sizeof(regex_t);
  4921. if (IS_NULL(reg)) return 0;
  4922. if (IS_NOT_NULL(reg->p)) size += reg->alloc;
  4923. if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
  4924. if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
  4925. if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
  4926. if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
  4927. if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
  4928. return size;
  4929. }
  4930. size_t
  4931. onig_region_memsize(const OnigRegion *regs)
  4932. {
  4933. size_t size = sizeof(*regs);
  4934. if (IS_NULL(regs)) return 0;
  4935. size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end));
  4936. return size;
  4937. }
  4938. #endif
  4939. #define REGEX_TRANSFER(to,from) do {\
  4940. onig_free_body(to);\
  4941. xmemcpy(to, from, sizeof(regex_t));\
  4942. xfree(from);\
  4943. } while (0)
  4944. #if 0
  4945. extern void
  4946. onig_transfer(regex_t* to, regex_t* from)
  4947. {
  4948. REGEX_TRANSFER(to, from);
  4949. }
  4950. #endif
  4951. #ifdef ONIG_DEBUG_COMPILE
  4952. static void print_compiled_byte_code_list(FILE* f, regex_t* reg);
  4953. #endif
  4954. #ifdef ONIG_DEBUG_PARSE_TREE
  4955. static void print_tree(FILE* f, Node* node);
  4956. #endif
  4957. #ifdef RUBY
  4958. extern int
  4959. onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
  4960. OnigErrorInfo* einfo)
  4961. {
  4962. return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0);
  4963. }
  4964. #endif
  4965. #ifdef RUBY
  4966. extern int
  4967. onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
  4968. OnigErrorInfo* einfo, const char *sourcefile, int sourceline)
  4969. #else
  4970. extern int
  4971. onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
  4972. OnigErrorInfo* einfo)
  4973. #endif
  4974. {
  4975. #define COMPILE_INIT_SIZE 20
  4976. int r;
  4977. OnigDistance init_size;
  4978. Node* root;
  4979. ScanEnv scan_env = {0};
  4980. #ifdef USE_SUBEXP_CALL
  4981. UnsetAddrList uslist;
  4982. #endif
  4983. if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
  4984. #ifdef RUBY
  4985. scan_env.sourcefile = sourcefile;
  4986. scan_env.sourceline = sourceline;
  4987. #endif
  4988. #ifdef ONIG_DEBUG
  4989. print_enc_string(stderr, reg->enc, pattern, pattern_end);
  4990. #endif
  4991. if (reg->alloc == 0) {
  4992. init_size = (pattern_end - pattern) * 2;
  4993. if (init_size <= 0) init_size = COMPILE_INIT_SIZE;
  4994. r = BBUF_INIT(reg, init_size);
  4995. if (r != 0) goto end;
  4996. }
  4997. else
  4998. reg->used = 0;
  4999. reg->num_mem = 0;
  5000. reg->num_repeat = 0;
  5001. reg->num_null_check = 0;
  5002. reg->repeat_range_alloc = 0;
  5003. reg->repeat_range = (OnigRepeatRange* )NULL;
  5004. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  5005. reg->num_comb_exp_check = 0;
  5006. #endif
  5007. r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env);
  5008. if (r != 0) goto err;
  5009. #ifdef ONIG_DEBUG_PARSE_TREE
  5010. # if 0
  5011. fprintf(stderr, "ORIGINAL PARSE TREE:\n");
  5012. print_tree(stderr, root);
  5013. # endif
  5014. #endif
  5015. #ifdef USE_NAMED_GROUP
  5016. /* mixed use named group and no-named group */
  5017. if (scan_env.num_named > 0 &&
  5018. IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) &&
  5019. !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) {
  5020. if (scan_env.num_named != scan_env.num_mem)
  5021. r = disable_noname_group_capture(&root, reg, &scan_env);
  5022. else
  5023. r = numbered_ref_check(root);
  5024. if (r != 0) goto err;
  5025. }
  5026. #endif
  5027. #ifdef USE_SUBEXP_CALL
  5028. if (scan_env.num_call > 0) {
  5029. r = unset_addr_list_init(&uslist, scan_env.num_call);
  5030. if (r != 0) goto err;
  5031. scan_env.unset_addr_list = &uslist;
  5032. r = setup_subexp_call(root, &scan_env);
  5033. if (r != 0) goto err_unset;
  5034. r = subexp_recursive_check_trav(root, &scan_env);
  5035. if (r < 0) goto err_unset;
  5036. r = subexp_inf_recursive_check_trav(root, &scan_env);
  5037. if (r != 0) goto err_unset;
  5038. reg->num_call = scan_env.num_call;
  5039. }
  5040. else
  5041. reg->num_call = 0;
  5042. #endif
  5043. r = setup_tree(root, reg, 0, &scan_env);
  5044. if (r != 0) goto err_unset;
  5045. #ifdef ONIG_DEBUG_PARSE_TREE
  5046. print_tree(stderr, root);
  5047. #endif
  5048. reg->capture_history = scan_env.capture_history;
  5049. reg->bt_mem_start = scan_env.bt_mem_start;
  5050. reg->bt_mem_start |= reg->capture_history;
  5051. if (IS_FIND_CONDITION(reg->options))
  5052. BIT_STATUS_ON_ALL(reg->bt_mem_end);
  5053. else {
  5054. reg->bt_mem_end = scan_env.bt_mem_end;
  5055. reg->bt_mem_end |= reg->capture_history;
  5056. }
  5057. #ifdef USE_COMBINATION_EXPLOSION_CHECK
  5058. if (scan_env.backrefed_mem == 0
  5059. # ifdef USE_SUBEXP_CALL
  5060. || scan_env.num_call == 0
  5061. # endif
  5062. ) {
  5063. setup_comb_exp_check(root, 0, &scan_env);
  5064. # ifdef USE_SUBEXP_CALL
  5065. if (scan_env.has_recursion != 0) {
  5066. scan_env.num_comb_exp_check = 0;
  5067. }
  5068. else
  5069. # endif
  5070. if (scan_env.comb_exp_max_regnum > 0) {
  5071. int i;
  5072. for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) {
  5073. if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) {
  5074. scan_env.num_comb_exp_check = 0;
  5075. break;
  5076. }
  5077. }
  5078. }
  5079. }
  5080. reg->num_comb_exp_check = scan_env.num_comb_exp_check;
  5081. #endif
  5082. clear_optimize_info(reg);
  5083. #ifndef ONIG_DONT_OPTIMIZE
  5084. r = set_optimize_info_from_tree(root, reg, &scan_env);
  5085. if (r != 0) goto err_unset;
  5086. #endif
  5087. if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) {
  5088. xfree(scan_env.mem_nodes_dynamic);
  5089. scan_env.mem_nodes_dynamic = (Node** )NULL;
  5090. }
  5091. r = compile_tree(root, reg);
  5092. if (r == 0) {
  5093. r = add_opcode(reg, OP_END);
  5094. #ifdef USE_SUBEXP_CALL
  5095. if (scan_env.num_call > 0) {
  5096. r = unset_addr_list_fix(&uslist, reg);
  5097. unset_addr_list_end(&uslist);
  5098. if (r) goto err;
  5099. }
  5100. #endif
  5101. if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0))
  5102. reg->stack_pop_level = STACK_POP_LEVEL_ALL;
  5103. else {
  5104. if (reg->bt_mem_start != 0)
  5105. reg->stack_pop_level = STACK_POP_LEVEL_MEM_START;
  5106. else
  5107. reg->stack_pop_level = STACK_POP_LEVEL_FREE;
  5108. }
  5109. }
  5110. #ifdef USE_SUBEXP_CALL
  5111. else if (scan_env.num_call > 0) {
  5112. unset_addr_list_end(&uslist);
  5113. }
  5114. #endif
  5115. onig_node_free(root);
  5116. #ifdef ONIG_DEBUG_COMPILE
  5117. # ifdef USE_NAMED_GROUP
  5118. onig_print_names(stderr, reg);
  5119. # endif
  5120. print_compiled_byte_code_list(stderr, reg);
  5121. #endif
  5122. end:
  5123. onig_reg_resize(reg);
  5124. return r;
  5125. err_unset:
  5126. #ifdef USE_SUBEXP_CALL
  5127. if (scan_env.num_call > 0) {
  5128. unset_addr_list_end(&uslist);
  5129. }
  5130. #endif
  5131. err:
  5132. if (IS_NOT_NULL(scan_env.error)) {
  5133. if (IS_NOT_NULL(einfo)) {
  5134. einfo->enc = scan_env.enc;
  5135. einfo->par = scan_env.error;
  5136. einfo->par_end = scan_env.error_end;
  5137. }
  5138. }
  5139. onig_node_free(root);
  5140. if (IS_NOT_NULL(scan_env.mem_nodes_dynamic))
  5141. xfree(scan_env.mem_nodes_dynamic);
  5142. return r;
  5143. }
  5144. static int onig_inited = 0;
  5145. extern int
  5146. onig_reg_init(regex_t* reg, OnigOptionType option,
  5147. OnigCaseFoldType case_fold_flag,
  5148. OnigEncoding enc, const OnigSyntaxType* syntax)
  5149. {
  5150. if (! onig_inited)
  5151. onig_init();
  5152. if (IS_NULL(reg))
  5153. return ONIGERR_INVALID_ARGUMENT;
  5154. if (ONIGENC_IS_UNDEF(enc))
  5155. return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
  5156. if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP))
  5157. == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) {
  5158. return ONIGERR_INVALID_COMBINATION_OF_OPTIONS;
  5159. }
  5160. if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) {
  5161. option |= syntax->options;
  5162. option &= ~ONIG_OPTION_SINGLELINE;
  5163. }
  5164. else
  5165. option |= syntax->options;
  5166. (reg)->enc = enc;
  5167. (reg)->options = option;
  5168. (reg)->syntax = syntax;
  5169. (reg)->optimize = 0;
  5170. (reg)->exact = (UChar* )NULL;
  5171. (reg)->int_map = (int* )NULL;
  5172. (reg)->int_map_backward = (int* )NULL;
  5173. (reg)->chain = (regex_t* )NULL;
  5174. (reg)->p = (UChar* )NULL;
  5175. (reg)->alloc = 0;
  5176. (reg)->used = 0;
  5177. (reg)->name_table = (void* )NULL;
  5178. (reg)->case_fold_flag = case_fold_flag;
  5179. return 0;
  5180. }
  5181. extern int
  5182. onig_new_without_alloc(regex_t* reg, const UChar* pattern,
  5183. const UChar* pattern_end, OnigOptionType option, OnigEncoding enc,
  5184. const OnigSyntaxType* syntax, OnigErrorInfo* einfo)
  5185. {
  5186. int r;
  5187. r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
  5188. if (r) return r;
  5189. r = onig_compile(reg, pattern, pattern_end, einfo);
  5190. return r;
  5191. }
  5192. extern int
  5193. onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end,
  5194. OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax,
  5195. OnigErrorInfo* einfo)
  5196. {
  5197. int r;
  5198. *reg = (regex_t* )xmalloc(sizeof(regex_t));
  5199. if (IS_NULL(*reg)) return ONIGERR_MEMORY;
  5200. r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax);
  5201. if (r) goto err;
  5202. r = onig_compile(*reg, pattern, pattern_end, einfo);
  5203. if (r) {
  5204. err:
  5205. onig_free(*reg);
  5206. *reg = NULL;
  5207. }
  5208. return r;
  5209. }
  5210. extern int
  5211. onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED)
  5212. {
  5213. return onig_init();
  5214. }
  5215. extern int
  5216. onig_init(void)
  5217. {
  5218. if (onig_inited != 0)
  5219. return 0;
  5220. onig_inited = 1;
  5221. #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
  5222. _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
  5223. #endif
  5224. onigenc_init();
  5225. /* onigenc_set_default_caseconv_table((UChar* )0); */
  5226. #ifdef ONIG_DEBUG_STATISTICS
  5227. onig_statistics_init();
  5228. #endif
  5229. return 0;
  5230. }
  5231. static OnigEndCallListItemType* EndCallTop;
  5232. extern void onig_add_end_call(void (*func)(void))
  5233. {
  5234. OnigEndCallListItemType* item;
  5235. item = (OnigEndCallListItemType* )xmalloc(sizeof(*item));
  5236. if (item == 0) return ;
  5237. item->next = EndCallTop;
  5238. item->func = func;
  5239. EndCallTop = item;
  5240. }
  5241. static void
  5242. exec_end_call_list(void)
  5243. {
  5244. OnigEndCallListItemType* prev;
  5245. void (*func)(void);
  5246. while (EndCallTop != 0) {
  5247. func = EndCallTop->func;
  5248. (*func)();
  5249. prev = EndCallTop;
  5250. EndCallTop = EndCallTop->next;
  5251. xfree(prev);
  5252. }
  5253. }
  5254. extern int
  5255. onig_end(void)
  5256. {
  5257. exec_end_call_list();
  5258. #ifdef ONIG_DEBUG_STATISTICS
  5259. onig_print_statistics(stderr);
  5260. #endif
  5261. #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER)
  5262. _CrtDumpMemoryLeaks();
  5263. #endif
  5264. onig_inited = 0;
  5265. return 0;
  5266. }
  5267. extern int
  5268. onig_is_in_code_range(const UChar* p, OnigCodePoint code)
  5269. {
  5270. OnigCodePoint n, *data;
  5271. OnigCodePoint low, high, x;
  5272. GET_CODE_POINT(n, p);
  5273. data = (OnigCodePoint* )p;
  5274. data++;
  5275. for (low = 0, high = n; low < high; ) {
  5276. x = (low + high) >> 1;
  5277. if (code > data[x * 2 + 1])
  5278. low = x + 1;
  5279. else
  5280. high = x;
  5281. }
  5282. return ((low < n && code >= data[low * 2]) ? 1 : 0);
  5283. }
  5284. extern int
  5285. onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc)
  5286. {
  5287. int found;
  5288. if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) {
  5289. if (IS_NULL(cc->mbuf)) {
  5290. found = 0;
  5291. }
  5292. else {
  5293. found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0);
  5294. }
  5295. }
  5296. else {
  5297. found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1);
  5298. }
  5299. if (IS_NCCLASS_NOT(cc))
  5300. return !found;
  5301. else
  5302. return found;
  5303. }
  5304. extern int
  5305. onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
  5306. {
  5307. int len;
  5308. if (ONIGENC_MBC_MINLEN(enc) > 1) {
  5309. len = 2;
  5310. }
  5311. else {
  5312. len = ONIGENC_CODE_TO_MBCLEN(enc, code);
  5313. }
  5314. return onig_is_code_in_cc_len(len, code, cc);
  5315. }
  5316. #ifdef ONIG_DEBUG
  5317. /* arguments type */
  5318. # define ARG_SPECIAL -1
  5319. # define ARG_NON 0
  5320. # define ARG_RELADDR 1
  5321. # define ARG_ABSADDR 2
  5322. # define ARG_LENGTH 3
  5323. # define ARG_MEMNUM 4
  5324. # define ARG_OPTION 5
  5325. # define ARG_STATE_CHECK 6
  5326. OnigOpInfoType OnigOpInfo[] = {
  5327. { OP_FINISH, "finish", ARG_NON },
  5328. { OP_END, "end", ARG_NON },
  5329. { OP_EXACT1, "exact1", ARG_SPECIAL },
  5330. { OP_EXACT2, "exact2", ARG_SPECIAL },
  5331. { OP_EXACT3, "exact3", ARG_SPECIAL },
  5332. { OP_EXACT4, "exact4", ARG_SPECIAL },
  5333. { OP_EXACT5, "exact5", ARG_SPECIAL },
  5334. { OP_EXACTN, "exactn", ARG_SPECIAL },
  5335. { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL },
  5336. { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL },
  5337. { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL },
  5338. { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL },
  5339. { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL },
  5340. { OP_EXACTMBN, "exactmbn", ARG_SPECIAL },
  5341. { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL },
  5342. { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL },
  5343. { OP_CCLASS, "cclass", ARG_SPECIAL },
  5344. { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL },
  5345. { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL },
  5346. { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL },
  5347. { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL },
  5348. { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL },
  5349. { OP_ANYCHAR, "anychar", ARG_NON },
  5350. { OP_ANYCHAR_ML, "anychar-ml", ARG_NON },
  5351. { OP_ANYCHAR_STAR, "anychar*", ARG_NON },
  5352. { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON },
  5353. { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL },
  5354. { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL },
  5355. { OP_WORD, "word", ARG_NON },
  5356. { OP_NOT_WORD, "not-word", ARG_NON },
  5357. { OP_WORD_BOUND, "word-bound", ARG_NON },
  5358. { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON },
  5359. { OP_WORD_BEGIN, "word-begin", ARG_NON },
  5360. { OP_WORD_END, "word-end", ARG_NON },
  5361. { OP_ASCII_WORD, "ascii-word", ARG_NON },
  5362. { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON },
  5363. { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON },
  5364. { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON },
  5365. { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON },
  5366. { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON },
  5367. { OP_BEGIN_BUF, "begin-buf", ARG_NON },
  5368. { OP_END_BUF, "end-buf", ARG_NON },
  5369. { OP_BEGIN_LINE, "begin-line", ARG_NON },
  5370. { OP_END_LINE, "end-line", ARG_NON },
  5371. { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
  5372. { OP_BEGIN_POSITION, "begin-position", ARG_NON },
  5373. { OP_BACKREF1, "backref1", ARG_NON },
  5374. { OP_BACKREF2, "backref2", ARG_NON },
  5375. { OP_BACKREFN, "backrefn", ARG_MEMNUM },
  5376. { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
  5377. { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
  5378. { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL },
  5379. { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL },
  5380. { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
  5381. { OP_MEMORY_START, "mem-start", ARG_MEMNUM },
  5382. { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM },
  5383. { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM },
  5384. { OP_MEMORY_END, "mem-end", ARG_MEMNUM },
  5385. { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM },
  5386. { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION },
  5387. { OP_SET_OPTION, "set-option", ARG_OPTION },
  5388. { OP_KEEP, "keep", ARG_NON },
  5389. { OP_FAIL, "fail", ARG_NON },
  5390. { OP_JUMP, "jump", ARG_RELADDR },
  5391. { OP_PUSH, "push", ARG_RELADDR },
  5392. { OP_POP, "pop", ARG_NON },
  5393. { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL },
  5394. { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL },
  5395. { OP_REPEAT, "repeat", ARG_SPECIAL },
  5396. { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
  5397. { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
  5398. { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
  5399. { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
  5400. { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
  5401. { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM },
  5402. { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
  5403. { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
  5404. { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM },
  5405. { OP_PUSH_POS, "push-pos", ARG_NON },
  5406. { OP_POP_POS, "pop-pos", ARG_NON },
  5407. { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR },
  5408. { OP_FAIL_POS, "fail-pos", ARG_NON },
  5409. { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON },
  5410. { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON },
  5411. { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL },
  5412. { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL },
  5413. { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON },
  5414. { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON },
  5415. { OP_ABSENT, "absent", ARG_RELADDR },
  5416. { OP_ABSENT_END, "absent-end", ARG_NON },
  5417. { OP_CALL, "call", ARG_ABSADDR },
  5418. { OP_RETURN, "return", ARG_NON },
  5419. { OP_CONDITION, "condition", ARG_SPECIAL },
  5420. { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL },
  5421. { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL },
  5422. { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK },
  5423. { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK },
  5424. { OP_STATE_CHECK_ANYCHAR_ML_STAR,
  5425. "state-check-anychar-ml*", ARG_STATE_CHECK },
  5426. { -1, "", ARG_NON }
  5427. };
  5428. static const char*
  5429. op2name(int opcode)
  5430. {
  5431. int i;
  5432. for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
  5433. if (opcode == OnigOpInfo[i].opcode)
  5434. return OnigOpInfo[i].name;
  5435. }
  5436. return "";
  5437. }
  5438. static int
  5439. op2arg_type(int opcode)
  5440. {
  5441. int i;
  5442. for (i = 0; OnigOpInfo[i].opcode >= 0; i++) {
  5443. if (opcode == OnigOpInfo[i].opcode)
  5444. return OnigOpInfo[i].arg_type;
  5445. }
  5446. return ARG_SPECIAL;
  5447. }
  5448. # ifdef ONIG_DEBUG_PARSE_TREE
  5449. static void
  5450. Indent(FILE* f, int indent)
  5451. {
  5452. int i;
  5453. for (i = 0; i < indent; i++) putc(' ', f);
  5454. }
  5455. # endif /* ONIG_DEBUG_PARSE_TREE */
  5456. static void
  5457. p_string(FILE* f, ptrdiff_t len, UChar* s)
  5458. {
  5459. fputs(":", f);
  5460. while (len-- > 0) { fputc(*s++, f); }
  5461. }
  5462. static void
  5463. p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
  5464. {
  5465. int x = len * mb_len;
  5466. fprintf(f, ":%d:", len);
  5467. while (x-- > 0) { fputc(*s++, f); }
  5468. }
  5469. extern void
  5470. onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp,
  5471. OnigEncoding enc)
  5472. {
  5473. int i, n, arg_type;
  5474. RelAddrType addr;
  5475. LengthType len;
  5476. MemNumType mem;
  5477. StateCheckNumType scn;
  5478. OnigCodePoint code;
  5479. UChar *q;
  5480. fprintf(f, "[%s", op2name(*bp));
  5481. arg_type = op2arg_type(*bp);
  5482. if (arg_type != ARG_SPECIAL) {
  5483. bp++;
  5484. switch (arg_type) {
  5485. case ARG_NON:
  5486. break;
  5487. case ARG_RELADDR:
  5488. GET_RELADDR_INC(addr, bp);
  5489. fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
  5490. break;
  5491. case ARG_ABSADDR:
  5492. GET_ABSADDR_INC(addr, bp);
  5493. fprintf(f, ":(%d)", addr);
  5494. break;
  5495. case ARG_LENGTH:
  5496. GET_LENGTH_INC(len, bp);
  5497. fprintf(f, ":%d", len);
  5498. break;
  5499. case ARG_MEMNUM:
  5500. mem = *((MemNumType* )bp);
  5501. bp += SIZE_MEMNUM;
  5502. fprintf(f, ":%d", mem);
  5503. break;
  5504. case ARG_OPTION:
  5505. {
  5506. OnigOptionType option = *((OnigOptionType* )bp);
  5507. bp += SIZE_OPTION;
  5508. fprintf(f, ":%d", option);
  5509. }
  5510. break;
  5511. case ARG_STATE_CHECK:
  5512. scn = *((StateCheckNumType* )bp);
  5513. bp += SIZE_STATE_CHECK_NUM;
  5514. fprintf(f, ":%d", scn);
  5515. break;
  5516. }
  5517. }
  5518. else {
  5519. switch (*bp++) {
  5520. case OP_EXACT1:
  5521. case OP_ANYCHAR_STAR_PEEK_NEXT:
  5522. case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
  5523. p_string(f, 1, bp++); break;
  5524. case OP_EXACT2:
  5525. p_string(f, 2, bp); bp += 2; break;
  5526. case OP_EXACT3:
  5527. p_string(f, 3, bp); bp += 3; break;
  5528. case OP_EXACT4:
  5529. p_string(f, 4, bp); bp += 4; break;
  5530. case OP_EXACT5:
  5531. p_string(f, 5, bp); bp += 5; break;
  5532. case OP_EXACTN:
  5533. GET_LENGTH_INC(len, bp);
  5534. p_len_string(f, len, 1, bp);
  5535. bp += len;
  5536. break;
  5537. case OP_EXACTMB2N1:
  5538. p_string(f, 2, bp); bp += 2; break;
  5539. case OP_EXACTMB2N2:
  5540. p_string(f, 4, bp); bp += 4; break;
  5541. case OP_EXACTMB2N3:
  5542. p_string(f, 6, bp); bp += 6; break;
  5543. case OP_EXACTMB2N:
  5544. GET_LENGTH_INC(len, bp);
  5545. p_len_string(f, len, 2, bp);
  5546. bp += len * 2;
  5547. break;
  5548. case OP_EXACTMB3N:
  5549. GET_LENGTH_INC(len, bp);
  5550. p_len_string(f, len, 3, bp);
  5551. bp += len * 3;
  5552. break;
  5553. case OP_EXACTMBN:
  5554. {
  5555. int mb_len;
  5556. GET_LENGTH_INC(mb_len, bp);
  5557. GET_LENGTH_INC(len, bp);
  5558. fprintf(f, ":%d:%d:", mb_len, len);
  5559. n = len * mb_len;
  5560. while (n-- > 0) { fputc(*bp++, f); }
  5561. }
  5562. break;
  5563. case OP_EXACT1_IC:
  5564. len = enclen(enc, bp, bpend);
  5565. p_string(f, len, bp);
  5566. bp += len;
  5567. break;
  5568. case OP_EXACTN_IC:
  5569. GET_LENGTH_INC(len, bp);
  5570. p_len_string(f, len, 1, bp);
  5571. bp += len;
  5572. break;
  5573. case OP_CCLASS:
  5574. n = bitset_on_num((BitSetRef )bp);
  5575. bp += SIZE_BITSET;
  5576. fprintf(f, ":%d", n);
  5577. break;
  5578. case OP_CCLASS_NOT:
  5579. n = bitset_on_num((BitSetRef )bp);
  5580. bp += SIZE_BITSET;
  5581. fprintf(f, ":%d", n);
  5582. break;
  5583. case OP_CCLASS_MB:
  5584. case OP_CCLASS_MB_NOT:
  5585. GET_LENGTH_INC(len, bp);
  5586. q = bp;
  5587. # ifndef PLATFORM_UNALIGNED_WORD_ACCESS
  5588. ALIGNMENT_RIGHT(q);
  5589. # endif
  5590. GET_CODE_POINT(code, q);
  5591. bp += len;
  5592. fprintf(f, ":%d:%d", (int )code, len);
  5593. break;
  5594. case OP_CCLASS_MIX:
  5595. case OP_CCLASS_MIX_NOT:
  5596. n = bitset_on_num((BitSetRef )bp);
  5597. bp += SIZE_BITSET;
  5598. GET_LENGTH_INC(len, bp);
  5599. q = bp;
  5600. # ifndef PLATFORM_UNALIGNED_WORD_ACCESS
  5601. ALIGNMENT_RIGHT(q);
  5602. # endif
  5603. GET_CODE_POINT(code, q);
  5604. bp += len;
  5605. fprintf(f, ":%d:%d:%d", n, (int )code, len);
  5606. break;
  5607. case OP_BACKREFN_IC:
  5608. mem = *((MemNumType* )bp);
  5609. bp += SIZE_MEMNUM;
  5610. fprintf(f, ":%d", mem);
  5611. break;
  5612. case OP_BACKREF_MULTI_IC:
  5613. case OP_BACKREF_MULTI:
  5614. fputs(" ", f);
  5615. GET_LENGTH_INC(len, bp);
  5616. for (i = 0; i < len; i++) {
  5617. GET_MEMNUM_INC(mem, bp);
  5618. if (i > 0) fputs(", ", f);
  5619. fprintf(f, "%d", mem);
  5620. }
  5621. break;
  5622. case OP_BACKREF_WITH_LEVEL:
  5623. {
  5624. OnigOptionType option;
  5625. LengthType level;
  5626. GET_OPTION_INC(option, bp);
  5627. fprintf(f, ":%d", option);
  5628. GET_LENGTH_INC(level, bp);
  5629. fprintf(f, ":%d", level);
  5630. fputs(" ", f);
  5631. GET_LENGTH_INC(len, bp);
  5632. for (i = 0; i < len; i++) {
  5633. GET_MEMNUM_INC(mem, bp);
  5634. if (i > 0) fputs(", ", f);
  5635. fprintf(f, "%d", mem);
  5636. }
  5637. }
  5638. break;
  5639. case OP_REPEAT:
  5640. case OP_REPEAT_NG:
  5641. {
  5642. mem = *((MemNumType* )bp);
  5643. bp += SIZE_MEMNUM;
  5644. addr = *((RelAddrType* )bp);
  5645. bp += SIZE_RELADDR;
  5646. fprintf(f, ":%d:%d", mem, addr);
  5647. }
  5648. break;
  5649. case OP_PUSH_OR_JUMP_EXACT1:
  5650. case OP_PUSH_IF_PEEK_NEXT:
  5651. addr = *((RelAddrType* )bp);
  5652. bp += SIZE_RELADDR;
  5653. fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr);
  5654. p_string(f, 1, bp);
  5655. bp += 1;
  5656. break;
  5657. case OP_LOOK_BEHIND:
  5658. GET_LENGTH_INC(len, bp);
  5659. fprintf(f, ":%d", len);
  5660. break;
  5661. case OP_PUSH_LOOK_BEHIND_NOT:
  5662. GET_RELADDR_INC(addr, bp);
  5663. GET_LENGTH_INC(len, bp);
  5664. fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr);
  5665. break;
  5666. case OP_STATE_CHECK_PUSH:
  5667. case OP_STATE_CHECK_PUSH_OR_JUMP:
  5668. scn = *((StateCheckNumType* )bp);
  5669. bp += SIZE_STATE_CHECK_NUM;
  5670. addr = *((RelAddrType* )bp);
  5671. bp += SIZE_RELADDR;
  5672. fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr);
  5673. break;
  5674. case OP_CONDITION:
  5675. GET_MEMNUM_INC(mem, bp);
  5676. GET_RELADDR_INC(addr, bp);
  5677. fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr);
  5678. break;
  5679. default:
  5680. fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n",
  5681. bp[-1]);
  5682. }
  5683. }
  5684. fputs("]", f);
  5685. if (nextp) *nextp = bp;
  5686. }
  5687. # ifdef ONIG_DEBUG_COMPILE
  5688. static void
  5689. print_compiled_byte_code_list(FILE* f, regex_t* reg)
  5690. {
  5691. int ncode;
  5692. UChar* bp = reg->p;
  5693. UChar* end = reg->p + reg->used;
  5694. fprintf(f, "code length: %d", reg->used);
  5695. ncode = -1;
  5696. while (bp < end) {
  5697. ncode++;
  5698. if (ncode % 5 == 0)
  5699. fprintf(f, "\n%ld:", bp - reg->p);
  5700. else
  5701. fprintf(f, " %ld:", bp - reg->p);
  5702. onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc);
  5703. }
  5704. fprintf(f, "\n");
  5705. }
  5706. # endif /* ONIG_DEBUG_COMPILE */
  5707. # ifdef ONIG_DEBUG_PARSE_TREE
  5708. static void
  5709. print_indent_tree(FILE* f, Node* node, int indent)
  5710. {
  5711. int i, type, container_p = 0;
  5712. int add = 3;
  5713. UChar* p;
  5714. Indent(f, indent);
  5715. if (IS_NULL(node)) {
  5716. fprintf(f, "ERROR: null node!!!\n");
  5717. exit (0);
  5718. }
  5719. type = NTYPE(node);
  5720. switch (type) {
  5721. case NT_LIST:
  5722. case NT_ALT:
  5723. if (NTYPE(node) == NT_LIST)
  5724. fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node);
  5725. else
  5726. fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node);
  5727. print_indent_tree(f, NCAR(node), indent + add);
  5728. while (IS_NOT_NULL(node = NCDR(node))) {
  5729. if (NTYPE(node) != type) {
  5730. fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node));
  5731. exit(0);
  5732. }
  5733. print_indent_tree(f, NCAR(node), indent + add);
  5734. }
  5735. break;
  5736. case NT_STR:
  5737. fprintf(f, "<string%s:%"PRIxPTR">",
  5738. (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node);
  5739. for (p = NSTR(node)->s; p < NSTR(node)->end; p++) {
  5740. if (*p >= 0x20 && *p < 0x7f)
  5741. fputc(*p, f);
  5742. else {
  5743. fprintf(f, " 0x%02x", *p);
  5744. }
  5745. }
  5746. break;
  5747. case NT_CCLASS:
  5748. fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node);
  5749. if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f);
  5750. if (NCCLASS(node)->mbuf) {
  5751. BBuf* bbuf = NCCLASS(node)->mbuf;
  5752. OnigCodePoint* data = (OnigCodePoint* )bbuf->p;
  5753. OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used);
  5754. fprintf(f, "%d", *data++);
  5755. for (; data < end; data+=2) {
  5756. fprintf(f, ",");
  5757. fprintf(f, "%04x-%04x", data[0], data[1]);
  5758. }
  5759. }
  5760. break;
  5761. case NT_CTYPE:
  5762. fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node);
  5763. switch (NCTYPE(node)->ctype) {
  5764. case ONIGENC_CTYPE_WORD:
  5765. if (NCTYPE(node)->not != 0)
  5766. fputs("not word", f);
  5767. else
  5768. fputs("word", f);
  5769. break;
  5770. default:
  5771. fprintf(f, "ERROR: undefined ctype.\n");
  5772. exit(0);
  5773. }
  5774. break;
  5775. case NT_CANY:
  5776. fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node);
  5777. break;
  5778. case NT_ANCHOR:
  5779. fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node);
  5780. switch (NANCHOR(node)->type) {
  5781. case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break;
  5782. case ANCHOR_END_BUF: fputs("end buf", f); break;
  5783. case ANCHOR_BEGIN_LINE: fputs("begin line", f); break;
  5784. case ANCHOR_END_LINE: fputs("end line", f); break;
  5785. case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
  5786. case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
  5787. case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
  5788. case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
  5789. # ifdef USE_WORD_BEGIN_END
  5790. case ANCHOR_WORD_BEGIN: fputs("word begin", f); break;
  5791. case ANCHOR_WORD_END: fputs("word end", f); break;
  5792. # endif
  5793. case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break;
  5794. case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break;
  5795. case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break;
  5796. case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break;
  5797. case ANCHOR_KEEP: fputs("keep",f); break;
  5798. default:
  5799. fprintf(f, "ERROR: undefined anchor type.\n");
  5800. break;
  5801. }
  5802. break;
  5803. case NT_BREF:
  5804. {
  5805. int* p;
  5806. BRefNode* br = NBREF(node);
  5807. p = BACKREFS_P(br);
  5808. fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node);
  5809. for (i = 0; i < br->back_num; i++) {
  5810. if (i > 0) fputs(", ", f);
  5811. fprintf(f, "%d", p[i]);
  5812. }
  5813. }
  5814. break;
  5815. # ifdef USE_SUBEXP_CALL
  5816. case NT_CALL:
  5817. {
  5818. CallNode* cn = NCALL(node);
  5819. fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node);
  5820. p_string(f, cn->name_end - cn->name, cn->name);
  5821. }
  5822. break;
  5823. # endif
  5824. case NT_QTFR:
  5825. fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node,
  5826. NQTFR(node)->lower, NQTFR(node)->upper,
  5827. (NQTFR(node)->greedy ? "" : "?"));
  5828. print_indent_tree(f, NQTFR(node)->target, indent + add);
  5829. break;
  5830. case NT_ENCLOSE:
  5831. fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node);
  5832. switch (NENCLOSE(node)->type) {
  5833. case ENCLOSE_OPTION:
  5834. fprintf(f, "option:%d", NENCLOSE(node)->option);
  5835. break;
  5836. case ENCLOSE_MEMORY:
  5837. fprintf(f, "memory:%d", NENCLOSE(node)->regnum);
  5838. break;
  5839. case ENCLOSE_STOP_BACKTRACK:
  5840. fprintf(f, "stop-bt");
  5841. break;
  5842. case ENCLOSE_CONDITION:
  5843. fprintf(f, "condition:%d", NENCLOSE(node)->regnum);
  5844. break;
  5845. case ENCLOSE_ABSENT:
  5846. fprintf(f, "absent");
  5847. break;
  5848. default:
  5849. break;
  5850. }
  5851. fprintf(f, "\n");
  5852. print_indent_tree(f, NENCLOSE(node)->target, indent + add);
  5853. break;
  5854. default:
  5855. fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node));
  5856. break;
  5857. }
  5858. if (type != NT_LIST && type != NT_ALT && type != NT_QTFR &&
  5859. type != NT_ENCLOSE)
  5860. fprintf(f, "\n");
  5861. if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add);
  5862. fflush(f);
  5863. }
  5864. static void
  5865. print_tree(FILE* f, Node* node)
  5866. {
  5867. print_indent_tree(f, node, 0);
  5868. }
  5869. # endif /* ONIG_DEBUG_PARSE_TREE */
  5870. #endif /* ONIG_DEBUG */