PageRenderTime 51ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/trunk/harbour/src/3rd/pcre/pcreexec.c

#
C | 1904 lines | 1079 code | 321 blank | 504 comment | 396 complexity | d80314669e22e9bcba3ac4a3ebca5239 MD5 | raw file
Possible License(s): AGPL-1.0, BSD-3-Clause, CC-BY-SA-3.0, LGPL-3.0, GPL-2.0, LGPL-2.0, LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. /*************************************************
  2. * Perl-Compatible Regular Expressions *
  3. *************************************************/
  4. /* PCRE is a library of functions to support regular expressions whose syntax
  5. and semantics are as close as possible to those of the Perl 5 language.
  6. Written by Philip Hazel
  7. Copyright (c) 1997-2012 University of Cambridge
  8. -----------------------------------------------------------------------------
  9. Redistribution and use in source and binary forms, with or without
  10. modification, are permitted provided that the following conditions are met:
  11. * Redistributions of source code must retain the above copyright notice,
  12. this list of conditions and the following disclaimer.
  13. * Redistributions in binary form must reproduce the above copyright
  14. notice, this list of conditions and the following disclaimer in the
  15. documentation and/or other materials provided with the distribution.
  16. * Neither the name of the University of Cambridge nor the names of its
  17. contributors may be used to endorse or promote products derived from
  18. this software without specific prior written permission.
  19. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  23. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29. POSSIBILITY OF SUCH DAMAGE.
  30. -----------------------------------------------------------------------------
  31. */
  32. /* This module contains pcre_exec(), the externally visible function that does
  33. pattern matching using an NFA algorithm, trying to mimic Perl as closely as
  34. possible. There are also some static supporting functions. */
  35. #ifdef HAVE_CONFIG_H
  36. #include "config.h"
  37. #endif
  38. #define NLBLOCK md /* Block containing newline information */
  39. #define PSSTART start_subject /* Field containing processed string start */
  40. #define PSEND end_subject /* Field containing processed string end */
  41. #include "pcreinal.h"
  42. /* Undefine some potentially clashing cpp symbols */
  43. #undef min
  44. #undef max
  45. /* Values for setting in md->match_function_type to indicate two special types
  46. of call to match(). We do it this way to save on using another stack variable,
  47. as stack usage is to be discouraged. */
  48. #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
  49. #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
  50. /* Non-error returns from the match() function. Error returns are externally
  51. defined PCRE_ERROR_xxx codes, which are all negative. */
  52. #define MATCH_MATCH 1
  53. #define MATCH_NOMATCH 0
  54. /* Special internal returns from the match() function. Make them sufficiently
  55. negative to avoid the external error codes. */
  56. #define MATCH_ACCEPT (-999)
  57. #define MATCH_COMMIT (-998)
  58. #define MATCH_KETRPOS (-997)
  59. #define MATCH_ONCE (-996)
  60. #define MATCH_PRUNE (-995)
  61. #define MATCH_SKIP (-994)
  62. #define MATCH_SKIP_ARG (-993)
  63. #define MATCH_THEN (-992)
  64. /* Maximum number of ints of offset to save on the stack for recursive calls.
  65. If the offset vector is bigger, malloc is used. This should be a multiple of 3,
  66. because the offset vector is always a multiple of 3 long. */
  67. #define REC_STACK_SAVE_MAX 30
  68. /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  69. static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
  70. static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
  71. #ifdef PCRE_DEBUG
  72. /*************************************************
  73. * Debugging function to print chars *
  74. *************************************************/
  75. /* Print a sequence of chars in printable format, stopping at the end of the
  76. subject if the requested.
  77. Arguments:
  78. p points to characters
  79. length number to print
  80. is_subject TRUE if printing from within md->start_subject
  81. md pointer to matching data block, if is_subject is TRUE
  82. Returns: nothing
  83. */
  84. static void
  85. pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
  86. {
  87. unsigned int c;
  88. if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
  89. while (length-- > 0)
  90. if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
  91. }
  92. #endif
  93. /*************************************************
  94. * Match a back-reference *
  95. *************************************************/
  96. /* Normally, if a back reference hasn't been set, the length that is passed is
  97. negative, so the match always fails. However, in JavaScript compatibility mode,
  98. the length passed is zero. Note that in caseless UTF-8 mode, the number of
  99. subject bytes matched may be different to the number of reference bytes.
  100. Arguments:
  101. offset index into the offset vector
  102. eptr pointer into the subject
  103. length length of reference to be matched (number of bytes)
  104. md points to match data block
  105. caseless TRUE if caseless
  106. Returns: < 0 if not matched, otherwise the number of subject bytes matched
  107. */
  108. static int
  109. match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
  110. BOOL caseless)
  111. {
  112. PCRE_PUCHAR eptr_start = eptr;
  113. register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
  114. #ifdef PCRE_DEBUG
  115. if (eptr >= md->end_subject)
  116. printf("matching subject <null>");
  117. else
  118. {
  119. printf("matching subject ");
  120. pchars(eptr, length, TRUE, md);
  121. }
  122. printf(" against backref ");
  123. pchars(p, length, FALSE, md);
  124. printf("\n");
  125. #endif
  126. /* Always fail if reference not set (and not JavaScript compatible). */
  127. if (length < 0) return -1;
  128. /* Separate the caseless case for speed. In UTF-8 mode we can only do this
  129. properly if Unicode properties are supported. Otherwise, we can check only
  130. ASCII characters. */
  131. if (caseless)
  132. {
  133. #ifdef SUPPORT_UTF
  134. #ifdef SUPPORT_UCP
  135. if (md->utf)
  136. {
  137. /* Match characters up to the end of the reference. NOTE: the number of
  138. bytes matched may differ, because there are some characters whose upper and
  139. lower case versions code as different numbers of bytes. For example, U+023A
  140. (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
  141. a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
  142. the latter. It is important, therefore, to check the length along the
  143. reference, not along the subject (earlier code did this wrong). */
  144. PCRE_PUCHAR endptr = p + length;
  145. while (p < endptr)
  146. {
  147. int c, d;
  148. if (eptr >= md->end_subject) return -1;
  149. GETCHARINC(c, eptr);
  150. GETCHARINC(d, p);
  151. if (c != d && c != UCD_OTHERCASE(d)) return -1;
  152. }
  153. }
  154. else
  155. #endif
  156. #endif
  157. /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
  158. is no UCP support. */
  159. {
  160. if (eptr + length > md->end_subject) return -1;
  161. while (length-- > 0)
  162. {
  163. if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
  164. p++;
  165. eptr++;
  166. }
  167. }
  168. }
  169. /* In the caseful case, we can just compare the bytes, whether or not we
  170. are in UTF-8 mode. */
  171. else
  172. {
  173. if (eptr + length > md->end_subject) return -1;
  174. while (length-- > 0) if (*p++ != *eptr++) return -1;
  175. }
  176. return (int)(eptr - eptr_start);
  177. }
  178. /***************************************************************************
  179. ****************************************************************************
  180. RECURSION IN THE match() FUNCTION
  181. The match() function is highly recursive, though not every recursive call
  182. increases the recursive depth. Nevertheless, some regular expressions can cause
  183. it to recurse to a great depth. I was writing for Unix, so I just let it call
  184. itself recursively. This uses the stack for saving everything that has to be
  185. saved for a recursive call. On Unix, the stack can be large, and this works
  186. fine.
  187. It turns out that on some non-Unix-like systems there are problems with
  188. programs that use a lot of stack. (This despite the fact that every last chip
  189. has oodles of memory these days, and techniques for extending the stack have
  190. been known for decades.) So....
  191. There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
  192. calls by keeping local variables that need to be preserved in blocks of memory
  193. obtained from malloc() instead instead of on the stack. Macros are used to
  194. achieve this so that the actual code doesn't look very different to what it
  195. always used to.
  196. The original heap-recursive code used longjmp(). However, it seems that this
  197. can be very slow on some operating systems. Following a suggestion from Stan
  198. Switzer, the use of longjmp() has been abolished, at the cost of having to
  199. provide a unique number for each call to RMATCH. There is no way of generating
  200. a sequence of numbers at compile time in C. I have given them names, to make
  201. them stand out more clearly.
  202. Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
  203. FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
  204. tests. Furthermore, not using longjmp() means that local dynamic variables
  205. don't have indeterminate values; this has meant that the frame size can be
  206. reduced because the result can be "passed back" by straight setting of the
  207. variable instead of being passed in the frame.
  208. ****************************************************************************
  209. ***************************************************************************/
  210. /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
  211. below must be updated in sync. */
  212. enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
  213. RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
  214. RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
  215. RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
  216. RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
  217. RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
  218. RM61, RM62, RM63, RM64, RM65, RM66 };
  219. /* These versions of the macros use the stack, as normal. There are debugging
  220. versions and production versions. Note that the "rw" argument of RMATCH isn't
  221. actually used in this definition. */
  222. #ifndef NO_RECURSE
  223. #define REGISTER register
  224. #ifdef PCRE_DEBUG
  225. #define RMATCH(ra,rb,rc,rd,re,rw) \
  226. { \
  227. printf("match() called in line %d\n", __LINE__); \
  228. rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
  229. printf("to line %d\n", __LINE__); \
  230. }
  231. #define RRETURN(ra) \
  232. { \
  233. printf("match() returned %d from line %d ", ra, __LINE__); \
  234. return ra; \
  235. }
  236. #else
  237. #define RMATCH(ra,rb,rc,rd,re,rw) \
  238. rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
  239. #define RRETURN(ra) return ra
  240. #endif
  241. #else
  242. /* These versions of the macros manage a private stack on the heap. Note that
  243. the "rd" argument of RMATCH isn't actually used in this definition. It's the md
  244. argument of match(), which never changes. */
  245. #define REGISTER
  246. #define RMATCH(ra,rb,rc,rd,re,rw)\
  247. {\
  248. heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
  249. if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
  250. frame->Xwhere = rw; \
  251. newframe->Xeptr = ra;\
  252. newframe->Xecode = rb;\
  253. newframe->Xmstart = mstart;\
  254. newframe->Xoffset_top = rc;\
  255. newframe->Xeptrb = re;\
  256. newframe->Xrdepth = frame->Xrdepth + 1;\
  257. newframe->Xprevframe = frame;\
  258. frame = newframe;\
  259. DPRINTF(("restarting from line %d\n", __LINE__));\
  260. goto HEAP_RECURSE;\
  261. L_##rw:\
  262. DPRINTF(("jumped back to line %d\n", __LINE__));\
  263. }
  264. #define RRETURN(ra)\
  265. {\
  266. heapframe *oldframe = frame;\
  267. frame = oldframe->Xprevframe;\
  268. if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
  269. if (frame != NULL)\
  270. {\
  271. rrc = ra;\
  272. goto HEAP_RETURN;\
  273. }\
  274. return ra;\
  275. }
  276. /* Structure for remembering the local variables in a private frame */
  277. typedef struct heapframe {
  278. struct heapframe *Xprevframe;
  279. /* Function arguments that may change */
  280. PCRE_PUCHAR Xeptr;
  281. const pcre_uchar *Xecode;
  282. PCRE_PUCHAR Xmstart;
  283. int Xoffset_top;
  284. eptrblock *Xeptrb;
  285. unsigned int Xrdepth;
  286. /* Function local variables */
  287. PCRE_PUCHAR Xcallpat;
  288. #ifdef SUPPORT_UTF
  289. PCRE_PUCHAR Xcharptr;
  290. #endif
  291. PCRE_PUCHAR Xdata;
  292. PCRE_PUCHAR Xnext;
  293. PCRE_PUCHAR Xpp;
  294. PCRE_PUCHAR Xprev;
  295. PCRE_PUCHAR Xsaved_eptr;
  296. recursion_info Xnew_recursive;
  297. BOOL Xcur_is_word;
  298. BOOL Xcondition;
  299. BOOL Xprev_is_word;
  300. #ifdef SUPPORT_UCP
  301. int Xprop_type;
  302. int Xprop_value;
  303. int Xprop_fail_result;
  304. int Xoclength;
  305. pcre_uchar Xocchars[6];
  306. #endif
  307. int Xcodelink;
  308. int Xctype;
  309. unsigned int Xfc;
  310. int Xfi;
  311. int Xlength;
  312. int Xmax;
  313. int Xmin;
  314. int Xnumber;
  315. int Xoffset;
  316. int Xop;
  317. int Xsave_capture_last;
  318. int Xsave_offset1, Xsave_offset2, Xsave_offset3;
  319. int Xstacksave[REC_STACK_SAVE_MAX];
  320. eptrblock Xnewptrb;
  321. /* Where to jump back to */
  322. int Xwhere;
  323. } heapframe;
  324. #endif
  325. /***************************************************************************
  326. ***************************************************************************/
  327. /*************************************************
  328. * Match from current position *
  329. *************************************************/
  330. /* This function is called recursively in many circumstances. Whenever it
  331. returns a negative (error) response, the outer incarnation must also return the
  332. same response. */
  333. /* These macros pack up tests that are used for partial matching, and which
  334. appear several times in the code. We set the "hit end" flag if the pointer is
  335. at the end of the subject and also past the start of the subject (i.e.
  336. something has been matched). For hard partial matching, we then return
  337. immediately. The second one is used when we already know we are past the end of
  338. the subject. */
  339. #define CHECK_PARTIAL()\
  340. if (md->partial != 0 && eptr >= md->end_subject && \
  341. eptr > md->start_used_ptr) \
  342. { \
  343. md->hitend = TRUE; \
  344. if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
  345. }
  346. #define SCHECK_PARTIAL()\
  347. if (md->partial != 0 && eptr > md->start_used_ptr) \
  348. { \
  349. md->hitend = TRUE; \
  350. if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
  351. }
  352. /* Performance note: It might be tempting to extract commonly used fields from
  353. the md structure (e.g. utf, end_subject) into individual variables to improve
  354. performance. Tests using gcc on a SPARC disproved this; in the first case, it
  355. made performance worse.
  356. Arguments:
  357. eptr pointer to current character in subject
  358. ecode pointer to current position in compiled code
  359. mstart pointer to the current match start position (can be modified
  360. by encountering \K)
  361. offset_top current top pointer
  362. md pointer to "static" info for the match
  363. eptrb pointer to chain of blocks containing eptr at start of
  364. brackets - for testing for empty matches
  365. rdepth the recursion depth
  366. Returns: MATCH_MATCH if matched ) these values are >= 0
  367. MATCH_NOMATCH if failed to match )
  368. a negative MATCH_xxx value for PRUNE, SKIP, etc
  369. a negative PCRE_ERROR_xxx value if aborted by an error condition
  370. (e.g. stopped by repeated call or recursion limit)
  371. */
  372. static int
  373. match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
  374. PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
  375. unsigned int rdepth)
  376. {
  377. /* These variables do not need to be preserved over recursion in this function,
  378. so they can be ordinary variables in all cases. Mark some of them with
  379. "register" because they are used a lot in loops. */
  380. register int rrc; /* Returns from recursive calls */
  381. register int i; /* Used for loops not involving calls to RMATCH() */
  382. register unsigned int c; /* Character values not kept over RMATCH() calls */
  383. register BOOL utf; /* Local copy of UTF flag for speed */
  384. BOOL minimize, possessive; /* Quantifier options */
  385. BOOL caseless;
  386. int condcode;
  387. /* When recursion is not being used, all "local" variables that have to be
  388. preserved over calls to RMATCH() are part of a "frame". We set up the top-level
  389. frame on the stack here; subsequent instantiations are obtained from the heap
  390. whenever RMATCH() does a "recursion". See the macro definitions above. Putting
  391. the top-level on the stack rather than malloc-ing them all gives a performance
  392. boost in many cases where there is not much "recursion". */
  393. #ifdef NO_RECURSE
  394. heapframe frame_zero;
  395. heapframe *frame = &frame_zero;
  396. frame->Xprevframe = NULL; /* Marks the top level */
  397. /* Copy in the original argument variables */
  398. frame->Xeptr = eptr;
  399. frame->Xecode = ecode;
  400. frame->Xmstart = mstart;
  401. frame->Xoffset_top = offset_top;
  402. frame->Xeptrb = eptrb;
  403. frame->Xrdepth = rdepth;
  404. /* This is where control jumps back to to effect "recursion" */
  405. HEAP_RECURSE:
  406. /* Macros make the argument variables come from the current frame */
  407. #define eptr frame->Xeptr
  408. #define ecode frame->Xecode
  409. #define mstart frame->Xmstart
  410. #define offset_top frame->Xoffset_top
  411. #define eptrb frame->Xeptrb
  412. #define rdepth frame->Xrdepth
  413. /* Ditto for the local variables */
  414. #ifdef SUPPORT_UTF
  415. #define charptr frame->Xcharptr
  416. #endif
  417. #define callpat frame->Xcallpat
  418. #define codelink frame->Xcodelink
  419. #define data frame->Xdata
  420. #define next frame->Xnext
  421. #define pp frame->Xpp
  422. #define prev frame->Xprev
  423. #define saved_eptr frame->Xsaved_eptr
  424. #define new_recursive frame->Xnew_recursive
  425. #define cur_is_word frame->Xcur_is_word
  426. #define condition frame->Xcondition
  427. #define prev_is_word frame->Xprev_is_word
  428. #ifdef SUPPORT_UCP
  429. #define prop_type frame->Xprop_type
  430. #define prop_value frame->Xprop_value
  431. #define prop_fail_result frame->Xprop_fail_result
  432. #define oclength frame->Xoclength
  433. #define occhars frame->Xocchars
  434. #endif
  435. #define ctype frame->Xctype
  436. #define fc frame->Xfc
  437. #define fi frame->Xfi
  438. #define length frame->Xlength
  439. #define max frame->Xmax
  440. #define min frame->Xmin
  441. #define number frame->Xnumber
  442. #define offset frame->Xoffset
  443. #define op frame->Xop
  444. #define save_capture_last frame->Xsave_capture_last
  445. #define save_offset1 frame->Xsave_offset1
  446. #define save_offset2 frame->Xsave_offset2
  447. #define save_offset3 frame->Xsave_offset3
  448. #define stacksave frame->Xstacksave
  449. #define newptrb frame->Xnewptrb
  450. /* When recursion is being used, local variables are allocated on the stack and
  451. get preserved during recursion in the normal way. In this environment, fi and
  452. i, and fc and c, can be the same variables. */
  453. #else /* NO_RECURSE not defined */
  454. #define fi i
  455. #define fc c
  456. /* Many of the following variables are used only in small blocks of the code.
  457. My normal style of coding would have declared them within each of those blocks.
  458. However, in order to accommodate the version of this code that uses an external
  459. "stack" implemented on the heap, it is easier to declare them all here, so the
  460. declarations can be cut out in a block. The only declarations within blocks
  461. below are for variables that do not have to be preserved over a recursive call
  462. to RMATCH(). */
  463. #ifdef SUPPORT_UTF
  464. const pcre_uchar *charptr;
  465. #endif
  466. const pcre_uchar *callpat;
  467. const pcre_uchar *data;
  468. const pcre_uchar *next;
  469. PCRE_PUCHAR pp;
  470. const pcre_uchar *prev;
  471. PCRE_PUCHAR saved_eptr;
  472. recursion_info new_recursive;
  473. BOOL cur_is_word;
  474. BOOL condition;
  475. BOOL prev_is_word;
  476. #ifdef SUPPORT_UCP
  477. int prop_type;
  478. int prop_value;
  479. int prop_fail_result;
  480. int oclength;
  481. pcre_uchar occhars[6];
  482. #endif
  483. int codelink;
  484. int ctype;
  485. int length;
  486. int max;
  487. int min;
  488. int number;
  489. int offset;
  490. int op;
  491. int save_capture_last;
  492. int save_offset1, save_offset2, save_offset3;
  493. int stacksave[REC_STACK_SAVE_MAX];
  494. eptrblock newptrb;
  495. /* There is a special fudge for calling match() in a way that causes it to
  496. measure the size of its basic stack frame when the stack is being used for
  497. recursion. The second argument (ecode) being NULL triggers this behaviour. It
  498. cannot normally ever be NULL. The return is the negated value of the frame
  499. size. */
  500. if (ecode == NULL)
  501. {
  502. if (rdepth == 0)
  503. return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
  504. else
  505. {
  506. int len = (char *)&rdepth - (char *)eptr;
  507. return (len > 0)? -len : len;
  508. }
  509. }
  510. #endif /* NO_RECURSE */
  511. /* To save space on the stack and in the heap frame, I have doubled up on some
  512. of the local variables that are used only in localised parts of the code, but
  513. still need to be preserved over recursive calls of match(). These macros define
  514. the alternative names that are used. */
  515. #define allow_zero cur_is_word
  516. #define cbegroup condition
  517. #define code_offset codelink
  518. #define condassert condition
  519. #define matched_once prev_is_word
  520. #define foc number
  521. #define save_mark data
  522. /* These statements are here to stop the compiler complaining about unitialized
  523. variables. */
  524. #ifdef SUPPORT_UCP
  525. prop_value = 0;
  526. prop_fail_result = 0;
  527. #endif
  528. /* This label is used for tail recursion, which is used in a few cases even
  529. when NO_RECURSE is not defined, in order to reduce the amount of stack that is
  530. used. Thanks to Ian Taylor for noticing this possibility and sending the
  531. original patch. */
  532. TAIL_RECURSE:
  533. /* OK, now we can get on with the real code of the function. Recursive calls
  534. are specified by the macro RMATCH and RRETURN is used to return. When
  535. NO_RECURSE is *not* defined, these just turn into a recursive call to match()
  536. and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
  537. defined). However, RMATCH isn't like a function call because it's quite a
  538. complicated macro. It has to be used in one particular way. This shouldn't,
  539. however, impact performance when true recursion is being used. */
  540. #ifdef SUPPORT_UTF
  541. utf = md->utf; /* Local copy of the flag */
  542. #else
  543. utf = FALSE;
  544. #endif
  545. /* First check that we haven't called match() too many times, or that we
  546. haven't exceeded the recursive call limit. */
  547. if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
  548. if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
  549. /* At the start of a group with an unlimited repeat that may match an empty
  550. string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
  551. done this way to save having to use another function argument, which would take
  552. up space on the stack. See also MATCH_CONDASSERT below.
  553. When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
  554. such remembered pointers, to be checked when we hit the closing ket, in order
  555. to break infinite loops that match no characters. When match() is called in
  556. other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
  557. NOT be used with tail recursion, because the memory block that is used is on
  558. the stack, so a new one may be required for each match(). */
  559. if (md->match_function_type == MATCH_CBEGROUP)
  560. {
  561. newptrb.epb_saved_eptr = eptr;
  562. newptrb.epb_prev = eptrb;
  563. eptrb = &newptrb;
  564. md->match_function_type = 0;
  565. }
  566. /* Now start processing the opcodes. */
  567. for (;;)
  568. {
  569. minimize = possessive = FALSE;
  570. op = *ecode;
  571. switch(op)
  572. {
  573. case OP_MARK:
  574. md->nomatch_mark = ecode + 2;
  575. md->mark = NULL; /* In case previously set by assertion */
  576. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
  577. eptrb, RM55);
  578. if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
  579. md->mark == NULL) md->mark = ecode + 2;
  580. /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
  581. argument, and we must check whether that argument matches this MARK's
  582. argument. It is passed back in md->start_match_ptr (an overloading of that
  583. variable). If it does match, we reset that variable to the current subject
  584. position and return MATCH_SKIP. Otherwise, pass back the return code
  585. unaltered. */
  586. else if (rrc == MATCH_SKIP_ARG &&
  587. STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
  588. {
  589. md->start_match_ptr = eptr;
  590. RRETURN(MATCH_SKIP);
  591. }
  592. RRETURN(rrc);
  593. case OP_FAIL:
  594. RRETURN(MATCH_NOMATCH);
  595. /* COMMIT overrides PRUNE, SKIP, and THEN */
  596. case OP_COMMIT:
  597. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  598. eptrb, RM52);
  599. if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
  600. rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
  601. rrc != MATCH_THEN)
  602. RRETURN(rrc);
  603. RRETURN(MATCH_COMMIT);
  604. /* PRUNE overrides THEN */
  605. case OP_PRUNE:
  606. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  607. eptrb, RM51);
  608. if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
  609. RRETURN(MATCH_PRUNE);
  610. case OP_PRUNE_ARG:
  611. md->nomatch_mark = ecode + 2;
  612. md->mark = NULL; /* In case previously set by assertion */
  613. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
  614. eptrb, RM56);
  615. if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
  616. md->mark == NULL) md->mark = ecode + 2;
  617. if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
  618. RRETURN(MATCH_PRUNE);
  619. /* SKIP overrides PRUNE and THEN */
  620. case OP_SKIP:
  621. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  622. eptrb, RM53);
  623. if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
  624. RRETURN(rrc);
  625. md->start_match_ptr = eptr; /* Pass back current position */
  626. RRETURN(MATCH_SKIP);
  627. /* Note that, for Perl compatibility, SKIP with an argument does NOT set
  628. nomatch_mark. There is a flag that disables this opcode when re-matching a
  629. pattern that ended with a SKIP for which there was not a matching MARK. */
  630. case OP_SKIP_ARG:
  631. if (md->ignore_skip_arg)
  632. {
  633. ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
  634. break;
  635. }
  636. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
  637. eptrb, RM57);
  638. if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
  639. RRETURN(rrc);
  640. /* Pass back the current skip name by overloading md->start_match_ptr and
  641. returning the special MATCH_SKIP_ARG return code. This will either be
  642. caught by a matching MARK, or get to the top, where it causes a rematch
  643. with the md->ignore_skip_arg flag set. */
  644. md->start_match_ptr = ecode + 2;
  645. RRETURN(MATCH_SKIP_ARG);
  646. /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
  647. the branch in which it occurs can be determined. Overload the start of
  648. match pointer to do this. */
  649. case OP_THEN:
  650. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  651. eptrb, RM54);
  652. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  653. md->start_match_ptr = ecode;
  654. RRETURN(MATCH_THEN);
  655. case OP_THEN_ARG:
  656. md->nomatch_mark = ecode + 2;
  657. md->mark = NULL; /* In case previously set by assertion */
  658. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
  659. md, eptrb, RM58);
  660. if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
  661. md->mark == NULL) md->mark = ecode + 2;
  662. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  663. md->start_match_ptr = ecode;
  664. RRETURN(MATCH_THEN);
  665. /* Handle an atomic group that does not contain any capturing parentheses.
  666. This can be handled like an assertion. Prior to 8.13, all atomic groups
  667. were handled this way. In 8.13, the code was changed as below for ONCE, so
  668. that backups pass through the group and thereby reset captured values.
  669. However, this uses a lot more stack, so in 8.20, atomic groups that do not
  670. contain any captures generate OP_ONCE_NC, which can be handled in the old,
  671. less stack intensive way.
  672. Check the alternative branches in turn - the matching won't pass the KET
  673. for this kind of subpattern. If any one branch matches, we carry on as at
  674. the end of a normal bracket, leaving the subject pointer, but resetting
  675. the start-of-match value in case it was changed by \K. */
  676. case OP_ONCE_NC:
  677. prev = ecode;
  678. saved_eptr = eptr;
  679. save_mark = md->mark;
  680. do
  681. {
  682. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
  683. if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
  684. {
  685. mstart = md->start_match_ptr;
  686. break;
  687. }
  688. if (rrc == MATCH_THEN)
  689. {
  690. next = ecode + GET(ecode,1);
  691. if (md->start_match_ptr < next &&
  692. (*ecode == OP_ALT || *next == OP_ALT))
  693. rrc = MATCH_NOMATCH;
  694. }
  695. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  696. ecode += GET(ecode,1);
  697. md->mark = save_mark;
  698. }
  699. while (*ecode == OP_ALT);
  700. /* If hit the end of the group (which could be repeated), fail */
  701. if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
  702. /* Continue as from after the group, updating the offsets high water
  703. mark, since extracts may have been taken. */
  704. do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
  705. offset_top = md->end_offset_top;
  706. eptr = md->end_match_ptr;
  707. /* For a non-repeating ket, just continue at this level. This also
  708. happens for a repeating ket if no characters were matched in the group.
  709. This is the forcible breaking of infinite loops as implemented in Perl
  710. 5.005. */
  711. if (*ecode == OP_KET || eptr == saved_eptr)
  712. {
  713. ecode += 1+LINK_SIZE;
  714. break;
  715. }
  716. /* The repeating kets try the rest of the pattern or restart from the
  717. preceding bracket, in the appropriate order. The second "call" of match()
  718. uses tail recursion, to avoid using another stack frame. */
  719. if (*ecode == OP_KETRMIN)
  720. {
  721. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
  722. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  723. ecode = prev;
  724. goto TAIL_RECURSE;
  725. }
  726. else /* OP_KETRMAX */
  727. {
  728. md->match_function_type = MATCH_CBEGROUP;
  729. RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
  730. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  731. ecode += 1 + LINK_SIZE;
  732. goto TAIL_RECURSE;
  733. }
  734. /* Control never gets here */
  735. /* Handle a capturing bracket, other than those that are possessive with an
  736. unlimited repeat. If there is space in the offset vector, save the current
  737. subject position in the working slot at the top of the vector. We mustn't
  738. change the current values of the data slot, because they may be set from a
  739. previous iteration of this group, and be referred to by a reference inside
  740. the group. A failure to match might occur after the group has succeeded,
  741. if something later on doesn't match. For this reason, we need to restore
  742. the working value and also the values of the final offsets, in case they
  743. were set by a previous iteration of the same bracket.
  744. If there isn't enough space in the offset vector, treat this as if it were
  745. a non-capturing bracket. Don't worry about setting the flag for the error
  746. case here; that is handled in the code for KET. */
  747. case OP_CBRA:
  748. case OP_SCBRA:
  749. number = GET2(ecode, 1+LINK_SIZE);
  750. offset = number << 1;
  751. #ifdef PCRE_DEBUG
  752. printf("start bracket %d\n", number);
  753. printf("subject=");
  754. pchars(eptr, 16, TRUE, md);
  755. printf("\n");
  756. #endif
  757. if (offset < md->offset_max)
  758. {
  759. save_offset1 = md->offset_vector[offset];
  760. save_offset2 = md->offset_vector[offset+1];
  761. save_offset3 = md->offset_vector[md->offset_end - number];
  762. save_capture_last = md->capture_last;
  763. save_mark = md->mark;
  764. DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
  765. md->offset_vector[md->offset_end - number] =
  766. (int)(eptr - md->start_subject);
  767. for (;;)
  768. {
  769. if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
  770. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  771. eptrb, RM1);
  772. if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
  773. /* If we backed up to a THEN, check whether it is within the current
  774. branch by comparing the address of the THEN that is passed back with
  775. the end of the branch. If it is within the current branch, and the
  776. branch is one of two or more alternatives (it either starts or ends
  777. with OP_ALT), we have reached the limit of THEN's action, so convert
  778. the return code to NOMATCH, which will cause normal backtracking to
  779. happen from now on. Otherwise, THEN is passed back to an outer
  780. alternative. This implements Perl's treatment of parenthesized groups,
  781. where a group not containing | does not affect the current alternative,
  782. that is, (X) is NOT the same as (X|(*F)). */
  783. if (rrc == MATCH_THEN)
  784. {
  785. next = ecode + GET(ecode,1);
  786. if (md->start_match_ptr < next &&
  787. (*ecode == OP_ALT || *next == OP_ALT))
  788. rrc = MATCH_NOMATCH;
  789. }
  790. /* Anything other than NOMATCH is passed back. */
  791. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  792. md->capture_last = save_capture_last;
  793. ecode += GET(ecode, 1);
  794. md->mark = save_mark;
  795. if (*ecode != OP_ALT) break;
  796. }
  797. DPRINTF(("bracket %d failed\n", number));
  798. md->offset_vector[offset] = save_offset1;
  799. md->offset_vector[offset+1] = save_offset2;
  800. md->offset_vector[md->offset_end - number] = save_offset3;
  801. /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
  802. RRETURN(rrc);
  803. }
  804. /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
  805. as a non-capturing bracket. */
  806. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  807. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  808. DPRINTF(("insufficient capture room: treat as non-capturing\n"));
  809. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  810. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  811. /* Non-capturing or atomic group, except for possessive with unlimited
  812. repeat and ONCE group with no captures. Loop for all the alternatives.
  813. When we get to the final alternative within the brackets, we used to return
  814. the result of a recursive call to match() whatever happened so it was
  815. possible to reduce stack usage by turning this into a tail recursion,
  816. except in the case of a possibly empty group. However, now that there is
  817. the possiblity of (*THEN) occurring in the final alternative, this
  818. optimization is no longer always possible.
  819. We can optimize if we know there are no (*THEN)s in the pattern; at present
  820. this is the best that can be done.
  821. MATCH_ONCE is returned when the end of an atomic group is successfully
  822. reached, but subsequent matching fails. It passes back up the tree (causing
  823. captured values to be reset) until the original atomic group level is
  824. reached. This is tested by comparing md->once_target with the start of the
  825. group. At this point, the return is converted into MATCH_NOMATCH so that
  826. previous backup points can be taken. */
  827. case OP_ONCE:
  828. case OP_BRA:
  829. case OP_SBRA:
  830. DPRINTF(("start non-capturing bracket\n"));
  831. for (;;)
  832. {
  833. if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
  834. /* If this is not a possibly empty group, and there are no (*THEN)s in
  835. the pattern, and this is the final alternative, optimize as described
  836. above. */
  837. else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
  838. {
  839. ecode += PRIV(OP_lengths)[*ecode];
  840. goto TAIL_RECURSE;
  841. }
  842. /* In all other cases, we have to make another call to match(). */
  843. save_mark = md->mark;
  844. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
  845. RM2);
  846. /* See comment in the code for capturing groups above about handling
  847. THEN. */
  848. if (rrc == MATCH_THEN)
  849. {
  850. next = ecode + GET(ecode,1);
  851. if (md->start_match_ptr < next &&
  852. (*ecode == OP_ALT || *next == OP_ALT))
  853. rrc = MATCH_NOMATCH;
  854. }
  855. if (rrc != MATCH_NOMATCH)
  856. {
  857. if (rrc == MATCH_ONCE)
  858. {
  859. const pcre_uchar *scode = ecode;
  860. if (*scode != OP_ONCE) /* If not at start, find it */
  861. {
  862. while (*scode == OP_ALT) scode += GET(scode, 1);
  863. scode -= GET(scode, 1);
  864. }
  865. if (md->once_target == scode) rrc = MATCH_NOMATCH;
  866. }
  867. RRETURN(rrc);
  868. }
  869. ecode += GET(ecode, 1);
  870. md->mark = save_mark;
  871. if (*ecode != OP_ALT) break;
  872. }
  873. RRETURN(MATCH_NOMATCH);
  874. /* Handle possessive capturing brackets with an unlimited repeat. We come
  875. here from BRAZERO with allow_zero set TRUE. The offset_vector values are
  876. handled similarly to the normal case above. However, the matching is
  877. different. The end of these brackets will always be OP_KETRPOS, which
  878. returns MATCH_KETRPOS without going further in the pattern. By this means
  879. we can handle the group by iteration rather than recursion, thereby
  880. reducing the amount of stack needed. */
  881. case OP_CBRAPOS:
  882. case OP_SCBRAPOS:
  883. allow_zero = FALSE;
  884. POSSESSIVE_CAPTURE:
  885. number = GET2(ecode, 1+LINK_SIZE);
  886. offset = number << 1;
  887. #ifdef PCRE_DEBUG
  888. printf("start possessive bracket %d\n", number);
  889. printf("subject=");
  890. pchars(eptr, 16, TRUE, md);
  891. printf("\n");
  892. #endif
  893. if (offset < md->offset_max)
  894. {
  895. matched_once = FALSE;
  896. code_offset = (int)(ecode - md->start_code);
  897. save_offset1 = md->offset_vector[offset];
  898. save_offset2 = md->offset_vector[offset+1];
  899. save_offset3 = md->offset_vector[md->offset_end - number];
  900. save_capture_last = md->capture_last;
  901. DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
  902. /* Each time round the loop, save the current subject position for use
  903. when the group matches. For MATCH_MATCH, the group has matched, so we
  904. restart it with a new subject starting position, remembering that we had
  905. at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
  906. usual. If we haven't matched any alternatives in any iteration, check to
  907. see if a previous iteration matched. If so, the group has matched;
  908. continue from afterwards. Otherwise it has failed; restore the previous
  909. capture values before returning NOMATCH. */
  910. for (;;)
  911. {
  912. md->offset_vector[md->offset_end - number] =
  913. (int)(eptr - md->start_subject);
  914. if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
  915. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  916. eptrb, RM63);
  917. if (rrc == MATCH_KETRPOS)
  918. {
  919. offset_top = md->end_offset_top;
  920. eptr = md->end_match_ptr;
  921. ecode = md->start_code + code_offset;
  922. save_capture_last = md->capture_last;
  923. matched_once = TRUE;
  924. continue;
  925. }
  926. /* See comment in the code for capturing groups above about handling
  927. THEN. */
  928. if (rrc == MATCH_THEN)
  929. {
  930. next = ecode + GET(ecode,1);
  931. if (md->start_match_ptr < next &&
  932. (*ecode == OP_ALT || *next == OP_ALT))
  933. rrc = MATCH_NOMATCH;
  934. }
  935. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  936. md->capture_last = save_capture_last;
  937. ecode += GET(ecode, 1);
  938. if (*ecode != OP_ALT) break;
  939. }
  940. if (!matched_once)
  941. {
  942. md->offset_vector[offset] = save_offset1;
  943. md->offset_vector[offset+1] = save_offset2;
  944. md->offset_vector[md->offset_end - number] = save_offset3;
  945. }
  946. if (allow_zero || matched_once)
  947. {
  948. ecode += 1 + LINK_SIZE;
  949. break;
  950. }
  951. RRETURN(MATCH_NOMATCH);
  952. }
  953. /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
  954. as a non-capturing bracket. */
  955. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  956. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  957. DPRINTF(("insufficient capture room: treat as non-capturing\n"));
  958. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  959. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  960. /* Non-capturing possessive bracket with unlimited repeat. We come here
  961. from BRAZERO with allow_zero = TRUE. The code is similar to the above,
  962. without the capturing complication. It is written out separately for speed
  963. and cleanliness. */
  964. case OP_BRAPOS:
  965. case OP_SBRAPOS:
  966. allow_zero = FALSE;
  967. POSSESSIVE_NON_CAPTURE:
  968. matched_once = FALSE;
  969. code_offset = (int)(ecode - md->start_code);
  970. for (;;)
  971. {
  972. if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
  973. RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
  974. eptrb, RM48);
  975. if (rrc == MATCH_KETRPOS)
  976. {
  977. offset_top = md->end_offset_top;
  978. eptr = md->end_match_ptr;
  979. ecode = md->start_code + code_offset;
  980. matched_once = TRUE;
  981. continue;
  982. }
  983. /* See comment in the code for capturing groups above about handling
  984. THEN. */
  985. if (rrc == MATCH_THEN)
  986. {
  987. next = ecode + GET(ecode,1);
  988. if (md->start_match_ptr < next &&
  989. (*ecode == OP_ALT || *next == OP_ALT))
  990. rrc = MATCH_NOMATCH;
  991. }
  992. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  993. ecode += GET(ecode, 1);
  994. if (*ecode != OP_ALT) break;
  995. }
  996. if (matched_once || allow_zero)
  997. {
  998. ecode += 1 + LINK_SIZE;
  999. break;
  1000. }
  1001. RRETURN(MATCH_NOMATCH);
  1002. /* Control never reaches here. */
  1003. /* Conditional group: compilation checked that there are no more than
  1004. two branches. If the condition is false, skipping the first branch takes us
  1005. past the end if there is only one branch, but that's OK because that is
  1006. exactly what going to the ket would do. */
  1007. case OP_COND:
  1008. case OP_SCOND:
  1009. codelink = GET(ecode, 1);
  1010. /* Because of the way auto-callout works during compile, a callout item is
  1011. inserted between OP_COND and an assertion condition. */
  1012. if (ecode[LINK_SIZE+1] == OP_CALLOUT)
  1013. {
  1014. if (PUBL(callout) != NULL)
  1015. {
  1016. PUBL(callout_block) cb;
  1017. cb.version = 2; /* Version 1 of the callout block */
  1018. cb.callout_number = ecode[LINK_SIZE+2];
  1019. cb.offset_vector = md->offset_vector;
  1020. #ifdef COMPILE_PCRE8
  1021. cb.subject = (PCRE_SPTR)md->start_subject;
  1022. #else
  1023. cb.subject = (PCRE_SPTR16)md->start_subject;
  1024. #endif
  1025. cb.subject_length = (int)(md->end_subject - md->start_subject);
  1026. cb.start_match = (int)(mstart - md->start_subject);
  1027. cb.current_position = (int)(eptr - md->start_subject);
  1028. cb.pattern_position = GET(ecode, LINK_SIZE + 3);
  1029. cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
  1030. cb.capture_top = offset_top/2;
  1031. cb.capture_last = md->capture_last;
  1032. cb.callout_data = md->callout_data;
  1033. cb.mark = md->nomatch_mark;
  1034. if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
  1035. if (rrc < 0) RRETURN(rrc);
  1036. }
  1037. ecode += PRIV(OP_lengths)[OP_CALLOUT];
  1038. }
  1039. condcode = ecode[LINK_SIZE+1];
  1040. /* Now see what the actual condition is */
  1041. if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
  1042. {
  1043. if (md->recursive == NULL) /* Not recursing => FALSE */
  1044. {
  1045. condition = FALSE;
  1046. ecode += GET(ecode, 1);
  1047. }
  1048. else
  1049. {
  1050. int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
  1051. condition = (recno == RREF_ANY || recno == md->recursive->group_num);
  1052. /* If the test is for recursion into a specific subpattern, and it is
  1053. false, but the test was set up by name, scan the table to see if the
  1054. name refers to any other numbers, and test them. The condition is true
  1055. if any one is set. */
  1056. if (!condition && condcode == OP_NRREF)
  1057. {
  1058. pcre_uchar *slotA = md->name_table;
  1059. for (i = 0; i < md->name_count; i++)
  1060. {
  1061. if (GET2(slotA, 0) == recno) break;
  1062. slotA += md->name_entry_size;
  1063. }
  1064. /* Found a name for the number - there can be only one; duplicate
  1065. names for different numbers are allowed, but not vice versa. First
  1066. scan down for duplicates. */
  1067. if (i < md->name_count)
  1068. {
  1069. pcre_uchar *slotB = slotA;
  1070. while (slotB > md->name_table)
  1071. {
  1072. slotB -= md->name_entry_size;
  1073. if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
  1074. {
  1075. condition = GET2(slotB, 0) == md->recursive->group_num;
  1076. if (condition) break;
  1077. }
  1078. else break;
  1079. }
  1080. /* Scan up for duplicates */
  1081. if (!condition)
  1082. {
  1083. slotB = slotA;
  1084. for (i++; i < md->name_count; i++)
  1085. {
  1086. slotB += md->name_entry_size;
  1087. if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
  1088. {
  1089. condition = GET2(slotB, 0) == md->recursive->group_num;
  1090. if (condition) break;
  1091. }
  1092. else break;
  1093. }
  1094. }
  1095. }
  1096. }
  1097. /* Chose branch according to the condition */
  1098. ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
  1099. }
  1100. }
  1101. else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
  1102. {
  1103. offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
  1104. condition = offset < offset_top && md->offset_vector[offset] >= 0;
  1105. /* If the numbered capture is unset, but the reference was by name,
  1106. scan the table to see if the name refers to any other numbers, and test
  1107. them. The condition is true if any one is set. This is tediously similar
  1108. to the code above, but not close enough to try to amalgamate. */
  1109. if (!condition && condcode == OP_NCREF)
  1110. {
  1111. int refno = offset >> 1;
  1112. pcre_uchar *slotA = md->name_table;
  1113. for (i = 0; i < md->name_count; i++)
  1114. {
  1115. if (GET2(slotA, 0) == refno) break;
  1116. slotA += md->name_entry_size;
  1117. }
  1118. /* Found a name for the number - there can be only one; duplicate names
  1119. for different numbers are allowed, but not vice versa. First scan down
  1120. for duplicates. */
  1121. if (i < md->name_count)
  1122. {
  1123. pcre_uchar *slotB = slotA;
  1124. while (slotB > md->name_table)
  1125. {
  1126. slotB -= md->name_entry_size;
  1127. if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
  1128. {
  1129. offset = GET2(slotB, 0) << 1;
  1130. condition = offset < offset_top &&
  1131. md->offset_vector[offset] >= 0;
  1132. if (condition) break;
  1133. }
  1134. else break;
  1135. }
  1136. /* Scan up for duplicates */
  1137. if (!condition)
  1138. {
  1139. slotB = slotA;
  1140. for (i++; i < md->name_count; i++)
  1141. {
  1142. slotB += md->name_entry_size;
  1143. if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
  1144. {
  1145. offset = GET2(slotB, 0) << 1;
  1146. condition = offset < offset_top &&
  1147. md->offset_vector[offset] >= 0;
  1148. if (condition) break;
  1149. }
  1150. else break;
  1151. }
  1152. }
  1153. }
  1154. }
  1155. /* Chose branch according to the condition */
  1156. ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
  1157. }
  1158. else if (condcode == OP_DEF) /* DEFINE - always false */
  1159. {
  1160. condition = FALSE;
  1161. ecode += GET(ecode, 1);
  1162. }
  1163. /* The condition is an assertion. Call match() to evaluate it - setting
  1164. md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
  1165. an assertion. */
  1166. else
  1167. {
  1168. md->match_function_type = MATCH_CONDASSERT;
  1169. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
  1170. if (rrc == MATCH_MATCH)
  1171. {
  1172. if (md->end_offset_top > offset_top)
  1173. offset_top = md->end_offset_top; /* Captures may have happened */
  1174. condition = TRUE;
  1175. ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
  1176. while (*ecode == OP_ALT) ecode += GET(ecode, 1);
  1177. }
  1178. /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
  1179. assertion; it is therefore treated as NOMATCH. */
  1180. else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
  1181. {
  1182. RRETURN(rrc); /* Need braces because of following else */
  1183. }
  1184. else
  1185. {
  1186. condition = FALSE;
  1187. ecode += codelink;
  1188. }
  1189. }
  1190. /* We are now at the branch that is to be obeyed. As there is only one, can
  1191. use tail recursion to avoid using another stack frame, except when there is
  1192. unlimited repeat of a possibly empty group. In the latter case, a recursive
  1193. call to match() is always required, unless the second alternative doesn't
  1194. exist, in which case we can just plough on. Note that…

Large files files are truncated, but you can click here to view the full file