PageRenderTime 53ms CodeModel.GetById 14ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/harbour-3.0.0/src/3rd/pcre/pcreexec.c

#
C | 2042 lines | 1285 code | 321 blank | 436 comment | 434 complexity | 818c0b5795d40b7af0cbdcee1f996278 MD5 | raw file
Possible License(s): AGPL-1.0, BSD-3-Clause, CC-BY-SA-3.0, LGPL-3.0, GPL-2.0, LGPL-2.0, LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. /*************************************************
  2. * Perl-Compatible Regular Expressions *
  3. *************************************************/
  4. /* PCRE is a library of functions to support regular expressions whose syntax
  5. and semantics are as close as possible to those of the Perl 5 language.
  6. Written by Philip Hazel
  7. Copyright (c) 1997-2010 University of Cambridge
  8. -----------------------------------------------------------------------------
  9. Redistribution and use in source and binary forms, with or without
  10. modification, are permitted provided that the following conditions are met:
  11. * Redistributions of source code must retain the above copyright notice,
  12. this list of conditions and the following disclaimer.
  13. * Redistributions in binary form must reproduce the above copyright
  14. notice, this list of conditions and the following disclaimer in the
  15. documentation and/or other materials provided with the distribution.
  16. * Neither the name of the University of Cambridge nor the names of its
  17. contributors may be used to endorse or promote products derived from
  18. this software without specific prior written permission.
  19. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  20. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  23. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29. POSSIBILITY OF SUCH DAMAGE.
  30. -----------------------------------------------------------------------------
  31. */
  32. /* This module contains pcre_exec(), the externally visible function that does
  33. pattern matching using an NFA algorithm, trying to mimic Perl as closely as
  34. possible. There are also some static supporting functions. */
  35. #ifdef HAVE_CONFIG_H
  36. #include "config.h"
  37. #endif
  38. #define NLBLOCK md /* Block containing newline information */
  39. #define PSSTART start_subject /* Field containing processed string start */
  40. #define PSEND end_subject /* Field containing processed string end */
  41. #include "pcreinal.h"
  42. /* Undefine some potentially clashing cpp symbols */
  43. #undef min
  44. #undef max
  45. /* Flag bits for the match() function */
  46. #define match_condassert 0x01 /* Called to check a condition assertion */
  47. #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
  48. /* Non-error returns from the match() function. Error returns are externally
  49. defined PCRE_ERROR_xxx codes, which are all negative. */
  50. #define MATCH_MATCH 1
  51. #define MATCH_NOMATCH 0
  52. /* Special internal returns from the match() function. Make them sufficiently
  53. negative to avoid the external error codes. */
  54. #define MATCH_ACCEPT (-999)
  55. #define MATCH_COMMIT (-998)
  56. #define MATCH_PRUNE (-997)
  57. #define MATCH_SKIP (-996)
  58. #define MATCH_SKIP_ARG (-995)
  59. #define MATCH_THEN (-994)
  60. /* This is a convenience macro for code that occurs many times. */
  61. #define MRRETURN(ra) \
  62. { \
  63. md->mark = markptr; \
  64. RRETURN(ra); \
  65. }
  66. /* Maximum number of ints of offset to save on the stack for recursive calls.
  67. If the offset vector is bigger, malloc is used. This should be a multiple of 3,
  68. because the offset vector is always a multiple of 3 long. */
  69. #define REC_STACK_SAVE_MAX 30
  70. /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  71. static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
  72. static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
  73. #ifdef PCRE_DEBUG
  74. /*************************************************
  75. * Debugging function to print chars *
  76. *************************************************/
  77. /* Print a sequence of chars in printable format, stopping at the end of the
  78. subject if the requested.
  79. Arguments:
  80. p points to characters
  81. length number to print
  82. is_subject TRUE if printing from within md->start_subject
  83. md pointer to matching data block, if is_subject is TRUE
  84. Returns: nothing
  85. */
  86. static void
  87. pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
  88. {
  89. unsigned int c;
  90. if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
  91. while (length-- > 0)
  92. if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
  93. }
  94. #endif
  95. /*************************************************
  96. * Match a back-reference *
  97. *************************************************/
  98. /* If a back reference hasn't been set, the length that is passed is greater
  99. than the number of characters left in the string, so the match fails.
  100. Arguments:
  101. offset index into the offset vector
  102. eptr points into the subject
  103. length length to be matched
  104. md points to match data block
  105. ims the ims flags
  106. Returns: TRUE if matched
  107. */
  108. static BOOL
  109. match_ref(int offset, register USPTR eptr, int length, match_data *md,
  110. unsigned long int ims)
  111. {
  112. USPTR p = md->start_subject + md->offset_vector[offset];
  113. #ifdef PCRE_DEBUG
  114. if (eptr >= md->end_subject)
  115. printf("matching subject <null>");
  116. else
  117. {
  118. printf("matching subject ");
  119. pchars(eptr, length, TRUE, md);
  120. }
  121. printf(" against backref ");
  122. pchars(p, length, FALSE, md);
  123. printf("\n");
  124. #endif
  125. /* Always fail if not enough characters left */
  126. if (length > md->end_subject - eptr) return FALSE;
  127. /* Separate the caseless case for speed. In UTF-8 mode we can only do this
  128. properly if Unicode properties are supported. Otherwise, we can check only
  129. ASCII characters. */
  130. if ((ims & PCRE_CASELESS) != 0)
  131. {
  132. #ifdef SUPPORT_UTF8
  133. #ifdef SUPPORT_UCP
  134. if (md->utf8)
  135. {
  136. USPTR endptr = eptr + length;
  137. while (eptr < endptr)
  138. {
  139. int c, d;
  140. GETCHARINC(c, eptr);
  141. GETCHARINC(d, p);
  142. if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
  143. }
  144. }
  145. else
  146. #endif
  147. #endif
  148. /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
  149. is no UCP support. */
  150. while (length-- > 0)
  151. { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
  152. }
  153. /* In the caseful case, we can just compare the bytes, whether or not we
  154. are in UTF-8 mode. */
  155. else
  156. { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
  157. return TRUE;
  158. }
  159. /***************************************************************************
  160. ****************************************************************************
  161. RECURSION IN THE match() FUNCTION
  162. The match() function is highly recursive, though not every recursive call
  163. increases the recursive depth. Nevertheless, some regular expressions can cause
  164. it to recurse to a great depth. I was writing for Unix, so I just let it call
  165. itself recursively. This uses the stack for saving everything that has to be
  166. saved for a recursive call. On Unix, the stack can be large, and this works
  167. fine.
  168. It turns out that on some non-Unix-like systems there are problems with
  169. programs that use a lot of stack. (This despite the fact that every last chip
  170. has oodles of memory these days, and techniques for extending the stack have
  171. been known for decades.) So....
  172. There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
  173. calls by keeping local variables that need to be preserved in blocks of memory
  174. obtained from malloc() instead instead of on the stack. Macros are used to
  175. achieve this so that the actual code doesn't look very different to what it
  176. always used to.
  177. The original heap-recursive code used longjmp(). However, it seems that this
  178. can be very slow on some operating systems. Following a suggestion from Stan
  179. Switzer, the use of longjmp() has been abolished, at the cost of having to
  180. provide a unique number for each call to RMATCH. There is no way of generating
  181. a sequence of numbers at compile time in C. I have given them names, to make
  182. them stand out more clearly.
  183. Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
  184. FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
  185. tests. Furthermore, not using longjmp() means that local dynamic variables
  186. don't have indeterminate values; this has meant that the frame size can be
  187. reduced because the result can be "passed back" by straight setting of the
  188. variable instead of being passed in the frame.
  189. ****************************************************************************
  190. ***************************************************************************/
  191. /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
  192. below must be updated in sync. */
  193. enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
  194. RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
  195. RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
  196. RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
  197. RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
  198. RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
  199. RM61, RM62 };
  200. /* These versions of the macros use the stack, as normal. There are debugging
  201. versions and production versions. Note that the "rw" argument of RMATCH isn't
  202. actually used in this definition. */
  203. #ifndef NO_RECURSE
  204. #define REGISTER register
  205. #ifdef PCRE_DEBUG
  206. #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
  207. { \
  208. printf("match() called in line %d\n", __LINE__); \
  209. rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
  210. printf("to line %d\n", __LINE__); \
  211. }
  212. #define RRETURN(ra) \
  213. { \
  214. printf("match() returned %d from line %d ", ra, __LINE__); \
  215. return ra; \
  216. }
  217. #else
  218. #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
  219. rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
  220. #define RRETURN(ra) return ra
  221. #endif
  222. #else
  223. /* These versions of the macros manage a private stack on the heap. Note that
  224. the "rd" argument of RMATCH isn't actually used in this definition. It's the md
  225. argument of match(), which never changes. */
  226. #define REGISTER
  227. #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
  228. {\
  229. heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
  230. if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
  231. frame->Xwhere = rw; \
  232. newframe->Xeptr = ra;\
  233. newframe->Xecode = rb;\
  234. newframe->Xmstart = mstart;\
  235. newframe->Xmarkptr = markptr;\
  236. newframe->Xoffset_top = rc;\
  237. newframe->Xims = re;\
  238. newframe->Xeptrb = rf;\
  239. newframe->Xflags = rg;\
  240. newframe->Xrdepth = frame->Xrdepth + 1;\
  241. newframe->Xprevframe = frame;\
  242. frame = newframe;\
  243. DPRINTF(("restarting from line %d\n", __LINE__));\
  244. goto HEAP_RECURSE;\
  245. L_##rw:\
  246. DPRINTF(("jumped back to line %d\n", __LINE__));\
  247. }
  248. #define RRETURN(ra)\
  249. {\
  250. heapframe *oldframe = frame;\
  251. frame = oldframe->Xprevframe;\
  252. (pcre_stack_free)(oldframe);\
  253. if (frame != NULL)\
  254. {\
  255. rrc = ra;\
  256. goto HEAP_RETURN;\
  257. }\
  258. return ra;\
  259. }
  260. /* Structure for remembering the local variables in a private frame */
  261. typedef struct heapframe {
  262. struct heapframe *Xprevframe;
  263. /* Function arguments that may change */
  264. USPTR Xeptr;
  265. const uschar *Xecode;
  266. USPTR Xmstart;
  267. USPTR Xmarkptr;
  268. int Xoffset_top;
  269. long int Xims;
  270. eptrblock *Xeptrb;
  271. int Xflags;
  272. unsigned int Xrdepth;
  273. /* Function local variables */
  274. USPTR Xcallpat;
  275. #ifdef SUPPORT_UTF8
  276. USPTR Xcharptr;
  277. #endif
  278. USPTR Xdata;
  279. USPTR Xnext;
  280. USPTR Xpp;
  281. USPTR Xprev;
  282. USPTR Xsaved_eptr;
  283. recursion_info Xnew_recursive;
  284. BOOL Xcur_is_word;
  285. BOOL Xcondition;
  286. BOOL Xprev_is_word;
  287. unsigned long int Xoriginal_ims;
  288. #ifdef SUPPORT_UCP
  289. int Xprop_type;
  290. int Xprop_value;
  291. int Xprop_fail_result;
  292. int Xprop_category;
  293. int Xprop_chartype;
  294. int Xprop_script;
  295. int Xoclength;
  296. uschar Xocchars[8];
  297. #endif
  298. int Xcodelink;
  299. int Xctype;
  300. unsigned int Xfc;
  301. int Xfi;
  302. int Xlength;
  303. int Xmax;
  304. int Xmin;
  305. int Xnumber;
  306. int Xoffset;
  307. int Xop;
  308. int Xsave_capture_last;
  309. int Xsave_offset1, Xsave_offset2, Xsave_offset3;
  310. int Xstacksave[REC_STACK_SAVE_MAX];
  311. eptrblock Xnewptrb;
  312. /* Where to jump back to */
  313. int Xwhere;
  314. } heapframe;
  315. #endif
  316. /***************************************************************************
  317. ***************************************************************************/
  318. /*************************************************
  319. * Match from current position *
  320. *************************************************/
  321. /* This function is called recursively in many circumstances. Whenever it
  322. returns a negative (error) response, the outer incarnation must also return the
  323. same response. */
  324. /* These macros pack up tests that are used for partial matching, and which
  325. appears several times in the code. We set the "hit end" flag if the pointer is
  326. at the end of the subject and also past the start of the subject (i.e.
  327. something has been matched). For hard partial matching, we then return
  328. immediately. The second one is used when we already know we are past the end of
  329. the subject. */
  330. #define CHECK_PARTIAL()\
  331. if (md->partial != 0 && eptr >= md->end_subject && \
  332. eptr > md->start_used_ptr) \
  333. { \
  334. md->hitend = TRUE; \
  335. if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
  336. }
  337. #define SCHECK_PARTIAL()\
  338. if (md->partial != 0 && eptr > md->start_used_ptr) \
  339. { \
  340. md->hitend = TRUE; \
  341. if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
  342. }
  343. /* Performance note: It might be tempting to extract commonly used fields from
  344. the md structure (e.g. utf8, end_subject) into individual variables to improve
  345. performance. Tests using gcc on a SPARC disproved this; in the first case, it
  346. made performance worse.
  347. Arguments:
  348. eptr pointer to current character in subject
  349. ecode pointer to current position in compiled code
  350. mstart pointer to the current match start position (can be modified
  351. by encountering \K)
  352. markptr pointer to the most recent MARK name, or NULL
  353. offset_top current top pointer
  354. md pointer to "static" info for the match
  355. ims current /i, /m, and /s options
  356. eptrb pointer to chain of blocks containing eptr at start of
  357. brackets - for testing for empty matches
  358. flags can contain
  359. match_condassert - this is an assertion condition
  360. match_cbegroup - this is the start of an unlimited repeat
  361. group that can match an empty string
  362. rdepth the recursion depth
  363. Returns: MATCH_MATCH if matched ) these values are >= 0
  364. MATCH_NOMATCH if failed to match )
  365. a negative MATCH_xxx value for PRUNE, SKIP, etc
  366. a negative PCRE_ERROR_xxx value if aborted by an error condition
  367. (e.g. stopped by repeated call or recursion limit)
  368. */
  369. static int
  370. match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
  371. const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
  372. eptrblock *eptrb, int flags, unsigned int rdepth)
  373. {
  374. /* These variables do not need to be preserved over recursion in this function,
  375. so they can be ordinary variables in all cases. Mark some of them with
  376. "register" because they are used a lot in loops. */
  377. register int rrc; /* Returns from recursive calls */
  378. register int i; /* Used for loops not involving calls to RMATCH() */
  379. register unsigned int c; /* Character values not kept over RMATCH() calls */
  380. register BOOL utf8; /* Local copy of UTF-8 flag for speed */
  381. BOOL minimize, possessive; /* Quantifier options */
  382. int condcode;
  383. /* When recursion is not being used, all "local" variables that have to be
  384. preserved over calls to RMATCH() are part of a "frame" which is obtained from
  385. heap storage. Set up the top-level frame here; others are obtained from the
  386. heap whenever RMATCH() does a "recursion". See the macro definitions above. */
  387. #ifdef NO_RECURSE
  388. heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
  389. if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
  390. frame->Xprevframe = NULL; /* Marks the top level */
  391. /* Copy in the original argument variables */
  392. frame->Xeptr = eptr;
  393. frame->Xecode = ecode;
  394. frame->Xmstart = mstart;
  395. frame->Xmarkptr = markptr;
  396. frame->Xoffset_top = offset_top;
  397. frame->Xims = ims;
  398. frame->Xeptrb = eptrb;
  399. frame->Xflags = flags;
  400. frame->Xrdepth = rdepth;
  401. /* This is where control jumps back to to effect "recursion" */
  402. HEAP_RECURSE:
  403. /* Macros make the argument variables come from the current frame */
  404. #define eptr frame->Xeptr
  405. #define ecode frame->Xecode
  406. #define mstart frame->Xmstart
  407. #define markptr frame->Xmarkptr
  408. #define offset_top frame->Xoffset_top
  409. #define ims frame->Xims
  410. #define eptrb frame->Xeptrb
  411. #define flags frame->Xflags
  412. #define rdepth frame->Xrdepth
  413. /* Ditto for the local variables */
  414. #ifdef SUPPORT_UTF8
  415. #define charptr frame->Xcharptr
  416. #endif
  417. #define callpat frame->Xcallpat
  418. #define codelink frame->Xcodelink
  419. #define data frame->Xdata
  420. #define next frame->Xnext
  421. #define pp frame->Xpp
  422. #define prev frame->Xprev
  423. #define saved_eptr frame->Xsaved_eptr
  424. #define new_recursive frame->Xnew_recursive
  425. #define cur_is_word frame->Xcur_is_word
  426. #define condition frame->Xcondition
  427. #define prev_is_word frame->Xprev_is_word
  428. #define original_ims frame->Xoriginal_ims
  429. #ifdef SUPPORT_UCP
  430. #define prop_type frame->Xprop_type
  431. #define prop_value frame->Xprop_value
  432. #define prop_fail_result frame->Xprop_fail_result
  433. #define prop_category frame->Xprop_category
  434. #define prop_chartype frame->Xprop_chartype
  435. #define prop_script frame->Xprop_script
  436. #define oclength frame->Xoclength
  437. #define occhars frame->Xocchars
  438. #endif
  439. #define ctype frame->Xctype
  440. #define fc frame->Xfc
  441. #define fi frame->Xfi
  442. #define length frame->Xlength
  443. #define max frame->Xmax
  444. #define min frame->Xmin
  445. #define number frame->Xnumber
  446. #define offset frame->Xoffset
  447. #define op frame->Xop
  448. #define save_capture_last frame->Xsave_capture_last
  449. #define save_offset1 frame->Xsave_offset1
  450. #define save_offset2 frame->Xsave_offset2
  451. #define save_offset3 frame->Xsave_offset3
  452. #define stacksave frame->Xstacksave
  453. #define newptrb frame->Xnewptrb
  454. /* When recursion is being used, local variables are allocated on the stack and
  455. get preserved during recursion in the normal way. In this environment, fi and
  456. i, and fc and c, can be the same variables. */
  457. #else /* NO_RECURSE not defined */
  458. #define fi i
  459. #define fc c
  460. #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
  461. const uschar *charptr; /* in small blocks of the code. My normal */
  462. #endif /* style of coding would have declared */
  463. const uschar *callpat; /* them within each of those blocks. */
  464. const uschar *data; /* However, in order to accommodate the */
  465. const uschar *next; /* version of this code that uses an */
  466. USPTR pp; /* external "stack" implemented on the */
  467. const uschar *prev; /* heap, it is easier to declare them all */
  468. USPTR saved_eptr; /* here, so the declarations can be cut */
  469. /* out in a block. The only declarations */
  470. recursion_info new_recursive; /* within blocks below are for variables */
  471. /* that do not have to be preserved over */
  472. BOOL cur_is_word; /* a recursive call to RMATCH(). */
  473. BOOL condition;
  474. BOOL prev_is_word;
  475. unsigned long int original_ims;
  476. #ifdef SUPPORT_UCP
  477. int prop_type;
  478. int prop_value;
  479. int prop_fail_result;
  480. int prop_category;
  481. int prop_chartype;
  482. int prop_script;
  483. int oclength;
  484. uschar occhars[8];
  485. #endif
  486. int codelink;
  487. int ctype;
  488. int length;
  489. int max;
  490. int min;
  491. int number;
  492. int offset;
  493. int op;
  494. int save_capture_last;
  495. int save_offset1, save_offset2, save_offset3;
  496. int stacksave[REC_STACK_SAVE_MAX];
  497. eptrblock newptrb;
  498. #endif /* NO_RECURSE */
  499. /* These statements are here to stop the compiler complaining about unitialized
  500. variables. */
  501. #ifdef SUPPORT_UCP
  502. prop_value = 0;
  503. prop_fail_result = 0;
  504. #endif
  505. /* This label is used for tail recursion, which is used in a few cases even
  506. when NO_RECURSE is not defined, in order to reduce the amount of stack that is
  507. used. Thanks to Ian Taylor for noticing this possibility and sending the
  508. original patch. */
  509. TAIL_RECURSE:
  510. /* OK, now we can get on with the real code of the function. Recursive calls
  511. are specified by the macro RMATCH and RRETURN is used to return. When
  512. NO_RECURSE is *not* defined, these just turn into a recursive call to match()
  513. and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
  514. defined). However, RMATCH isn't like a function call because it's quite a
  515. complicated macro. It has to be used in one particular way. This shouldn't,
  516. however, impact performance when true recursion is being used. */
  517. #ifdef SUPPORT_UTF8
  518. utf8 = md->utf8; /* Local copy of the flag */
  519. #else
  520. utf8 = FALSE;
  521. #endif
  522. /* First check that we haven't called match() too many times, or that we
  523. haven't exceeded the recursive call limit. */
  524. if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
  525. if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
  526. original_ims = ims; /* Save for resetting on ')' */
  527. /* At the start of a group with an unlimited repeat that may match an empty
  528. string, the match_cbegroup flag is set. When this is the case, add the current
  529. subject pointer to the chain of such remembered pointers, to be checked when we
  530. hit the closing ket, in order to break infinite loops that match no characters.
  531. When match() is called in other circumstances, don't add to the chain. The
  532. match_cbegroup flag must NOT be used with tail recursion, because the memory
  533. block that is used is on the stack, so a new one may be required for each
  534. match(). */
  535. if ((flags & match_cbegroup) != 0)
  536. {
  537. newptrb.epb_saved_eptr = eptr;
  538. newptrb.epb_prev = eptrb;
  539. eptrb = &newptrb;
  540. }
  541. /* Now start processing the opcodes. */
  542. for (;;)
  543. {
  544. minimize = possessive = FALSE;
  545. op = *ecode;
  546. switch(op)
  547. {
  548. case OP_MARK:
  549. markptr = ecode + 2;
  550. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
  551. ims, eptrb, flags, RM55);
  552. /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
  553. argument, and we must check whether that argument matches this MARK's
  554. argument. It is passed back in md->start_match_ptr (an overloading of that
  555. variable). If it does match, we reset that variable to the current subject
  556. position and return MATCH_SKIP. Otherwise, pass back the return code
  557. unaltered. */
  558. if (rrc == MATCH_SKIP_ARG &&
  559. strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
  560. {
  561. md->start_match_ptr = eptr;
  562. RRETURN(MATCH_SKIP);
  563. }
  564. if (md->mark == NULL) md->mark = markptr;
  565. RRETURN(rrc);
  566. case OP_FAIL:
  567. MRRETURN(MATCH_NOMATCH);
  568. /* COMMIT overrides PRUNE, SKIP, and THEN */
  569. case OP_COMMIT:
  570. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
  571. ims, eptrb, flags, RM52);
  572. if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
  573. rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
  574. rrc != MATCH_THEN)
  575. RRETURN(rrc);
  576. MRRETURN(MATCH_COMMIT);
  577. /* PRUNE overrides THEN */
  578. case OP_PRUNE:
  579. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
  580. ims, eptrb, flags, RM51);
  581. if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
  582. MRRETURN(MATCH_PRUNE);
  583. case OP_PRUNE_ARG:
  584. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
  585. ims, eptrb, flags, RM56);
  586. if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
  587. md->mark = ecode + 2;
  588. RRETURN(MATCH_PRUNE);
  589. /* SKIP overrides PRUNE and THEN */
  590. case OP_SKIP:
  591. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
  592. ims, eptrb, flags, RM53);
  593. if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
  594. RRETURN(rrc);
  595. md->start_match_ptr = eptr; /* Pass back current position */
  596. MRRETURN(MATCH_SKIP);
  597. case OP_SKIP_ARG:
  598. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
  599. ims, eptrb, flags, RM57);
  600. if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
  601. RRETURN(rrc);
  602. /* Pass back the current skip name by overloading md->start_match_ptr and
  603. returning the special MATCH_SKIP_ARG return code. This will either be
  604. caught by a matching MARK, or get to the top, where it is treated the same
  605. as PRUNE. */
  606. md->start_match_ptr = ecode + 2;
  607. RRETURN(MATCH_SKIP_ARG);
  608. /* For THEN (and THEN_ARG) we pass back the address of the bracket or
  609. the alt that is at the start of the current branch. This makes it possible
  610. to skip back past alternatives that precede the THEN within the current
  611. branch. */
  612. case OP_THEN:
  613. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
  614. ims, eptrb, flags, RM54);
  615. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  616. md->start_match_ptr = ecode - GET(ecode, 1);
  617. MRRETURN(MATCH_THEN);
  618. case OP_THEN_ARG:
  619. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
  620. offset_top, md, ims, eptrb, flags, RM58);
  621. if (rrc != MATCH_NOMATCH) RRETURN(rrc);
  622. md->start_match_ptr = ecode - GET(ecode, 1);
  623. md->mark = ecode + LINK_SIZE + 2;
  624. RRETURN(MATCH_THEN);
  625. /* Handle a capturing bracket. If there is space in the offset vector, save
  626. the current subject position in the working slot at the top of the vector.
  627. We mustn't change the current values of the data slot, because they may be
  628. set from a previous iteration of this group, and be referred to by a
  629. reference inside the group.
  630. If the bracket fails to match, we need to restore this value and also the
  631. values of the final offsets, in case they were set by a previous iteration
  632. of the same bracket.
  633. If there isn't enough space in the offset vector, treat this as if it were
  634. a non-capturing bracket. Don't worry about setting the flag for the error
  635. case here; that is handled in the code for KET. */
  636. case OP_CBRA:
  637. case OP_SCBRA:
  638. number = GET2(ecode, 1+LINK_SIZE);
  639. offset = number << 1;
  640. #ifdef PCRE_DEBUG
  641. printf("start bracket %d\n", number);
  642. printf("subject=");
  643. pchars(eptr, 16, TRUE, md);
  644. printf("\n");
  645. #endif
  646. if (offset < md->offset_max)
  647. {
  648. save_offset1 = md->offset_vector[offset];
  649. save_offset2 = md->offset_vector[offset+1];
  650. save_offset3 = md->offset_vector[md->offset_end - number];
  651. save_capture_last = md->capture_last;
  652. DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
  653. md->offset_vector[md->offset_end - number] =
  654. (int)(eptr - md->start_subject);
  655. flags = (op == OP_SCBRA)? match_cbegroup : 0;
  656. do
  657. {
  658. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
  659. ims, eptrb, flags, RM1);
  660. if (rrc != MATCH_NOMATCH &&
  661. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  662. RRETURN(rrc);
  663. md->capture_last = save_capture_last;
  664. ecode += GET(ecode, 1);
  665. }
  666. while (*ecode == OP_ALT);
  667. DPRINTF(("bracket %d failed\n", number));
  668. md->offset_vector[offset] = save_offset1;
  669. md->offset_vector[offset+1] = save_offset2;
  670. md->offset_vector[md->offset_end - number] = save_offset3;
  671. if (rrc != MATCH_THEN) md->mark = markptr;
  672. RRETURN(MATCH_NOMATCH);
  673. }
  674. /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
  675. as a non-capturing bracket. */
  676. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  677. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  678. DPRINTF(("insufficient capture room: treat as non-capturing\n"));
  679. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  680. /* VVVVVVVVVVVVVVVVVVVVVVVVV */
  681. /* Non-capturing bracket. Loop for all the alternatives. When we get to the
  682. final alternative within the brackets, we would return the result of a
  683. recursive call to match() whatever happened. We can reduce stack usage by
  684. turning this into a tail recursion, except in the case when match_cbegroup
  685. is set.*/
  686. case OP_BRA:
  687. case OP_SBRA:
  688. DPRINTF(("start non-capturing bracket\n"));
  689. flags = (op >= OP_SBRA)? match_cbegroup : 0;
  690. for (;;)
  691. {
  692. if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
  693. {
  694. if (flags == 0) /* Not a possibly empty group */
  695. {
  696. ecode += _pcre_OP_lengths[*ecode];
  697. DPRINTF(("bracket 0 tail recursion\n"));
  698. goto TAIL_RECURSE;
  699. }
  700. /* Possibly empty group; can't use tail recursion. */
  701. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
  702. eptrb, flags, RM48);
  703. if (rrc == MATCH_NOMATCH) md->mark = markptr;
  704. RRETURN(rrc);
  705. }
  706. /* For non-final alternatives, continue the loop for a NOMATCH result;
  707. otherwise return. */
  708. RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
  709. eptrb, flags, RM2);
  710. if (rrc != MATCH_NOMATCH &&
  711. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  712. RRETURN(rrc);
  713. ecode += GET(ecode, 1);
  714. }
  715. /* Control never reaches here. */
  716. /* Conditional group: compilation checked that there are no more than
  717. two branches. If the condition is false, skipping the first branch takes us
  718. past the end if there is only one branch, but that's OK because that is
  719. exactly what going to the ket would do. As there is only one branch to be
  720. obeyed, we can use tail recursion to avoid using another stack frame. */
  721. case OP_COND:
  722. case OP_SCOND:
  723. codelink= GET(ecode, 1);
  724. /* Because of the way auto-callout works during compile, a callout item is
  725. inserted between OP_COND and an assertion condition. */
  726. if (ecode[LINK_SIZE+1] == OP_CALLOUT)
  727. {
  728. if (pcre_callout != NULL)
  729. {
  730. pcre_callout_block cb;
  731. cb.version = 1; /* Version 1 of the callout block */
  732. cb.callout_number = ecode[LINK_SIZE+2];
  733. cb.offset_vector = md->offset_vector;
  734. cb.subject = (PCRE_SPTR)md->start_subject;
  735. cb.subject_length = (int)(md->end_subject - md->start_subject);
  736. cb.start_match = (int)(mstart - md->start_subject);
  737. cb.current_position = (int)(eptr - md->start_subject);
  738. cb.pattern_position = GET(ecode, LINK_SIZE + 3);
  739. cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
  740. cb.capture_top = offset_top/2;
  741. cb.capture_last = md->capture_last;
  742. cb.callout_data = md->callout_data;
  743. if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
  744. if (rrc < 0) RRETURN(rrc);
  745. }
  746. ecode += _pcre_OP_lengths[OP_CALLOUT];
  747. }
  748. condcode = ecode[LINK_SIZE+1];
  749. /* Now see what the actual condition is */
  750. if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
  751. {
  752. if (md->recursive == NULL) /* Not recursing => FALSE */
  753. {
  754. condition = FALSE;
  755. ecode += GET(ecode, 1);
  756. }
  757. else
  758. {
  759. int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
  760. condition = (recno == RREF_ANY || recno == md->recursive->group_num);
  761. /* If the test is for recursion into a specific subpattern, and it is
  762. false, but the test was set up by name, scan the table to see if the
  763. name refers to any other numbers, and test them. The condition is true
  764. if any one is set. */
  765. if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
  766. {
  767. uschar *slotA = md->name_table;
  768. for (i = 0; i < md->name_count; i++)
  769. {
  770. if (GET2(slotA, 0) == recno) break;
  771. slotA += md->name_entry_size;
  772. }
  773. /* Found a name for the number - there can be only one; duplicate
  774. names for different numbers are allowed, but not vice versa. First
  775. scan down for duplicates. */
  776. if (i < md->name_count)
  777. {
  778. uschar *slotB = slotA;
  779. while (slotB > md->name_table)
  780. {
  781. slotB -= md->name_entry_size;
  782. if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
  783. {
  784. condition = GET2(slotB, 0) == md->recursive->group_num;
  785. if (condition) break;
  786. }
  787. else break;
  788. }
  789. /* Scan up for duplicates */
  790. if (!condition)
  791. {
  792. slotB = slotA;
  793. for (i++; i < md->name_count; i++)
  794. {
  795. slotB += md->name_entry_size;
  796. if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
  797. {
  798. condition = GET2(slotB, 0) == md->recursive->group_num;
  799. if (condition) break;
  800. }
  801. else break;
  802. }
  803. }
  804. }
  805. }
  806. /* Chose branch according to the condition */
  807. ecode += condition? 3 : GET(ecode, 1);
  808. }
  809. }
  810. else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
  811. {
  812. offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
  813. condition = offset < offset_top && md->offset_vector[offset] >= 0;
  814. /* If the numbered capture is unset, but the reference was by name,
  815. scan the table to see if the name refers to any other numbers, and test
  816. them. The condition is true if any one is set. This is tediously similar
  817. to the code above, but not close enough to try to amalgamate. */
  818. if (!condition && condcode == OP_NCREF)
  819. {
  820. int refno = offset >> 1;
  821. uschar *slotA = md->name_table;
  822. for (i = 0; i < md->name_count; i++)
  823. {
  824. if (GET2(slotA, 0) == refno) break;
  825. slotA += md->name_entry_size;
  826. }
  827. /* Found a name for the number - there can be only one; duplicate names
  828. for different numbers are allowed, but not vice versa. First scan down
  829. for duplicates. */
  830. if (i < md->name_count)
  831. {
  832. uschar *slotB = slotA;
  833. while (slotB > md->name_table)
  834. {
  835. slotB -= md->name_entry_size;
  836. if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
  837. {
  838. offset = GET2(slotB, 0) << 1;
  839. condition = offset < offset_top &&
  840. md->offset_vector[offset] >= 0;
  841. if (condition) break;
  842. }
  843. else break;
  844. }
  845. /* Scan up for duplicates */
  846. if (!condition)
  847. {
  848. slotB = slotA;
  849. for (i++; i < md->name_count; i++)
  850. {
  851. slotB += md->name_entry_size;
  852. if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
  853. {
  854. offset = GET2(slotB, 0) << 1;
  855. condition = offset < offset_top &&
  856. md->offset_vector[offset] >= 0;
  857. if (condition) break;
  858. }
  859. else break;
  860. }
  861. }
  862. }
  863. }
  864. /* Chose branch according to the condition */
  865. ecode += condition? 3 : GET(ecode, 1);
  866. }
  867. else if (condcode == OP_DEF) /* DEFINE - always false */
  868. {
  869. condition = FALSE;
  870. ecode += GET(ecode, 1);
  871. }
  872. /* The condition is an assertion. Call match() to evaluate it - setting
  873. the final argument match_condassert causes it to stop at the end of an
  874. assertion. */
  875. else
  876. {
  877. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
  878. match_condassert, RM3);
  879. if (rrc == MATCH_MATCH)
  880. {
  881. condition = TRUE;
  882. ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
  883. while (*ecode == OP_ALT) ecode += GET(ecode, 1);
  884. }
  885. else if (rrc != MATCH_NOMATCH &&
  886. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  887. {
  888. RRETURN(rrc); /* Need braces because of following else */
  889. }
  890. else
  891. {
  892. condition = FALSE;
  893. ecode += codelink;
  894. }
  895. }
  896. /* We are now at the branch that is to be obeyed. As there is only one,
  897. we can use tail recursion to avoid using another stack frame, except when
  898. match_cbegroup is required for an unlimited repeat of a possibly empty
  899. group. If the second alternative doesn't exist, we can just plough on. */
  900. if (condition || *ecode == OP_ALT)
  901. {
  902. ecode += 1 + LINK_SIZE;
  903. if (op == OP_SCOND) /* Possibly empty group */
  904. {
  905. RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
  906. RRETURN(rrc);
  907. }
  908. else /* Group must match something */
  909. {
  910. flags = 0;
  911. goto TAIL_RECURSE;
  912. }
  913. }
  914. else /* Condition false & no alternative */
  915. {
  916. ecode += 1 + LINK_SIZE;
  917. }
  918. break;
  919. /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
  920. to close any currently open capturing brackets. */
  921. case OP_CLOSE:
  922. number = GET2(ecode, 1);
  923. offset = number << 1;
  924. #ifdef PCRE_DEBUG
  925. printf("end bracket %d at *ACCEPT", number);
  926. printf("\n");
  927. #endif
  928. md->capture_last = number;
  929. if (offset >= md->offset_max) md->offset_overflow = TRUE; else
  930. {
  931. md->offset_vector[offset] =
  932. md->offset_vector[md->offset_end - number];
  933. md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
  934. if (offset_top <= offset) offset_top = offset + 2;
  935. }
  936. ecode += 3;
  937. break;
  938. /* End of the pattern, either real or forced. If we are in a top-level
  939. recursion, we should restore the offsets appropriately and continue from
  940. after the call. */
  941. case OP_ACCEPT:
  942. case OP_END:
  943. if (md->recursive != NULL && md->recursive->group_num == 0)
  944. {
  945. recursion_info *rec = md->recursive;
  946. DPRINTF(("End of pattern in a (?0) recursion\n"));
  947. md->recursive = rec->prevrec;
  948. memmove(md->offset_vector, rec->offset_save,
  949. rec->saved_max * sizeof(int));
  950. offset_top = rec->save_offset_top;
  951. ims = original_ims;
  952. ecode = rec->after_call;
  953. break;
  954. }
  955. /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
  956. set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
  957. the subject. In both cases, backtracking will then try other alternatives,
  958. if any. */
  959. if (eptr == mstart &&
  960. (md->notempty ||
  961. (md->notempty_atstart &&
  962. mstart == md->start_subject + md->start_offset)))
  963. MRRETURN(MATCH_NOMATCH);
  964. /* Otherwise, we have a match. */
  965. md->end_match_ptr = eptr; /* Record where we ended */
  966. md->end_offset_top = offset_top; /* and how many extracts were taken */
  967. md->start_match_ptr = mstart; /* and the start (\K can modify) */
  968. /* For some reason, the macros don't work properly if an expression is
  969. given as the argument to MRRETURN when the heap is in use. */
  970. rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
  971. MRRETURN(rrc);
  972. /* Change option settings */
  973. case OP_OPT:
  974. ims = ecode[1];
  975. ecode += 2;
  976. DPRINTF(("ims set to %02lx\n", ims));
  977. break;
  978. /* Assertion brackets. Check the alternative branches in turn - the
  979. matching won't pass the KET for an assertion. If any one branch matches,
  980. the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
  981. start of each branch to move the current point backwards, so the code at
  982. this level is identical to the lookahead case. */
  983. case OP_ASSERT:
  984. case OP_ASSERTBACK:
  985. do
  986. {
  987. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
  988. RM4);
  989. if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
  990. {
  991. mstart = md->start_match_ptr; /* In case \K reset it */
  992. break;
  993. }
  994. if (rrc != MATCH_NOMATCH &&
  995. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  996. RRETURN(rrc);
  997. ecode += GET(ecode, 1);
  998. }
  999. while (*ecode == OP_ALT);
  1000. if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
  1001. /* If checking an assertion for a condition, return MATCH_MATCH. */
  1002. if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
  1003. /* Continue from after the assertion, updating the offsets high water
  1004. mark, since extracts may have been taken during the assertion. */
  1005. do ecode += GET(ecode,1); while (*ecode == OP_ALT);
  1006. ecode += 1 + LINK_SIZE;
  1007. offset_top = md->end_offset_top;
  1008. continue;
  1009. /* Negative assertion: all branches must fail to match. Encountering SKIP,
  1010. PRUNE, or COMMIT means we must assume failure without checking subsequent
  1011. branches. */
  1012. case OP_ASSERT_NOT:
  1013. case OP_ASSERTBACK_NOT:
  1014. do
  1015. {
  1016. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
  1017. RM5);
  1018. if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
  1019. if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
  1020. {
  1021. do ecode += GET(ecode,1); while (*ecode == OP_ALT);
  1022. break;
  1023. }
  1024. if (rrc != MATCH_NOMATCH &&
  1025. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  1026. RRETURN(rrc);
  1027. ecode += GET(ecode,1);
  1028. }
  1029. while (*ecode == OP_ALT);
  1030. if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
  1031. ecode += 1 + LINK_SIZE;
  1032. continue;
  1033. /* Move the subject pointer back. This occurs only at the start of
  1034. each branch of a lookbehind assertion. If we are too close to the start to
  1035. move back, this match function fails. When working with UTF-8 we move
  1036. back a number of characters, not bytes. */
  1037. case OP_REVERSE:
  1038. #ifdef SUPPORT_UTF8
  1039. if (utf8)
  1040. {
  1041. i = GET(ecode, 1);
  1042. while (i-- > 0)
  1043. {
  1044. eptr--;
  1045. if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
  1046. BACKCHAR(eptr);
  1047. }
  1048. }
  1049. else
  1050. #endif
  1051. /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
  1052. {
  1053. eptr -= GET(ecode, 1);
  1054. if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
  1055. }
  1056. /* Save the earliest consulted character, then skip to next op code */
  1057. if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
  1058. ecode += 1 + LINK_SIZE;
  1059. break;
  1060. /* The callout item calls an external function, if one is provided, passing
  1061. details of the match so far. This is mainly for debugging, though the
  1062. function is able to force a failure. */
  1063. case OP_CALLOUT:
  1064. if (pcre_callout != NULL)
  1065. {
  1066. pcre_callout_block cb;
  1067. cb.version = 1; /* Version 1 of the callout block */
  1068. cb.callout_number = ecode[1];
  1069. cb.offset_vector = md->offset_vector;
  1070. cb.subject = (PCRE_SPTR)md->start_subject;
  1071. cb.subject_length = (int)(md->end_subject - md->start_subject);
  1072. cb.start_match = (int)(mstart - md->start_subject);
  1073. cb.current_position = (int)(eptr - md->start_subject);
  1074. cb.pattern_position = GET(ecode, 2);
  1075. cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
  1076. cb.capture_top = offset_top/2;
  1077. cb.capture_last = md->capture_last;
  1078. cb.callout_data = md->callout_data;
  1079. if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
  1080. if (rrc < 0) RRETURN(rrc);
  1081. }
  1082. ecode += 2 + 2*LINK_SIZE;
  1083. break;
  1084. /* Recursion either matches the current regex, or some subexpression. The
  1085. offset data is the offset to the starting bracket from the start of the
  1086. whole pattern. (This is so that it works from duplicated subpatterns.)
  1087. If there are any capturing brackets started but not finished, we have to
  1088. save their starting points and reinstate them after the recursion. However,
  1089. we don't know how many such there are (offset_top records the completed
  1090. total) so we just have to save all the potential data. There may be up to
  1091. 65535 such values, which is too large to put on the stack, but using malloc
  1092. for small numbers seems expensive. As a compromise, the stack is used when
  1093. there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
  1094. is used. A problem is what to do if the malloc fails ... there is no way of
  1095. returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
  1096. values on the stack, and accept that the rest may be wrong.
  1097. There are also other values that have to be saved. We use a chained
  1098. sequence of blocks that actually live on the stack. Thanks to Robin Houston
  1099. for the original version of this logic. */
  1100. case OP_RECURSE:
  1101. {
  1102. callpat = md->start_code + GET(ecode, 1);
  1103. new_recursive.group_num = (callpat == md->start_code)? 0 :
  1104. GET2(callpat, 1 + LINK_SIZE);
  1105. /* Add to "recursing stack" */
  1106. new_recursive.prevrec = md->recursive;
  1107. md->recursive = &new_recursive;
  1108. /* Find where to continue from afterwards */
  1109. ecode += 1 + LINK_SIZE;
  1110. new_recursive.after_call = ecode;
  1111. /* Now save the offset data. */
  1112. new_recursive.saved_max = md->offset_end;
  1113. if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
  1114. new_recursive.offset_save = stacksave;
  1115. else
  1116. {
  1117. new_recursive.offset_save =
  1118. (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
  1119. if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
  1120. }
  1121. memcpy(new_recursive.offset_save, md->offset_vector,
  1122. new_recursive.saved_max * sizeof(int));
  1123. new_recursive.save_offset_top = offset_top;
  1124. /* OK, now we can do the recursion. For each top-level alternative we
  1125. restore the offset and recursion data. */
  1126. DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
  1127. flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
  1128. do
  1129. {
  1130. RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
  1131. md, ims, eptrb, flags, RM6);
  1132. if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
  1133. {
  1134. DPRINTF(("Recursion matched\n"));
  1135. md->recursive = new_recursive.prevrec;
  1136. if (new_recursive.offset_save != stacksave)
  1137. (pcre_free)(new_recursive.offset_save);
  1138. MRRETURN(MATCH_MATCH);
  1139. }
  1140. else if (rrc != MATCH_NOMATCH &&
  1141. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  1142. {
  1143. DPRINTF(("Recursion gave error %d\n", rrc));
  1144. if (new_recursive.offset_save != stacksave)
  1145. (pcre_free)(new_recursive.offset_save);
  1146. RRETURN(rrc);
  1147. }
  1148. md->recursive = &new_recursive;
  1149. memcpy(md->offset_vector, new_recursive.offset_save,
  1150. new_recursive.saved_max * sizeof(int));
  1151. callpat += GET(callpat, 1);
  1152. }
  1153. while (*callpat == OP_ALT);
  1154. DPRINTF(("Recursion didn't match\n"));
  1155. md->recursive = new_recursive.prevrec;
  1156. if (new_recursive.offset_save != stacksave)
  1157. (pcre_free)(new_recursive.offset_save);
  1158. MRRETURN(MATCH_NOMATCH);
  1159. }
  1160. /* Control never reaches here */
  1161. /* "Once" brackets are like assertion brackets except that after a match,
  1162. the point in the subject string is not moved back. Thus there can never be
  1163. a move back into the brackets. Friedl calls these "atomic" subpatterns.
  1164. Check the alternative branches in turn - the matching won't pass the KET
  1165. for this kind of subpattern. If any one branch matches, we carry on as at
  1166. the end of a normal bracket, leaving the subject pointer, but resetting
  1167. the start-of-match value in case it was changed by \K. */
  1168. case OP_ONCE:
  1169. prev = ecode;
  1170. saved_eptr = eptr;
  1171. do
  1172. {
  1173. RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
  1174. if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
  1175. {
  1176. mstart = md->start_match_ptr;
  1177. break;
  1178. }
  1179. if (rrc != MATCH_NOMATCH &&
  1180. (rrc != MATCH_THEN || md->start_match_ptr != ecode))
  1181. RRETURN(rrc);
  1182. ecode += GET(ecode,1);
  1183. }
  1184. while (*ecode == OP_ALT);
  1185. /* If hit the end of the group (which could be repeated), fail */
  1186. if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
  1187. /* Continue as from after the assertion, updating the offsets high water
  1188. mark, since extracts may have been taken. */
  1189. do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
  1190. offset_top = md->end_offset_top;
  1191. eptr = md->end_match_ptr;
  1192. /* For a non-repeating ket, just continue at this level. This also
  1193. happens for a repeating ket if no characters were matched in the group.
  1194. This is the forcible breaking of infinite loops as implemented in Perl
  1195. 5.005. If there is an options reset, it will get obeyed in the normal
  1196. course of events. */
  1197. if (*ecode == OP_KET || eptr == saved_eptr)
  1198. {
  1199. ecode += 1+LINK_SIZE;
  1200. break;
  1201. }
  1202. /* Th…

Large files files are truncated, but you can click here to view the full file