/tags/beta3/harbour/source/hbpcre/pcrecomp.c
C | 2101 lines | 1254 code | 313 blank | 534 comment | 356 complexity | 176f1ce6df975a239bdb7a1ba855d8c9 MD5 | raw file
Possible License(s): AGPL-1.0, BSD-3-Clause, CC-BY-SA-3.0, LGPL-3.0, GPL-2.0, LGPL-2.0, LGPL-2.1
Large files files are truncated, but you can click here to view the full file
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /* PCRE is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language.
- Written by Philip Hazel
- Copyright (c) 1997-2005 University of Cambridge
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- /* This module contains the external function pcre_compile(), along with
- supporting internal functions that are not used by other modules. */
- #include "pcreinal.h"
- /*************************************************
- * Code parameters and static tables *
- *************************************************/
- /* Maximum number of items on the nested bracket stacks at compile time. This
- applies to the nesting of all kinds of parentheses. It does not limit
- un-nested, non-capturing parentheses. This number can be made bigger if
- necessary - it is used to dimension one int and one unsigned char vector at
- compile time. */
- #define BRASTACK_SIZE 200
- /* Table for handling escaped characters in the range '0'-'z'. Positive returns
- are simple data values; negative values are for special things like \d and so
- on. Zero means further processing is needed (for things like \x), or the escape
- is invalid. */
- #if !EBCDIC /* This is the "normal" table for ASCII systems */
- static const short int escapes[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
- 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
- '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
- 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
- -ESC_P, -ESC_Q, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
- -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
- '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
- 0, 0, 0, 0, 0, 0, ESC_n, 0, /* h - o */
- -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, 0, -ESC_w, /* p - w */
- 0, 0, -ESC_z /* x - z */
- };
- #else /* This is the "abnormal" table for EBCDIC systems */
- static const short int escapes[] = {
- /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
- /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
- /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
- /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
- /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
- /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
- /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
- /* 88 */ 0, 0, 0, '{', 0, 0, 0, 0,
- /* 90 */ 0, 0, 0, 'l', 0, ESC_n, 0, -ESC_p,
- /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
- /* A0 */ 0, '~', -ESC_s, ESC_tee, 0, 0, -ESC_w, 0,
- /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
- /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
- /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
- /* C8 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* D0 */ '}', 0, 0, 0, 0, 0, 0, -ESC_P,
- /* D8 */-ESC_Q, 0, 0, 0, 0, 0, 0, 0,
- /* E0 */ '\\', 0, -ESC_S, 0, 0, 0, -ESC_W, -ESC_X,
- /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
- /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
- };
- #endif
- /* Tables of names of POSIX character classes and their lengths. The list is
- terminated by a zero length entry. The first three must be alpha, upper, lower,
- as this is assumed for handling case independence. */
- static const char *const posix_names[] = {
- "alpha", "lower", "upper",
- "alnum", "ascii", "blank", "cntrl", "digit", "graph",
- "print", "punct", "space", "word", "xdigit" };
- static const uschar posix_name_lengths[] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
- /* Table of class bit maps for each POSIX class; up to three may be combined
- to form the class. The table for [:blank:] is dynamically modified to remove
- the vertical space characters. */
- static const int posix_class_maps[] = {
- cbit_lower, cbit_upper, -1, /* alpha */
- cbit_lower, -1, -1, /* lower */
- cbit_upper, -1, -1, /* upper */
- cbit_digit, cbit_lower, cbit_upper, /* alnum */
- cbit_print, cbit_cntrl, -1, /* ascii */
- cbit_space, -1, -1, /* blank - a GNU extension */
- cbit_cntrl, -1, -1, /* cntrl */
- cbit_digit, -1, -1, /* digit */
- cbit_graph, -1, -1, /* graph */
- cbit_print, -1, -1, /* print */
- cbit_punct, -1, -1, /* punct */
- cbit_space, -1, -1, /* space */
- cbit_word, -1, -1, /* word - a Perl extension */
- cbit_xdigit,-1, -1 /* xdigit */
- };
- /* The texts of compile-time error messages. These are "char *" because they
- are passed to the outside world. */
- static const char *error_texts[] = {
- "no error",
- "\\ at end of pattern",
- "\\c at end of pattern",
- "unrecognized character follows \\",
- "numbers out of order in {} quantifier",
- /* 5 */
- "number too big in {} quantifier",
- "missing terminating ] for character class",
- "invalid escape sequence in character class",
- "range out of order in character class",
- "nothing to repeat",
- /* 10 */
- "operand of unlimited repeat could match the empty string",
- "internal error: unexpected repeat",
- "unrecognized character after (?",
- "POSIX named classes are supported only within a class",
- "missing )",
- /* 15 */
- "reference to non-existent subpattern",
- "erroffset passed as NULL",
- "unknown option bit(s) set",
- "missing ) after comment",
- "parentheses nested too deeply",
- /* 20 */
- "regular expression too large",
- "failed to get memory",
- "unmatched parentheses",
- "internal error: code overflow",
- "unrecognized character after (?<",
- /* 25 */
- "lookbehind assertion is not fixed length",
- "malformed number after (?(",
- "conditional group contains more than two branches",
- "assertion expected after (?(",
- "(?R or (?digits must be followed by )",
- /* 30 */
- "unknown POSIX class name",
- "POSIX collating elements are not supported",
- "this version of PCRE is not compiled with PCRE_UTF8 support",
- "spare error",
- "character value in \\x{...} sequence is too large",
- /* 35 */
- "invalid condition (?(0)",
- "\\C not allowed in lookbehind assertion",
- "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
- "number after (?C is > 255",
- "closing ) for (?C expected",
- /* 40 */
- "recursive call could loop indefinitely",
- "unrecognized character after (?P",
- "syntax error after (?P",
- "two named groups have the same name",
- "invalid UTF-8 string",
- /* 45 */
- "support for \\P, \\p, and \\X has not been compiled",
- "malformed \\P or \\p sequence",
- "unknown property name after \\P or \\p"
- };
- /* Table to identify digits and hex digits. This is used when compiling
- patterns. Note that the tables in chartables are dependent on the locale, and
- may mark arbitrary characters as digits - but the PCRE compiling code expects
- to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
- a private table here. It costs 256 bytes, but it is a lot faster than doing
- character value tests (at least in some simple cases I timed), and in some
- applications one wants PCRE to compile efficiently as well as match
- efficiently.
- For convenience, we use the same bit definitions as in chartables:
- 0x04 decimal digit
- 0x08 hexadecimal digit
- Then we can use ctype_digit and ctype_xdigit in the code. */
- #if !EBCDIC /* This is the "normal" case, for ASCII systems */
- static const unsigned char digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
- #else /* This is the "abnormal" case, for EBCDIC systems */
- static const unsigned char digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
- static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
- 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
- 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
- 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
- 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
- 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
- #endif
- /* Definition to allow mutual recursion */
- static BOOL
- compile_regex(int, int, int *, uschar **, const uschar **, int *, BOOL, int,
- int *, int *, branch_chain *, compile_data *);
- /*************************************************
- * Handle escapes *
- *************************************************/
- /* This function is called when a \ has been encountered. It either returns a
- positive value for a simple escape such as \n, or a negative value which
- encodes one of the more complicated things such as \d. When UTF-8 is enabled,
- a positive value greater than 255 may be returned. On entry, ptr is pointing at
- the \. On exit, it is on the final character of the escape sequence.
- Arguments:
- ptrptr points to the pattern position pointer
- errorcodeptr points to the errorcode variable
- bracount number of previous extracting brackets
- options the options bits
- isclass TRUE if inside a character class
- Returns: zero or positive => a data character
- negative => a special escape sequence
- on error, errorptr is set
- */
- static int
- check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
- int options, BOOL isclass)
- {
- const uschar *ptr = *ptrptr;
- int c, i;
- /* If backslash is at the end of the pattern, it's an error. */
- c = *(++ptr);
- if (c == 0) *errorcodeptr = ERR1;
- /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
- a table. A non-zero result is something that can be returned immediately.
- Otherwise further processing may be required. */
- #if !EBCDIC /* ASCII coding */
- else if (c < '0' || c > 'z') {} /* Not alphameric */
- else if ((i = escapes[c - '0']) != 0) c = i;
- #else /* EBCDIC coding */
- else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphameric */
- else if ((i = escapes[c - 0x48]) != 0) c = i;
- #endif
- /* Escapes that need further processing, or are illegal. */
- else
- {
- const uschar *oldptr;
- switch (c)
- {
- /* A number of Perl escapes are not handled by PCRE. We give an explicit
- error. */
- case 'l':
- case 'L':
- case 'N':
- case 'u':
- case 'U':
- *errorcodeptr = ERR37;
- break;
- /* The handling of escape sequences consisting of a string of digits
- starting with one that is not zero is not straightforward. By experiment,
- the way Perl works seems to be as follows:
- Outside a character class, the digits are read as a decimal number. If the
- number is less than 10, or if there are that many previous extracting
- left brackets, then it is a back reference. Otherwise, up to three octal
- digits are read to form an escaped byte. Thus \123 is likely to be octal
- 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
- value is greater than 377, the least significant 8 bits are taken. Inside a
- character class, \ followed by a digit is always an octal number. */
- case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9':
- if (!isclass)
- {
- oldptr = ptr;
- c -= '0';
- while ((digitab[ptr[1]] & ctype_digit) != 0)
- c = c * 10 + *(++ptr) - '0';
- if (c < 10 || c <= bracount)
- {
- c = -(ESC_REF + c);
- break;
- }
- ptr = oldptr; /* Put the pointer back and fall through */
- }
- /* Handle an octal number following \. If the first digit is 8 or 9, Perl
- generates a binary zero byte and treats the digit as a following literal.
- Thus we have to pull back the pointer by one. */
- if ((c = *ptr) >= '8')
- {
- ptr--;
- c = 0;
- break;
- }
- /* \0 always starts an octal number, but we may drop through to here with a
- larger first octal digit. */
- case '0':
- c -= '0';
- while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
- c = c * 8 + *(++ptr) - '0';
- c &= 255; /* Take least significant 8 bits */
- break;
- /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
- which can be greater than 0xff, but only if the ddd are hex digits. */
- case 'x':
- #ifdef SUPPORT_UTF8
- if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
- {
- const uschar *pt = ptr + 2;
- register int count = 0;
- c = 0;
- while ((digitab[*pt] & ctype_xdigit) != 0)
- {
- int cc = *pt++;
- count++;
- #if !EBCDIC /* ASCII coding */
- if (cc >= 'a') cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
- #else /* EBCDIC coding */
- if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
- #endif
- }
- if (*pt == '}')
- {
- if (c < 0 || count > 8) *errorcodeptr = ERR34;
- ptr = pt;
- break;
- }
- /* If the sequence of hex digits does not end with '}', then we don't
- recognize this construct; fall through to the normal \x handling. */
- }
- #endif
- /* Read just a single hex char */
- c = 0;
- while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
- {
- int cc; /* Some compilers don't like ++ */
- cc = *(++ptr); /* in initializers */
- #if !EBCDIC /* ASCII coding */
- if (cc >= 'a') cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
- #else /* EBCDIC coding */
- if (cc <= 'z') cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
- #endif
- }
- break;
- /* Other special escapes not starting with a digit are straightforward */
- case 'c':
- c = *(++ptr);
- if (c == 0)
- {
- *errorcodeptr = ERR2;
- return 0;
- }
- /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
- is ASCII-specific, but then the whole concept of \cx is ASCII-specific.
- (However, an EBCDIC equivalent has now been added.) */
- #if !EBCDIC /* ASCII coding */
- if (c >= 'a' && c <= 'z') c -= 32;
- c ^= 0x40;
- #else /* EBCDIC coding */
- if (c >= 'a' && c <= 'z') c += 64;
- c ^= 0xC0;
- #endif
- break;
- /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
- other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
- for Perl compatibility, it is a literal. This code looks a bit odd, but
- there used to be some cases other than the default, and there may be again
- in future, so I haven't "optimized" it. */
- default:
- if ((options & PCRE_EXTRA) != 0) switch(c)
- {
- default:
- *errorcodeptr = ERR3;
- break;
- }
- break;
- }
- }
- *ptrptr = ptr;
- return c;
- }
- #ifdef SUPPORT_UCP
- /*************************************************
- * Handle \P and \p *
- *************************************************/
- /* This function is called after \P or \p has been encountered, provided that
- PCRE is compiled with support for Unicode properties. On entry, ptrptr is
- pointing at the P or p. On exit, it is pointing at the final character of the
- escape sequence.
- Argument:
- ptrptr points to the pattern position pointer
- negptr points to a boolean that is set TRUE for negation else FALSE
- errorcodeptr points to the error code variable
- Returns: value from ucp_type_table, or -1 for an invalid type
- */
- static int
- get_ucp(const uschar **ptrptr, BOOL *negptr, int *errorcodeptr)
- {
- int c, i, bot, top;
- const uschar *ptr = *ptrptr;
- char name[4];
- *negptr = FALSE;
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- /* \P or \p can be followed by a one- or two-character name in {}, optionally
- preceded by ^ for negation. */
- if (c == '{')
- {
- if (ptr[1] == '^')
- {
- *negptr = TRUE;
- ptr++;
- }
- for (i = 0; i <= 2; i++)
- {
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- if (c == '}') break;
- name[i] = c;
- }
- if (c !='}') /* Try to distinguish error cases */
- {
- while (*(++ptr) != 0 && *ptr != '}');
- if (*ptr == '}') goto UNKNOWN_RETURN; else goto ERROR_RETURN;
- }
- name[i] = 0;
- }
- /* Otherwise there is just one following character */
- else
- {
- name[0] = c;
- name[1] = 0;
- }
- *ptrptr = ptr;
- /* Search for a recognized property name using binary chop */
- bot = 0;
- top = _pcre_utt_size;
- while (bot < top)
- {
- i = (bot + top)/2;
- c = strcmp(name, _pcre_utt[i].name);
- if (c == 0) return _pcre_utt[i].value;
- if (c > 0) bot = i + 1; else top = i;
- }
- UNKNOWN_RETURN:
- *errorcodeptr = ERR47;
- *ptrptr = ptr;
- return -1;
- ERROR_RETURN:
- *errorcodeptr = ERR46;
- *ptrptr = ptr;
- return -1;
- }
- #endif
- /*************************************************
- * Check for counted repeat *
- *************************************************/
- /* This function is called when a '{' is encountered in a place where it might
- start a quantifier. It looks ahead to see if it really is a quantifier or not.
- It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
- where the ddds are digits.
- Arguments:
- p pointer to the first char after '{'
- Returns: TRUE or FALSE
- */
- static BOOL
- is_counted_repeat(const uschar *p)
- {
- if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
- while ((digitab[*p] & ctype_digit) != 0) p++;
- if (*p == '}') return TRUE;
- if (*p++ != ',') return FALSE;
- if (*p == '}') return TRUE;
- if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
- while ((digitab[*p] & ctype_digit) != 0) p++;
- return (*p == '}');
- }
- /*************************************************
- * Read repeat counts *
- *************************************************/
- /* Read an item of the form {n,m} and return the values. This is called only
- after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
- so the syntax is guaranteed to be correct, but we need to check the values.
- Arguments:
- p pointer to first char after '{'
- minp pointer to int for min
- maxp pointer to int for max
- returned as -1 if no max
- errorcodeptr points to error code variable
- Returns: pointer to '}' on success;
- current ptr on error, with errorcodeptr set non-zero
- */
- static const uschar *
- read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
- {
- int min = 0;
- int max = -1;
- /* Read the minimum value and do a paranoid check: a negative value indicates
- an integer overflow. */
- while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
- if (min < 0 || min > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- /* Read the maximum value if there is one, and again do a paranoid on its size.
- Also, max must not be less than min. */
- if (*p == '}') max = min; else
- {
- if (*(++p) != '}')
- {
- max = 0;
- while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
- if (max < 0 || max > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- if (max < min)
- {
- *errorcodeptr = ERR4;
- return p;
- }
- }
- }
- /* Fill in the required variables, and pass back the pointer to the terminating
- '}'. */
- *minp = min;
- *maxp = max;
- return p;
- }
- /*************************************************
- * Find first significant op code *
- *************************************************/
- /* This is called by several functions that scan a compiled expression looking
- for a fixed first character, or an anchoring op code etc. It skips over things
- that do not influence this. For some calls, a change of option is important.
- For some calls, it makes sense to skip negative forward and all backward
- assertions, and also the \b assertion; for others it does not.
- Arguments:
- code pointer to the start of the group
- options pointer to external options
- optbit the option bit whose changing is significant, or
- zero if none are
- skipassert TRUE if certain assertions are to be skipped
- Returns: pointer to the first significant opcode
- */
- static const uschar*
- first_significant_code(const uschar *code, int *options, int optbit,
- BOOL skipassert)
- {
- for (;;)
- {
- switch ((int)*code)
- {
- case OP_OPT:
- if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
- *options = (int)code[1];
- code += 2;
- break;
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- if (!skipassert) return code;
- do code += GET(code, 1); while (*code == OP_ALT);
- code += _pcre_OP_lengths[*code];
- break;
- case OP_WORD_BOUNDARY:
- case OP_NOT_WORD_BOUNDARY:
- if (!skipassert) return code;
- /* Fall through */
- case OP_CALLOUT:
- case OP_CREF:
- case OP_BRANUMBER:
- code += _pcre_OP_lengths[*code];
- break;
- default:
- return code;
- }
- }
- /* Control never reaches here */
- }
- /*************************************************
- * Find the fixed length of a pattern *
- *************************************************/
- /* Scan a pattern and compute the fixed length of subject that will match it,
- if the length is fixed. This is needed for dealing with backward assertions.
- In UTF8 mode, the result is in characters rather than bytes.
- Arguments:
- code points to the start of the pattern (the bracket)
- options the compiling options
- Returns: the fixed length, or -1 if there is no fixed length,
- or -2 if \C was encountered
- */
- static int
- find_fixedlength(uschar *code, int options)
- {
- int length = -1;
- register int branchlength = 0;
- register uschar *cc = code + 1 + LINK_SIZE;
- /* Scan along the opcodes for this branch. If we get to the end of the
- branch, check the length against that of the other branches. */
- for (;;)
- {
- int d;
- register int op = *cc;
- if (op >= OP_BRA) op = OP_BRA;
- switch (op)
- {
- case OP_BRA:
- case OP_ONCE:
- case OP_COND:
- d = find_fixedlength(cc, options);
- if (d < 0) return d;
- branchlength += d;
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- cc += 1 + LINK_SIZE;
- break;
- /* Reached end of a branch; if it's a ket it is the end of a nested
- call. If it's ALT it is an alternation in a nested call. If it is
- END it's the end of the outer call. All can be handled by the same code. */
- case OP_ALT:
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_END:
- if (length < 0) length = branchlength;
- else if (length != branchlength) return -1;
- if (*cc != OP_ALT) return length;
- cc += 1 + LINK_SIZE;
- branchlength = 0;
- break;
- /* Skip over assertive subpatterns */
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- /* Fall through */
- /* Skip over things that don't match chars */
- case OP_REVERSE:
- case OP_BRANUMBER:
- case OP_CREF:
- case OP_OPT:
- case OP_CALLOUT:
- case OP_SOD:
- case OP_SOM:
- case OP_EOD:
- case OP_EODN:
- case OP_CIRC:
- case OP_DOLL:
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- cc += _pcre_OP_lengths[*cc];
- break;
- /* Handle literal characters */
- case OP_CHAR:
- case OP_CHARNC:
- branchlength++;
- cc += 2;
- #ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while ((*cc & 0xc0) == 0x80) cc++;
- }
- #endif
- break;
- /* Handle exact repetitions. The count is already in characters, but we
- need to skip over a multibyte character in UTF8 mode. */
- case OP_EXACT:
- branchlength += GET2(cc,1);
- cc += 4;
- #ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while((*cc & 0x80) == 0x80) cc++;
- }
- #endif
- break;
- case OP_TYPEEXACT:
- branchlength += GET2(cc,1);
- cc += 4;
- break;
- /* Handle single-char matchers */
- case OP_PROP:
- case OP_NOTPROP:
- cc++;
- /* Fall through */
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- branchlength++;
- cc++;
- break;
- /* The single-byte matcher isn't allowed */
- case OP_ANYBYTE:
- return -2;
- /* Check a class for variable quantification */
- #ifdef SUPPORT_UTF8
- case OP_XCLASS:
- cc += GET(cc, 1) - 33;
- /* Fall through */
- #endif
- case OP_CLASS:
- case OP_NCLASS:
- cc += 33;
- switch (*cc)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- return -1;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(cc,1) != GET2(cc,3)) return -1;
- branchlength += GET2(cc,1);
- cc += 5;
- break;
- default:
- branchlength++;
- }
- break;
- /* Anything else is variable length */
- default:
- return -1;
- }
- }
- /* Control never gets here */
- }
- /*************************************************
- * Scan compiled regex for numbered bracket *
- *************************************************/
- /* This little function scans through a compiled pattern until it finds a
- capturing bracket with the given number.
- Arguments:
- code points to start of expression
- utf8 TRUE in UTF-8 mode
- number the required bracket number
- Returns: pointer to the opcode for the bracket, or NULL if not found
- */
- static const uschar *
- find_bracket(const uschar *code, BOOL utf8, int number)
- {
- #ifndef SUPPORT_UTF8
- utf8 = utf8; /* Stop pedantic compilers complaining */
- #endif
- for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
- else if (c > OP_BRA)
- {
- int n = c - OP_BRA;
- if (n > EXTRACT_BASIC_MAX) n = GET2(code, 2+LINK_SIZE);
- if (n == number) return (uschar *)code;
- code += _pcre_OP_lengths[OP_BRA];
- }
- else
- {
- code += _pcre_OP_lengths[c];
- #ifdef SUPPORT_UTF8
- /* In UTF-8 mode, opcodes that are followed by a character may be followed
- by a multi-byte character. The length in the table is a minimum, so we have
- to scan along to skip the extra bytes. All opcodes are less than 128, so we
- can use relatively efficient code. */
- if (utf8) switch(c)
- {
- case OP_CHAR:
- case OP_CHARNC:
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- while ((*code & 0xc0) == 0x80) code++;
- break;
- /* XCLASS is used for classes that cannot be represented just by a bit
- map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compiled code. */
- case OP_XCLASS:
- code += GET(code, 1) + 1;
- break;
- }
- #endif
- }
- }
- }
- /*************************************************
- * Scan compiled regex for recursion reference *
- *************************************************/
- /* This little function scans through a compiled pattern until it finds an
- instance of OP_RECURSE.
- Arguments:
- code points to start of expression
- utf8 TRUE in UTF-8 mode
- Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
- */
- static const uschar *
- find_recurse(const uschar *code, BOOL utf8)
- {
- #ifndef SUPPORT_UTF8
- utf8 = utf8; /* Stop pedantic compilers complaining */
- #endif
- for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
- else if (c == OP_RECURSE) return code;
- else if (c > OP_BRA)
- {
- code += _pcre_OP_lengths[OP_BRA];
- }
- else
- {
- code += _pcre_OP_lengths[c];
- #ifdef SUPPORT_UTF8
- /* In UTF-8 mode, opcodes that are followed by a character may be followed
- by a multi-byte character. The length in the table is a minimum, so we have
- to scan along to skip the extra bytes. All opcodes are less than 128, so we
- can use relatively efficient code. */
- if (utf8) switch(c)
- {
- case OP_CHAR:
- case OP_CHARNC:
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- while ((*code & 0xc0) == 0x80) code++;
- break;
- /* XCLASS is used for classes that cannot be represented just by a bit
- map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compiled code. */
- case OP_XCLASS:
- code += GET(code, 1) + 1;
- break;
- }
- #endif
- }
- }
- }
- /*************************************************
- * Scan compiled branch for non-emptiness *
- *************************************************/
- /* This function scans through a branch of a compiled pattern to see whether it
- can match the empty string or not. It is called only from could_be_empty()
- below. Note that first_significant_code() skips over assertions. If we hit an
- unclosed bracket, we return "empty" - this means we've struck an inner bracket
- whose current branch will already have been scanned.
- Arguments:
- code points to start of search
- endcode points to where to stop
- utf8 TRUE if in UTF8 mode
- Returns: TRUE if what is matched could be empty
- */
- static BOOL
- could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
- {
- register int c;
- for (code = first_significant_code(code + 1 + LINK_SIZE, NULL, 0, TRUE);
- code < endcode;
- code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
- {
- const uschar *ccode;
- c = *code;
- if (c >= OP_BRA)
- {
- BOOL empty_branch;
- if (GET(code, 1) == 0) return TRUE; /* Hit unclosed bracket */
- /* Scan a closed bracket */
- empty_branch = FALSE;
- do
- {
- if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
- empty_branch = TRUE;
- code += GET(code, 1);
- }
- while (*code == OP_ALT);
- if (!empty_branch) return FALSE; /* All branches are non-empty */
- code += 1 + LINK_SIZE;
- c = *code;
- }
- else switch (c)
- {
- /* Check for quantifiers after a class */
- #ifdef SUPPORT_UTF8
- case OP_XCLASS:
- ccode = code + GET(code, 1);
- goto CHECK_CLASS_REPEAT;
- #endif
- case OP_CLASS:
- case OP_NCLASS:
- ccode = code + 33;
- #ifdef SUPPORT_UTF8
- CHECK_CLASS_REPEAT:
- #endif
- switch (*ccode)
- {
- case OP_CRSTAR: /* These could be empty; continue */
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- break;
- default: /* Non-repeat => class must match */
- case OP_CRPLUS: /* These repeats aren't empty */
- case OP_CRMINPLUS:
- return FALSE;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(ccode, 1) > 0) return FALSE; /* Minimum > 0 */
- break;
- }
- break;
- /* Opcodes that must match a character */
- case OP_PROP:
- case OP_NOTPROP:
- case OP_EXTUNI:
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- case OP_ANYBYTE:
- case OP_CHAR:
- case OP_CHARNC:
- case OP_NOT:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_EXACT:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTEXACT:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEEXACT:
- return FALSE;
- /* End of branch */
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_ALT:
- return TRUE;
- /* In UTF-8 mode, STAR, MINSTAR, QUERY, MINQUERY, UPTO, and MINUPTO may be
- followed by a multibyte character */
- #ifdef SUPPORT_UTF8
- case OP_STAR:
- case OP_MINSTAR:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_UPTO:
- case OP_MINUPTO:
- if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
- break;
- #endif
- }
- }
- return TRUE;
- }
- /*************************************************
- * Scan compiled regex for non-emptiness *
- *************************************************/
- /* This function is called to check for left recursive calls. We want to check
- the current branch of the current pattern to see if it could match the empty
- string. If it could, we must look outwards for branches at other levels,
- stopping when we pass beyond the bracket which is the subject of the recursion.
- Arguments:
- code points to start of the recursion
- endcode points to where to stop (current RECURSE item)
- bcptr points to the chain of current (unclosed) branch starts
- utf8 TRUE if in UTF-8 mode
- Returns: TRUE if what is matched could be empty
- */
- static BOOL
- could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
- BOOL utf8)
- {
- while (bcptr != NULL && bcptr->current >= code)
- {
- if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
- bcptr = bcptr->outer;
- }
- return TRUE;
- }
- /*************************************************
- * Check for POSIX class syntax *
- *************************************************/
- /* This function is called when the sequence "[:" or "[." or "[=" is
- encountered in a character class. It checks whether this is followed by an
- optional ^ and then a sequence of letters, terminated by a matching ":]" or
- ".]" or "=]".
- Argument:
- ptr pointer to the initial [
- endptr where to return the end pointer
- cd pointer to compile data
- Returns: TRUE or FALSE
- */
- static BOOL
- check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
- {
- int terminator; /* Don't combine these lines; the Solaris cc */
- terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
- if (*(++ptr) == '^') ptr++;
- while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
- if (*ptr == terminator && ptr[1] == ']')
- {
- *endptr = ptr;
- return TRUE;
- }
- return FALSE;
- }
- /*************************************************
- * Check POSIX class name *
- *************************************************/
- /* This function is called to check the name given in a POSIX-style class entry
- such as [:alnum:].
- Arguments:
- ptr points to the first letter
- len the length of the name
- Returns: a value representing the name, or -1 if unknown
- */
- static int
- check_posix_name(const uschar *ptr, int len)
- {
- register int yield = 0;
- while (posix_name_lengths[yield] != 0)
- {
- if (len == posix_name_lengths[yield] &&
- strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
- yield++;
- }
- return -1;
- }
- /*************************************************
- * Adjust OP_RECURSE items in repeated group *
- *************************************************/
- /* OP_RECURSE items contain an offset from the start of the regex to the group
- that is referenced. This means that groups can be replicated for fixed
- repetition simply by copying (because the recursion is allowed to refer to
- earlier groups that are outside the current group). However, when a group is
- optional (i.e. the minimum quantifier is zero), OP_BRAZERO is inserted before
- it, after it has been compiled. This means that any OP_RECURSE items within it
- that refer to the group itself or any contained groups have to have their
- offsets adjusted. That is the job of this function. Before it is called, the
- partially compiled regex must be temporarily terminated with OP_END.
- Arguments:
- group points to the start of the group
- adjust the amount by which the group is to be moved
- utf8 TRUE in UTF-8 mode
- cd contains pointers to tables etc.
- Returns: nothing
- */
- static void
- adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd)
- {
- uschar *ptr = group;
- while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
- {
- int offset = GET(ptr, 1);
- if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
- ptr += 1 + LINK_SIZE;
- }
- }
- /*************************************************
- * Insert an automatic callout point *
- *************************************************/
- /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
- callout points before each pattern item.
- Arguments:
- code current code pointer
- ptr current pattern pointer
- cd pointers to tables etc
- Returns: new code pointer
- */
- static uschar *
- auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
- {
- *code++ = OP_CALLOUT;
- *code++ = 255;
- PUT(code, 0, ptr - cd->start_pattern); /* Pattern offset */
- PUT(code, LINK_SIZE, 0); /* Default length */
- return code + 2*LINK_SIZE;
- }
- /*************************************************
- * Complete a callout item *
- *************************************************/
- /* A callout item contains the length of the next item in the pattern, which
- we can't fill in till after we have reached the relevant point. This is used
- for both automatic and manual callouts.
- Arguments:
- previous_callout points to previous callout item
- ptr current pattern pointer
- cd pointers to tables etc
- Returns: nothing
- */
- static void
- complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
- {
- int length = ptr - cd->start_pattern - GET(previous_callout, 2);
- PUT(previous_callout, 2 + LINK_SIZE, length);
- }
- #ifdef SUPPORT_UCP
- /*************************************************
- * Get othercase range *
- *************************************************/
- /* This function is passed the start and end of a class range, in UTF-8 mode
- with UCP support. It searches up the characters, looking for internal ranges of
- characters in the "other" case. Each call returns the next one, updating the
- start address.
- Arguments:
- cptr points to starting character value; updated
- d end value
- ocptr where to put start of othercase range
- odptr where to put end of othercase range
- Yield: TRUE when range returned; FALSE when no more
- */
- static BOOL
- get_othercase_range(int *cptr, int d, int *ocptr, int *odptr)
- {
- int c, chartype, othercase, next;
- for (c = *cptr; c <= d; c++)
- {
- if (_pcre_ucp_findchar(c, &chartype, &othercase) == ucp_L && othercase != 0)
- break;
- }
- if (c > d) return FALSE;
- *ocptr = othercase;
- next = othercase + 1;
- for (++c; c <= d; c++)
- {
- if (_pcre_ucp_findchar(c, &chartype, &othercase) != ucp_L ||
- othercase != next)
- break;
- next++;
- }
- *odptr = next - 1;
- *cptr = c;
- return TRUE;
- }
- #endif /* SUPPORT_UCP */
- /*************************************************
- * Compile one branch *
- *************************************************/
- /* Scan the pattern, compiling it into the code vector. If the options are
- changed during the branch, the pointer is used to change the external options
- bits.
- Arguments:
- optionsptr pointer to the option bits
- brackets points to number of extracting brackets used
- codeptr points to the pointer to the current code point
- ptrptr points to the current pattern pointer
- errorcodeptr points to error code variable
- firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
- reqbyteptr set to the last literal character required, else < 0
- bcptr points to current branch chain
- cd contains pointers to tables etc.
- Returns: TRUE on success
- FALSE, with *errorcodeptr set non-zero on error
- */
- static BOOL
- compile_branch(int *optionsptr, int *brackets, uschar **codeptr,
- const uschar **ptrptr, int *errorcodeptr, int *firstbyteptr,
- int *reqbyteptr, branch_chain *bcptr, compile_data *cd)
- {
- int repeat_type, op_type;
- int repeat_min = 0, repeat_max = 0; /* To please picky compilers */
- int bravalue = 0;
- int greedy_default, greedy_non_default;
- int firstbyte, reqbyte;
- int zeroreqbyte, zerofirstbyte;
- int req_caseopt, reqvary, tempreqvary;
- int condcount = 0;
- int options = *optionsptr;
- int after_manual_callout = 0;
- register int c;
- register uschar *code = *codeptr;
- uschar *tempcode;
- BOOL inescq = FALSE;
- BOOL groupsetfirstbyte = FALSE;
- const uschar *ptr = *ptrptr;
- const uschar *tempptr;
- uschar *previous = NULL;
- uschar *previous_callout = NULL;
- uschar classbits[32];
- #ifdef SUPPORT_UTF8
- BOOL class_utf8;
- BOOL utf8 = (options & PCRE_UTF8) != 0;
- uschar *class_utf8data;
- uschar utf8_char[6];
- #else
- BOOL utf8 = FALSE;
- #endif
- /* Set up the default and non-default settings for greediness */
- greedy_default = ((options & PCRE_UNGREEDY) != 0);
- greedy_non_default = greedy_default ^ 1;
- /* Initialize no first byte, no required byte. REQ_UNSET means "no char
- matching encountered yet". It gets changed to REQ_NONE if we hit something that
- matches a non-fixed char first char; reqbyte just remains unset if we never
- find one.
- When we hit a repeat whose minimum is zero, we may have to adjust these values
- to take the zero repeat into account. This is implemented by setting them to
- zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
- item types that can be repeated set these backoff variables appropriately. */
- firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
- /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
- according to the current setting of the caseless flag. REQ_CASELESS is a bit
- value > 255. It is added into the firstbyte or reqbyte variables to record the
- case status of the value. This is used only for ASCII characters. */
- req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
- /* Switch on next character until the end of the branch */
- for (;; ptr++)
- {
- BOOL negate_class;
- BOOL possessive_quantifier;
- BOOL is_quantifier;
- int class_charcount;
- int class_lastchar;
- int newoptions;
- int recno;
- int skipbytes;
- int subreqbyte;
- int subfirstbyte;
- int mclength;
- uschar mcbuffer[8];
- /* Next byte in the pattern */
- c = *ptr;
- /* If in \Q...\E, check for the end; if n…
Large files files are truncated, but you can click here to view the full file