/tags/harbour-1.0.0/source/hbpcre/pcrecomp.c
C | 2147 lines | 1300 code | 310 blank | 537 comment | 426 complexity | 77a2ead75113a065678eef12a603e3c3 MD5 | raw file
Possible License(s): AGPL-1.0, BSD-3-Clause, CC-BY-SA-3.0, LGPL-3.0, GPL-2.0, LGPL-2.0, LGPL-2.1
Large files files are truncated, but you can click here to view the full file
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /* PCRE is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language.
- Written by Philip Hazel
- Copyright (c) 1997-2008 University of Cambridge
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- /* This module contains the external function pcre_compile(), along with
- supporting internal functions that are not used by other modules. */
- #if 1
- #include "_hbconf.h"
- #endif
- #define NLBLOCK cd /* Block containing newline information */
- #define PSSTART start_pattern /* Field containing processed string start */
- #define PSEND end_pattern /* Field containing processed string end */
- #include "pcreinal.h"
- /* When DEBUG is defined, we need the pcre_printint() function, which is also
- used by pcretest. DEBUG is not defined when building a production library. */
- #ifdef DEBUG
- #include "pcreprni.h"
- #endif
- /* Macro for setting individual bits in class bitmaps. */
- #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
- /* Maximum length value to check against when making sure that the integer that
- holds the compiled pattern length does not overflow. We make it a bit less than
- INT_MAX to allow for adding in group terminating bytes, so that we don't have
- to check them every time. */
- #define OFLOW_MAX (INT_MAX - 20)
- /*************************************************
- * Code parameters and static tables *
- *************************************************/
- /* This value specifies the size of stack workspace that is used during the
- first pre-compile phase that determines how much memory is required. The regex
- is partly compiled into this space, but the compiled parts are discarded as
- soon as they can be, so that hopefully there will never be an overrun. The code
- does, however, check for an overrun. The largest amount I've seen used is 218,
- so this number is very generous.
- The same workspace is used during the second, actual compile phase for
- remembering forward references to groups so that they can be filled in at the
- end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
- is 4 there is plenty of room. */
- #define COMPILE_WORK_SIZE (4096)
- /* Table for handling escaped characters in the range '0'-'z'. Positive returns
- are simple data values; negative values are for special things like \d and so
- on. Zero means further processing is needed (for things like \x), or the escape
- is invalid. */
- #ifndef EBCDIC /* This is the "normal" table for ASCII systems */
- static const short int escapes[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
- 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
- '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E, 0, -ESC_G, /* @ - G */
- -ESC_H, 0, 0, -ESC_K, 0, 0, 0, 0, /* H - O */
- -ESC_P, -ESC_Q, -ESC_R, -ESC_S, 0, 0, -ESC_V, -ESC_W, /* P - W */
- -ESC_X, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
- '`', 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0, /* ` - g */
- -ESC_h, 0, 0, -ESC_k, 0, 0, ESC_n, 0, /* h - o */
- -ESC_p, 0, ESC_r, -ESC_s, ESC_tee, 0, -ESC_v, -ESC_w, /* p - w */
- 0, 0, -ESC_z /* x - z */
- };
- #else /* This is the "abnormal" table for EBCDIC systems */
- static const short int escapes[] = {
- /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
- /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
- /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
- /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
- /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
- /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
- /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
- /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
- /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
- /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
- /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
- /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
- /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
- /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
- /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
- /* D0 */ '}', 0, -ESC_K, 0, 0, 0, 0, -ESC_P,
- /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
- /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
- /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
- /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
- };
- #endif
- /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
- searched linearly. Put all the names into a single string, in order to reduce
- the number of relocations when a shared library is dynamically linked. */
- typedef struct verbitem {
- int len;
- int op;
- } verbitem;
- static const char verbnames[] =
- "ACCEPT\0"
- "COMMIT\0"
- "F\0"
- "FAIL\0"
- "PRUNE\0"
- "SKIP\0"
- "THEN";
- static const verbitem verbs[] = {
- { 6, OP_ACCEPT },
- { 6, OP_COMMIT },
- { 1, OP_FAIL },
- { 4, OP_FAIL },
- { 5, OP_PRUNE },
- { 4, OP_SKIP },
- { 4, OP_THEN }
- };
- static const int verbcount = sizeof(verbs)/sizeof(verbitem);
- /* Tables of names of POSIX character classes and their lengths. The names are
- now all in a single string, to reduce the number of relocations when a shared
- library is dynamically loaded. The list of lengths is terminated by a zero
- length entry. The first three must be alpha, lower, upper, as this is assumed
- for handling case independence. */
- static const char posix_names[] =
- "alpha\0" "lower\0" "upper\0" "alnum\0" "ascii\0" "blank\0"
- "cntrl\0" "digit\0" "graph\0" "print\0" "punct\0" "space\0"
- "word\0" "xdigit";
- static const uschar posix_name_lengths[] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
- /* Table of class bit maps for each POSIX class. Each class is formed from a
- base map, with an optional addition or removal of another map. Then, for some
- classes, there is some additional tweaking: for [:blank:] the vertical space
- characters are removed, and for [:alpha:] and [:alnum:] the underscore
- character is removed. The triples in the table consist of the base map offset,
- second map offset or -1 if no second map, and a non-negative value for map
- addition or a negative value for map subtraction (if there are two maps). The
- absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
- remove vertical space characters, 2 => remove underscore. */
- static const int posix_class_maps[] = {
- cbit_word, cbit_digit, -2, /* alpha */
- cbit_lower, -1, 0, /* lower */
- cbit_upper, -1, 0, /* upper */
- cbit_word, -1, 2, /* alnum - word without underscore */
- cbit_print, cbit_cntrl, 0, /* ascii */
- cbit_space, -1, 1, /* blank - a GNU extension */
- cbit_cntrl, -1, 0, /* cntrl */
- cbit_digit, -1, 0, /* digit */
- cbit_graph, -1, 0, /* graph */
- cbit_print, -1, 0, /* print */
- cbit_punct, -1, 0, /* punct */
- cbit_space, -1, 0, /* space */
- cbit_word, -1, 0, /* word - a Perl extension */
- cbit_xdigit,-1, 0 /* xdigit */
- };
- #define STRING(a) # a
- #define XSTRING(s) STRING(s)
- /* The texts of compile-time error messages. These are "char *" because they
- are passed to the outside world. Do not ever re-use any error number, because
- they are documented. Always add a new error instead. Messages marked DEAD below
- are no longer used. This used to be a table of strings, but in order to reduce
- the number of relocations needed when a shared library is loaded dynamically,
- it is now one long string. We cannot use a table of offsets, because the
- lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
- simply count through to the one we want - this isn't a performance issue
- because these strings are used only when there is a compilation error. */
- static const char error_texts[] =
- "no error\0"
- "\\ at end of pattern\0"
- "\\c at end of pattern\0"
- "unrecognized character follows \\\0"
- "numbers out of order in {} quantifier\0"
- /* 5 */
- "number too big in {} quantifier\0"
- "missing terminating ] for character class\0"
- "invalid escape sequence in character class\0"
- "range out of order in character class\0"
- "nothing to repeat\0"
- /* 10 */
- "operand of unlimited repeat could match the empty string\0" /** DEAD **/
- "internal error: unexpected repeat\0"
- "unrecognized character after (? or (?-\0"
- "POSIX named classes are supported only within a class\0"
- "missing )\0"
- /* 15 */
- "reference to non-existent subpattern\0"
- "erroffset passed as NULL\0"
- "unknown option bit(s) set\0"
- "missing ) after comment\0"
- "parentheses nested too deeply\0" /** DEAD **/
- /* 20 */
- "regular expression is too large\0"
- "failed to get memory\0"
- "unmatched parentheses\0"
- "internal error: code overflow\0"
- "unrecognized character after (?<\0"
- /* 25 */
- "lookbehind assertion is not fixed length\0"
- "malformed number or name after (?(\0"
- "conditional group contains more than two branches\0"
- "assertion expected after (?(\0"
- "(?R or (?[+-]digits must be followed by )\0"
- /* 30 */
- "unknown POSIX class name\0"
- "POSIX collating elements are not supported\0"
- "this version of PCRE is not compiled with PCRE_UTF8 support\0"
- "spare error\0" /** DEAD **/
- "character value in \\x{...} sequence is too large\0"
- /* 35 */
- "invalid condition (?(0)\0"
- "\\C not allowed in lookbehind assertion\0"
- "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
- "number after (?C is > 255\0"
- "closing ) for (?C expected\0"
- /* 40 */
- "recursive call could loop indefinitely\0"
- "unrecognized character after (?P\0"
- "syntax error in subpattern name (missing terminator)\0"
- "two named subpatterns have the same name\0"
- "invalid UTF-8 string\0"
- /* 45 */
- "support for \\P, \\p, and \\X has not been compiled\0"
- "malformed \\P or \\p sequence\0"
- "unknown property name after \\P or \\p\0"
- "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
- "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
- /* 50 */
- "repeated subpattern is too long\0" /** DEAD **/
- "octal value is greater than \\377 (not in UTF-8 mode)\0"
- "internal error: overran compiling workspace\0"
- "internal error: previously-checked referenced subpattern not found\0"
- "DEFINE group contains more than one branch\0"
- /* 55 */
- "repeating a DEFINE group is not allowed\0"
- "inconsistent NEWLINE options\0"
- "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
- "a numbered reference must not be zero\0"
- "(*VERB) with an argument is not supported\0"
- /* 60 */
- "(*VERB) not recognized\0"
- "number is too big\0"
- "subpattern name expected\0"
- "digit expected after (?+\0"
- "] is an invalid data character in JavaScript compatibility mode";
- /* Table to identify digits and hex digits. This is used when compiling
- patterns. Note that the tables in chartables are dependent on the locale, and
- may mark arbitrary characters as digits - but the PCRE compiling code expects
- to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
- a private table here. It costs 256 bytes, but it is a lot faster than doing
- character value tests (at least in some simple cases I timed), and in some
- applications one wants PCRE to compile efficiently as well as match
- efficiently.
- For convenience, we use the same bit definitions as in chartables:
- 0x04 decimal digit
- 0x08 hexadecimal digit
- Then we can use ctype_digit and ctype_xdigit in the code. */
- #ifndef EBCDIC /* This is the "normal" case, for ASCII systems */
- static const unsigned char digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
- #else /* This is the "abnormal" case, for EBCDIC systems */
- static const unsigned char digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
- static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
- 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
- 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
- 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
- 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
- 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
- #endif
- /* Definition to allow mutual recursion */
- static BOOL
- compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
- int *, int *, branch_chain *, compile_data *, int *);
- /*************************************************
- * Find an error text *
- *************************************************/
- /* The error texts are now all in one long string, to save on relocations. As
- some of the text is of unknown length, we can't use a table of offsets.
- Instead, just count through the strings. This is not a performance issue
- because it happens only when there has been a compilation error.
- Argument: the error number
- Returns: pointer to the error string
- */
- static const char *
- find_error_text(int n)
- {
- const char *s = error_texts;
- for (; n > 0; n--) while (*s++ != 0) {};
- return s;
- }
- /*************************************************
- * Handle escapes *
- *************************************************/
- /* This function is called when a \ has been encountered. It either returns a
- positive value for a simple escape such as \n, or a negative value which
- encodes one of the more complicated things such as \d. A backreference to group
- n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
- UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
- ptr is pointing at the \. On exit, it is on the final character of the escape
- sequence.
- Arguments:
- ptrptr points to the pattern position pointer
- errorcodeptr points to the errorcode variable
- bracount number of previous extracting brackets
- options the options bits
- isclass TRUE if inside a character class
- Returns: zero or positive => a data character
- negative => a special escape sequence
- on error, errorcodeptr is set
- */
- static int
- check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
- int options, BOOL isclass)
- {
- BOOL utf8 = (options & PCRE_UTF8) != 0;
- const uschar *ptr = *ptrptr + 1;
- int c, i;
- GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
- ptr--; /* Set pointer back to the last byte */
- /* If backslash is at the end of the pattern, it's an error. */
- if (c == 0) *errorcodeptr = ERR1;
- /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
- in a table. A non-zero result is something that can be returned immediately.
- Otherwise further processing may be required. */
- #ifndef EBCDIC /* ASCII coding */
- else if (c < '0' || c > 'z') {} /* Not alphanumeric */
- else if ((i = escapes[c - '0']) != 0) c = i;
- #else /* EBCDIC coding */
- else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {} /* Not alphanumeric */
- else if ((i = escapes[c - 0x48]) != 0) c = i;
- #endif
- /* Escapes that need further processing, or are illegal. */
- else
- {
- const uschar *oldptr;
- BOOL braced, negated;
- switch (c)
- {
- /* A number of Perl escapes are not handled by PCRE. We give an explicit
- error. */
- case 'l':
- case 'L':
- case 'N':
- case 'u':
- case 'U':
- *errorcodeptr = ERR37;
- break;
- /* \g must be followed by one of a number of specific things:
- (1) A number, either plain or braced. If positive, it is an absolute
- backreference. If negative, it is a relative backreference. This is a Perl
- 5.10 feature.
- (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
- is part of Perl's movement towards a unified syntax for back references. As
- this is synonymous with \k{name}, we fudge it up by pretending it really
- was \k.
- (3) For Oniguruma compatibility we also support \g followed by a name or a
- number either in angle brackets or in single quotes. However, these are
- (possibly recursive) subroutine calls, _not_ backreferences. Just return
- the -ESC_g code (cf \k). */
- case 'g':
- if (ptr[1] == '<' || ptr[1] == '\'')
- {
- c = -ESC_g;
- break;
- }
- /* Handle the Perl-compatible cases */
- if (ptr[1] == '{')
- {
- const uschar *p;
- for (p = ptr+2; *p != 0 && *p != '}'; p++)
- if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
- if (*p != 0 && *p != '}')
- {
- c = -ESC_k;
- break;
- }
- braced = TRUE;
- ptr++;
- }
- else braced = FALSE;
- if (ptr[1] == '-')
- {
- negated = TRUE;
- ptr++;
- }
- else negated = FALSE;
- c = 0;
- while ((digitab[ptr[1]] & ctype_digit) != 0)
- c = c * 10 + *(++ptr) - '0';
- if (c < 0) /* Integer overflow */
- {
- *errorcodeptr = ERR61;
- break;
- }
- if (braced && *(++ptr) != '}')
- {
- *errorcodeptr = ERR57;
- break;
- }
- if (c == 0)
- {
- *errorcodeptr = ERR58;
- break;
- }
- if (negated)
- {
- if (c > bracount)
- {
- *errorcodeptr = ERR15;
- break;
- }
- c = bracount - (c - 1);
- }
- c = -(ESC_REF + c);
- break;
- /* The handling of escape sequences consisting of a string of digits
- starting with one that is not zero is not straightforward. By experiment,
- the way Perl works seems to be as follows:
- Outside a character class, the digits are read as a decimal number. If the
- number is less than 10, or if there are that many previous extracting
- left brackets, then it is a back reference. Otherwise, up to three octal
- digits are read to form an escaped byte. Thus \123 is likely to be octal
- 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
- value is greater than 377, the least significant 8 bits are taken. Inside a
- character class, \ followed by a digit is always an octal number. */
- case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9':
- if (!isclass)
- {
- oldptr = ptr;
- c -= '0';
- while ((digitab[ptr[1]] & ctype_digit) != 0)
- c = c * 10 + *(++ptr) - '0';
- if (c < 0) /* Integer overflow */
- {
- *errorcodeptr = ERR61;
- break;
- }
- if (c < 10 || c <= bracount)
- {
- c = -(ESC_REF + c);
- break;
- }
- ptr = oldptr; /* Put the pointer back and fall through */
- }
- /* Handle an octal number following \. If the first digit is 8 or 9, Perl
- generates a binary zero byte and treats the digit as a following literal.
- Thus we have to pull back the pointer by one. */
- if ((c = *ptr) >= '8')
- {
- ptr--;
- c = 0;
- break;
- }
- /* \0 always starts an octal number, but we may drop through to here with a
- larger first octal digit. The original code used just to take the least
- significant 8 bits of octal numbers (I think this is what early Perls used
- to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
- than 3 octal digits. */
- case '0':
- c -= '0';
- while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
- c = c * 8 + *(++ptr) - '0';
- if (!utf8 && c > 255) *errorcodeptr = ERR51;
- break;
- /* \x is complicated. \x{ddd} is a character number which can be greater
- than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
- treated as a data character. */
- case 'x':
- if (ptr[1] == '{')
- {
- const uschar *pt = ptr + 2;
- int count = 0;
- c = 0;
- while ((digitab[*pt] & ctype_xdigit) != 0)
- {
- register int cc = *pt++;
- if (c == 0 && cc == '0') continue; /* Leading zeroes */
- count++;
- #ifndef EBCDIC /* ASCII coding */
- if (cc >= 'a') cc -= 32; /* Convert to upper case */
- c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
- #else /* EBCDIC coding */
- if (cc >= 'a' && cc <= 'z') cc += 64; /* Convert to upper case */
- c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
- #endif
- }
- if (*pt == '}')
- {
- if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
- ptr = pt;
- break;
- }
- /* If the sequence of hex digits does not end with '}', then we don't
- recognize this construct; fall through to the normal \x handling. */
- }
- /* Read just a single-byte hex-defined char */
- c = 0;
- while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
- {
- int cc; /* Some compilers don't like ++ */
- cc = *(++ptr); /* in initializers */
- #ifndef EBCDIC /* ASCII coding */
- if (cc >= 'a') cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
- #else /* EBCDIC coding */
- if (cc <= 'z') cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
- #endif
- }
- break;
- /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
- This coding is ASCII-specific, but then the whole concept of \cx is
- ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
- case 'c':
- c = *(++ptr);
- if (c == 0)
- {
- *errorcodeptr = ERR2;
- break;
- }
- #ifndef EBCDIC /* ASCII coding */
- if (c >= 'a' && c <= 'z') c -= 32;
- c ^= 0x40;
- #else /* EBCDIC coding */
- if (c >= 'a' && c <= 'z') c += 64;
- c ^= 0xC0;
- #endif
- break;
- /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
- other alphanumeric following \ is an error if PCRE_EXTRA was set;
- otherwise, for Perl compatibility, it is a literal. This code looks a bit
- odd, but there used to be some cases other than the default, and there may
- be again in future, so I haven't "optimized" it. */
- default:
- if ((options & PCRE_EXTRA) != 0) switch(c)
- {
- default:
- *errorcodeptr = ERR3;
- break;
- }
- break;
- }
- }
- *ptrptr = ptr;
- return c;
- }
- #ifdef SUPPORT_UCP
- /*************************************************
- * Handle \P and \p *
- *************************************************/
- /* This function is called after \P or \p has been encountered, provided that
- PCRE is compiled with support for Unicode properties. On entry, ptrptr is
- pointing at the P or p. On exit, it is pointing at the final character of the
- escape sequence.
- Argument:
- ptrptr points to the pattern position pointer
- negptr points to a boolean that is set TRUE for negation else FALSE
- dptr points to an int that is set to the detailed property value
- errorcodeptr points to the error code variable
- Returns: type value from ucp_type_table, or -1 for an invalid type
- */
- static int
- get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
- {
- int c, i, bot, top;
- const uschar *ptr = *ptrptr;
- char name[32];
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- *negptr = FALSE;
- /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
- negation. */
- if (c == '{')
- {
- if (ptr[1] == '^')
- {
- *negptr = TRUE;
- ptr++;
- }
- for (i = 0; i < (int)sizeof(name) - 1; i++)
- {
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- if (c == '}') break;
- name[i] = c;
- }
- if (c !='}') goto ERROR_RETURN;
- name[i] = 0;
- }
- /* Otherwise there is just one following character */
- else
- {
- name[0] = c;
- name[1] = 0;
- }
- *ptrptr = ptr;
- /* Search for a recognized property name using binary chop */
- bot = 0;
- top = _pcre_utt_size;
- while (bot < top)
- {
- i = (bot + top) >> 1;
- c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
- if (c == 0)
- {
- *dptr = _pcre_utt[i].value;
- return _pcre_utt[i].type;
- }
- if (c > 0) bot = i + 1; else top = i;
- }
- *errorcodeptr = ERR47;
- *ptrptr = ptr;
- return -1;
- ERROR_RETURN:
- *errorcodeptr = ERR46;
- *ptrptr = ptr;
- return -1;
- }
- #endif
- /*************************************************
- * Check for counted repeat *
- *************************************************/
- /* This function is called when a '{' is encountered in a place where it might
- start a quantifier. It looks ahead to see if it really is a quantifier or not.
- It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
- where the ddds are digits.
- Arguments:
- p pointer to the first char after '{'
- Returns: TRUE or FALSE
- */
- static BOOL
- is_counted_repeat(const uschar *p)
- {
- if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
- while ((digitab[*p] & ctype_digit) != 0) p++;
- if (*p == '}') return TRUE;
- if (*p++ != ',') return FALSE;
- if (*p == '}') return TRUE;
- if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
- while ((digitab[*p] & ctype_digit) != 0) p++;
- return (*p == '}');
- }
- /*************************************************
- * Read repeat counts *
- *************************************************/
- /* Read an item of the form {n,m} and return the values. This is called only
- after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
- so the syntax is guaranteed to be correct, but we need to check the values.
- Arguments:
- p pointer to first char after '{'
- minp pointer to int for min
- maxp pointer to int for max
- returned as -1 if no max
- errorcodeptr points to error code variable
- Returns: pointer to '}' on success;
- current ptr on error, with errorcodeptr set non-zero
- */
- static const uschar *
- read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
- {
- int min = 0;
- int max = -1;
- /* Read the minimum value and do a paranoid check: a negative value indicates
- an integer overflow. */
- while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
- if (min < 0 || min > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- /* Read the maximum value if there is one, and again do a paranoid on its size.
- Also, max must not be less than min. */
- if (*p == '}') max = min; else
- {
- if (*(++p) != '}')
- {
- max = 0;
- while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
- if (max < 0 || max > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- if (max < min)
- {
- *errorcodeptr = ERR4;
- return p;
- }
- }
- }
- /* Fill in the required variables, and pass back the pointer to the terminating
- '}'. */
- *minp = min;
- *maxp = max;
- return p;
- }
- /*************************************************
- * Find forward referenced subpattern *
- *************************************************/
- /* This function scans along a pattern's text looking for capturing
- subpatterns, and counting them. If it finds a named pattern that matches the
- name it is given, it returns its number. Alternatively, if the name is NULL, it
- returns when it reaches a given numbered subpattern. This is used for forward
- references to subpatterns. We know that if (?P< is encountered, the name will
- be terminated by '>' because that is checked in the first pass.
- Arguments:
- ptr current position in the pattern
- cd compile background data
- name name to seek, or NULL if seeking a numbered subpattern
- lorn name length, or subpattern number if name is NULL
- xmode TRUE if we are in /x mode
- Returns: the number of the named subpattern, or -1 if not found
- */
- static int
- find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
- BOOL xmode)
- {
- const uschar *thisname;
- int count = cd->bracount;
- for (; *ptr != 0; ptr++)
- {
- int term;
- /* Skip over backslashed characters and also entire \Q...\E */
- if (*ptr == '\\')
- {
- if (*(++ptr) == 0) return -1;
- if (*ptr == 'Q') for (;;)
- {
- while (*(++ptr) != 0 && *ptr != '\\') {};
- if (*ptr == 0) return -1;
- if (*(++ptr) == 'E') break;
- }
- continue;
- }
- /* Skip over character classes; this logic must be similar to the way they
- are handled for real. If the first character is '^', skip it. Also, if the
- first few characters (either before or after ^) are \Q\E or \E we skip them
- too. This makes for compatibility with Perl. */
- if (*ptr == '[')
- {
- BOOL negate_class = FALSE;
- for (;;)
- {
- int c = *(++ptr);
- if (c == '\\')
- {
- if (ptr[1] == 'E') ptr++;
- else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
- else break;
- }
- else if (!negate_class && c == '^')
- negate_class = TRUE;
- else break;
- }
- /* If the next character is ']', it is a data character that must be
- skipped, except in JavaScript compatibility mode. */
- if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
- ptr++;
- while (*(++ptr) != ']')
- {
- if (*ptr == 0) return -1;
- if (*ptr == '\\')
- {
- if (*(++ptr) == 0) return -1;
- if (*ptr == 'Q') for (;;)
- {
- while (*(++ptr) != 0 && *ptr != '\\') {};
- if (*ptr == 0) return -1;
- if (*(++ptr) == 'E') break;
- }
- continue;
- }
- }
- continue;
- }
- /* Skip comments in /x mode */
- if (xmode && *ptr == '#')
- {
- while (*(++ptr) != 0 && *ptr != '\n') {};
- if (*ptr == 0) return -1;
- continue;
- }
- /* An opening parens must now be a real metacharacter */
- if (*ptr != '(') continue;
- if (ptr[1] != '?' && ptr[1] != '*')
- {
- count++;
- if (name == NULL && count == lorn) return count;
- continue;
- }
- ptr += 2;
- if (*ptr == 'P') ptr++; /* Allow optional P */
- /* We have to disambiguate (?<! and (?<= from (?<name> */
- if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
- *ptr != '\'')
- continue;
- count++;
- if (name == NULL && count == lorn) return count;
- term = *ptr++;
- if (term == '<') term = '>';
- thisname = ptr;
- while (*ptr != term) ptr++;
- if (name != NULL && lorn == ptr - thisname &&
- strncmp((const char *)name, (const char *)thisname, lorn) == 0)
- return count;
- }
- return -1;
- }
- /*************************************************
- * Find first significant op code *
- *************************************************/
- /* This is called by several functions that scan a compiled expression looking
- for a fixed first character, or an anchoring op code etc. It skips over things
- that do not influence this. For some calls, a change of option is important.
- For some calls, it makes sense to skip negative forward and all backward
- assertions, and also the \b assertion; for others it does not.
- Arguments:
- code pointer to the start of the group
- options pointer to external options
- optbit the option bit whose changing is significant, or
- zero if none are
- skipassert TRUE if certain assertions are to be skipped
- Returns: pointer to the first significant opcode
- */
- static const uschar*
- first_significant_code(const uschar *code, int *options, int optbit,
- BOOL skipassert)
- {
- for (;;)
- {
- switch ((int)*code)
- {
- case OP_OPT:
- if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
- *options = (int)code[1];
- code += 2;
- break;
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- if (!skipassert) return code;
- do code += GET(code, 1); while (*code == OP_ALT);
- code += _pcre_OP_lengths[*code];
- break;
- case OP_WORD_BOUNDARY:
- case OP_NOT_WORD_BOUNDARY:
- if (!skipassert) return code;
- /* Fall through */
- case OP_CALLOUT:
- case OP_CREF:
- case OP_RREF:
- case OP_DEF:
- code += _pcre_OP_lengths[*code];
- break;
- default:
- return code;
- }
- }
- /* Control never reaches here */
- }
- /*************************************************
- * Find the fixed length of a pattern *
- *************************************************/
- /* Scan a pattern and compute the fixed length of subject that will match it,
- if the length is fixed. This is needed for dealing with backward assertions.
- In UTF8 mode, the result is in characters rather than bytes.
- Arguments:
- code points to the start of the pattern (the bracket)
- options the compiling options
- Returns: the fixed length, or -1 if there is no fixed length,
- or -2 if \C was encountered
- */
- static int
- find_fixedlength(uschar *code, int options)
- {
- int length = -1;
- register int branchlength = 0;
- register uschar *cc = code + 1 + LINK_SIZE;
- /* Scan along the opcodes for this branch. If we get to the end of the
- branch, check the length against that of the other branches. */
- for (;;)
- {
- int d;
- register int op = *cc;
- switch (op)
- {
- case OP_CBRA:
- case OP_BRA:
- case OP_ONCE:
- case OP_COND:
- d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
- if (d < 0) return d;
- branchlength += d;
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- cc += 1 + LINK_SIZE;
- break;
- /* Reached end of a branch; if it's a ket it is the end of a nested
- call. If it's ALT it is an alternation in a nested call. If it is
- END it's the end of the outer call. All can be handled by the same code. */
- case OP_ALT:
- case OP_KET:
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_END:
- if (length < 0) length = branchlength;
- else if (length != branchlength) return -1;
- if (*cc != OP_ALT) return length;
- cc += 1 + LINK_SIZE;
- branchlength = 0;
- break;
- /* Skip over assertive subpatterns */
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- do cc += GET(cc, 1); while (*cc == OP_ALT);
- /* Fall through */
- /* Skip over things that don't match chars */
- case OP_REVERSE:
- case OP_CREF:
- case OP_RREF:
- case OP_DEF:
- case OP_OPT:
- case OP_CALLOUT:
- case OP_SOD:
- case OP_SOM:
- case OP_EOD:
- case OP_EODN:
- case OP_CIRC:
- case OP_DOLL:
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- cc += _pcre_OP_lengths[*cc];
- break;
- /* Handle literal characters */
- case OP_CHAR:
- case OP_CHARNC:
- case OP_NOT:
- branchlength++;
- cc += 2;
- #ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while ((*cc & 0xc0) == 0x80) cc++;
- }
- #endif
- break;
- /* Handle exact repetitions. The count is already in characters, but we
- need to skip over a multibyte character in UTF8 mode. */
- case OP_EXACT:
- branchlength += GET2(cc,1);
- cc += 4;
- #ifdef SUPPORT_UTF8
- if ((options & PCRE_UTF8) != 0)
- {
- while((*cc & 0x80) == 0x80) cc++;
- }
- #endif
- break;
- case OP_TYPEEXACT:
- branchlength += GET2(cc,1);
- if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
- cc += 4;
- break;
- /* Handle single-char matchers */
- case OP_PROP:
- case OP_NOTPROP:
- cc += 2;
- /* Fall through */
- case OP_NOT_DIGIT:
- case OP_DIGIT:
- case OP_NOT_WHITESPACE:
- case OP_WHITESPACE:
- case OP_NOT_WORDCHAR:
- case OP_WORDCHAR:
- case OP_ANY:
- case OP_ALLANY:
- branchlength++;
- cc++;
- break;
- /* The single-byte matcher isn't allowed */
- case OP_ANYBYTE:
- return -2;
- /* Check a class for variable quantification */
- #ifdef SUPPORT_UTF8
- case OP_XCLASS:
- cc += GET(cc, 1) - 33;
- /* Fall through */
- #endif
- case OP_CLASS:
- case OP_NCLASS:
- cc += 33;
- switch (*cc)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- return -1;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- if (GET2(cc,1) != GET2(cc,3)) return -1;
- branchlength += GET2(cc,1);
- cc += 5;
- break;
- default:
- branchlength++;
- }
- break;
- /* Anything else is variable length */
- default:
- return -1;
- }
- }
- /* Control never gets here */
- }
- /*************************************************
- * Scan compiled regex for numbered bracket *
- *************************************************/
- /* This little function scans through a compiled pattern until it finds a
- capturing bracket with the given number.
- Arguments:
- code points to start of expression
- utf8 TRUE in UTF-8 mode
- number the required bracket number
- Returns: pointer to the opcode for the bracket, or NULL if not found
- */
- static const uschar *
- find_bracket(const uschar *code, BOOL utf8, int number)
- {
- for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
- /* XCLASS is used for classes that cannot be represented just by a bit
- map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compiled code. */
- if (c == OP_XCLASS) code += GET(code, 1);
- /* Handle capturing bracket */
- else if (c == OP_CBRA)
- {
- int n = GET2(code, 1+LINK_SIZE);
- if (n == number) return (uschar *)code;
- code += _pcre_OP_lengths[c];
- }
- /* Otherwise, we can get the item's length from the table, except that for
- repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
- else
- {
- switch(c)
- {
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSSTAR:
- case OP_TYPEPOSPLUS:
- case OP_TYPEPOSQUERY:
- if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
- break;
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEEXACT:
- case OP_TYPEPOSUPTO:
- if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
- break;
- }
- /* Add in the fixed length from the table */
- code += _pcre_OP_lengths[c];
- /* In UTF-8 mode, opcodes that are followed by a character may be followed by
- a multi-byte character. The length in the table is a minimum, so we have to
- arrange to skip the extra bytes. */
- #ifdef SUPPORT_UTF8
- if (utf8) switch(c)
- {
- case OP_CHAR:
- case OP_CHARNC:
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
- break;
- }
- #else
- /* pacify warnings */
- (void)(utf8);
- #endif
- }
- }
- }
- /*************************************************
- * Scan compiled regex for recursion reference *
- *************************************************/
- /* This little function scans through a compiled pattern until it finds an
- instance of OP_RECURSE.
- Arguments:
- code points to start of expression
- utf8 TRUE in UTF-8 mode
- Returns: pointer to the opcode for OP_RECURSE, or NULL if not found
- */
- static const uschar *
- find_recurse(const uschar *code, BOOL utf8)
- {
- for (;;)
- {
- register int c = *code;
- if (c == OP_END) return NULL;
- if (c == OP_RECURSE) return code;
- /* XCLASS is used for classes that cannot be represented just by a bit
- map. This includes negated single high-valued characters. The length in
- the table is zero; the actual length is stored in the compiled code. */
- if (c == OP_XCLASS) code += GET(code, 1);
- /* Otherwise, we can get the item's length from the table, except that for
- repeated character types, we have to test for \p and \P, which have an extra
- two bytes of parameters. */
- else
- {
- switch(c)
- {
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- case OP_TYPEPOSSTAR:
- case OP_TYPEPOSPLUS:
- case OP_TYPEPOSQUERY:
- if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
- break;
- case OP_TYPEPOSUPTO:
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- case OP_TYPEEXACT:
- if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
- break;
- }
- /* Add in the fixed length from the table */
- code += _pcre_OP_lengths[c];
- /* In UTF-8 mode, opcodes that are followed by a character may be followed
- by a multi-byte character. The length in the table is a minimum, so we have
- to arrange to skip the extra bytes. */
- #ifdef SUPPORT_UTF8
- if (utf8) switch(c)
- {
- case OP_CHAR:
- case OP_CHARNC:
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- case OP_POSUPTO:
- case OP_STAR:
- case OP_MINSTAR:
- case OP_POSSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_POSPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_POSQUERY:
- if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
- break;
- }
- #else
- /* pacify warnings */
- (void)(utf8);
- #endif
- }
- }
- }
- /*************************************************
- * Scan compiled branch for non-emptiness *
- *************************************************/
- /* This function scans through a branch of a compiled pattern to see whether it
- can match the empty string or not. It is called from could_be_empty()
- below and from compile_branch() when checking for an unlimited repeat of a
- group that can match nothing. Note that first_significant_code() skips over
- backward and negative forward assertions when its final argument is TRUE. If we
- hit an unclosed bracket, we return "empty" - this means we've struck an inner
- bracket whose current branch will already have been scanned.
- Arguments:
- code points to start of search
- endcode points to where to stop
- utf8 TRUE if in UTF8 mode
- Returns: TRUE if what is matched could be empty
- */
- static BOOL
- could_be_empty_branch(const…
Large files files are truncated, but you can click here to view the full file