/trunk/harbour/src/3rd/pcre/pcrecomp.c
C | 1912 lines | 1271 code | 230 blank | 411 comment | 369 complexity | 700c459a2e37a69c39465fc6c4e3864c MD5 | raw file
Possible License(s): AGPL-1.0, BSD-3-Clause, CC-BY-SA-3.0, LGPL-3.0, GPL-2.0, LGPL-2.0, LGPL-2.1
Large files files are truncated, but you can click here to view the full file
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /* PCRE is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language.
- Written by Philip Hazel
- Copyright (c) 1997-2012 University of Cambridge
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- /* This module contains the external function pcre_compile(), along with
- supporting internal functions that are not used by other modules. */
- #ifdef HAVE_CONFIG_H
- #include "config.h"
- #endif
- #define NLBLOCK cd /* Block containing newline information */
- #define PSSTART start_pattern /* Field containing processed string start */
- #define PSEND end_pattern /* Field containing processed string end */
- #include "pcreinal.h"
- /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
- is also used by pcretest. PCRE_DEBUG is not defined when building a production
- library. We do not need to select pcre16_printint.c specially, because the
- COMPILE_PCREx macro will already be appropriately set. */
- #ifdef PCRE_DEBUG
- /* pcre_printint.c should not include any headers */
- #define PCRE_INCLUDED
- #include "pcreprni.c"
- #undef PCRE_INCLUDED
- #endif
- /* Macro for setting individual bits in class bitmaps. */
- #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
- /* Maximum length value to check against when making sure that the integer that
- holds the compiled pattern length does not overflow. We make it a bit less than
- INT_MAX to allow for adding in group terminating bytes, so that we don't have
- to check them every time. */
- #define OFLOW_MAX (INT_MAX - 20)
- /*************************************************
- * Code parameters and static tables *
- *************************************************/
- /* This value specifies the size of stack workspace that is used during the
- first pre-compile phase that determines how much memory is required. The regex
- is partly compiled into this space, but the compiled parts are discarded as
- soon as they can be, so that hopefully there will never be an overrun. The code
- does, however, check for an overrun. The largest amount I've seen used is 218,
- so this number is very generous.
- The same workspace is used during the second, actual compile phase for
- remembering forward references to groups so that they can be filled in at the
- end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
- is 4 there is plenty of room for most patterns. However, the memory can get
- filled up by repetitions of forward references, for example patterns like
- /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
- that the workspace is expanded using malloc() in this situation. The value
- below is therefore a minimum, and we put a maximum on it for safety. The
- minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
- kicks in at the same number of forward references in all cases. */
- #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
- #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
- /* The overrun tests check for a slightly smaller size so that they detect the
- overrun before it actually does run off the end of the data block. */
- #define WORK_SIZE_SAFETY_MARGIN (100)
- /* Private flags added to firstchar and reqchar. */
- #define REQ_CASELESS 0x10000000l /* Indicates caselessness */
- #define REQ_VARY 0x20000000l /* Reqchar followed non-literal item */
- /* Repeated character flags. */
- #define UTF_LENGTH 0x10000000l /* The char contains its length. */
- /* Table for handling escaped characters in the range '0'-'z'. Positive returns
- are simple data values; negative values are for special things like \d and so
- on. Zero means further processing is needed (for things like \x), or the escape
- is invalid. */
- #ifndef EBCDIC
- /* This is the "normal" table for ASCII systems or for EBCDIC systems running
- in UTF-8 mode. */
- static const short int escapes[] = {
- 0, 0,
- 0, 0,
- 0, 0,
- 0, 0,
- 0, 0,
- CHAR_COLON, CHAR_SEMICOLON,
- CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
- CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
- CHAR_COMMERCIAL_AT, -ESC_A,
- -ESC_B, -ESC_C,
- -ESC_D, -ESC_E,
- 0, -ESC_G,
- -ESC_H, 0,
- 0, -ESC_K,
- 0, 0,
- -ESC_N, 0,
- -ESC_P, -ESC_Q,
- -ESC_R, -ESC_S,
- 0, 0,
- -ESC_V, -ESC_W,
- -ESC_X, 0,
- -ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
- CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
- CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
- CHAR_GRAVE_ACCENT, 7,
- -ESC_b, 0,
- -ESC_d, ESC_e,
- ESC_f, 0,
- -ESC_h, 0,
- 0, -ESC_k,
- 0, 0,
- ESC_n, 0,
- -ESC_p, 0,
- ESC_r, -ESC_s,
- ESC_tee, 0,
- -ESC_v, -ESC_w,
- 0, 0,
- -ESC_z
- };
- #else
- /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
- static const short int escapes[] = {
- /* 48 */ 0, 0, 0, '.', '<', '(', '+', '|',
- /* 50 */ '&', 0, 0, 0, 0, 0, 0, 0,
- /* 58 */ 0, 0, '!', '$', '*', ')', ';', '~',
- /* 60 */ '-', '/', 0, 0, 0, 0, 0, 0,
- /* 68 */ 0, 0, '|', ',', '%', '_', '>', '?',
- /* 70 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* 78 */ 0, '`', ':', '#', '@', '\'', '=', '"',
- /* 80 */ 0, 7, -ESC_b, 0, -ESC_d, ESC_e, ESC_f, 0,
- /* 88 */-ESC_h, 0, 0, '{', 0, 0, 0, 0,
- /* 90 */ 0, 0, -ESC_k, 'l', 0, ESC_n, 0, -ESC_p,
- /* 98 */ 0, ESC_r, 0, '}', 0, 0, 0, 0,
- /* A0 */ 0, '~', -ESC_s, ESC_tee, 0,-ESC_v, -ESC_w, 0,
- /* A8 */ 0,-ESC_z, 0, 0, 0, '[', 0, 0,
- /* B0 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* B8 */ 0, 0, 0, 0, 0, ']', '=', '-',
- /* C0 */ '{',-ESC_A, -ESC_B, -ESC_C, -ESC_D,-ESC_E, 0, -ESC_G,
- /* C8 */-ESC_H, 0, 0, 0, 0, 0, 0, 0,
- /* D0 */ '}', 0, -ESC_K, 0, 0,-ESC_N, 0, -ESC_P,
- /* D8 */-ESC_Q,-ESC_R, 0, 0, 0, 0, 0, 0,
- /* E0 */ '\\', 0, -ESC_S, 0, 0,-ESC_V, -ESC_W, -ESC_X,
- /* E8 */ 0,-ESC_Z, 0, 0, 0, 0, 0, 0,
- /* F0 */ 0, 0, 0, 0, 0, 0, 0, 0,
- /* F8 */ 0, 0, 0, 0, 0, 0, 0, 0
- };
- #endif
- /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
- searched linearly. Put all the names into a single string, in order to reduce
- the number of relocations when a shared library is dynamically linked. The
- string is built from string macros so that it works in UTF-8 mode on EBCDIC
- platforms. */
- typedef struct verbitem {
- int len; /* Length of verb name */
- int op; /* Op when no arg, or -1 if arg mandatory */
- int op_arg; /* Op when arg present, or -1 if not allowed */
- } verbitem;
- static const char verbnames[] =
- "\0" /* Empty name is a shorthand for MARK */
- STRING_MARK0
- STRING_ACCEPT0
- STRING_COMMIT0
- STRING_F0
- STRING_FAIL0
- STRING_PRUNE0
- STRING_SKIP0
- STRING_THEN;
- static const verbitem verbs[] = {
- { 0, -1, OP_MARK },
- { 4, -1, OP_MARK },
- { 6, OP_ACCEPT, -1 },
- { 6, OP_COMMIT, -1 },
- { 1, OP_FAIL, -1 },
- { 4, OP_FAIL, -1 },
- { 5, OP_PRUNE, OP_PRUNE_ARG },
- { 4, OP_SKIP, OP_SKIP_ARG },
- { 4, OP_THEN, OP_THEN_ARG }
- };
- static const int verbcount = sizeof(verbs)/sizeof(verbitem);
- /* Tables of names of POSIX character classes and their lengths. The names are
- now all in a single string, to reduce the number of relocations when a shared
- library is dynamically loaded. The list of lengths is terminated by a zero
- length entry. The first three must be alpha, lower, upper, as this is assumed
- for handling case independence. */
- static const char posix_names[] =
- STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
- STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
- STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
- STRING_word0 STRING_xdigit;
- static const pcre_uint8 posix_name_lengths[] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
- /* Table of class bit maps for each POSIX class. Each class is formed from a
- base map, with an optional addition or removal of another map. Then, for some
- classes, there is some additional tweaking: for [:blank:] the vertical space
- characters are removed, and for [:alpha:] and [:alnum:] the underscore
- character is removed. The triples in the table consist of the base map offset,
- second map offset or -1 if no second map, and a non-negative value for map
- addition or a negative value for map subtraction (if there are two maps). The
- absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
- remove vertical space characters, 2 => remove underscore. */
- static const int posix_class_maps[] = {
- cbit_word, cbit_digit, -2, /* alpha */
- cbit_lower, -1, 0, /* lower */
- cbit_upper, -1, 0, /* upper */
- cbit_word, -1, 2, /* alnum - word without underscore */
- cbit_print, cbit_cntrl, 0, /* ascii */
- cbit_space, -1, 1, /* blank - a GNU extension */
- cbit_cntrl, -1, 0, /* cntrl */
- cbit_digit, -1, 0, /* digit */
- cbit_graph, -1, 0, /* graph */
- cbit_print, -1, 0, /* print */
- cbit_punct, -1, 0, /* punct */
- cbit_space, -1, 0, /* space */
- cbit_word, -1, 0, /* word - a Perl extension */
- cbit_xdigit,-1, 0 /* xdigit */
- };
- /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
- substitutes must be in the order of the names, defined above, and there are
- both positive and negative cases. NULL means no substitute. */
- #ifdef SUPPORT_UCP
- static const pcre_uchar string_PNd[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_pNd[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_PXsp[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_pXsp[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_PXwd[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_pXwd[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar *substitutes[] = {
- string_PNd, /* \D */
- string_pNd, /* \d */
- string_PXsp, /* \S */ /* NOTE: Xsp is Perl space */
- string_pXsp, /* \s */
- string_PXwd, /* \W */
- string_pXwd /* \w */
- };
- static const pcre_uchar string_pL[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_pLl[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_pLu[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_pXan[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_h[] = {
- CHAR_BACKSLASH, CHAR_h, '\0' };
- static const pcre_uchar string_pXps[] = {
- CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_PL[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_PLl[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_PLu[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_PXan[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar string_H[] = {
- CHAR_BACKSLASH, CHAR_H, '\0' };
- static const pcre_uchar string_PXps[] = {
- CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
- CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
- static const pcre_uchar *posix_substitutes[] = {
- string_pL, /* alpha */
- string_pLl, /* lower */
- string_pLu, /* upper */
- string_pXan, /* alnum */
- NULL, /* ascii */
- string_h, /* blank */
- NULL, /* cntrl */
- string_pNd, /* digit */
- NULL, /* graph */
- NULL, /* print */
- NULL, /* punct */
- string_pXps, /* space */ /* NOTE: Xps is POSIX space */
- string_pXwd, /* word */
- NULL, /* xdigit */
- /* Negated cases */
- string_PL, /* ^alpha */
- string_PLl, /* ^lower */
- string_PLu, /* ^upper */
- string_PXan, /* ^alnum */
- NULL, /* ^ascii */
- string_H, /* ^blank */
- NULL, /* ^cntrl */
- string_PNd, /* ^digit */
- NULL, /* ^graph */
- NULL, /* ^print */
- NULL, /* ^punct */
- string_PXps, /* ^space */ /* NOTE: Xps is POSIX space */
- string_PXwd, /* ^word */
- NULL /* ^xdigit */
- };
- #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
- #endif
- #define STRING(a) # a
- #define XSTRING(s) STRING(s)
- /* The texts of compile-time error messages. These are "char *" because they
- are passed to the outside world. Do not ever re-use any error number, because
- they are documented. Always add a new error instead. Messages marked DEAD below
- are no longer used. This used to be a table of strings, but in order to reduce
- the number of relocations needed when a shared library is loaded dynamically,
- it is now one long string. We cannot use a table of offsets, because the
- lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
- simply count through to the one we want - this isn't a performance issue
- because these strings are used only when there is a compilation error.
- Each substring ends with \0 to insert a null character. This includes the final
- substring, so that the whole string ends with \0\0, which can be detected when
- counting through. */
- static const char error_texts[] =
- "no error\0"
- "\\ at end of pattern\0"
- "\\c at end of pattern\0"
- "unrecognized character follows \\\0"
- "numbers out of order in {} quantifier\0"
- /* 5 */
- "number too big in {} quantifier\0"
- "missing terminating ] for character class\0"
- "invalid escape sequence in character class\0"
- "range out of order in character class\0"
- "nothing to repeat\0"
- /* 10 */
- "operand of unlimited repeat could match the empty string\0" /** DEAD **/
- "internal error: unexpected repeat\0"
- "unrecognized character after (? or (?-\0"
- "POSIX named classes are supported only within a class\0"
- "missing )\0"
- /* 15 */
- "reference to non-existent subpattern\0"
- "erroffset passed as NULL\0"
- "unknown option bit(s) set\0"
- "missing ) after comment\0"
- "parentheses nested too deeply\0" /** DEAD **/
- /* 20 */
- "regular expression is too large\0"
- "failed to get memory\0"
- "unmatched parentheses\0"
- "internal error: code overflow\0"
- "unrecognized character after (?<\0"
- /* 25 */
- "lookbehind assertion is not fixed length\0"
- "malformed number or name after (?(\0"
- "conditional group contains more than two branches\0"
- "assertion expected after (?(\0"
- "(?R or (?[+-]digits must be followed by )\0"
- /* 30 */
- "unknown POSIX class name\0"
- "POSIX collating elements are not supported\0"
- "this version of PCRE is compiled without UTF support\0"
- "spare error\0" /** DEAD **/
- "character value in \\x{...} sequence is too large\0"
- /* 35 */
- "invalid condition (?(0)\0"
- "\\C not allowed in lookbehind assertion\0"
- "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
- "number after (?C is > 255\0"
- "closing ) for (?C expected\0"
- /* 40 */
- "recursive call could loop indefinitely\0"
- "unrecognized character after (?P\0"
- "syntax error in subpattern name (missing terminator)\0"
- "two named subpatterns have the same name\0"
- "invalid UTF-8 string\0"
- /* 45 */
- "support for \\P, \\p, and \\X has not been compiled\0"
- "malformed \\P or \\p sequence\0"
- "unknown property name after \\P or \\p\0"
- "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
- "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
- /* 50 */
- "repeated subpattern is too long\0" /** DEAD **/
- "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
- "internal error: overran compiling workspace\0"
- "internal error: previously-checked referenced subpattern not found\0"
- "DEFINE group contains more than one branch\0"
- /* 55 */
- "repeating a DEFINE group is not allowed\0" /** DEAD **/
- "inconsistent NEWLINE options\0"
- "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
- "a numbered reference must not be zero\0"
- "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
- /* 60 */
- "(*VERB) not recognized\0"
- "number is too big\0"
- "subpattern name expected\0"
- "digit expected after (?+\0"
- "] is an invalid data character in JavaScript compatibility mode\0"
- /* 65 */
- "different names for subpatterns of the same number are not allowed\0"
- "(*MARK) must have an argument\0"
- "this version of PCRE is not compiled with Unicode property support\0"
- "\\c must be followed by an ASCII character\0"
- "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
- /* 70 */
- "internal error: unknown opcode in find_fixedlength()\0"
- "\\N is not supported in a class\0"
- "too many forward references\0"
- "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
- "invalid UTF-16 string\0"
- ;
- /* Table to identify digits and hex digits. This is used when compiling
- patterns. Note that the tables in chartables are dependent on the locale, and
- may mark arbitrary characters as digits - but the PCRE compiling code expects
- to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
- a private table here. It costs 256 bytes, but it is a lot faster than doing
- character value tests (at least in some simple cases I timed), and in some
- applications one wants PCRE to compile efficiently as well as match
- efficiently.
- For convenience, we use the same bit definitions as in chartables:
- 0x04 decimal digit
- 0x08 hexadecimal digit
- Then we can use ctype_digit and ctype_xdigit in the code. */
- /* Using a simple comparison for decimal numbers rather than a memory read
- is much faster, and the resulting code is simpler (the compiler turns it
- into a subtraction and unsigned comparison). */
- #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
- #ifndef EBCDIC
- /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
- UTF-8 mode. */
- static const pcre_uint8 digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - ' */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ( - / */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /* 8 - ? */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* @ - G */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H - O */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* P - W */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* X - _ */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* ` - g */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h - o */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p - w */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* x -127 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
- #else
- /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
- static const pcre_uint8 digitab[] =
- {
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 0- 7 0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 16- 23 10 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 32- 39 20 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 30 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 40 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 50 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 88- 95 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 60 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g 80 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p 90 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x A0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 B0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* { - G C0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* } - P D0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* \ - X E0 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /* 0 - 7 F0 */
- 0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
- static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
- 0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 0- 7 */
- 0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /* 8- 15 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 16- 23 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 24- 31 */
- 0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /* 32- 39 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 40- 47 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 48- 55 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 56- 63 */
- 0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - 71 */
- 0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /* 72- | */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* & - 87 */
- 0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /* 88- 95 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* - -103 */
- 0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ? */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
- 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- " */
- 0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* h -143 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* q -159 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* y -175 */
- 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* ^ -183 */
- 0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
- 0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* { - G */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* H -207 */
- 0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* } - P */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Q -223 */
- 0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* \ - X */
- 0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /* Y -239 */
- 0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /* 0 - 7 */
- 0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/* 8 -255 */
- #endif
- /* Definition to allow mutual recursion */
- static BOOL
- compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
- int *, int *, branch_chain *, compile_data *, int *);
- /*************************************************
- * Find an error text *
- *************************************************/
- /* The error texts are now all in one long string, to save on relocations. As
- some of the text is of unknown length, we can't use a table of offsets.
- Instead, just count through the strings. This is not a performance issue
- because it happens only when there has been a compilation error.
- Argument: the error number
- Returns: pointer to the error string
- */
- static const char *
- find_error_text(int n)
- {
- const char *s = error_texts;
- for (; n > 0; n--)
- {
- while (*s++ != 0) {};
- if (*s == 0) return "Error text not found (please report)";
- }
- return s;
- }
- /*************************************************
- * Expand the workspace *
- *************************************************/
- /* This function is called during the second compiling phase, if the number of
- forward references fills the existing workspace, which is originally a block on
- the stack. A larger block is obtained from malloc() unless the ultimate limit
- has been reached or the increase will be rather small.
- Argument: pointer to the compile data block
- Returns: 0 if all went well, else an error number
- */
- static int
- expand_workspace(compile_data *cd)
- {
- pcre_uchar *newspace;
- int newsize = cd->workspace_size * 2;
- if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
- if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
- newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
- return ERR72;
- newspace = (PUBL(malloc))(IN_UCHARS(newsize));
- if (newspace == NULL) return ERR21;
- memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
- cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
- if (cd->workspace_size > COMPILE_WORK_SIZE)
- (PUBL(free))((void *)cd->start_workspace);
- cd->start_workspace = newspace;
- cd->workspace_size = newsize;
- return 0;
- }
- /*************************************************
- * Check for counted repeat *
- *************************************************/
- /* This function is called when a '{' is encountered in a place where it might
- start a quantifier. It looks ahead to see if it really is a quantifier or not.
- It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
- where the ddds are digits.
- Arguments:
- p pointer to the first char after '{'
- Returns: TRUE or FALSE
- */
- static BOOL
- is_counted_repeat(const pcre_uchar *p)
- {
- if (!IS_DIGIT(*p)) return FALSE;
- p++;
- while (IS_DIGIT(*p)) p++;
- if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
- if (*p++ != CHAR_COMMA) return FALSE;
- if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
- if (!IS_DIGIT(*p)) return FALSE;
- p++;
- while (IS_DIGIT(*p)) p++;
- return (*p == CHAR_RIGHT_CURLY_BRACKET);
- }
- /*************************************************
- * Handle escapes *
- *************************************************/
- /* This function is called when a \ has been encountered. It either returns a
- positive value for a simple escape such as \n, or a negative value which
- encodes one of the more complicated things such as \d. A backreference to group
- n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
- UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
- ptr is pointing at the \. On exit, it is on the final character of the escape
- sequence.
- Arguments:
- ptrptr points to the pattern position pointer
- errorcodeptr points to the errorcode variable
- bracount number of previous extracting brackets
- options the options bits
- isclass TRUE if inside a character class
- Returns: zero or positive => a data character
- negative => a special escape sequence
- on error, errorcodeptr is set
- */
- static int
- check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
- int options, BOOL isclass)
- {
- /* PCRE_UTF16 has the same value as PCRE_UTF8. */
- BOOL utf = (options & PCRE_UTF8) != 0;
- const pcre_uchar *ptr = *ptrptr + 1;
- pcre_int32 c;
- int i;
- GETCHARINCTEST(c, ptr); /* Get character value, increment pointer */
- ptr--; /* Set pointer back to the last byte */
- /* If backslash is at the end of the pattern, it's an error. */
- if (c == 0) *errorcodeptr = ERR1;
- /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
- in a table. A non-zero result is something that can be returned immediately.
- Otherwise further processing may be required. */
- #ifndef EBCDIC /* ASCII/UTF-8 coding */
- /* Not alphanumeric */
- else if (c < CHAR_0 || c > CHAR_z) {}
- else if ((i = escapes[c - CHAR_0]) != 0) c = i;
- #else /* EBCDIC coding */
- /* Not alphanumeric */
- else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
- else if ((i = escapes[c - 0x48]) != 0) c = i;
- #endif
- /* Escapes that need further processing, or are illegal. */
- else
- {
- const pcre_uchar *oldptr;
- BOOL braced, negated;
- switch (c)
- {
- /* A number of Perl escapes are not handled by PCRE. We give an explicit
- error. */
- case CHAR_l:
- case CHAR_L:
- *errorcodeptr = ERR37;
- break;
- case CHAR_u:
- if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
- {
- /* In JavaScript, \u must be followed by four hexadecimal numbers.
- Otherwise it is a lowercase u letter. */
- if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
- && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0
- && MAX_255(ptr[3]) && (digitab[ptr[3]] & ctype_xdigit) != 0
- && MAX_255(ptr[4]) && (digitab[ptr[4]] & ctype_xdigit) != 0)
- {
- c = 0;
- for (i = 0; i < 4; ++i)
- {
- register int cc = *(++ptr);
- #ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
- #else /* EBCDIC coding */
- if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
- #endif
- }
- }
- }
- else
- *errorcodeptr = ERR37;
- break;
- case CHAR_U:
- /* In JavaScript, \U is an uppercase U letter. */
- if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
- break;
- /* In a character class, \g is just a literal "g". Outside a character
- class, \g must be followed by one of a number of specific things:
- (1) A number, either plain or braced. If positive, it is an absolute
- backreference. If negative, it is a relative backreference. This is a Perl
- 5.10 feature.
- (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
- is part of Perl's movement towards a unified syntax for back references. As
- this is synonymous with \k{name}, we fudge it up by pretending it really
- was \k.
- (3) For Oniguruma compatibility we also support \g followed by a name or a
- number either in angle brackets or in single quotes. However, these are
- (possibly recursive) subroutine calls, _not_ backreferences. Just return
- the -ESC_g code (cf \k). */
- case CHAR_g:
- if (isclass) break;
- if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
- {
- c = -ESC_g;
- break;
- }
- /* Handle the Perl-compatible cases */
- if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
- {
- const pcre_uchar *p;
- for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
- if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
- if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
- {
- c = -ESC_k;
- break;
- }
- braced = TRUE;
- ptr++;
- }
- else braced = FALSE;
- if (ptr[1] == CHAR_MINUS)
- {
- negated = TRUE;
- ptr++;
- }
- else negated = FALSE;
- /* The integer range is limited by the machine's int representation. */
- c = 0;
- while (IS_DIGIT(ptr[1]))
- {
- if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
- {
- c = -1;
- break;
- }
- c = c * 10 + *(++ptr) - CHAR_0;
- }
- if (((unsigned int)c) > INT_MAX) /* Integer overflow */
- {
- while (IS_DIGIT(ptr[1]))
- ptr++;
- *errorcodeptr = ERR61;
- break;
- }
- if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
- {
- *errorcodeptr = ERR57;
- break;
- }
- if (c == 0)
- {
- *errorcodeptr = ERR58;
- break;
- }
- if (negated)
- {
- if (c > bracount)
- {
- *errorcodeptr = ERR15;
- break;
- }
- c = bracount - (c - 1);
- }
- c = -(ESC_REF + c);
- break;
- /* The handling of escape sequences consisting of a string of digits
- starting with one that is not zero is not straightforward. By experiment,
- the way Perl works seems to be as follows:
- Outside a character class, the digits are read as a decimal number. If the
- number is less than 10, or if there are that many previous extracting
- left brackets, then it is a back reference. Otherwise, up to three octal
- digits are read to form an escaped byte. Thus \123 is likely to be octal
- 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
- value is greater than 377, the least significant 8 bits are taken. Inside a
- character class, \ followed by a digit is always an octal number. */
- case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
- case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
- if (!isclass)
- {
- oldptr = ptr;
- /* The integer range is limited by the machine's int representation. */
- c -= CHAR_0;
- while (IS_DIGIT(ptr[1]))
- {
- if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
- {
- c = -1;
- break;
- }
- c = c * 10 + *(++ptr) - CHAR_0;
- }
- if (((unsigned int)c) > INT_MAX) /* Integer overflow */
- {
- while (IS_DIGIT(ptr[1]))
- ptr++;
- *errorcodeptr = ERR61;
- break;
- }
- if (c < 10 || c <= bracount)
- {
- c = -(ESC_REF + c);
- break;
- }
- ptr = oldptr; /* Put the pointer back and fall through */
- }
- /* Handle an octal number following \. If the first digit is 8 or 9, Perl
- generates a binary zero byte and treats the digit as a following literal.
- Thus we have to pull back the pointer by one. */
- if ((c = *ptr) >= CHAR_8)
- {
- ptr--;
- c = 0;
- break;
- }
- /* \0 always starts an octal number, but we may drop through to here with a
- larger first octal digit. The original code used just to take the least
- significant 8 bits of octal numbers (I think this is what early Perls used
- to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
- but no more than 3 octal digits. */
- case CHAR_0:
- c -= CHAR_0;
- while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
- c = c * 8 + *(++ptr) - CHAR_0;
- #ifdef COMPILE_PCRE8
- if (!utf && c > 0xff) *errorcodeptr = ERR51;
- #endif
- break;
- /* \x is complicated. \x{ddd} is a character number which can be greater
- than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
- If not, { is treated as a data character. */
- case CHAR_x:
- if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
- {
- /* In JavaScript, \x must be followed by two hexadecimal numbers.
- Otherwise it is a lowercase x letter. */
- if (MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0
- && MAX_255(ptr[2]) && (digitab[ptr[2]] & ctype_xdigit) != 0)
- {
- c = 0;
- for (i = 0; i < 2; ++i)
- {
- register int cc = *(++ptr);
- #ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
- #else /* EBCDIC coding */
- if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
- #endif
- }
- }
- break;
- }
- if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
- {
- const pcre_uchar *pt = ptr + 2;
- c = 0;
- while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0)
- {
- register int cc = *pt++;
- if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
- #ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
- #else /* EBCDIC coding */
- if (cc >= CHAR_a && cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
- #endif
- #ifdef COMPILE_PCRE8
- if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
- #else
- #ifdef COMPILE_PCRE16
- if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
- #endif
- #endif
- }
- if (c < 0)
- {
- while (MAX_255(*pt) && (digitab[*pt] & ctype_xdigit) != 0) pt++;
- *errorcodeptr = ERR34;
- }
- if (*pt == CHAR_RIGHT_CURLY_BRACKET)
- {
- if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
- ptr = pt;
- break;
- }
- /* If the sequence of hex digits does not end with '}', then we don't
- recognize this construct; fall through to the normal \x handling. */
- }
- /* Read just a single-byte hex-defined char */
- c = 0;
- while (i++ < 2 && MAX_255(ptr[1]) && (digitab[ptr[1]] & ctype_xdigit) != 0)
- {
- int cc; /* Some compilers don't like */
- cc = *(++ptr); /* ++ in initializers */
- #ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (cc >= CHAR_a) cc -= 32; /* Convert to upper case */
- c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
- #else /* EBCDIC coding */
- if (cc <= CHAR_z) cc += 64; /* Convert to upper case */
- c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
- #endif
- }
- break;
- /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
- An error is given if the byte following \c is not an ASCII character. This
- coding is ASCII-specific, but then the whole concept of \cx is
- ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
- case CHAR_c:
- c = *(++ptr);
- if (c == 0)
- {
- *errorcodeptr = ERR2;
- break;
- }
- #ifndef EBCDIC /* ASCII/UTF-8 coding */
- if (c > 127) /* Excludes all non-ASCII in either mode */
- {
- *errorcodeptr = ERR68;
- break;
- }
- if (c >= CHAR_a && c <= CHAR_z) c -= 32;
- c ^= 0x40;
- #else /* EBCDIC coding */
- if (c >= CHAR_a && c <= CHAR_z) c += 64;
- c ^= 0xC0;
- #endif
- break;
- /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
- other alphanumeric following \ is an error if PCRE_EXTRA was set;
- otherwise, for Perl compatibility, it is a literal. This code looks a bit
- odd, but there used to be some cases other than the default, and there may
- be again in future, so I haven't "optimized" it. */
- default:
- if ((options & PCRE_EXTRA) != 0) switch(c)
- {
- default:
- *errorcodeptr = ERR3;
- break;
- }
- break;
- }
- }
- /* Perl supports \N{name} for character names, as well as plain \N for "not
- newline". PCRE does not support \N{name}. However, it does support
- quantification such as \N{2,3}. */
- if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
- !is_counted_repeat(ptr+2))
- *errorcodeptr = ERR37;
- /* If PCRE_UCP is set, we change the values for \d etc. */
- if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
- c -= (ESC_DU - ESC_D);
- /* Set the pointer to the final character before returning. */
- *ptrptr = ptr;
- return c;
- }
- #ifdef SUPPORT_UCP
- /*************************************************
- * Handle \P and \p *
- *************************************************/
- /* This function is called after \P or \p has been encountered, provided that
- PCRE is compiled with support for Unicode properties. On entry, ptrptr is
- pointing at the P or p. On exit, it is pointing at the final character of the
- escape sequence.
- Argument:
- ptrptr points to the pattern position pointer
- negptr points to a boolean that is set TRUE for negation else FALSE
- dptr points to an int that is set to the detailed property value
- errorcodeptr points to the error code variable
- Returns: type value from ucp_type_table, or -1 for an invalid type
- */
- static int
- get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
- {
- int c, i, bot, top;
- const pcre_uchar *ptr = *ptrptr;
- pcre_uchar name[32];
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- *negptr = FALSE;
- /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
- negation. */
- if (c == CHAR_LEFT_CURLY_BRACKET)
- {
- if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
- {
- *negptr = TRUE;
- ptr++;
- }
- for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
- {
- c = *(++ptr);
- if (c == 0) goto ERROR_RETURN;
- if (c == CHAR_RIGHT_CURLY_BRACKET) break;
- name[i] = c;
- }
- if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
- name[i] = 0;
- }
- /* Otherwise there is just one following character */
- else
- {
- name[0] = c;
- name[1] = 0;
- }
- *ptrptr = ptr;
- /* Search for a recognized property name using binary chop */
- bot = 0;
- top = PRIV(utt_size);
- while (bot < top)
- {
- i = (bot + top) >> 1;
- c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
- if (c == 0)
- {
- *dptr = PRIV(utt)[i].value;
- return PRIV(utt)[i].type;
- }
- if (c > 0) bot = i + 1; else top = i;
- }
- *errorcodeptr = ERR47;
- *ptrptr = ptr;
- return -1;
- ERROR_RETURN:
- *errorcodeptr = ERR46;
- *ptrptr = ptr;
- return -1;
- }
- #endif
- /*************************************************
- * Read repeat counts *
- *************************************************/
- /* Read an item of the form {n,m} and return the values. This is called only
- after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
- so the syntax is guaranteed to be correct, but we need to check the values.
- Arguments:
- p pointer to first char after '{'
- minp pointer to int for min
- maxp pointer to int for max
- returned as -1 if no max
- errorcodeptr points to error code variable
- Returns: pointer to '}' on success;
- current ptr on error, with errorcodeptr set non-zero
- */
- static const pcre_uchar *
- read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
- {
- int min = 0;
- int max = -1;
- /* Read the minimum value and do a paranoid check: a negative value indicates
- an integer overflow. */
- while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
- if (min < 0 || min > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- /* Read the maximum value if there is one, and again do a paranoid on its size.
- Also, max must not be less than min. */
- if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
- {
- if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
- {
- max = 0;
- while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
- if (max < 0 || max > 65535)
- {
- *errorcodeptr = ERR5;
- return p;
- }
- if (max < min)
- {
- *errorcodeptr = ERR4;
- return p;
- }
- }
- }
- /* Fill in the required variables, and pass back the pointer to the terminating
- '}'. */
- *minp = min;
- *maxp = max;
- return p;
- }
- /*************************************************
- * Subroutine for finding forward reference *
- *************************************************/
- /* This recursive function is called only from find_parens() below. The
- top-level call starts at the beginning of the pattern. All other calls must
- start at a parenthesis. It scans along a pattern's text looking for capturing
- subpatterns, and counting them. If it finds a named pattern that matches the
- name it is given, it returns its number. Alternatively, if the name is NULL, it
- returns when it reaches a given numbered subpattern. Recursion is used to keep
- track of subpatterns that reset the capturing group numbers - the (?| feature.
- This function was originally called only from the second pass, in which we know
- that if (?< or (?' or (?P< is encountered, the name will be correctly
- terminated because that is checked in the first pass. There is now one call to
- this function in the first pass, to check for a recursive back reference by
- name (so that we can make the whole group atomic). In this case, we need check
- only up to the current position in the pattern, and that is still OK because
- and previous occurrences will have been checked. To make this work, the test
- for "end of pattern" is a check against cd->end_pattern in the main loop,
- instead of looking for a binary zero. This means that the special first-pass
- call can adjust cd->end_pattern temporarily. (Checks for binary zero while
- processing items within the loop are OK, because afterwards the main loop will
- terminate.)
- Arguments:
- ptrptr address of the current character pointer (updated)
- cd compile background data
- name name to seek, or NULL if seeking a numbered subpattern
- lorn name length, or subpattern number if name is NULL
- xmode TRUE if we are in /x mode
- utf TRUE if we are in UTF-8 / UTF-16 mode
- count pointer to the current ca…
Large files files are truncated, but you can click here to view the full file