/quakeforge/trunk/libs/gib/regex.c
C | 2003 lines | 1200 code | 375 blank | 428 comment | 320 complexity | efadfcf393f2444084f238984ade1afc MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, AGPL-3.0, AGPL-1.0, Unlicense
Large files files are truncated, but you can click here to view the full file
- /* Extended regular expression matching and search library,
- version 0.12.
- (Implements POSIX draft P10003.2/D11.2, except for
- internationalization features.)
- Copyright (C) 1993 Free Software Foundation, Inc.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2, or (at your option)
- any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
- /* AIX requires this to be the first thing in the file. */
- #if defined (_AIX) && !defined (REGEX_MALLOC)
- #pragma alloca
- #endif
- #define _GNU_SOURCE
- /* We need this for `regex.h', and perhaps for the Emacs include files. */
- #include <sys/types.h>
- #ifdef HAVE_CONFIG_H
- #include "config.h"
- #endif
- /* The `emacs' switch turns on certain matching commands
- that make sense only in Emacs. */
- #ifdef emacs
- #include "lisp.h"
- #include "buffer.h"
- #include "syntax.h"
- /* Emacs uses `NULL' as a predicate. */
- #undef NULL
- #else /* not emacs */
- /* We used to test for `BSTRING' here, but only GCC and Emacs define
- `BSTRING', as far as I know, and neither of them use this code. */
- #if HAVE_STRING_H || STDC_HEADERS
- #include <string.h>
- #ifndef bcmp
- #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
- #endif
- #ifndef bcopy
- #define bcopy(s, d, n) memcpy ((d), (s), (n))
- #endif
- #ifndef bzero
- #define bzero(s, n) memset ((s), 0, (n))
- #endif
- #else
- #include <strings.h>
- #endif
- #ifdef STDC_HEADERS
- #include <stdlib.h>
- #ifdef HAVE_MALLOC_H
- #include <malloc.h>
- #endif
- #else
- char *malloc ();
- char *realloc ();
- #endif
- /* Define the syntax stuff for \<, \>, etc. */
- /* This must be nonzero for the wordchar and notwordchar pattern
- commands in re_match_2. */
- #ifndef Sword
- #define Sword 1
- #endif
- #ifdef SYNTAX_TABLE
- extern char *re_syntax_table;
- #else /* not SYNTAX_TABLE */
- /* How many characters in the character set. */
- #define CHAR_SET_SIZE 256
- static char re_syntax_table[CHAR_SET_SIZE];
- static void
- init_syntax_once (void)
- {
- register int c;
- static int done = 0;
- if (done)
- return;
- bzero (re_syntax_table, sizeof re_syntax_table);
- for (c = 'a'; c <= 'z'; c++)
- re_syntax_table[c] = Sword;
- for (c = 'A'; c <= 'Z'; c++)
- re_syntax_table[c] = Sword;
- for (c = '0'; c <= '9'; c++)
- re_syntax_table[c] = Sword;
- re_syntax_table['_'] = Sword;
- done = 1;
- }
- #endif /* not SYNTAX_TABLE */
- #define SYNTAX(c) re_syntax_table[c]
- #endif /* not emacs */
- /* Get the interface, including the syntax bits. */
- #include "regex.h"
- /* isalpha etc. are used for the character classes. */
- #include <ctype.h>
- #ifndef isascii
- #define isascii(c) 1
- #endif
- #ifdef isblank
- #define ISBLANK(c) (isascii (c) && isblank (c))
- #else
- #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
- #endif
- #ifdef isgraph
- #define ISGRAPH(c) (isascii (c) && isgraph (c))
- #else
- #define ISGRAPH(c) (isascii (c) && isprint (c) && !isspace (c))
- #endif
- #define ISPRINT(c) (isascii (c) && isprint (c))
- #define ISDIGIT(c) (isascii (c) && isdigit (c))
- #define ISALNUM(c) (isascii (c) && isalnum (c))
- #define ISALPHA(c) (isascii (c) && isalpha (c))
- #define ISCNTRL(c) (isascii (c) && iscntrl (c))
- #define ISLOWER(c) (isascii (c) && islower (c))
- #define ISPUNCT(c) (isascii (c) && ispunct (c))
- #define ISSPACE(c) (isascii (c) && isspace (c))
- #define ISUPPER(c) (isascii (c) && isupper (c))
- #define ISXDIGIT(c) (isascii (c) && isxdigit (c))
- #ifndef NULL
- #define NULL 0
- #endif
- /* We remove any previous definition of `SIGN_EXTEND_CHAR',
- since ours (we hope) works properly with all combinations of
- machines, compilers, `char' and `unsigned char' argument types.
- (Per Bothner suggested the basic approach.) */
- #undef SIGN_EXTEND_CHAR
- #if __STDC__
- #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
- #else /* not __STDC__ */
- /* As in Harbison and Steele. */
- #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
- #endif
- /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
- use `alloca' instead of `malloc'. This is because using malloc in
- re_search* or re_match* could cause memory leaks when C-g is used in
- Emacs; also, malloc is slower and causes storage fragmentation. On
- the other hand, malloc is more portable, and easier to debug.
-
- Because we sometimes use alloca, some routines have to be macros,
- not functions -- `alloca'-allocated space disappears at the end of the
- function it is called in. */
- #ifdef REGEX_MALLOC
- #define REGEX_ALLOCATE malloc
- #define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
- #else /* not REGEX_MALLOC */
- /* Emacs already defines alloca, sometimes. */
- #ifndef alloca
- /* Make alloca work the best possible way. */
- #ifdef __GNUC__
- #define alloca __builtin_alloca
- #else /* not __GNUC__ */
- #if HAVE_ALLOCA_H
- #include <alloca.h>
- #else /* not __GNUC__ or HAVE_ALLOCA_H */
- #ifndef _AIX /* Already did AIX, up at the top. */
- #ifndef _WIN32
- char *alloca ();
- #endif
- #endif /* not _AIX */
- #endif /* not HAVE_ALLOCA_H */
- #endif /* not __GNUC__ */
- #endif /* not alloca */
- #define REGEX_ALLOCATE alloca
- /* Assumes a `char *destination' variable. */
- #define REGEX_REALLOCATE(source, osize, nsize) \
- (destination = (char *) alloca (nsize), \
- bcopy (source, destination, osize), \
- destination)
- #endif /* not REGEX_MALLOC */
- /* True if `size1' is non-NULL and PTR is pointing anywhere inside
- `string1' or just past its end. This works if PTR is NULL, which is
- a good thing. */
- #define FIRST_STRING_P(ptr) \
- (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
- /* (Re)Allocate N items of type T using malloc, or fail. */
- #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
- #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
- #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
- #define BYTEWIDTH 8 /* In bits. */
- #define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
- typedef char boolean;
- #define false 0
- #define true 1
- /* These are the command codes that appear in compiled regular
- expressions. Some opcodes are followed by argument bytes. A
- command code can specify any interpretation whatsoever for its
- arguments. Zero bytes may appear in the compiled regular expression.
- The value of `exactn' is needed in search.c (search_buffer) in Emacs.
- So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of
- `exactn' we use here must also be 1. */
- typedef enum
- {
- no_op = 0,
- /* Followed by one byte giving n, then by n literal bytes. */
- exactn = 1,
- /* Matches any (more or less) character. */
- anychar,
- /* Matches any one char belonging to specified set. First
- following byte is number of bitmap bytes. Then come bytes
- for a bitmap saying which chars are in. Bits in each byte
- are ordered low-bit-first. A character is in the set if its
- bit is 1. A character too large to have a bit in the map is
- automatically not in the set. */
- charset,
- /* Same parameters as charset, but match any character that is
- not one of those specified. */
- charset_not,
- /* Start remembering the text that is matched, for storing in a
- register. Followed by one byte with the register number, in
- the range 0 to one less than the pattern buffer's re_nsub
- field. Then followed by one byte with the number of groups
- inner to this one. (This last has to be part of the
- start_memory only because we need it in the on_failure_jump
- of re_match_2.) */
- start_memory,
- /* Stop remembering the text that is matched and store it in a
- memory register. Followed by one byte with the register
- number, in the range 0 to one less than `re_nsub' in the
- pattern buffer, and one byte with the number of inner groups,
- just like `start_memory'. (We need the number of inner
- groups here because we don't have any easy way of finding the
- corresponding start_memory when we're at a stop_memory.) */
- stop_memory,
- /* Match a duplicate of something remembered. Followed by one
- byte containing the register number. */
- duplicate,
- /* Fail unless at beginning of line. */
- begline,
- /* Fail unless at end of line. */
- endline,
- /* Succeeds if at beginning of buffer (if emacs) or at beginning
- of string to be matched (if not). */
- begbuf,
- /* Analogously, for end of buffer/string. */
- endbuf,
-
- /* Followed by two byte relative address to which to jump. */
- jump,
- /* Same as jump, but marks the end of an alternative. */
- jump_past_alt,
- /* Followed by two-byte relative address of place to resume at
- in case of failure. */
- on_failure_jump,
-
- /* Like on_failure_jump, but pushes a placeholder instead of the
- current string position when executed. */
- on_failure_keep_string_jump,
-
- /* Throw away latest failure point and then jump to following
- two-byte relative address. */
- pop_failure_jump,
- /* Change to pop_failure_jump if know won't have to backtrack to
- match; otherwise change to jump. This is used to jump
- back to the beginning of a repeat. If what follows this jump
- clearly won't match what the repeat does, such that we can be
- sure that there is no use backtracking out of repetitions
- already matched, then we change it to a pop_failure_jump.
- Followed by two-byte address. */
- maybe_pop_jump,
- /* Jump to following two-byte address, and push a dummy failure
- point. This failure point will be thrown away if an attempt
- is made to use it for a failure. A `+' construct makes this
- before the first repeat. Also used as an intermediary kind
- of jump when compiling an alternative. */
- dummy_failure_jump,
- /* Push a dummy failure point and continue. Used at the end of
- alternatives. */
- push_dummy_failure,
- /* Followed by two-byte relative address and two-byte number n.
- After matching N times, jump to the address upon failure. */
- succeed_n,
- /* Followed by two-byte relative address, and two-byte number n.
- Jump to the address N times, then fail. */
- jump_n,
- /* Set the following two-byte relative address to the
- subsequent two-byte number. The address *includes* the two
- bytes of number. */
- set_number_at,
- wordchar, /* Matches any word-constituent character. */
- notwordchar, /* Matches any char that is not a word-constituent. */
- wordbeg, /* Succeeds if at word beginning. */
- wordend, /* Succeeds if at word end. */
- wordbound, /* Succeeds if at a word boundary. */
- notwordbound /* Succeeds if not at a word boundary. */
- #ifdef emacs
- ,before_dot, /* Succeeds if before point. */
- at_dot, /* Succeeds if at point. */
- after_dot, /* Succeeds if after point. */
- /* Matches any character whose syntax is specified. Followed by
- a byte which contains a syntax code, e.g., Sword. */
- syntaxspec,
- /* Matches any character whose syntax is not that specified. */
- notsyntaxspec
- #endif /* emacs */
- } re_opcode_t;
- /* Common operations on the compiled pattern. */
- /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
- #define STORE_NUMBER(destination, number) \
- do { \
- (destination)[0] = (number) & 0377; \
- (destination)[1] = (number) >> 8; \
- } while (0)
- /* Same as STORE_NUMBER, except increment DESTINATION to
- the byte after where the number is stored. Therefore, DESTINATION
- must be an lvalue. */
- #define STORE_NUMBER_AND_INCR(destination, number) \
- do { \
- STORE_NUMBER (destination, number); \
- (destination) += 2; \
- } while (0)
- /* Put into DESTINATION a number stored in two contiguous bytes starting
- at SOURCE. */
- #define EXTRACT_NUMBER(destination, source) \
- do { \
- (destination) = *(source) & 0377; \
- (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
- } while (0)
- #ifdef DEBUG
- static void
- extract_number (dest, source)
- int *dest;
- unsigned char *source;
- {
- int temp = SIGN_EXTEND_CHAR (*(source + 1));
- *dest = *source & 0377;
- *dest += temp << 8;
- }
- #ifndef EXTRACT_MACROS /* To debug the macros. */
- #undef EXTRACT_NUMBER
- #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
- #endif /* not EXTRACT_MACROS */
- #endif /* DEBUG */
- /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
- SOURCE must be an lvalue. */
- #define EXTRACT_NUMBER_AND_INCR(destination, source) \
- do { \
- EXTRACT_NUMBER (destination, source); \
- (source) += 2; \
- } while (0)
- #ifdef DEBUG
- static void
- extract_number_and_incr (destination, source)
- int *destination;
- unsigned char **source;
- {
- extract_number (destination, *source);
- *source += 2;
- }
- #ifndef EXTRACT_MACROS
- #undef EXTRACT_NUMBER_AND_INCR
- #define EXTRACT_NUMBER_AND_INCR(dest, src) \
- extract_number_and_incr (&dest, &src)
- #endif /* not EXTRACT_MACROS */
- #endif /* DEBUG */
- /* If DEBUG is defined, Regex prints many voluminous messages about what
- it is doing (if the variable `debug' is nonzero). If linked with the
- main program in `iregex.c', you can enter patterns and strings
- interactively. And if linked with the main program in `main.c' and
- the other test files, you can run the already-written tests. */
- #ifdef DEBUG
- /* We use standard I/O for debugging. */
- #include <stdio.h>
- /* It is useful to test things that ``must'' be true when debugging. */
- #include <assert.h>
- static int debug = 0;
- #define DEBUG_STATEMENT(e) e
- #define DEBUG_PRINT1(x) if (debug) printf (x)
- #define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2)
- #define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3)
- #define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4)
- #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
- if (debug) print_partial_compiled_pattern (s, e)
- #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
- if (debug) print_double_string (w, s1, sz1, s2, sz2)
- extern void printchar ();
- /* Print the fastmap in human-readable form. */
- void
- print_fastmap (fastmap)
- char *fastmap;
- {
- unsigned was_a_range = 0;
- unsigned i = 0;
-
- while (i < (1 << BYTEWIDTH))
- {
- if (fastmap[i++])
- {
- was_a_range = 0;
- printchar (i - 1);
- while (i < (1 << BYTEWIDTH) && fastmap[i])
- {
- was_a_range = 1;
- i++;
- }
- if (was_a_range)
- {
- printf ("-");
- printchar (i - 1);
- }
- }
- }
- putchar ('\n');
- }
- /* Print a compiled pattern string in human-readable form, starting at
- the START pointer into it and ending just before the pointer END. */
- void
- print_partial_compiled_pattern (start, end)
- unsigned char *start;
- unsigned char *end;
- {
- int mcnt, mcnt2;
- unsigned char *p = start;
- unsigned char *pend = end;
- if (start == NULL)
- {
- printf ("(null)\n");
- return;
- }
-
- /* Loop over pattern commands. */
- while (p < pend)
- {
- switch ((re_opcode_t) *p++)
- {
- case no_op:
- printf ("/no_op");
- break;
- case exactn:
- mcnt = *p++;
- printf ("/exactn/%d", mcnt);
- do
- {
- putchar ('/');
- printchar (*p++);
- }
- while (--mcnt);
- break;
- case start_memory:
- mcnt = *p++;
- printf ("/start_memory/%d/%d", mcnt, *p++);
- break;
- case stop_memory:
- mcnt = *p++;
- printf ("/stop_memory/%d/%d", mcnt, *p++);
- break;
- case duplicate:
- printf ("/duplicate/%d", *p++);
- break;
- case anychar:
- printf ("/anychar");
- break;
- case charset:
- case charset_not:
- {
- register int c;
- printf ("/charset%s",
- (re_opcode_t) *(p - 1) == charset_not ? "_not" : "");
-
- assert (p + *p < pend);
- for (c = 0; c < *p; c++)
- {
- unsigned bit;
- unsigned char map_byte = p[1 + c];
-
- putchar ('/');
- for (bit = 0; bit < BYTEWIDTH; bit++)
- if (map_byte & (1 << bit))
- printchar (c * BYTEWIDTH + bit);
- }
- p += 1 + *p;
- break;
- }
- case begline:
- printf ("/begline");
- break;
- case endline:
- printf ("/endline");
- break;
- case on_failure_jump:
- extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_jump/0/%d", mcnt);
- break;
- case on_failure_keep_string_jump:
- extract_number_and_incr (&mcnt, &p);
- printf ("/on_failure_keep_string_jump/0/%d", mcnt);
- break;
- case dummy_failure_jump:
- extract_number_and_incr (&mcnt, &p);
- printf ("/dummy_failure_jump/0/%d", mcnt);
- break;
- case push_dummy_failure:
- printf ("/push_dummy_failure");
- break;
-
- case maybe_pop_jump:
- extract_number_and_incr (&mcnt, &p);
- printf ("/maybe_pop_jump/0/%d", mcnt);
- break;
- case pop_failure_jump:
- extract_number_and_incr (&mcnt, &p);
- printf ("/pop_failure_jump/0/%d", mcnt);
- break;
-
- case jump_past_alt:
- extract_number_and_incr (&mcnt, &p);
- printf ("/jump_past_alt/0/%d", mcnt);
- break;
-
- case jump:
- extract_number_and_incr (&mcnt, &p);
- printf ("/jump/0/%d", mcnt);
- break;
- case succeed_n:
- extract_number_and_incr (&mcnt, &p);
- extract_number_and_incr (&mcnt2, &p);
- printf ("/succeed_n/0/%d/0/%d", mcnt, mcnt2);
- break;
-
- case jump_n:
- extract_number_and_incr (&mcnt, &p);
- extract_number_and_incr (&mcnt2, &p);
- printf ("/jump_n/0/%d/0/%d", mcnt, mcnt2);
- break;
-
- case set_number_at:
- extract_number_and_incr (&mcnt, &p);
- extract_number_and_incr (&mcnt2, &p);
- printf ("/set_number_at/0/%d/0/%d", mcnt, mcnt2);
- break;
-
- case wordbound:
- printf ("/wordbound");
- break;
- case notwordbound:
- printf ("/notwordbound");
- break;
- case wordbeg:
- printf ("/wordbeg");
- break;
-
- case wordend:
- printf ("/wordend");
-
- #ifdef emacs
- case before_dot:
- printf ("/before_dot");
- break;
- case at_dot:
- printf ("/at_dot");
- break;
- case after_dot:
- printf ("/after_dot");
- break;
- case syntaxspec:
- printf ("/syntaxspec");
- mcnt = *p++;
- printf ("/%d", mcnt);
- break;
-
- case notsyntaxspec:
- printf ("/notsyntaxspec");
- mcnt = *p++;
- printf ("/%d", mcnt);
- break;
- #endif /* emacs */
- case wordchar:
- printf ("/wordchar");
- break;
-
- case notwordchar:
- printf ("/notwordchar");
- break;
- case begbuf:
- printf ("/begbuf");
- break;
- case endbuf:
- printf ("/endbuf");
- break;
- default:
- printf ("?%d", *(p-1));
- }
- }
- printf ("/\n");
- }
- void
- print_compiled_pattern (bufp)
- struct re_pattern_buffer *bufp;
- {
- unsigned char *buffer = bufp->buffer;
- print_partial_compiled_pattern (buffer, buffer + bufp->used);
- printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated);
- if (bufp->fastmap_accurate && bufp->fastmap)
- {
- printf ("fastmap: ");
- print_fastmap (bufp->fastmap);
- }
- printf ("re_nsub: %d\t", bufp->re_nsub);
- printf ("regs_alloc: %d\t", bufp->regs_allocated);
- printf ("can_be_null: %d\t", bufp->can_be_null);
- printf ("newline_anchor: %d\n", bufp->newline_anchor);
- printf ("no_sub: %d\t", bufp->no_sub);
- printf ("not_bol: %d\t", bufp->not_bol);
- printf ("not_eol: %d\t", bufp->not_eol);
- printf ("syntax: %d\n", bufp->syntax);
- /* Perhaps we should print the translate table? */
- }
- void
- print_double_string (where, string1, size1, string2, size2)
- const char *where;
- const char *string1;
- const char *string2;
- int size1;
- int size2;
- {
- unsigned this_char;
-
- if (where == NULL)
- printf ("(null)");
- else
- {
- if (FIRST_STRING_P (where))
- {
- for (this_char = where - string1; this_char < size1; this_char++)
- printchar (string1[this_char]);
- where = string2;
- }
- for (this_char = where - string2; this_char < size2; this_char++)
- printchar (string2[this_char]);
- }
- }
- #else /* not DEBUG */
- #undef assert
- #define assert(e)
- #define DEBUG_STATEMENT(e)
- #define DEBUG_PRINT1(x)
- #define DEBUG_PRINT2(x1, x2)
- #define DEBUG_PRINT3(x1, x2, x3)
- #define DEBUG_PRINT4(x1, x2, x3, x4)
- #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
- #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
- #endif /* not DEBUG */
- /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
- also be assigned to arbitrarily: each pattern buffer stores its own
- syntax, so it can be changed between regex compilations. */
- reg_syntax_t re_syntax_options = RE_SYNTAX_EMACS;
- /* Specify the precise syntax of regexps for compilation. This provides
- for compatibility for various utilities which historically have
- different, incompatible syntaxes.
- The argument SYNTAX is a bit mask comprised of the various bits
- defined in regex.h. We return the old syntax. */
- reg_syntax_t
- re_set_syntax (syntax)
- reg_syntax_t syntax;
- {
- reg_syntax_t ret = re_syntax_options;
-
- re_syntax_options = syntax;
- return ret;
- }
- /* This table gives an error message for each of the error codes listed
- in regex.h. Obviously the order here has to be same as there. */
- static const char *re_error_msg[] =
- { NULL, /* REG_NOERROR */
- "No match", /* REG_NOMATCH */
- "Invalid regular expression", /* REG_BADPAT */
- "Invalid collation character", /* REG_ECOLLATE */
- "Invalid character class name", /* REG_ECTYPE */
- "Trailing backslash", /* REG_EESCAPE */
- "Invalid back reference", /* REG_ESUBREG */
- "Unmatched [ or [^", /* REG_EBRACK */
- "Unmatched ( or \\(", /* REG_EPAREN */
- "Unmatched \\{", /* REG_EBRACE */
- "Invalid content of \\{\\}", /* REG_BADBR */
- "Invalid range end", /* REG_ERANGE */
- "Memory exhausted", /* REG_ESPACE */
- "Invalid preceding regular expression", /* REG_BADRPT */
- "Premature end of regular expression", /* REG_EEND */
- "Regular expression too big", /* REG_ESIZE */
- "Unmatched ) or \\)", /* REG_ERPAREN */
- };
- /* Subroutine declarations and macros for regex_compile. */
- typedef struct
- {
- struct compile_stack_elt_t *stack;
- unsigned size;
- unsigned avail; /* Offset of next open position. */
- } compile_stack_type;
- /* But patterns can have more than `MAX_REGNUM' registers. We just
- ignore the excess. */
- typedef unsigned regnum_t;
- static void store_op1 (re_opcode_t op, unsigned char *loc, int arg), store_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2);
- static void insert_op1 (re_opcode_t op, unsigned char *loc, int arg, unsigned char *end), insert_op2 (re_opcode_t op, unsigned char *loc, int arg1, int arg2, unsigned char *end);
- static boolean at_begline_loc_p (const char *pattern, const char *p, reg_syntax_t syntax), at_endline_loc_p (const char *pattern, const char *p, int syntax);
- static boolean group_in_compile_stack (compile_stack_type compile_stack, regnum_t regnum);
- static reg_errcode_t compile_range (const char **p_ptr, const char *pend, char *translate, reg_syntax_t syntax, unsigned char *b);
- /* Fetch the next character in the uncompiled pattern---translating it
- if necessary. Also cast from a signed character in the constant
- string passed to us by the user to an unsigned char that we can use
- as an array index (in, e.g., `translate'). */
- #define PATFETCH(c) \
- do {if (p == pend) return REG_EEND; \
- c = (unsigned char) *p++; \
- if (translate) c = translate[c]; \
- } while (0)
- /* Fetch the next character in the uncompiled pattern, with no
- translation. */
- #define PATFETCH_RAW(c) \
- do {if (p == pend) return REG_EEND; \
- c = (unsigned char) *p++; \
- } while (0)
- /* Go backwards one character in the pattern. */
- #define PATUNFETCH p--
- /* If `translate' is non-null, return translate[D], else just D. We
- cast the subscript to translate because some data is declared as
- `char *', to avoid warnings when a string constant is passed. But
- when we use a character as a subscript we must make it unsigned. */
- #define TRANSLATE(d) (translate ? translate[(unsigned char) (d)] : (unsigned char)(d))
- /* Macros for outputting the compiled pattern into `buffer'. */
- /* If the buffer isn't allocated when it comes in, use this. */
- #define INIT_BUF_SIZE 32
- /* Make sure we have at least N more bytes of space in buffer. */
- #define GET_BUFFER_SPACE(n) \
- while ((unsigned long)(b - bufp->buffer + (n)) > bufp->allocated) \
- EXTEND_BUFFER ()
- /* Make sure we have one more byte of buffer space and then add C to it. */
- #define BUF_PUSH(c) \
- do { \
- GET_BUFFER_SPACE (1); \
- *b++ = (unsigned char) (c); \
- } while (0)
- /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
- #define BUF_PUSH_2(c1, c2) \
- do { \
- GET_BUFFER_SPACE (2); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
- } while (0)
- /* As with BUF_PUSH_2, except for three bytes. */
- #define BUF_PUSH_3(c1, c2, c3) \
- do { \
- GET_BUFFER_SPACE (3); \
- *b++ = (unsigned char) (c1); \
- *b++ = (unsigned char) (c2); \
- *b++ = (unsigned char) (c3); \
- } while (0)
- /* Store a jump with opcode OP at LOC to location TO. We store a
- relative address offset by the three bytes the jump itself occupies. */
- #define STORE_JUMP(op, loc, to) \
- store_op1 (op, loc, (to) - (loc) - 3)
- /* Likewise, for a two-argument jump. */
- #define STORE_JUMP2(op, loc, to, arg) \
- store_op2 (op, loc, (to) - (loc) - 3, arg)
- /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */
- #define INSERT_JUMP(op, loc, to) \
- insert_op1 (op, loc, (to) - (loc) - 3, b)
- /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */
- #define INSERT_JUMP2(op, loc, to, arg) \
- insert_op2 (op, loc, (to) - (loc) - 3, arg, b)
- /* This is not an arbitrary limit: the arguments which represent offsets
- into the pattern are two bytes long. So if 2^16 bytes turns out to
- be too small, many things would have to change. */
- #define MAX_BUF_SIZE (1L << 16)
- /* Extend the buffer by twice its current size via realloc and
- reset the pointers that pointed into the old block to point to the
- correct places in the new one. If extending the buffer results in it
- being larger than MAX_BUF_SIZE, then flag memory exhausted. */
- #define EXTEND_BUFFER() \
- do { \
- unsigned char *old_buffer = bufp->buffer; \
- if (bufp->allocated == MAX_BUF_SIZE) \
- return REG_ESIZE; \
- bufp->allocated <<= 1; \
- if (bufp->allocated > MAX_BUF_SIZE) \
- bufp->allocated = MAX_BUF_SIZE; \
- bufp->buffer = (unsigned char *) realloc (bufp->buffer, bufp->allocated);\
- if (bufp->buffer == NULL) \
- return REG_ESPACE; \
- /* If the buffer moved, move all the pointers into it. */ \
- if (old_buffer != bufp->buffer) \
- { \
- b = (b - old_buffer) + bufp->buffer; \
- begalt = (begalt - old_buffer) + bufp->buffer; \
- if (fixup_alt_jump) \
- fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
- if (laststart) \
- laststart = (laststart - old_buffer) + bufp->buffer; \
- if (pending_exact) \
- pending_exact = (pending_exact - old_buffer) + bufp->buffer; \
- } \
- } while (0)
- /* Since we have one byte reserved for the register number argument to
- {start,stop}_memory, the maximum number of groups we can report
- things about is what fits in that byte. */
- #define MAX_REGNUM 255
- /* Macros for the compile stack. */
- /* Since offsets can go either forwards or backwards, this type needs to
- be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
- typedef int pattern_offset_t;
- typedef struct compile_stack_elt_t
- {
- pattern_offset_t begalt_offset;
- pattern_offset_t fixup_alt_jump;
- pattern_offset_t inner_group_offset;
- pattern_offset_t laststart_offset;
- regnum_t regnum;
- } compile_stack_elt_t;
- #define INIT_COMPILE_STACK_SIZE 32
- #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
- #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
- /* The next available element. */
- #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
- /* Set the bit for character C in a list. */
- #define SET_LIST_BIT(c) \
- (b[((unsigned char) (c)) / BYTEWIDTH] \
- |= 1 << (((unsigned char) c) % BYTEWIDTH))
- /* Get the next unsigned number in the uncompiled pattern. */
- #define GET_UNSIGNED_NUMBER(num) \
- { if (p != pend) \
- { \
- PATFETCH (c); \
- while (ISDIGIT (c)) \
- { \
- if (num < 0) \
- num = 0; \
- num = num * 10 + c - '0'; \
- if (p == pend) \
- break; \
- PATFETCH (c); \
- } \
- } \
- }
- #define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */
- #define IS_CHAR_CLASS(string) \
- (STREQ (string, "alpha") || STREQ (string, "upper") \
- || STREQ (string, "lower") || STREQ (string, "digit") \
- || STREQ (string, "alnum") || STREQ (string, "xdigit") \
- || STREQ (string, "space") || STREQ (string, "print") \
- || STREQ (string, "punct") || STREQ (string, "graph") \
- || STREQ (string, "cntrl") || STREQ (string, "blank"))
- /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
- Returns one of error codes defined in `regex.h', or zero for success.
- Assumes the `allocated' (and perhaps `buffer') and `translate'
- fields are set in BUFP on entry.
- If it succeeds, results are put in BUFP (if it returns an error, the
- contents of BUFP are undefined):
- `buffer' is the compiled pattern;
- `syntax' is set to SYNTAX;
- `used' is set to the length of the compiled pattern;
- `fastmap_accurate' is zero;
- `re_nsub' is the number of subexpressions in PATTERN;
- `not_bol' and `not_eol' are zero;
-
- The `fastmap' and `newline_anchor' fields are neither
- examined nor set. */
- static reg_errcode_t regex_compile (const char *pattern, int size, reg_syntax_t syntax, struct re_pattern_buffer *bufp);
- static reg_errcode_t
- regex_compile (pattern, size, syntax, bufp)
- const char *pattern;
- int size;
- reg_syntax_t syntax;
- struct re_pattern_buffer *bufp;
- {
- /* We fetch characters from PATTERN here. Even though PATTERN is
- `char *' (i.e., signed), we declare these variables as unsigned, so
- they can be reliably used as array indices. */
- register unsigned char c, c1;
-
- /* A random tempory spot in PATTERN. */
- const char *p1;
- /* Points to the end of the buffer, where we should append. */
- register unsigned char *b;
-
- /* Keeps track of unclosed groups. */
- compile_stack_type compile_stack;
- /* Points to the current (ending) position in the pattern. */
- const char *p = pattern;
- const char *pend = pattern + size;
-
- /* How to translate the characters in the pattern. */
- char *translate = bufp->translate;
- /* Address of the count-byte of the most recently inserted `exactn'
- command. This makes it possible to tell if a new exact-match
- character can be added to that command or if the character requires
- a new `exactn' command. */
- unsigned char *pending_exact = 0;
- /* Address of start of the most recently finished expression.
- This tells, e.g., postfix * where to find the start of its
- operand. Reset at the beginning of groups and alternatives. */
- unsigned char *laststart = 0;
- /* Address of beginning of regexp, or inside of last group. */
- unsigned char *begalt;
- /* Place in the uncompiled pattern (i.e., the {) to
- which to go back if the interval is invalid. */
- const char *beg_interval;
-
- /* Address of the place where a forward jump should go to the end of
- the containing expression. Each alternative of an `or' -- except the
- last -- ends with a forward jump of this sort. */
- unsigned char *fixup_alt_jump = 0;
- /* Counts open-groups as they are encountered. Remembered for the
- matching close-group on the compile stack, so the same register
- number is put in the stop_memory as the start_memory. */
- regnum_t regnum = 0;
- #ifdef DEBUG
- DEBUG_PRINT1 ("\nCompiling pattern: ");
- if (debug)
- {
- unsigned debug_count;
-
- for (debug_count = 0; debug_count < size; debug_count++)
- printchar (pattern[debug_count]);
- putchar ('\n');
- }
- #endif /* DEBUG */
- /* Initialize the compile stack. */
- compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
- if (compile_stack.stack == NULL)
- return REG_ESPACE;
- compile_stack.size = INIT_COMPILE_STACK_SIZE;
- compile_stack.avail = 0;
- /* Initialize the pattern buffer. */
- bufp->syntax = syntax;
- bufp->fastmap_accurate = 0;
- bufp->not_bol = bufp->not_eol = 0;
- /* Set `used' to zero, so that if we return an error, the pattern
- printer (for debugging) will think there's no pattern. We reset it
- at the end. */
- bufp->used = 0;
-
- /* Always count groups, whether or not bufp->no_sub is set. */
- bufp->re_nsub = 0;
- #if !defined (emacs) && !defined (SYNTAX_TABLE)
- /* Initialize the syntax table. */
- init_syntax_once ();
- #endif
- if (bufp->allocated == 0)
- {
- if (bufp->buffer)
- { /* If zero allocated, but buffer is non-null, try to realloc
- enough space. This loses if buffer's address is bogus, but
- that is the user's responsibility. */
- RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);
- }
- else
- { /* Caller did not allocate a buffer. Do it for them. */
- bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);
- }
- if (!bufp->buffer) return REG_ESPACE;
- bufp->allocated = INIT_BUF_SIZE;
- }
- begalt = b = bufp->buffer;
- /* Loop through the uncompiled pattern until we're at the end. */
- while (p != pend)
- {
- PATFETCH (c);
- switch (c)
- {
- case '^':
- {
- if ( /* If at start of pattern, it's an operator. */
- p == pattern + 1
- /* If context independent, it's an operator. */
- || syntax & RE_CONTEXT_INDEP_ANCHORS
- /* Otherwise, depends on what's come before. */
- || at_begline_loc_p (pattern, p, syntax))
- BUF_PUSH (begline);
- else
- goto normal_char;
- }
- break;
- case '$':
- {
- if ( /* If at end of pattern, it's an operator. */
- p == pend
- /* If context independent, it's an operator. */
- || syntax & RE_CONTEXT_INDEP_ANCHORS
- /* Otherwise, depends on what's next. */
- || at_endline_loc_p (p, pend, syntax))
- BUF_PUSH (endline);
- else
- goto normal_char;
- }
- break;
- case '+':
- case '?':
- if ((syntax & RE_BK_PLUS_QM)
- || (syntax & RE_LIMITED_OPS))
- goto normal_char;
- handle_plus:
- case '*':
- /* If there is no previous pattern... */
- if (!laststart)
- {
- if (syntax & RE_CONTEXT_INVALID_OPS)
- return REG_BADRPT;
- else if (!(syntax & RE_CONTEXT_INDEP_OPS))
- goto normal_char;
- }
- {
- /* Are we optimizing this jump? */
- boolean keep_string_p = false;
-
- /* 1 means zero (many) matches is allowed. */
- char zero_times_ok = 0, many_times_ok = 0;
- /* If there is a sequence of repetition chars, collapse it
- down to just one (the right one). We can't combine
- interval operators with these because of, e.g., `a{2}*',
- which should only match an even number of `a's. */
- for (;;)
- {
- zero_times_ok |= c != '+';
- many_times_ok |= c != '?';
- if (p == pend)
- break;
- PATFETCH (c);
- if (c == '*'
- || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))
- ;
- else if (syntax & RE_BK_PLUS_QM && c == '\\')
- {
- if (p == pend) return REG_EESCAPE;
- PATFETCH (c1);
- if (!(c1 == '+' || c1 == '?'))
- {
- PATUNFETCH;
- PATUNFETCH;
- break;
- }
- c = c1;
- }
- else
- {
- PATUNFETCH;
- break;
- }
- /* If we get here, we found another repeat character. */
- }
- /* Star, etc. applied to an empty pattern is equivalent
- to an empty pattern. */
- if (!laststart)
- break;
- /* Now we know whether or not zero matches is allowed
- and also whether or not two or more matches is allowed. */
- if (many_times_ok)
- { /* More than one repetition is allowed, so put in at the
- end a backward relative jump from `b' to before the next
- jump we're going to put in below (which jumps from
- laststart to after this jump).
- But if we are at the `*' in the exact sequence `.*\n',
- insert an unconditional jump backwards to the .,
- instead of the beginning of the loop. This way we only
- push a failure point once, instead of every time
- through the loop. */
- assert (p - 1 > pattern);
- /* Allocate the space for the jump. */
- GET_BUFFER_SPACE (3);
- /* We know we are not at the first character of the pattern,
- because laststart was nonzero. And we've already
- incremented `p', by the way, to be the character after
- the `*'. Do we have to do something analogous here
- for null bytes, because of RE_DOT_NOT_NULL? */
- if (TRANSLATE (*(p - 2)) == TRANSLATE ('.')
- && zero_times_ok
- && p < pend && TRANSLATE (*p) == TRANSLATE ('\n')
- && !(syntax & RE_DOT_NEWLINE))
- { /* We have .*\n. */
- STORE_JUMP (jump, b, laststart);
- keep_string_p = true;
- }
- else
- /* Anything else. */
- STORE_JUMP (maybe_pop_jump, b, laststart - 3);
- /* We've added more stuff to the buffer. */
- b += 3;
- }
- /* On failure, jump from laststart to b + 3, which will be the
- end of the buffer after this jump is inserted. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump
- : on_failure_jump,
- laststart, b + 3);
- pending_exact = 0;
- b += 3;
- if (!zero_times_ok)
- {
- /* At least one repetition is required, so insert a
- `dummy_failure_jump' before the initial
- `on_failure_jump' instruction of the loop. This
- effects a skip over that instruction the first time
- we hit that loop. */
- GET_BUFFER_SPACE (3);
- INSERT_JUMP (dummy_failure_jump, laststart, laststart + 6);
- b += 3;
- }
- }
- break;
- case '.':
- laststart = b;
- BUF_PUSH (anychar);
- break;
- case '[':
- {
- boolean had_char_class = false;
- if (p == pend) return REG_EBRACK;
- /* Ensure that we have enough space to push a charset: the
- opcode, the length count, and the bitset; 34 bytes in all. */
- GET_BUFFER_SPACE (34);
- laststart = b;
- /* We test `*p == '^' twice, instead of using an if
- statement, so we only need one BUF_PUSH. */
- BUF_PUSH (*p == '^' ? charset_not : charset);
- if (*p == '^')
- p++;
- /* Remember the first position in the bracket expression. */
- p1 = p;
- /* Push the number of bytes in the bitmap. */
- BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH);
- /* Clear the whole map. */
- bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH);
- /* charset_not matches newline according to a syntax bit. */
- if ((re_opcode_t) b[-2] == charset_not
- && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
- SET_LIST_BIT ('\n');
- /* Read in characters and ranges, setting map bits. */
- for (;;)
- {
- if (p == pend) return REG_EBRACK;
- PATFETCH (c);
- /* \ might escape characters inside [...] and [^...]. */
- if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\')
- {
- if (p == pend) return REG_EESCAPE;
- PATFETCH (c1);
- SET_LIST_BIT (c1);
- continue;
- }
- /* Could be the end of the bracket expression. If it's
- not (i.e., when the bracket expression is `[]' so
- far), the ']' character bit gets set way below. */
- if (c == ']' && p != p1 + 1)
- break;
- /* Look ahead to see if it's a range when the last thing
- was a character class. */
- if (had_char_class && c == '-' && *p != ']')
- return REG_ERANGE;
- /* Look ahead to see if it's a range when the last thing
- was a character: if this is a hyphen not at the
- beginning or the end of a list, then it's the range
- operator. */
- if (c == '-'
- && !(p - 2 >= pattern && p[-2] == '[')
- && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^')
- && *p != ']')
- {
- reg_errcode_t ret
- = compile_range (&p, pend, translate, syntax, b);
- if (ret != REG_NOERROR) return ret;
- }
- else if (p[0] == '-' && p[1] != ']')
- { /* This handles ranges made up of characters only. */
- reg_errcode_t ret;
- /* Move past the `-'. */
- PATFETCH (c1);
-
- ret = compile_range (&p, pend, translate, syntax, b);
- if (ret != REG_NOERROR) return ret;
- }
- /* See if we're at the beginning of a possible character
- class. */
- else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':')
- { /* Leave room for the null. */
- char str[CHAR_CLASS_MAX_LENGTH + 1];
- PATFETCH (c);
- c1 = 0;
- /* If pattern is `[[:'. */
- if (p == pend) return REG_EBRACK;
- for (;;)
- {
- PATFETCH (c);
- if (c == ':' || c == ']' || p == pend
- || c1 == CHAR_CLASS_MAX_LENGTH)
- break;
- str[c1++] = c;
- }
- str[c1] = '\0';
- /* If isn't a word bracketed by `[:' and:`]':
- undo the ending character, the letters, and leave
- the leading `:' and `[' (but set bits for them). */
- if (c == ':' && *p == ']')
- {
- int ch;
- boolean is_alnum = STREQ (str, "alnum");
- boolean is_alpha = STREQ (str, "alpha");
- boolean is_blank = STREQ (str, "blank");
- boolean is_cntrl = STREQ (str, "cntrl");
- boolean is_digit = STREQ (str, "digit");
- boolean is_graph = STREQ (str, "graph");
- boolean is_lower = STREQ (str, "lower");
- boolean is_print = STREQ (str, "print");
- boolean is_punct = STREQ (str, "punct");
- boolean is_space = STREQ (str, "space");
- boolean is_upper = STREQ (str, "upper");
- boolean is_xdigit = STREQ (str, "xdigit");
-
- if (!IS_CHAR_CLASS (str)) return REG_ECTYPE;
- /* Throw away the ] at the end of the character
- class. */
- PATFETCH (c);
- if (p == pend) return REG_EBRACK;
- for (ch = 0; ch < 1 << BYTEWIDTH; ch++)
- {
- if ( (is_alnum && ISALNUM (ch))
- || (is_alpha && ISALPHA (ch))
- || (is_blank && ISBLANK (ch))
- || (is_cntrl && ISCNTRL (ch))
- || (is_digit && ISDIGIT (ch))
- || (is_graph && ISGRAPH (ch))
- || (is_lower && ISLOWER (ch))
- || (is_print && ISPRINT (ch))
- || (is_punct && ISPUNCT (ch))
- || (is_space && ISSPACE (ch))
- || (is_upper && ISUPPER (ch))
- || (is_xdigit && ISXDIGIT (ch)))
- SET_LIST_BIT (ch);
- }
- had_char_class = true;
- }
- else
- {
- c1++;
- while (c1--)
- PATUNFETCH;
- SET_LIST_BIT ('[');
- SET_LIST_BIT (':');
- had_char_class = false;
- }
- }
- else
- {
- had_char_class = false;
- SET_LIST_BIT (c);
- }
- }
- /* Discard any (non)matching list bytes that are all 0 at the
- end of the map. Decrease the map-length byte too. */
- while ((int) b[-1] > 0 && b[b[-1] - 1] == 0)
- b[-1]--;
- b += b[-1];
- }
- break;
- case '(':
- if (syntax & RE_NO_BK_PARENS)
- goto handle_open;
- else
- goto normal_char;
- case ')':
- if (syntax & RE_NO_BK_PARENS)
- goto handle_close;
- else
- goto normal_char;
- case '\n':
- if (syntax & RE_NEWLINE_ALT)
- goto handle_alt;
- else
- goto normal_char;
- case '|':
- if (syntax & RE_NO_BK_VBAR)
- goto handle_alt;
- else
- goto normal_char;
- case '{':
- if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
- goto handle_interval;
- else
- goto normal_char;
- case '\\':
- if (p == pend) return REG_EESCAPE;
- /* Do not translate the character after the \, so that we can
- distinguish, e.g., \B from \b, even if we normally would
- translate, e.g., B to b. */
- PATFETCH_RAW (c);
- switch (c)
- {
- case '(':
- if (syntax & RE_NO_BK_PARENS)
- goto normal_backslash;
- handle_open:
- bufp->re_nsub++;
- regnum++;
- if (COMPILE_STACK_FULL)
- {
- RETALLOC (compile_stack.stack, compile_stack.size << 1,
- compile_stack_elt_t);
- …
Large files files are truncated, but you can click here to view the full file