/contrib/cvs/lib/regex.c
C | 6375 lines | 3925 code | 1031 blank | 1419 comment | 970 complexity | 31ee443b3692f366763a0c58c9d6d303 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/* Extended regular expression matching and search library, version 2 0.12. (Implements POSIX draft P10003.2/D11.2, except for 3 internationalization features.) 4 5 Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998 Free Software Foundation, Inc. 6 7 This program is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with this program; if not, write to the Free Software 19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, 20 USA. */ 21 22/* AIX requires this to be the first thing in the file. */ 23#if defined (_AIX) && !defined (REGEX_MALLOC) 24 #pragma alloca 25#endif 26 27#undef _GNU_SOURCE 28#define _GNU_SOURCE 29 30#ifdef emacs 31/* Converts the pointer to the char to BEG-based offset from the start. */ 32#define PTR_TO_OFFSET(d) \ 33 POS_AS_IN_BUFFER (MATCHING_IN_FIRST_STRING \ 34 ? (d) - string1 : (d) - (string2 - size1)) 35#define POS_AS_IN_BUFFER(p) ((p) + (NILP (re_match_object) || BUFFERP (re_match_object))) 36#else 37#define PTR_TO_OFFSET(d) 0 38#endif 39 40#ifdef HAVE_CONFIG_H 41#include <config.h> 42#endif 43 44/* We need this for `regex.h', and perhaps for the Emacs include files. */ 45#include <sys/types.h> 46 47/* This is for other GNU distributions with internationalized messages. */ 48#if HAVE_LIBINTL_H || defined (_LIBC) 49# include <libintl.h> 50#else 51# define gettext(msgid) (msgid) 52#endif 53 54#ifndef gettext_noop 55/* This define is so xgettext can find the internationalizable 56 strings. */ 57#define gettext_noop(String) String 58#endif 59 60/* The `emacs' switch turns on certain matching commands 61 that make sense only in Emacs. */ 62#ifdef emacs 63 64#include "lisp.h" 65#include "buffer.h" 66 67/* Make syntax table lookup grant data in gl_state. */ 68#define SYNTAX_ENTRY_VIA_PROPERTY 69 70#include "syntax.h" 71#include "charset.h" 72#include "category.h" 73 74#define malloc xmalloc 75#define realloc xrealloc 76#define free xfree 77 78#else /* not emacs */ 79 80/* If we are not linking with Emacs proper, 81 we can't use the relocating allocator 82 even if config.h says that we can. */ 83#undef REL_ALLOC 84 85#if defined (STDC_HEADERS) || defined (_LIBC) 86#include <stdlib.h> 87#else 88char *malloc (); 89char *realloc (); 90#endif 91 92/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. 93 If nothing else has been done, use the method below. */ 94#ifdef INHIBIT_STRING_HEADER 95#if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY)) 96#if !defined (bzero) && !defined (bcopy) 97#undef INHIBIT_STRING_HEADER 98#endif 99#endif 100#endif 101 102/* This is the normal way of making sure we have a bcopy and a bzero. 103 This is used in most programs--a few other programs avoid this 104 by defining INHIBIT_STRING_HEADER. */ 105#ifndef INHIBIT_STRING_HEADER 106#if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC) 107#include <string.h> 108#ifndef bcmp 109#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n)) 110#endif 111#ifndef bcopy 112#define bcopy(s, d, n) memcpy ((d), (s), (n)) 113#endif 114#ifndef bzero 115#define bzero(s, n) memset ((s), 0, (n)) 116#endif 117#else 118#include <strings.h> 119#endif 120#endif 121 122/* Define the syntax stuff for \<, \>, etc. */ 123 124/* This must be nonzero for the wordchar and notwordchar pattern 125 commands in re_match_2. */ 126#ifndef Sword 127#define Sword 1 128#endif 129 130#ifdef SWITCH_ENUM_BUG 131#define SWITCH_ENUM_CAST(x) ((int)(x)) 132#else 133#define SWITCH_ENUM_CAST(x) (x) 134#endif 135 136#ifdef SYNTAX_TABLE 137 138extern char *re_syntax_table; 139 140#else /* not SYNTAX_TABLE */ 141 142/* How many characters in the character set. */ 143#define CHAR_SET_SIZE 256 144 145static char re_syntax_table[CHAR_SET_SIZE]; 146 147static void 148init_syntax_once () 149{ 150 register int c; 151 static int done = 0; 152 153 if (done) 154 return; 155 156 bzero (re_syntax_table, sizeof re_syntax_table); 157 158 for (c = 'a'; c <= 'z'; c++) 159 re_syntax_table[c] = Sword; 160 161 for (c = 'A'; c <= 'Z'; c++) 162 re_syntax_table[c] = Sword; 163 164 for (c = '0'; c <= '9'; c++) 165 re_syntax_table[c] = Sword; 166 167 re_syntax_table['_'] = Sword; 168 169 done = 1; 170} 171 172#endif /* not SYNTAX_TABLE */ 173 174#define SYNTAX(c) re_syntax_table[c] 175 176/* Dummy macros for non-Emacs environments. */ 177#define BASE_LEADING_CODE_P(c) (0) 178#define WORD_BOUNDARY_P(c1, c2) (0) 179#define CHAR_HEAD_P(p) (1) 180#define SINGLE_BYTE_CHAR_P(c) (1) 181#define SAME_CHARSET_P(c1, c2) (1) 182#define MULTIBYTE_FORM_LENGTH(p, s) (1) 183#define STRING_CHAR(p, s) (*(p)) 184#define STRING_CHAR_AND_LENGTH(p, s, actual_len) ((actual_len) = 1, *(p)) 185#define GET_CHAR_AFTER_2(c, p, str1, end1, str2, end2) \ 186 (c = ((p) == (end1) ? *(str2) : *(p))) 187#define GET_CHAR_BEFORE_2(c, p, str1, end1, str2, end2) \ 188 (c = ((p) == (str2) ? *((end1) - 1) : *((p) - 1))) 189#endif /* not emacs */ 190 191/* Get the interface, including the syntax bits. */ 192#include "regex.h" 193 194/* isalpha etc. are used for the character classes. */ 195#include <ctype.h> 196 197/* Jim Meyering writes: 198 199 "... Some ctype macros are valid only for character codes that 200 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when 201 using /bin/cc or gcc but without giving an ansi option). So, all 202 ctype uses should be through macros like ISPRINT... If 203 STDC_HEADERS is defined, then autoconf has verified that the ctype 204 macros don't need to be guarded with references to isascii. ... 205 Defining isascii to 1 should let any compiler worth its salt 206 eliminate the && through constant folding." */ 207 208#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) 209#define ISASCII(c) 1 210#else 211#define ISASCII(c) isascii(c) 212#endif 213 214#ifdef isblank 215#define ISBLANK(c) (ISASCII (c) && isblank (c)) 216#else 217#define ISBLANK(c) ((c) == ' ' || (c) == '\t') 218#endif 219#ifdef isgraph 220#define ISGRAPH(c) (ISASCII (c) && isgraph (c)) 221#else 222#define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) 223#endif 224 225#define ISPRINT(c) (ISASCII (c) && isprint (c)) 226#define ISDIGIT(c) (ISASCII (c) && isdigit (c)) 227#define ISALNUM(c) (ISASCII (c) && isalnum (c)) 228#define ISALPHA(c) (ISASCII (c) && isalpha (c)) 229#define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) 230#define ISLOWER(c) (ISASCII (c) && islower (c)) 231#define ISPUNCT(c) (ISASCII (c) && ispunct (c)) 232#define ISSPACE(c) (ISASCII (c) && isspace (c)) 233#define ISUPPER(c) (ISASCII (c) && isupper (c)) 234#define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) 235 236#ifndef NULL 237#define NULL (void *)0 238#endif 239 240/* We remove any previous definition of `SIGN_EXTEND_CHAR', 241 since ours (we hope) works properly with all combinations of 242 machines, compilers, `char' and `unsigned char' argument types. 243 (Per Bothner suggested the basic approach.) */ 244#undef SIGN_EXTEND_CHAR 245#if __STDC__ 246#define SIGN_EXTEND_CHAR(c) ((signed char) (c)) 247#else /* not __STDC__ */ 248/* As in Harbison and Steele. */ 249#define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) 250#endif 251 252/* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we 253 use `alloca' instead of `malloc'. This is because using malloc in 254 re_search* or re_match* could cause memory leaks when C-g is used in 255 Emacs; also, malloc is slower and causes storage fragmentation. On 256 the other hand, malloc is more portable, and easier to debug. 257 258 Because we sometimes use alloca, some routines have to be macros, 259 not functions -- `alloca'-allocated space disappears at the end of the 260 function it is called in. */ 261 262#ifdef REGEX_MALLOC 263 264#define REGEX_ALLOCATE malloc 265#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) 266#define REGEX_FREE free 267 268#else /* not REGEX_MALLOC */ 269 270/* Emacs already defines alloca, sometimes. */ 271#ifndef alloca 272 273/* Make alloca work the best possible way. */ 274#ifdef __GNUC__ 275#define alloca __builtin_alloca 276#else /* not __GNUC__ */ 277#if HAVE_ALLOCA_H 278#include <alloca.h> 279#else /* not __GNUC__ or HAVE_ALLOCA_H */ 280#if 0 /* It is a bad idea to declare alloca. We always cast the result. */ 281#ifndef _AIX /* Already did AIX, up at the top. */ 282char *alloca (); 283#endif /* not _AIX */ 284#endif 285#endif /* not HAVE_ALLOCA_H */ 286#endif /* not __GNUC__ */ 287 288#endif /* not alloca */ 289 290#define REGEX_ALLOCATE alloca 291 292/* Assumes a `char *destination' variable. */ 293#define REGEX_REALLOCATE(source, osize, nsize) \ 294 (destination = (char *) alloca (nsize), \ 295 bcopy (source, destination, osize), \ 296 destination) 297 298/* No need to do anything to free, after alloca. */ 299#define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ 300 301#endif /* not REGEX_MALLOC */ 302 303/* Define how to allocate the failure stack. */ 304 305#if defined (REL_ALLOC) && defined (REGEX_MALLOC) 306 307#define REGEX_ALLOCATE_STACK(size) \ 308 r_alloc (&failure_stack_ptr, (size)) 309#define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 310 r_re_alloc (&failure_stack_ptr, (nsize)) 311#define REGEX_FREE_STACK(ptr) \ 312 r_alloc_free (&failure_stack_ptr) 313 314#else /* not using relocating allocator */ 315 316#ifdef REGEX_MALLOC 317 318#define REGEX_ALLOCATE_STACK malloc 319#define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) 320#define REGEX_FREE_STACK free 321 322#else /* not REGEX_MALLOC */ 323 324#define REGEX_ALLOCATE_STACK alloca 325 326#define REGEX_REALLOCATE_STACK(source, osize, nsize) \ 327 REGEX_REALLOCATE (source, osize, nsize) 328/* No need to explicitly free anything. */ 329#define REGEX_FREE_STACK(arg) 330 331#endif /* not REGEX_MALLOC */ 332#endif /* not using relocating allocator */ 333 334 335/* True if `size1' is non-NULL and PTR is pointing anywhere inside 336 `string1' or just past its end. This works if PTR is NULL, which is 337 a good thing. */ 338#define FIRST_STRING_P(ptr) \ 339 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) 340 341/* (Re)Allocate N items of type T using malloc, or fail. */ 342#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) 343#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) 344#define RETALLOC_IF(addr, n, t) \ 345 if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) 346#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) 347 348#define BYTEWIDTH 8 /* In bits. */ 349 350#define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) 351 352#undef MAX 353#undef MIN 354#define MAX(a, b) ((a) > (b) ? (a) : (b)) 355#define MIN(a, b) ((a) < (b) ? (a) : (b)) 356 357typedef char boolean; 358#define false 0 359#define true 1 360 361static int re_match_2_internal (); 362 363/* These are the command codes that appear in compiled regular 364 expressions. Some opcodes are followed by argument bytes. A 365 command code can specify any interpretation whatsoever for its 366 arguments. Zero bytes may appear in the compiled regular expression. */ 367 368typedef enum 369{ 370 no_op = 0, 371 372 /* Succeed right away--no more backtracking. */ 373 succeed, 374 375 /* Followed by one byte giving n, then by n literal bytes. */ 376 exactn, 377 378 /* Matches any (more or less) character. */ 379 anychar, 380 381 /* Matches any one char belonging to specified set. First 382 following byte is number of bitmap bytes. Then come bytes 383 for a bitmap saying which chars are in. Bits in each byte 384 are ordered low-bit-first. A character is in the set if its 385 bit is 1. A character too large to have a bit in the map is 386 automatically not in the set. */ 387 charset, 388 389 /* Same parameters as charset, but match any character that is 390 not one of those specified. */ 391 charset_not, 392 393 /* Start remembering the text that is matched, for storing in a 394 register. Followed by one byte with the register number, in 395 the range 0 to one less than the pattern buffer's re_nsub 396 field. Then followed by one byte with the number of groups 397 inner to this one. (This last has to be part of the 398 start_memory only because we need it in the on_failure_jump 399 of re_match_2.) */ 400 start_memory, 401 402 /* Stop remembering the text that is matched and store it in a 403 memory register. Followed by one byte with the register 404 number, in the range 0 to one less than `re_nsub' in the 405 pattern buffer, and one byte with the number of inner groups, 406 just like `start_memory'. (We need the number of inner 407 groups here because we don't have any easy way of finding the 408 corresponding start_memory when we're at a stop_memory.) */ 409 stop_memory, 410 411 /* Match a duplicate of something remembered. Followed by one 412 byte containing the register number. */ 413 duplicate, 414 415 /* Fail unless at beginning of line. */ 416 begline, 417 418 /* Fail unless at end of line. */ 419 endline, 420 421 /* Succeeds if at beginning of buffer (if emacs) or at beginning 422 of string to be matched (if not). */ 423 begbuf, 424 425 /* Analogously, for end of buffer/string. */ 426 endbuf, 427 428 /* Followed by two byte relative address to which to jump. */ 429 jump, 430 431 /* Same as jump, but marks the end of an alternative. */ 432 jump_past_alt, 433 434 /* Followed by two-byte relative address of place to resume at 435 in case of failure. */ 436 on_failure_jump, 437 438 /* Like on_failure_jump, but pushes a placeholder instead of the 439 current string position when executed. */ 440 on_failure_keep_string_jump, 441 442 /* Throw away latest failure point and then jump to following 443 two-byte relative address. */ 444 pop_failure_jump, 445 446 /* Change to pop_failure_jump if know won't have to backtrack to 447 match; otherwise change to jump. This is used to jump 448 back to the beginning of a repeat. If what follows this jump 449 clearly won't match what the repeat does, such that we can be 450 sure that there is no use backtracking out of repetitions 451 already matched, then we change it to a pop_failure_jump. 452 Followed by two-byte address. */ 453 maybe_pop_jump, 454 455 /* Jump to following two-byte address, and push a dummy failure 456 point. This failure point will be thrown away if an attempt 457 is made to use it for a failure. A `+' construct makes this 458 before the first repeat. Also used as an intermediary kind 459 of jump when compiling an alternative. */ 460 dummy_failure_jump, 461 462 /* Push a dummy failure point and continue. Used at the end of 463 alternatives. */ 464 push_dummy_failure, 465 466 /* Followed by two-byte relative address and two-byte number n. 467 After matching N times, jump to the address upon failure. */ 468 succeed_n, 469 470 /* Followed by two-byte relative address, and two-byte number n. 471 Jump to the address N times, then fail. */ 472 jump_n, 473 474 /* Set the following two-byte relative address to the 475 subsequent two-byte number. The address *includes* the two 476 bytes of number. */ 477 set_number_at, 478 479 wordchar, /* Matches any word-constituent character. */ 480 notwordchar, /* Matches any char that is not a word-constituent. */ 481 482 wordbeg, /* Succeeds if at word beginning. */ 483 wordend, /* Succeeds if at word end. */ 484 485 wordbound, /* Succeeds if at a word boundary. */ 486 notwordbound /* Succeeds if not at a word boundary. */ 487 488#ifdef emacs 489 ,before_dot, /* Succeeds if before point. */ 490 at_dot, /* Succeeds if at point. */ 491 after_dot, /* Succeeds if after point. */ 492 493 /* Matches any character whose syntax is specified. Followed by 494 a byte which contains a syntax code, e.g., Sword. */ 495 syntaxspec, 496 497 /* Matches any character whose syntax is not that specified. */ 498 notsyntaxspec, 499 500 /* Matches any character whose category-set contains the specified 501 category. The operator is followed by a byte which contains a 502 category code (mnemonic ASCII character). */ 503 categoryspec, 504 505 /* Matches any character whose category-set does not contain the 506 specified category. The operator is followed by a byte which 507 contains the category code (mnemonic ASCII character). */ 508 notcategoryspec 509#endif /* emacs */ 510} re_opcode_t; 511 512/* Common operations on the compiled pattern. */ 513 514/* Store NUMBER in two contiguous bytes starting at DESTINATION. */ 515 516#define STORE_NUMBER(destination, number) \ 517 do { \ 518 (destination)[0] = (number) & 0377; \ 519 (destination)[1] = (number) >> 8; \ 520 } while (0) 521 522/* Same as STORE_NUMBER, except increment DESTINATION to 523 the byte after where the number is stored. Therefore, DESTINATION 524 must be an lvalue. */ 525 526#define STORE_NUMBER_AND_INCR(destination, number) \ 527 do { \ 528 STORE_NUMBER (destination, number); \ 529 (destination) += 2; \ 530 } while (0) 531 532/* Put into DESTINATION a number stored in two contiguous bytes starting 533 at SOURCE. */ 534 535#define EXTRACT_NUMBER(destination, source) \ 536 do { \ 537 (destination) = *(source) & 0377; \ 538 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \ 539 } while (0) 540 541#ifdef DEBUG 542static void 543extract_number (dest, source) 544 int *dest; 545 unsigned char *source; 546{ 547 int temp = SIGN_EXTEND_CHAR (*(source + 1)); 548 *dest = *source & 0377; 549 *dest += temp << 8; 550} 551 552#ifndef EXTRACT_MACROS /* To debug the macros. */ 553#undef EXTRACT_NUMBER 554#define EXTRACT_NUMBER(dest, src) extract_number (&dest, src) 555#endif /* not EXTRACT_MACROS */ 556 557#endif /* DEBUG */ 558 559/* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. 560 SOURCE must be an lvalue. */ 561 562#define EXTRACT_NUMBER_AND_INCR(destination, source) \ 563 do { \ 564 EXTRACT_NUMBER (destination, source); \ 565 (source) += 2; \ 566 } while (0) 567 568#ifdef DEBUG 569static void 570extract_number_and_incr (destination, source) 571 int *destination; 572 unsigned char **source; 573{ 574 extract_number (destination, *source); 575 *source += 2; 576} 577 578#ifndef EXTRACT_MACROS 579#undef EXTRACT_NUMBER_AND_INCR 580#define EXTRACT_NUMBER_AND_INCR(dest, src) \ 581 extract_number_and_incr (&dest, &src) 582#endif /* not EXTRACT_MACROS */ 583 584#endif /* DEBUG */ 585 586/* Store a multibyte character in three contiguous bytes starting 587 DESTINATION, and increment DESTINATION to the byte after where the 588 character is stored. Therefore, DESTINATION must be an lvalue. */ 589 590#define STORE_CHARACTER_AND_INCR(destination, character) \ 591 do { \ 592 (destination)[0] = (character) & 0377; \ 593 (destination)[1] = ((character) >> 8) & 0377; \ 594 (destination)[2] = (character) >> 16; \ 595 (destination) += 3; \ 596 } while (0) 597 598/* Put into DESTINATION a character stored in three contiguous bytes 599 starting at SOURCE. */ 600 601#define EXTRACT_CHARACTER(destination, source) \ 602 do { \ 603 (destination) = ((source)[0] \ 604 | ((source)[1] << 8) \ 605 | ((source)[2] << 16)); \ 606 } while (0) 607 608 609/* Macros for charset. */ 610 611/* Size of bitmap of charset P in bytes. P is a start of charset, 612 i.e. *P is (re_opcode_t) charset or (re_opcode_t) charset_not. */ 613#define CHARSET_BITMAP_SIZE(p) ((p)[1] & 0x7F) 614 615/* Nonzero if charset P has range table. */ 616#define CHARSET_RANGE_TABLE_EXISTS_P(p) ((p)[1] & 0x80) 617 618/* Return the address of range table of charset P. But not the start 619 of table itself, but the before where the number of ranges is 620 stored. `2 +' means to skip re_opcode_t and size of bitmap. */ 621#define CHARSET_RANGE_TABLE(p) (&(p)[2 + CHARSET_BITMAP_SIZE (p)]) 622 623/* Test if C is listed in the bitmap of charset P. */ 624#define CHARSET_LOOKUP_BITMAP(p, c) \ 625 ((c) < CHARSET_BITMAP_SIZE (p) * BYTEWIDTH \ 626 && (p)[2 + (c) / BYTEWIDTH] & (1 << ((c) % BYTEWIDTH))) 627 628/* Return the address of end of RANGE_TABLE. COUNT is number of 629 ranges (which is a pair of (start, end)) in the RANGE_TABLE. `* 2' 630 is start of range and end of range. `* 3' is size of each start 631 and end. */ 632#define CHARSET_RANGE_TABLE_END(range_table, count) \ 633 ((range_table) + (count) * 2 * 3) 634 635/* Test if C is in RANGE_TABLE. A flag NOT is negated if C is in. 636 COUNT is number of ranges in RANGE_TABLE. */ 637#define CHARSET_LOOKUP_RANGE_TABLE_RAW(not, c, range_table, count) \ 638 do \ 639 { \ 640 int range_start, range_end; \ 641 unsigned char *p; \ 642 unsigned char *range_table_end \ 643 = CHARSET_RANGE_TABLE_END ((range_table), (count)); \ 644 \ 645 for (p = (range_table); p < range_table_end; p += 2 * 3) \ 646 { \ 647 EXTRACT_CHARACTER (range_start, p); \ 648 EXTRACT_CHARACTER (range_end, p + 3); \ 649 \ 650 if (range_start <= (c) && (c) <= range_end) \ 651 { \ 652 (not) = !(not); \ 653 break; \ 654 } \ 655 } \ 656 } \ 657 while (0) 658 659/* Test if C is in range table of CHARSET. The flag NOT is negated if 660 C is listed in it. */ 661#define CHARSET_LOOKUP_RANGE_TABLE(not, c, charset) \ 662 do \ 663 { \ 664 /* Number of ranges in range table. */ \ 665 int count; \ 666 unsigned char *range_table = CHARSET_RANGE_TABLE (charset); \ 667 \ 668 EXTRACT_NUMBER_AND_INCR (count, range_table); \ 669 CHARSET_LOOKUP_RANGE_TABLE_RAW ((not), (c), range_table, count); \ 670 } \ 671 while (0) 672 673/* If DEBUG is defined, Regex prints many voluminous messages about what 674 it is doing (if the variable `debug' is nonzero). If linked with the 675 main program in `iregex.c', you can enter patterns and strings 676 interactively. And if linked with the main program in `main.c' and 677 the other test files, you can run the already-written tests. */ 678 679#ifdef DEBUG 680 681/* We use standard I/O for debugging. */ 682#include <stdio.h> 683 684/* It is useful to test things that ``must'' be true when debugging. */ 685#include <assert.h> 686 687static int debug = 0; 688 689#define DEBUG_STATEMENT(e) e 690#define DEBUG_PRINT1(x) if (debug) printf (x) 691#define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) 692#define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) 693#define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) 694#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ 695 if (debug) print_partial_compiled_pattern (s, e) 696#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ 697 if (debug) print_double_string (w, s1, sz1, s2, sz2) 698 699 700/* Print the fastmap in human-readable form. */ 701 702void 703print_fastmap (fastmap) 704 char *fastmap; 705{ 706 unsigned was_a_range = 0; 707 unsigned i = 0; 708 709 while (i < (1 << BYTEWIDTH)) 710 { 711 if (fastmap[i++]) 712 { 713 was_a_range = 0; 714 putchar (i - 1); 715 while (i < (1 << BYTEWIDTH) && fastmap[i]) 716 { 717 was_a_range = 1; 718 i++; 719 } 720 if (was_a_range) 721 { 722 printf ("-"); 723 putchar (i - 1); 724 } 725 } 726 } 727 putchar ('\n'); 728} 729 730 731/* Print a compiled pattern string in human-readable form, starting at 732 the START pointer into it and ending just before the pointer END. */ 733 734void 735print_partial_compiled_pattern (start, end) 736 unsigned char *start; 737 unsigned char *end; 738{ 739 int mcnt, mcnt2; 740 unsigned char *p = start; 741 unsigned char *pend = end; 742 743 if (start == NULL) 744 { 745 printf ("(null)\n"); 746 return; 747 } 748 749 /* Loop over pattern commands. */ 750 while (p < pend) 751 { 752 printf ("%d:\t", p - start); 753 754 switch ((re_opcode_t) *p++) 755 { 756 case no_op: 757 printf ("/no_op"); 758 break; 759 760 case exactn: 761 mcnt = *p++; 762 printf ("/exactn/%d", mcnt); 763 do 764 { 765 putchar ('/'); 766 putchar (*p++); 767 } 768 while (--mcnt); 769 break; 770 771 case start_memory: 772 mcnt = *p++; 773 printf ("/start_memory/%d/%d", mcnt, *p++); 774 break; 775 776 case stop_memory: 777 mcnt = *p++; 778 printf ("/stop_memory/%d/%d", mcnt, *p++); 779 break; 780 781 case duplicate: 782 printf ("/duplicate/%d", *p++); 783 break; 784 785 case anychar: 786 printf ("/anychar"); 787 break; 788 789 case charset: 790 case charset_not: 791 { 792 register int c, last = -100; 793 register int in_range = 0; 794 795 printf ("/charset [%s", 796 (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); 797 798 assert (p + *p < pend); 799 800 for (c = 0; c < 256; c++) 801 if (c / 8 < *p 802 && (p[1 + (c/8)] & (1 << (c % 8)))) 803 { 804 /* Are we starting a range? */ 805 if (last + 1 == c && ! in_range) 806 { 807 putchar ('-'); 808 in_range = 1; 809 } 810 /* Have we broken a range? */ 811 else if (last + 1 != c && in_range) 812 { 813 putchar (last); 814 in_range = 0; 815 } 816 817 if (! in_range) 818 putchar (c); 819 820 last = c; 821 } 822 823 if (in_range) 824 putchar (last); 825 826 putchar (']'); 827 828 p += 1 + *p; 829 } 830 break; 831 832 case begline: 833 printf ("/begline"); 834 break; 835 836 case endline: 837 printf ("/endline"); 838 break; 839 840 case on_failure_jump: 841 extract_number_and_incr (&mcnt, &p); 842 printf ("/on_failure_jump to %d", p + mcnt - start); 843 break; 844 845 case on_failure_keep_string_jump: 846 extract_number_and_incr (&mcnt, &p); 847 printf ("/on_failure_keep_string_jump to %d", p + mcnt - start); 848 break; 849 850 case dummy_failure_jump: 851 extract_number_and_incr (&mcnt, &p); 852 printf ("/dummy_failure_jump to %d", p + mcnt - start); 853 break; 854 855 case push_dummy_failure: 856 printf ("/push_dummy_failure"); 857 break; 858 859 case maybe_pop_jump: 860 extract_number_and_incr (&mcnt, &p); 861 printf ("/maybe_pop_jump to %d", p + mcnt - start); 862 break; 863 864 case pop_failure_jump: 865 extract_number_and_incr (&mcnt, &p); 866 printf ("/pop_failure_jump to %d", p + mcnt - start); 867 break; 868 869 case jump_past_alt: 870 extract_number_and_incr (&mcnt, &p); 871 printf ("/jump_past_alt to %d", p + mcnt - start); 872 break; 873 874 case jump: 875 extract_number_and_incr (&mcnt, &p); 876 printf ("/jump to %d", p + mcnt - start); 877 break; 878 879 case succeed_n: 880 extract_number_and_incr (&mcnt, &p); 881 extract_number_and_incr (&mcnt2, &p); 882 printf ("/succeed_n to %d, %d times", p + mcnt - start, mcnt2); 883 break; 884 885 case jump_n: 886 extract_number_and_incr (&mcnt, &p); 887 extract_number_and_incr (&mcnt2, &p); 888 printf ("/jump_n to %d, %d times", p + mcnt - start, mcnt2); 889 break; 890 891 case set_number_at: 892 extract_number_and_incr (&mcnt, &p); 893 extract_number_and_incr (&mcnt2, &p); 894 printf ("/set_number_at location %d to %d", p + mcnt - start, mcnt2); 895 break; 896 897 case wordbound: 898 printf ("/wordbound"); 899 break; 900 901 case notwordbound: 902 printf ("/notwordbound"); 903 break; 904 905 case wordbeg: 906 printf ("/wordbeg"); 907 break; 908 909 case wordend: 910 printf ("/wordend"); 911 912#ifdef emacs 913 case before_dot: 914 printf ("/before_dot"); 915 break; 916 917 case at_dot: 918 printf ("/at_dot"); 919 break; 920 921 case after_dot: 922 printf ("/after_dot"); 923 break; 924 925 case syntaxspec: 926 printf ("/syntaxspec"); 927 mcnt = *p++; 928 printf ("/%d", mcnt); 929 break; 930 931 case notsyntaxspec: 932 printf ("/notsyntaxspec"); 933 mcnt = *p++; 934 printf ("/%d", mcnt); 935 break; 936#endif /* emacs */ 937 938 case wordchar: 939 printf ("/wordchar"); 940 break; 941 942 case notwordchar: 943 printf ("/notwordchar"); 944 break; 945 946 case begbuf: 947 printf ("/begbuf"); 948 break; 949 950 case endbuf: 951 printf ("/endbuf"); 952 break; 953 954 default: 955 printf ("?%d", *(p-1)); 956 } 957 958 putchar ('\n'); 959 } 960 961 printf ("%d:\tend of pattern.\n", p - start); 962} 963 964 965void 966print_compiled_pattern (bufp) 967 struct re_pattern_buffer *bufp; 968{ 969 unsigned char *buffer = bufp->buffer; 970 971 print_partial_compiled_pattern (buffer, buffer + bufp->used); 972 printf ("%d bytes used/%d bytes allocated.\n", bufp->used, bufp->allocated); 973 974 if (bufp->fastmap_accurate && bufp->fastmap) 975 { 976 printf ("fastmap: "); 977 print_fastmap (bufp->fastmap); 978 } 979 980 printf ("re_nsub: %d\t", bufp->re_nsub); 981 printf ("regs_alloc: %d\t", bufp->regs_allocated); 982 printf ("can_be_null: %d\t", bufp->can_be_null); 983 printf ("newline_anchor: %d\n", bufp->newline_anchor); 984 printf ("no_sub: %d\t", bufp->no_sub); 985 printf ("not_bol: %d\t", bufp->not_bol); 986 printf ("not_eol: %d\t", bufp->not_eol); 987 printf ("syntax: %d\n", bufp->syntax); 988 /* Perhaps we should print the translate table? */ 989} 990 991 992void 993print_double_string (where, string1, size1, string2, size2) 994 const char *where; 995 const char *string1; 996 const char *string2; 997 int size1; 998 int size2; 999{ 1000 unsigned this_char; 1001 1002 if (where == NULL) 1003 printf ("(null)"); 1004 else 1005 { 1006 if (FIRST_STRING_P (where)) 1007 { 1008 for (this_char = where - string1; this_char < size1; this_char++) 1009 putchar (string1[this_char]); 1010 1011 where = string2; 1012 } 1013 1014 for (this_char = where - string2; this_char < size2; this_char++) 1015 putchar (string2[this_char]); 1016 } 1017} 1018 1019#else /* not DEBUG */ 1020 1021#undef assert 1022#define assert(e) 1023 1024#define DEBUG_STATEMENT(e) 1025#define DEBUG_PRINT1(x) 1026#define DEBUG_PRINT2(x1, x2) 1027#define DEBUG_PRINT3(x1, x2, x3) 1028#define DEBUG_PRINT4(x1, x2, x3, x4) 1029#define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) 1030#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) 1031 1032#endif /* not DEBUG */ 1033 1034/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can 1035 also be assigned to arbitrarily: each pattern buffer stores its own 1036 syntax, so it can be changed between regex compilations. */ 1037/* This has no initializer because initialized variables in Emacs 1038 become read-only after dumping. */ 1039reg_syntax_t re_syntax_options; 1040 1041 1042/* Specify the precise syntax of regexps for compilation. This provides 1043 for compatibility for various utilities which historically have 1044 different, incompatible syntaxes. 1045 1046 The argument SYNTAX is a bit mask comprised of the various bits 1047 defined in regex.h. We return the old syntax. */ 1048 1049reg_syntax_t 1050re_set_syntax (syntax) 1051 reg_syntax_t syntax; 1052{ 1053 reg_syntax_t ret = re_syntax_options; 1054 1055 re_syntax_options = syntax; 1056 return ret; 1057} 1058 1059/* This table gives an error message for each of the error codes listed 1060 in regex.h. Obviously the order here has to be same as there. 1061 POSIX doesn't require that we do anything for REG_NOERROR, 1062 but why not be nice? */ 1063 1064static const char *re_error_msgid[] = 1065 { 1066 gettext_noop ("Success"), /* REG_NOERROR */ 1067 gettext_noop ("No match"), /* REG_NOMATCH */ 1068 gettext_noop ("Invalid regular expression"), /* REG_BADPAT */ 1069 gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */ 1070 gettext_noop ("Invalid character class name"), /* REG_ECTYPE */ 1071 gettext_noop ("Trailing backslash"), /* REG_EESCAPE */ 1072 gettext_noop ("Invalid back reference"), /* REG_ESUBREG */ 1073 gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */ 1074 gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */ 1075 gettext_noop ("Unmatched \\{"), /* REG_EBRACE */ 1076 gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */ 1077 gettext_noop ("Invalid range end"), /* REG_ERANGE */ 1078 gettext_noop ("Memory exhausted"), /* REG_ESPACE */ 1079 gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */ 1080 gettext_noop ("Premature end of regular expression"), /* REG_EEND */ 1081 gettext_noop ("Regular expression too big"), /* REG_ESIZE */ 1082 gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */ 1083 }; 1084 1085/* Avoiding alloca during matching, to placate r_alloc. */ 1086 1087/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the 1088 searching and matching functions should not call alloca. On some 1089 systems, alloca is implemented in terms of malloc, and if we're 1090 using the relocating allocator routines, then malloc could cause a 1091 relocation, which might (if the strings being searched are in the 1092 ralloc heap) shift the data out from underneath the regexp 1093 routines. 1094 1095 Here's another reason to avoid allocation: Emacs 1096 processes input from X in a signal handler; processing X input may 1097 call malloc; if input arrives while a matching routine is calling 1098 malloc, then we're scrod. But Emacs can't just block input while 1099 calling matching routines; then we don't notice interrupts when 1100 they come in. So, Emacs blocks input around all regexp calls 1101 except the matching calls, which it leaves unprotected, in the 1102 faith that they will not malloc. */ 1103 1104/* Normally, this is fine. */ 1105#define MATCH_MAY_ALLOCATE 1106 1107/* When using GNU C, we are not REALLY using the C alloca, no matter 1108 what config.h may say. So don't take precautions for it. */ 1109#ifdef __GNUC__ 1110#undef C_ALLOCA 1111#endif 1112 1113/* The match routines may not allocate if (1) they would do it with malloc 1114 and (2) it's not safe for them to use malloc. 1115 Note that if REL_ALLOC is defined, matching would not use malloc for the 1116 failure stack, but we would still use it for the register vectors; 1117 so REL_ALLOC should not affect this. */ 1118#if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs) 1119#undef MATCH_MAY_ALLOCATE 1120#endif 1121 1122 1123/* Failure stack declarations and macros; both re_compile_fastmap and 1124 re_match_2 use a failure stack. These have to be macros because of 1125 REGEX_ALLOCATE_STACK. */ 1126 1127 1128/* Approximate number of failure points for which to initially allocate space 1129 when matching. If this number is exceeded, we allocate more 1130 space, so it is not a hard limit. */ 1131#ifndef INIT_FAILURE_ALLOC 1132#define INIT_FAILURE_ALLOC 20 1133#endif 1134 1135/* Roughly the maximum number of failure points on the stack. Would be 1136 exactly that if always used TYPICAL_FAILURE_SIZE items each time we failed. 1137 This is a variable only so users of regex can assign to it; we never 1138 change it ourselves. */ 1139#if defined (MATCH_MAY_ALLOCATE) 1140/* Note that 4400 is enough to cause a crash on Alpha OSF/1, 1141 whose default stack limit is 2mb. In order for a larger 1142 value to work reliably, you have to try to make it accord 1143 with the process stack limit. */ 1144int re_max_failures = 40000; 1145#else 1146int re_max_failures = 4000; 1147#endif 1148 1149union fail_stack_elt 1150{ 1151 unsigned char *pointer; 1152 int integer; 1153}; 1154 1155typedef union fail_stack_elt fail_stack_elt_t; 1156 1157typedef struct 1158{ 1159 fail_stack_elt_t *stack; 1160 unsigned size; 1161 unsigned avail; /* Offset of next open position. */ 1162} fail_stack_type; 1163 1164#define FAIL_STACK_EMPTY() (fail_stack.avail == 0) 1165#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) 1166#define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) 1167 1168 1169/* Define macros to initialize and free the failure stack. 1170 Do `return -2' if the alloc fails. */ 1171 1172#ifdef MATCH_MAY_ALLOCATE 1173#define INIT_FAIL_STACK() \ 1174 do { \ 1175 fail_stack.stack = (fail_stack_elt_t *) \ 1176 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * TYPICAL_FAILURE_SIZE \ 1177 * sizeof (fail_stack_elt_t)); \ 1178 \ 1179 if (fail_stack.stack == NULL) \ 1180 return -2; \ 1181 \ 1182 fail_stack.size = INIT_FAILURE_ALLOC; \ 1183 fail_stack.avail = 0; \ 1184 } while (0) 1185 1186#define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) 1187#else 1188#define INIT_FAIL_STACK() \ 1189 do { \ 1190 fail_stack.avail = 0; \ 1191 } while (0) 1192 1193#define RESET_FAIL_STACK() 1194#endif 1195 1196 1197/* Double the size of FAIL_STACK, up to a limit 1198 which allows approximately `re_max_failures' items. 1199 1200 Return 1 if succeeds, and 0 if either ran out of memory 1201 allocating space for it or it was already too large. 1202 1203 REGEX_REALLOCATE_STACK requires `destination' be declared. */ 1204 1205/* Factor to increase the failure stack size by 1206 when we increase it. 1207 This used to be 2, but 2 was too wasteful 1208 because the old discarded stacks added up to as much space 1209 were as ultimate, maximum-size stack. */ 1210#define FAIL_STACK_GROWTH_FACTOR 4 1211 1212#define GROW_FAIL_STACK(fail_stack) \ 1213 (((fail_stack).size * sizeof (fail_stack_elt_t) \ 1214 >= re_max_failures * TYPICAL_FAILURE_SIZE) \ 1215 ? 0 \ 1216 : ((fail_stack).stack \ 1217 = (fail_stack_elt_t *) \ 1218 REGEX_REALLOCATE_STACK ((fail_stack).stack, \ 1219 (fail_stack).size * sizeof (fail_stack_elt_t), \ 1220 MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \ 1221 ((fail_stack).size * sizeof (fail_stack_elt_t) \ 1222 * FAIL_STACK_GROWTH_FACTOR))), \ 1223 \ 1224 (fail_stack).stack == NULL \ 1225 ? 0 \ 1226 : ((fail_stack).size \ 1227 = (MIN (re_max_failures * TYPICAL_FAILURE_SIZE, \ 1228 ((fail_stack).size * sizeof (fail_stack_elt_t) \ 1229 * FAIL_STACK_GROWTH_FACTOR)) \ 1230 / sizeof (fail_stack_elt_t)), \ 1231 1))) 1232 1233 1234/* Push pointer POINTER on FAIL_STACK. 1235 Return 1 if was able to do so and 0 if ran out of memory allocating 1236 space to do so. */ 1237#define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ 1238 ((FAIL_STACK_FULL () \ 1239 && !GROW_FAIL_STACK (FAIL_STACK)) \ 1240 ? 0 \ 1241 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ 1242 1)) 1243 1244/* Push a pointer value onto the failure stack. 1245 Assumes the variable `fail_stack'. Probably should only 1246 be called from within `PUSH_FAILURE_POINT'. */ 1247#define PUSH_FAILURE_POINTER(item) \ 1248 fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item) 1249 1250/* This pushes an integer-valued item onto the failure stack. 1251 Assumes the variable `fail_stack'. Probably should only 1252 be called from within `PUSH_FAILURE_POINT'. */ 1253#define PUSH_FAILURE_INT(item) \ 1254 fail_stack.stack[fail_stack.avail++].integer = (item) 1255 1256/* Push a fail_stack_elt_t value onto the failure stack. 1257 Assumes the variable `fail_stack'. Probably should only 1258 be called from within `PUSH_FAILURE_POINT'. */ 1259#define PUSH_FAILURE_ELT(item) \ 1260 fail_stack.stack[fail_stack.avail++] = (item) 1261 1262/* These three POP... operations complement the three PUSH... operations. 1263 All assume that `fail_stack' is nonempty. */ 1264#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer 1265#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer 1266#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] 1267 1268/* Used to omit pushing failure point id's when we're not debugging. */ 1269#ifdef DEBUG 1270#define DEBUG_PUSH PUSH_FAILURE_INT 1271#define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () 1272#else 1273#define DEBUG_PUSH(item) 1274#define DEBUG_POP(item_addr) 1275#endif 1276 1277 1278/* Push the information about the state we will need 1279 if we ever fail back to it. 1280 1281 Requires variables fail_stack, regstart, regend, reg_info, and 1282 num_regs be declared. GROW_FAIL_STACK requires `destination' be 1283 declared. 1284 1285 Does `return FAILURE_CODE' if runs out of memory. */ 1286 1287#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ 1288 do { \ 1289 char *destination; \ 1290 /* Must be int, so when we don't save any registers, the arithmetic \ 1291 of 0 + -1 isn't done as unsigned. */ \ 1292 int this_reg; \ 1293 \ 1294 DEBUG_STATEMENT (failure_id++); \ 1295 DEBUG_STATEMENT (nfailure_points_pushed++); \ 1296 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ 1297 DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ 1298 DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ 1299 \ 1300 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \ 1301 DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ 1302 \ 1303 /* Ensure we have enough space allocated for what we will push. */ \ 1304 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ 1305 { \ 1306 if (!GROW_FAIL_STACK (fail_stack)) \ 1307 return failure_code; \ 1308 \ 1309 DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ 1310 (fail_stack).size); \ 1311 DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ 1312 } \ 1313 \ 1314 /* Push the info, starting with the registers. */ \ 1315 DEBUG_PRINT1 ("\n"); \ 1316 \ 1317 if (1) \ 1318 for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ 1319 this_reg++) \ 1320 { \ 1321 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \ 1322 DEBUG_STATEMENT (num_regs_pushed++); \ 1323 \ 1324 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 1325 PUSH_FAILURE_POINTER (regstart[this_reg]); \ 1326 \ 1327 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 1328 PUSH_FAILURE_POINTER (regend[this_reg]); \ 1329 \ 1330 DEBUG_PRINT2 (" info: 0x%x\n ", reg_info[this_reg]); \ 1331 DEBUG_PRINT2 (" match_null=%d", \ 1332 REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ 1333 DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ 1334 DEBUG_PRINT2 (" matched_something=%d", \ 1335 MATCHED_SOMETHING (reg_info[this_reg])); \ 1336 DEBUG_PRINT2 (" ever_matched=%d", \ 1337 EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ 1338 DEBUG_PRINT1 ("\n"); \ 1339 PUSH_FAILURE_ELT (reg_info[this_reg].word); \ 1340 } \ 1341 \ 1342 DEBUG_PRINT2 (" Pushing low active reg: %d\n", lowest_active_reg);\ 1343 PUSH_FAILURE_INT (lowest_active_reg); \ 1344 \ 1345 DEBUG_PRINT2 (" Pushing high active reg: %d\n", highest_active_reg);\ 1346 PUSH_FAILURE_INT (highest_active_reg); \ 1347 \ 1348 DEBUG_PRINT2 (" Pushing pattern 0x%x: ", pattern_place); \ 1349 DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ 1350 PUSH_FAILURE_POINTER (pattern_place); \ 1351 \ 1352 DEBUG_PRINT2 (" Pushing string 0x%x: `", string_place); \ 1353 DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ 1354 size2); \ 1355 DEBUG_PRINT1 ("'\n"); \ 1356 PUSH_FAILURE_POINTER (string_place); \ 1357 \ 1358 DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ 1359 DEBUG_PUSH (failure_id); \ 1360 } while (0) 1361 1362/* This is the number of items that are pushed and popped on the stack 1363 for each register. */ 1364#define NUM_REG_ITEMS 3 1365 1366/* Individual items aside from the registers. */ 1367#ifdef DEBUG 1368#define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ 1369#else 1370#define NUM_NONREG_ITEMS 4 1371#endif 1372 1373/* Estimate the size of data pushed by a typical failure stack entry. 1374 An estimate is all we need, because all we use this for 1375 is to choose a limit for how big to make the failure stack. */ 1376 1377#define TYPICAL_FAILURE_SIZE 20 1378 1379/* This is how many items we actually use for a failure point. 1380 It depends on the regexp. */ 1381#define NUM_FAILURE_ITEMS \ 1382 (((0 \ 1383 ? 0 : highest_active_reg - lowest_active_reg + 1) \ 1384 * NUM_REG_ITEMS) \ 1385 + NUM_NONREG_ITEMS) 1386 1387/* How many items can still be added to the stack without overflowing it. */ 1388#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) 1389 1390 1391/* Pops what PUSH_FAIL_STACK pushes. 1392 1393 We restore into the parameters, all of which should be lvalues: 1394 STR -- the saved data position. 1395 PAT -- the saved pattern position. 1396 LOW_REG, HIGH_REG -- the highest and lowest active registers. 1397 REGSTART, REGEND -- arrays of string positions. 1398 REG_INFO -- array of information about each subexpression. 1399 1400 Also assumes the variables `fail_stack' and (if debugging), `bufp', 1401 `pend', `string1', `size1', `string2', and `size2'. */ 1402 1403#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ 1404{ \ 1405 DEBUG_STATEMENT (fail_stack_elt_t failure_id;) \ 1406 int this_reg; \ 1407 const unsigned char *string_temp; \ 1408 \ 1409 assert (!FAIL_STACK_EMPTY ()); \ 1410 \ 1411 /* Remove failure points and point to how many regs pushed. */ \ 1412 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ 1413 DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ 1414 DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ 1415 \ 1416 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ 1417 \ 1418 DEBUG_POP (&failure_id); \ 1419 DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ 1420 \ 1421 /* If the saved string location is NULL, it came from an \ 1422 on_failure_keep_string_jump opcode, and we want to throw away the \ 1423 saved NULL, thus retaining our current position in the string. */ \ 1424 string_temp = POP_FAILURE_POINTER (); \ 1425 if (string_temp != NULL) \ 1426 str = (const char *) string_temp; \ 1427 \ 1428 DEBUG_PRINT2 (" Popping string 0x%x: `", str); \ 1429 DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ 1430 DEBUG_PRINT1 ("'\n"); \ 1431 \ 1432 pat = (unsigned char *) POP_FAILURE_POINTER (); \ 1433 DEBUG_PRINT2 (" Popping pattern 0x%x: ", pat); \ 1434 DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ 1435 \ 1436 /* Restore register info. */ \ 1437 high_reg = (unsigned) POP_FAILURE_INT (); \ 1438 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \ 1439 \ 1440 low_reg = (unsigned) POP_FAILURE_INT (); \ 1441 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \ 1442 \ 1443 if (1) \ 1444 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ 1445 { \ 1446 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \ 1447 \ 1448 reg_info[this_reg].word = POP_FAILURE_ELT (); \ 1449 DEBUG_PRINT2 (" info: 0x%x\n", reg_info[this_reg]); \ 1450 \ 1451 regend[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 1452 DEBUG_PRINT2 (" end: 0x%x\n", regend[this_reg]); \ 1453 \ 1454 regstart[this_reg] = (const char *) POP_FAILURE_POINTER (); \ 1455 DEBUG_PRINT2 (" start: 0x%x\n", regstart[this_reg]); \ 1456 } \ 1457 else \ 1458 { \ 1459 for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ 1460 { \ 1461 reg_info[this_reg].word.integer = 0; \ 1462 regend[this_reg] = 0; \ 1463 regstart[this_reg] = 0; \ 1464 } \ 1465 highest_active_reg = high_reg; \ 1466 } \ 1467 \ 1468 set_regs_matched_done = 0; \ 1469 DEBUG_STATEMENT (nfailure_points_popped++); \ 1470} /* POP_FAILURE_POINT */ 1471 1472 1473 1474/* Structure for per-register (a.k.a. per-group) information. 1475 Other register information, such as the 1476 starting and ending positions (which are addresses), and the list of 1477 inner groups (which is a bits list) are maintained in separate 1478 variables. 1479 1480 We are making a (strictly speaking) nonportable assumption here: that 1481 the compiler will pack our bit fields into something that fits into 1482 the type of `word', i.e., is something that fits into one item on the 1483 failure stack. */ 1484 1485typedef union 1486{ 1487 fail_stack_elt_t word; 1488 struct 1489 { 1490 /* This field is one if this group can match the empty string, 1491 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ 1492#define MATCH_NULL_UNSET_VALUE 3 1493 unsigned match_null_string_p : 2; 1494 unsigned is_active : 1; 1495 unsigned matched_something : 1; 1496 unsigned ever_matched_something : 1; 1497 } bits; 1498} register_info_type; 1499 1500#define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) 1501#define IS_ACTIVE(R) ((R).bits.is_active) 1502#define MATCHED_SOMETHING(R) ((R).bits.matched_something) 1503#define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) 1504 1505 1506/* Call this when have matched a real character; it sets `matched' flags 1507 for the subexpressions which we are currently inside. Also records 1508 that those subexprs have matched. */ 1509#define SET_REGS_MATCHED() \ 1510 do \ 1511 { \ 1512 if (!set_regs_matched_done) \ 1513 { \ 1514 unsigned r; \ 1515 set_regs_matched_done = 1; \ 1516 for (r = lowest_active_reg; r <= highest_active_reg; r++) \ 1517 { \ 1518 MATCHED_SOMETHING (reg_info[r]) \ 1519 = EVER_MATCHED_SOMETHING (reg_info[r]) \ 1520 = 1; \ 1521 } \ 1522 } \ 1523 } \ 1524 while (0) 1525 1526/* Registers are set to a sentinel when they haven't yet matched. */ 1527static char reg_unset_dummy; 1528#define REG_UNSET_VALUE (®_unset_dummy) 1529#define REG_UNSET(e) ((e) == REG_UNSET_VALUE) 1530 1531/* Subroutine declarations and macros for regex_compile. */ 1532 1533static void store_op1 (), store_op2 (); 1534static void insert_op1 (), insert_op2 (); 1535static boolean at_begline_loc_p (), at_endline_loc_p (); 1536static boolean group_in_compile_stack (); 1537 1538/* Fetch the next character in the uncompiled pattern---translating it 1539 if necessary. Also cast from a signed character in the constant 1540 string passed to us by the user to an unsigned char that we can use 1541 as an array index (in, e.g., `translate'). */ 1542#ifndef PATFETCH 1543#define PATFETCH(c) \ 1544 do {if (p == pend) return REG_EEND; \ 1545 c = (unsigned char) *p++; \ 1546 if (RE_TRANSLATE_P (translate)) c = RE_TRANSLATE (translate, c); \ 1547 } while (0) 1548#endif 1549 1550/* Fetch the next character in the uncompiled pattern, with no 1551 translation. */ 1552#define PATFETCH_RAW(c) \ 1553 do {if (p == pend) return REG_EEND; \ 1554 c = (unsigned char) *p++; \ 1555 } while (0) 1556 1557/* Go backwards one character in the pattern. */ 1558#define PATUNFETCH p-- 1559 1560 1561/* If `translate' is non-null, return translate[D], else just D. We 1562 cast the subscript to translate because some data is declared as 1563 `char *', to avoid warnings when a string constant is passed. But 1564 when we use a character as a subscript we must make it unsigned. */ 1565#ifndef TRANSLATE 1566#define TRANSLATE(d) \ 1567 (RE_TRANSLATE_P (translate) \ 1568 ? (unsigned) RE_TRANSLATE (translate, (unsigned) (d)) : (d)) 1569#endif 1570 1571 1572/* Macros for outputting the compiled pattern into `buffer'. */ 1573 1574/* If the buffer isn't allocated when it comes in, use this. */ 1575#define INIT_BUF_SIZE 32 1576 1577/* Make sure we have at least N more bytes of space in buffer. */ 1578#define GET_BUFFER_SPACE(n) \ 1579 while (b - bufp->buffer + (n) > bufp->allocated) \ 1580 EXTEND_BUFFER () 1581 1582/* Make sure we have one more byte of buffer space and then add C to it. */ 1583#define BUF_PUSH(c) \ 1584 do { \ 1585 GET_BUFFER_SPACE (1); \ 1586 *b++ = (unsigned char) (c); \ 1587 } while (0) 1588 1589 1590/* Ensure we have two more bytes of buffer space and then append C1 and C2. */ 1591#define BUF_PUSH_2(c1, c2) \ 1592 do { \ 1593 GET_BUFFER_SPACE (2); \ 1594 *b++ = (unsigned char) (c1); \ 1595 *b++ = (unsigned char) (c2); \ 1596 } while (0) 1597 1598 1599/* As with BUF_PUSH_2, except for three bytes. */ 1600#define BUF_PUSH_3(c1, c2, c3) \ 1601 do { \ 1602 GET_BUFFER_SPACE (3); \ 1603 *b++ = (unsigned char) (c1); \ 1604 *b++ = (unsigned char) (c2); \ 1605 *b++ = (unsigned char) (c3); \ 1606 } while (0) 1607 1608 1609/* Store a jump with opcode OP at LOC to location TO. We store a 1610 relative address offset by the three bytes the jump itself occupies. */ 1611#define STORE_JUMP(op, loc, to) \ 1612 store_op1 (op, loc, (to) - (loc) - 3) 1613 1614/* Likewise, for a two-argument jump. */ 1615#define STORE_JUMP2(op, loc, to, arg) \ 1616 store_op2 (op, loc, (to) - (loc) - 3, arg) 1617 1618/* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ 1619#define INSERT_JUMP(op, loc, to) \ 1620 insert_op1 (op, loc, (to) - (loc) - 3, b) 1621 1622/* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ 1623#define INSERT_JUMP2(op, loc, to, arg) \ 1624 insert_op2 (op, loc, (to) - (loc) - 3, arg, b) 1625 1626 1627/* This is not an arbitrary limit: the arguments which represent offsets 1628 into the pattern are two bytes long. So if 2^16 bytes turns out to 1629 be too small, many things would have to change. */ 1630#define MAX_BUF_SIZE (1L << 16) 1631 1632 1633/* E…
Large files files are truncated, but you can click here to view the full file