/js/lib/Socket.IO-node/support/expresso/deps/jscoverage/js/jsregexp.cpp
C++ | 1906 lines | 1433 code | 148 blank | 325 comment | 293 complexity | 3040154f5eb7ab91c45eedc02b82c675 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1, MPL-2.0-no-copyleft-exception, BSD-3-Clause
Large files files are truncated, but you can click here to view the full file
- /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
- * vim: set sw=4 ts=8 et tw=78:
- *
- * ***** BEGIN LICENSE BLOCK *****
- * Version: MPL 1.1/GPL 2.0/LGPL 2.1
- *
- * The contents of this file are subject to the Mozilla Public License Version
- * 1.1 (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS" basis,
- * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
- * for the specific language governing rights and limitations under the
- * License.
- *
- * The Original Code is Mozilla Communicator client code, released
- * March 31, 1998.
- *
- * The Initial Developer of the Original Code is
- * Netscape Communications Corporation.
- * Portions created by the Initial Developer are Copyright (C) 1998
- * the Initial Developer. All Rights Reserved.
- *
- * Contributor(s):
- *
- * Alternatively, the contents of this file may be used under the terms of
- * either of the GNU General Public License Version 2 or later (the "GPL"),
- * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
- * in which case the provisions of the GPL or the LGPL are applicable instead
- * of those above. If you wish to allow use of your version of this file only
- * under the terms of either the GPL or the LGPL, and not to allow others to
- * use your version of this file under the terms of the MPL, indicate your
- * decision by deleting the provisions above and replace them with the notice
- * and other provisions required by the GPL or the LGPL. If you do not delete
- * the provisions above, a recipient may use your version of this file under
- * the terms of any one of the MPL, the GPL or the LGPL.
- *
- * ***** END LICENSE BLOCK ***** */
- /*
- * JS regular expressions, after Perl.
- */
- #include "jsstddef.h"
- #include <stdlib.h>
- #include <string.h>
- #include <stdarg.h>
- #include "jstypes.h"
- #include "jsarena.h" /* Added by JSIFY */
- #include "jsutil.h" /* Added by JSIFY */
- #include "jsapi.h"
- #include "jsarray.h"
- #include "jsatom.h"
- #include "jsbuiltins.h"
- #include "jscntxt.h"
- #include "jsversion.h"
- #include "jsfun.h"
- #include "jsgc.h"
- #include "jsinterp.h"
- #include "jslock.h"
- #include "jsnum.h"
- #include "jsobj.h"
- #include "jsopcode.h"
- #include "jsregexp.h"
- #include "jsscan.h"
- #include "jsscope.h"
- #include "jsstr.h"
- #ifdef JS_TRACER
- #include "jstracer.h"
- using namespace avmplus;
- using namespace nanojit;
- /*
- * FIXME Duplicated with jstracer.cpp, doing it this way for now
- * to keep it private to files that need it.
- */
- #ifdef JS_JIT_SPEW
- static bool verbose_debug = getenv("TRACEMONKEY") && strstr(getenv("TRACEMONKEY"), "verbose");
- #define debug_only_v(x) if (verbose_debug) { x; }
- #else
- #define debug_only_v(x)
- #endif
- #endif
- typedef enum REOp {
- #define REOP_DEF(opcode, name) opcode,
- #include "jsreops.tbl"
- #undef REOP_DEF
- REOP_LIMIT /* META: no operator >= to this */
- } REOp;
- #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS)
- #ifdef REGEXP_DEBUG
- const char *reop_names[] = {
- #define REOP_DEF(opcode, name) name,
- #include "jsreops.tbl"
- #undef REOP_DEF
- NULL
- };
- #endif
- #ifdef __GNUC__
- static int
- re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2)));
- #endif
- #ifdef REGEXP_DEBUG
- static int
- re_debug(const char *fmt, ...)
- {
- va_list ap;
- int retval;
- va_start(ap, fmt);
- retval = vprintf(fmt, ap);
- va_end(ap);
- return retval;
- }
- static void
- re_debug_chars(const jschar *chrs, size_t length)
- {
- int i = 0;
- printf(" \"");
- while (*chrs && i++ < length) {
- putchar((char)*chrs++);
- }
- printf("\"");
- }
- #else /* !REGEXP_DEBUG */
- /* This should be optimized to a no-op by our tier-1 compilers. */
- static int
- re_debug(const char *fmt, ...)
- {
- return 0;
- }
- static void
- re_debug_chars(const jschar *chrs, size_t length)
- {
- }
- #endif /* !REGEXP_DEBUG */
- struct RENode {
- REOp op; /* r.e. op bytecode */
- RENode *next; /* next in concatenation order */
- void *kid; /* first operand */
- union {
- void *kid2; /* second operand */
- jsint num; /* could be a number */
- size_t parenIndex; /* or a parenthesis index */
- struct { /* or a quantifier range */
- uintN min;
- uintN max;
- JSPackedBool greedy;
- } range;
- struct { /* or a character class */
- size_t startIndex;
- size_t kidlen; /* length of string at kid, in jschars */
- size_t index; /* index into class list */
- uint16 bmsize; /* bitmap size, based on max char code */
- JSPackedBool sense;
- } ucclass;
- struct { /* or a literal sequence */
- jschar chr; /* of one character */
- size_t length; /* or many (via the kid) */
- } flat;
- struct {
- RENode *kid2; /* second operand from ALT */
- jschar ch1; /* match char for ALTPREREQ */
- jschar ch2; /* ditto, or class index for ALTPREREQ2 */
- } altprereq;
- } u;
- };
- #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \
- ((c >= 'a') && (c <= 'z')) )
- #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \
- (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR))
- #define CLASS_CACHE_SIZE 4
- typedef struct CompilerState {
- JSContext *context;
- JSTokenStream *tokenStream; /* For reporting errors */
- const jschar *cpbegin;
- const jschar *cpend;
- const jschar *cp;
- size_t parenCount;
- size_t classCount; /* number of [] encountered */
- size_t treeDepth; /* maximum depth of parse tree */
- size_t progLength; /* estimated bytecode length */
- RENode *result;
- size_t classBitmapsMem; /* memory to hold all class bitmaps */
- struct {
- const jschar *start; /* small cache of class strings */
- size_t length; /* since they're often the same */
- size_t index;
- } classCache[CLASS_CACHE_SIZE];
- uint16 flags;
- } CompilerState;
- typedef struct EmitStateStackEntry {
- jsbytecode *altHead; /* start of REOP_ALT* opcode */
- jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */
- jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */
- jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */
- RENode *continueNode; /* original REOP_ALT* node being stacked */
- jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */
- JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to
- avoid 16-bit unsigned offset overflow */
- } EmitStateStackEntry;
- /*
- * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h,
- * the getters and setters take the pc of the offset, not of the opcode before
- * the offset.
- */
- #define ARG_LEN 2
- #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1]))
- #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \
- (pc)[1] = (jsbytecode) (arg))
- #define OFFSET_LEN ARG_LEN
- #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1)
- #define GET_OFFSET(pc) GET_ARG(pc)
- /*
- * Maximum supported tree depth is maximum size of EmitStateStackEntry stack.
- * For sanity, we limit it to 2^24 bytes.
- */
- #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry))
- /*
- * The maximum memory that can be allocated for class bitmaps.
- * For sanity, we limit it to 2^24 bytes.
- */
- #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24)
- /*
- * Functions to get size and write/read bytecode that represent small indexes
- * compactly.
- * Each byte in the code represent 7-bit chunk of the index. 8th bit when set
- * indicates that the following byte brings more bits to the index. Otherwise
- * this is the last byte in the index bytecode representing highest index bits.
- */
- static size_t
- GetCompactIndexWidth(size_t index)
- {
- size_t width;
- for (width = 1; (index >>= 7) != 0; ++width) { }
- return width;
- }
- static JS_ALWAYS_INLINE jsbytecode *
- WriteCompactIndex(jsbytecode *pc, size_t index)
- {
- size_t next;
- while ((next = index >> 7) != 0) {
- *pc++ = (jsbytecode)(index | 0x80);
- index = next;
- }
- *pc++ = (jsbytecode)index;
- return pc;
- }
- static JS_ALWAYS_INLINE jsbytecode *
- ReadCompactIndex(jsbytecode *pc, size_t *result)
- {
- size_t nextByte;
- nextByte = *pc++;
- if ((nextByte & 0x80) == 0) {
- /*
- * Short-circuit the most common case when compact index <= 127.
- */
- *result = nextByte;
- } else {
- size_t shift = 7;
- *result = 0x7F & nextByte;
- do {
- nextByte = *pc++;
- *result |= (nextByte & 0x7F) << shift;
- shift += 7;
- } while ((nextByte & 0x80) != 0);
- }
- return pc;
- }
- typedef struct RECapture {
- ptrdiff_t index; /* start of contents, -1 for empty */
- size_t length; /* length of capture */
- } RECapture;
- typedef struct REMatchState {
- const jschar *cp;
- RECapture parens[1]; /* first of 're->parenCount' captures,
- allocated at end of this struct */
- } REMatchState;
- struct REBackTrackData;
- typedef struct REProgState {
- jsbytecode *continue_pc; /* current continuation data */
- jsbytecode continue_op;
- ptrdiff_t index; /* progress in text */
- size_t parenSoFar; /* highest indexed paren started */
- union {
- struct {
- uintN min; /* current quantifier limits */
- uintN max;
- } quantifier;
- struct {
- size_t top; /* backtrack stack state */
- size_t sz;
- } assertion;
- } u;
- } REProgState;
- typedef struct REBackTrackData {
- size_t sz; /* size of previous stack entry */
- jsbytecode *backtrack_pc; /* where to backtrack to */
- jsbytecode backtrack_op;
- const jschar *cp; /* index in text of match at backtrack */
- size_t parenIndex; /* start index of saved paren contents */
- size_t parenCount; /* # of saved paren contents */
- size_t saveStateStackTop; /* number of parent states */
- /* saved parent states follow */
- /* saved paren contents follow */
- } REBackTrackData;
- #define INITIAL_STATESTACK 100
- #define INITIAL_BACKTRACK 8000
- typedef struct REGlobalData {
- JSContext *cx;
- JSRegExp *regexp; /* the RE in execution */
- JSBool ok; /* runtime error (out_of_memory only?) */
- size_t start; /* offset to start at */
- ptrdiff_t skipped; /* chars skipped anchoring this r.e. */
- const jschar *cpbegin; /* text base address */
- const jschar *cpend; /* text limit address */
- REProgState *stateStack; /* stack of state of current parents */
- size_t stateStackTop;
- size_t stateStackLimit;
- REBackTrackData *backTrackStack;/* stack of matched-so-far positions */
- REBackTrackData *backTrackSP;
- size_t backTrackStackSize;
- size_t cursz; /* size of current stack entry */
- size_t backTrackCount; /* how many times we've backtracked */
- size_t backTrackLimit; /* upper limit on backtrack states */
- } REGlobalData;
- /*
- * 1. If IgnoreCase is false, return ch.
- * 2. Let u be ch converted to upper case as if by calling
- * String.prototype.toUpperCase on the one-character string ch.
- * 3. If u does not consist of a single character, return ch.
- * 4. Let cu be u's character.
- * 5. If ch's code point value is greater than or equal to decimal 128 and cu's
- * code point value is less than decimal 128, then return ch.
- * 6. Return cu.
- */
- static JS_ALWAYS_INLINE uintN
- upcase(uintN ch)
- {
- uintN cu;
- JS_ASSERT((uintN) (jschar) ch == ch);
- if (ch < 128) {
- if (ch - (uintN) 'a' <= (uintN) ('z' - 'a'))
- ch -= (uintN) ('a' - 'A');
- return ch;
- }
- cu = JS_TOUPPER(ch);
- return (cu < 128) ? ch : cu;
- }
- static JS_ALWAYS_INLINE uintN
- downcase(uintN ch)
- {
- JS_ASSERT((uintN) (jschar) ch == ch);
- if (ch < 128) {
- if (ch - (uintN) 'A' <= (uintN) ('Z' - 'A'))
- ch += (uintN) ('a' - 'A');
- return ch;
- }
- return JS_TOLOWER(ch);
- }
- /* Construct and initialize an RENode, returning NULL for out-of-memory */
- static RENode *
- NewRENode(CompilerState *state, REOp op)
- {
- JSContext *cx;
- RENode *ren;
- cx = state->context;
- JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren);
- if (!ren) {
- js_ReportOutOfScriptQuota(cx);
- return NULL;
- }
- ren->op = op;
- ren->next = NULL;
- ren->kid = NULL;
- return ren;
- }
- /*
- * Validates and converts hex ascii value.
- */
- static JSBool
- isASCIIHexDigit(jschar c, uintN *digit)
- {
- uintN cv = c;
- if (cv < '0')
- return JS_FALSE;
- if (cv <= '9') {
- *digit = cv - '0';
- return JS_TRUE;
- }
- cv |= 0x20;
- if (cv >= 'a' && cv <= 'f') {
- *digit = cv - 'a' + 10;
- return JS_TRUE;
- }
- return JS_FALSE;
- }
- typedef struct {
- REOp op;
- const jschar *errPos;
- size_t parenIndex;
- } REOpData;
- static JSBool
- ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber,
- const jschar *arg)
- {
- if (state->tokenStream) {
- return js_ReportCompileErrorNumber(state->context, state->tokenStream,
- NULL, JSREPORT_UC | flags,
- errorNumber, arg);
- }
- return JS_ReportErrorFlagsAndNumberUC(state->context, flags,
- js_GetErrorMessage, NULL,
- errorNumber, arg);
- }
- static JSBool
- ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber)
- {
- return ReportRegExpErrorHelper(state, flags, errorNumber, NULL);
- }
- /*
- * Process the op against the two top operands, reducing them to a single
- * operand in the penultimate slot. Update progLength and treeDepth.
- */
- static JSBool
- ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack,
- intN operandSP)
- {
- RENode *result;
- switch (opData->op) {
- case REOP_ALT:
- result = NewRENode(state, REOP_ALT);
- if (!result)
- return JS_FALSE;
- result->kid = operandStack[operandSP - 2];
- result->u.kid2 = operandStack[operandSP - 1];
- operandStack[operandSP - 2] = result;
- if (state->treeDepth == TREE_DEPTH_MAX) {
- ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
- return JS_FALSE;
- }
- ++state->treeDepth;
- /*
- * Look at both alternates to see if there's a FLAT or a CLASS at
- * the start of each. If so, use a prerequisite match.
- */
- if (((RENode *) result->kid)->op == REOP_FLAT &&
- ((RENode *) result->u.kid2)->op == REOP_FLAT &&
- (state->flags & JSREG_FOLD) == 0) {
- result->op = REOP_ALTPREREQ;
- result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
- result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr;
- /* ALTPREREQ, <end>, uch1, uch2, <next>, ...,
- JUMP, <end> ... ENDALT */
- state->progLength += 13;
- }
- else
- if (((RENode *) result->kid)->op == REOP_CLASS &&
- ((RENode *) result->kid)->u.ucclass.index < 256 &&
- ((RENode *) result->u.kid2)->op == REOP_FLAT &&
- (state->flags & JSREG_FOLD) == 0) {
- result->op = REOP_ALTPREREQ2;
- result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr;
- result->u.altprereq.ch2 = ((RENode *) result->kid)->u.ucclass.index;
- /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
- JUMP, <end> ... ENDALT */
- state->progLength += 13;
- }
- else
- if (((RENode *) result->kid)->op == REOP_FLAT &&
- ((RENode *) result->u.kid2)->op == REOP_CLASS &&
- ((RENode *) result->u.kid2)->u.ucclass.index < 256 &&
- (state->flags & JSREG_FOLD) == 0) {
- result->op = REOP_ALTPREREQ2;
- result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr;
- result->u.altprereq.ch2 =
- ((RENode *) result->u.kid2)->u.ucclass.index;
- /* ALTPREREQ2, <end>, uch1, uch2, <next>, ...,
- JUMP, <end> ... ENDALT */
- state->progLength += 13;
- }
- else {
- /* ALT, <next>, ..., JUMP, <end> ... ENDALT */
- state->progLength += 7;
- }
- break;
- case REOP_CONCAT:
- result = operandStack[operandSP - 2];
- while (result->next)
- result = result->next;
- result->next = operandStack[operandSP - 1];
- break;
- case REOP_ASSERT:
- case REOP_ASSERT_NOT:
- case REOP_LPARENNON:
- case REOP_LPAREN:
- /* These should have been processed by a close paren. */
- ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN,
- opData->errPos);
- return JS_FALSE;
- default:;
- }
- return JS_TRUE;
- }
- /*
- * Parser forward declarations.
- */
- static JSBool ParseTerm(CompilerState *state);
- static JSBool ParseQuantifier(CompilerState *state);
- static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues);
- /*
- * Top-down regular expression grammar, based closely on Perl4.
- *
- * regexp: altern A regular expression is one or more
- * altern '|' regexp alternatives separated by vertical bar.
- */
- #define INITIAL_STACK_SIZE 128
- static JSBool
- ParseRegExp(CompilerState *state)
- {
- size_t parenIndex;
- RENode *operand;
- REOpData *operatorStack;
- RENode **operandStack;
- REOp op;
- intN i;
- JSBool result = JS_FALSE;
- intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE;
- intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE;
- /* Watch out for empty regexp */
- if (state->cp == state->cpend) {
- state->result = NewRENode(state, REOP_EMPTY);
- return (state->result != NULL);
- }
- operatorStack = (REOpData *)
- JS_malloc(state->context, sizeof(REOpData) * operatorStackSize);
- if (!operatorStack)
- return JS_FALSE;
- operandStack = (RENode **)
- JS_malloc(state->context, sizeof(RENode *) * operandStackSize);
- if (!operandStack)
- goto out;
- for (;;) {
- parenIndex = state->parenCount;
- if (state->cp == state->cpend) {
- /*
- * If we are at the end of the regexp and we're short one or more
- * operands, the regexp must have the form /x|/ or some such, with
- * left parentheses making us short more than one operand.
- */
- if (operatorSP >= operandSP) {
- operand = NewRENode(state, REOP_EMPTY);
- if (!operand)
- goto out;
- goto pushOperand;
- }
- } else {
- switch (*state->cp) {
- case '(':
- ++state->cp;
- if (state->cp + 1 < state->cpend &&
- *state->cp == '?' &&
- (state->cp[1] == '=' ||
- state->cp[1] == '!' ||
- state->cp[1] == ':')) {
- switch (state->cp[1]) {
- case '=':
- op = REOP_ASSERT;
- /* ASSERT, <next>, ... ASSERTTEST */
- state->progLength += 4;
- break;
- case '!':
- op = REOP_ASSERT_NOT;
- /* ASSERTNOT, <next>, ... ASSERTNOTTEST */
- state->progLength += 4;
- break;
- default:
- op = REOP_LPARENNON;
- break;
- }
- state->cp += 2;
- } else {
- op = REOP_LPAREN;
- /* LPAREN, <index>, ... RPAREN, <index> */
- state->progLength
- += 2 * (1 + GetCompactIndexWidth(parenIndex));
- state->parenCount++;
- if (state->parenCount == 65535) {
- ReportRegExpError(state, JSREPORT_ERROR,
- JSMSG_TOO_MANY_PARENS);
- goto out;
- }
- }
- goto pushOperator;
- case ')':
- /*
- * If there's no stacked open parenthesis, throw syntax error.
- */
- for (i = operatorSP - 1; ; i--) {
- if (i < 0) {
- ReportRegExpError(state, JSREPORT_ERROR,
- JSMSG_UNMATCHED_RIGHT_PAREN);
- goto out;
- }
- if (operatorStack[i].op == REOP_ASSERT ||
- operatorStack[i].op == REOP_ASSERT_NOT ||
- operatorStack[i].op == REOP_LPARENNON ||
- operatorStack[i].op == REOP_LPAREN) {
- break;
- }
- }
- /* FALL THROUGH */
- case '|':
- /* Expected an operand before these, so make an empty one */
- operand = NewRENode(state, REOP_EMPTY);
- if (!operand)
- goto out;
- goto pushOperand;
- default:
- if (!ParseTerm(state))
- goto out;
- operand = state->result;
- pushOperand:
- if (operandSP == operandStackSize) {
- RENode **tmp;
- operandStackSize += operandStackSize;
- tmp = (RENode **)
- JS_realloc(state->context, operandStack,
- sizeof(RENode *) * operandStackSize);
- if (!tmp)
- goto out;
- operandStack = tmp;
- }
- operandStack[operandSP++] = operand;
- break;
- }
- }
- /* At the end; process remaining operators. */
- restartOperator:
- if (state->cp == state->cpend) {
- while (operatorSP) {
- --operatorSP;
- if (!ProcessOp(state, &operatorStack[operatorSP],
- operandStack, operandSP))
- goto out;
- --operandSP;
- }
- JS_ASSERT(operandSP == 1);
- state->result = operandStack[0];
- result = JS_TRUE;
- goto out;
- }
- switch (*state->cp) {
- case '|':
- /* Process any stacked 'concat' operators */
- ++state->cp;
- while (operatorSP &&
- operatorStack[operatorSP - 1].op == REOP_CONCAT) {
- --operatorSP;
- if (!ProcessOp(state, &operatorStack[operatorSP],
- operandStack, operandSP)) {
- goto out;
- }
- --operandSP;
- }
- op = REOP_ALT;
- goto pushOperator;
- case ')':
- /*
- * If there's no stacked open parenthesis, throw syntax error.
- */
- for (i = operatorSP - 1; ; i--) {
- if (i < 0) {
- ReportRegExpError(state, JSREPORT_ERROR,
- JSMSG_UNMATCHED_RIGHT_PAREN);
- goto out;
- }
- if (operatorStack[i].op == REOP_ASSERT ||
- operatorStack[i].op == REOP_ASSERT_NOT ||
- operatorStack[i].op == REOP_LPARENNON ||
- operatorStack[i].op == REOP_LPAREN) {
- break;
- }
- }
- ++state->cp;
- /* Process everything on the stack until the open parenthesis. */
- for (;;) {
- JS_ASSERT(operatorSP);
- --operatorSP;
- switch (operatorStack[operatorSP].op) {
- case REOP_ASSERT:
- case REOP_ASSERT_NOT:
- case REOP_LPAREN:
- operand = NewRENode(state, operatorStack[operatorSP].op);
- if (!operand)
- goto out;
- operand->u.parenIndex =
- operatorStack[operatorSP].parenIndex;
- JS_ASSERT(operandSP);
- operand->kid = operandStack[operandSP - 1];
- operandStack[operandSP - 1] = operand;
- if (state->treeDepth == TREE_DEPTH_MAX) {
- ReportRegExpError(state, JSREPORT_ERROR,
- JSMSG_REGEXP_TOO_COMPLEX);
- goto out;
- }
- ++state->treeDepth;
- /* FALL THROUGH */
- case REOP_LPARENNON:
- state->result = operandStack[operandSP - 1];
- if (!ParseQuantifier(state))
- goto out;
- operandStack[operandSP - 1] = state->result;
- goto restartOperator;
- default:
- if (!ProcessOp(state, &operatorStack[operatorSP],
- operandStack, operandSP))
- goto out;
- --operandSP;
- break;
- }
- }
- break;
- case '{':
- {
- const jschar *errp = state->cp;
- if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) {
- /*
- * This didn't even scan correctly as a quantifier, so we should
- * treat it as flat.
- */
- op = REOP_CONCAT;
- goto pushOperator;
- }
- state->cp = errp;
- /* FALL THROUGH */
- }
- case '+':
- case '*':
- case '?':
- ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER,
- state->cp);
- result = JS_FALSE;
- goto out;
- default:
- /* Anything else is the start of the next term. */
- op = REOP_CONCAT;
- pushOperator:
- if (operatorSP == operatorStackSize) {
- REOpData *tmp;
- operatorStackSize += operatorStackSize;
- tmp = (REOpData *)
- JS_realloc(state->context, operatorStack,
- sizeof(REOpData) * operatorStackSize);
- if (!tmp)
- goto out;
- operatorStack = tmp;
- }
- operatorStack[operatorSP].op = op;
- operatorStack[operatorSP].errPos = state->cp;
- operatorStack[operatorSP++].parenIndex = parenIndex;
- break;
- }
- }
- out:
- if (operatorStack)
- JS_free(state->context, operatorStack);
- if (operandStack)
- JS_free(state->context, operandStack);
- return result;
- }
- /*
- * Hack two bits in CompilerState.flags, for use within FindParenCount to flag
- * its being on the stack, and to propagate errors to its callers.
- */
- #define JSREG_FIND_PAREN_COUNT 0x8000
- #define JSREG_FIND_PAREN_ERROR 0x4000
- /*
- * Magic return value from FindParenCount and GetDecimalValue, to indicate
- * overflow beyond GetDecimalValue's max parameter, or a computed maximum if
- * its findMax parameter is non-null.
- */
- #define OVERFLOW_VALUE ((uintN)-1)
- static uintN
- FindParenCount(CompilerState *state)
- {
- CompilerState temp;
- int i;
- if (state->flags & JSREG_FIND_PAREN_COUNT)
- return OVERFLOW_VALUE;
- /*
- * Copy state into temp, flag it so we never report an invalid backref,
- * and reset its members to parse the entire regexp. This is obviously
- * suboptimal, but GetDecimalValue calls us only if a backref appears to
- * refer to a forward parenthetical, which is rare.
- */
- temp = *state;
- temp.flags |= JSREG_FIND_PAREN_COUNT;
- temp.cp = temp.cpbegin;
- temp.parenCount = 0;
- temp.classCount = 0;
- temp.progLength = 0;
- temp.treeDepth = 0;
- temp.classBitmapsMem = 0;
- for (i = 0; i < CLASS_CACHE_SIZE; i++)
- temp.classCache[i].start = NULL;
- if (!ParseRegExp(&temp)) {
- state->flags |= JSREG_FIND_PAREN_ERROR;
- return OVERFLOW_VALUE;
- }
- return temp.parenCount;
- }
- /*
- * Extract and return a decimal value at state->cp. The initial character c
- * has already been read. Return OVERFLOW_VALUE if the result exceeds max.
- * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in
- * state->flags to discover whether an error occurred under findMax.
- */
- static uintN
- GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state),
- CompilerState *state)
- {
- uintN value = JS7_UNDEC(c);
- JSBool overflow = (value > max && (!findMax || value > findMax(state)));
- /* The following restriction allows simpler overflow checks. */
- JS_ASSERT(max <= ((uintN)-1 - 9) / 10);
- while (state->cp < state->cpend) {
- c = *state->cp;
- if (!JS7_ISDEC(c))
- break;
- value = 10 * value + JS7_UNDEC(c);
- if (!overflow && value > max && (!findMax || value > findMax(state)))
- overflow = JS_TRUE;
- ++state->cp;
- }
- return overflow ? OVERFLOW_VALUE : value;
- }
- /*
- * Calculate the total size of the bitmap required for a class expression.
- */
- static JSBool
- CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src,
- const jschar *end)
- {
- uintN max = 0;
- JSBool inRange = JS_FALSE;
- jschar c, rangeStart = 0;
- uintN n, digit, nDigits, i;
- target->u.ucclass.bmsize = 0;
- target->u.ucclass.sense = JS_TRUE;
- if (src == end)
- return JS_TRUE;
- if (*src == '^') {
- ++src;
- target->u.ucclass.sense = JS_FALSE;
- }
- while (src != end) {
- JSBool canStartRange = JS_TRUE;
- uintN localMax = 0;
- switch (*src) {
- case '\\':
- ++src;
- c = *src++;
- switch (c) {
- case 'b':
- localMax = 0x8;
- break;
- case 'f':
- localMax = 0xC;
- break;
- case 'n':
- localMax = 0xA;
- break;
- case 'r':
- localMax = 0xD;
- break;
- case 't':
- localMax = 0x9;
- break;
- case 'v':
- localMax = 0xB;
- break;
- case 'c':
- if (src < end && RE_IS_LETTER(*src)) {
- localMax = (uintN) (*src++) & 0x1F;
- } else {
- --src;
- localMax = '\\';
- }
- break;
- case 'x':
- nDigits = 2;
- goto lexHex;
- case 'u':
- nDigits = 4;
- lexHex:
- n = 0;
- for (i = 0; (i < nDigits) && (src < end); i++) {
- c = *src++;
- if (!isASCIIHexDigit(c, &digit)) {
- /*
- * Back off to accepting the original
- *'\' as a literal.
- */
- src -= i + 1;
- n = '\\';
- break;
- }
- n = (n << 4) | digit;
- }
- localMax = n;
- break;
- case 'd':
- canStartRange = JS_FALSE;
- if (inRange) {
- JS_ReportErrorNumber(state->context,
- js_GetErrorMessage, NULL,
- JSMSG_BAD_CLASS_RANGE);
- return JS_FALSE;
- }
- localMax = '9';
- break;
- case 'D':
- case 's':
- case 'S':
- case 'w':
- case 'W':
- canStartRange = JS_FALSE;
- if (inRange) {
- JS_ReportErrorNumber(state->context,
- js_GetErrorMessage, NULL,
- JSMSG_BAD_CLASS_RANGE);
- return JS_FALSE;
- }
- max = 65535;
- /*
- * If this is the start of a range, ensure that it's less than
- * the end.
- */
- localMax = 0;
- break;
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- /*
- * This is a non-ECMA extension - decimal escapes (in this
- * case, octal!) are supposed to be an error inside class
- * ranges, but supported here for backwards compatibility.
- *
- */
- n = JS7_UNDEC(c);
- c = *src;
- if ('0' <= c && c <= '7') {
- src++;
- n = 8 * n + JS7_UNDEC(c);
- c = *src;
- if ('0' <= c && c <= '7') {
- src++;
- i = 8 * n + JS7_UNDEC(c);
- if (i <= 0377)
- n = i;
- else
- src--;
- }
- }
- localMax = n;
- break;
- default:
- localMax = c;
- break;
- }
- break;
- default:
- localMax = *src++;
- break;
- }
- if (inRange) {
- /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */
- if (rangeStart > localMax) {
- JS_ReportErrorNumber(state->context,
- js_GetErrorMessage, NULL,
- JSMSG_BAD_CLASS_RANGE);
- return JS_FALSE;
- }
- inRange = JS_FALSE;
- } else {
- if (canStartRange && src < end - 1) {
- if (*src == '-') {
- ++src;
- inRange = JS_TRUE;
- rangeStart = (jschar)localMax;
- continue;
- }
- }
- if (state->flags & JSREG_FOLD)
- rangeStart = localMax; /* one run of the uc/dc loop below */
- }
- if (state->flags & JSREG_FOLD) {
- jschar maxch = localMax;
- for (i = rangeStart; i <= localMax; i++) {
- jschar uch, dch;
- uch = upcase(i);
- dch = downcase(i);
- maxch = JS_MAX(maxch, uch);
- maxch = JS_MAX(maxch, dch);
- }
- localMax = maxch;
- }
- if (localMax > max)
- max = localMax;
- }
- target->u.ucclass.bmsize = max;
- return JS_TRUE;
- }
- /*
- * item: assertion An item is either an assertion or
- * quantatom a quantified atom.
- *
- * assertion: '^' Assertions match beginning of string
- * (or line if the class static property
- * RegExp.multiline is true).
- * '$' End of string (or line if the class
- * static property RegExp.multiline is
- * true).
- * '\b' Word boundary (between \w and \W).
- * '\B' Word non-boundary.
- *
- * quantatom: atom An unquantified atom.
- * quantatom '{' n ',' m '}'
- * Atom must occur between n and m times.
- * quantatom '{' n ',' '}' Atom must occur at least n times.
- * quantatom '{' n '}' Atom must occur exactly n times.
- * quantatom '*' Zero or more times (same as {0,}).
- * quantatom '+' One or more times (same as {1,}).
- * quantatom '?' Zero or one time (same as {0,1}).
- *
- * any of which can be optionally followed by '?' for ungreedy
- *
- * atom: '(' regexp ')' A parenthesized regexp (what matched
- * can be addressed using a backreference,
- * see '\' n below).
- * '.' Matches any char except '\n'.
- * '[' classlist ']' A character class.
- * '[' '^' classlist ']' A negated character class.
- * '\f' Form Feed.
- * '\n' Newline (Line Feed).
- * '\r' Carriage Return.
- * '\t' Horizontal Tab.
- * '\v' Vertical Tab.
- * '\d' A digit (same as [0-9]).
- * '\D' A non-digit.
- * '\w' A word character, [0-9a-z_A-Z].
- * '\W' A non-word character.
- * '\s' A whitespace character, [ \b\f\n\r\t\v].
- * '\S' A non-whitespace character.
- * '\' n A backreference to the nth (n decimal
- * and positive) parenthesized expression.
- * '\' octal An octal escape sequence (octal must be
- * two or three digits long, unless it is
- * 0 for the null character).
- * '\x' hex A hex escape (hex must be two digits).
- * '\u' unicode A unicode escape (must be four digits).
- * '\c' ctrl A control character, ctrl is a letter.
- * '\' literalatomchar Any character except one of the above
- * that follow '\' in an atom.
- * otheratomchar Any character not first among the other
- * atom right-hand sides.
- */
- static JSBool
- ParseTerm(CompilerState *state)
- {
- jschar c = *state->cp++;
- uintN nDigits;
- uintN num, tmp, n, i;
- const jschar *termStart;
- switch (c) {
- /* assertions and atoms */
- case '^':
- state->result = NewRENode(state, REOP_BOL);
- if (!state->result)
- return JS_FALSE;
- state->progLength++;
- return JS_TRUE;
- case '$':
- state->result = NewRENode(state, REOP_EOL);
- if (!state->result)
- return JS_FALSE;
- state->progLength++;
- return JS_TRUE;
- case '\\':
- if (state->cp >= state->cpend) {
- /* a trailing '\' is an error */
- ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH);
- return JS_FALSE;
- }
- c = *state->cp++;
- switch (c) {
- /* assertion escapes */
- case 'b' :
- state->result = NewRENode(state, REOP_WBDRY);
- if (!state->result)
- return JS_FALSE;
- state->progLength++;
- return JS_TRUE;
- case 'B':
- state->result = NewRENode(state, REOP_WNONBDRY);
- if (!state->result)
- return JS_FALSE;
- state->progLength++;
- return JS_TRUE;
- /* Decimal escape */
- case '0':
- /* Give a strict warning. See also the note below. */
- if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT,
- JSMSG_INVALID_BACKREF)) {
- return JS_FALSE;
- }
- doOctal:
- num = 0;
- while (state->cp < state->cpend) {
- c = *state->cp;
- if (c < '0' || '7' < c)
- break;
- state->cp++;
- tmp = 8 * num + (uintN)JS7_UNDEC(c);
- if (tmp > 0377)
- break;
- num = tmp;
- }
- c = (jschar)num;
- doFlat:
- state->result = NewRENode(state, REOP_FLAT);
- if (!state->result)
- return JS_FALSE;
- state->result->u.flat.chr = c;
- state->result->u.flat.length = 1;
- state->progLength += 3;
- break;
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- termStart = state->cp - 1;
- num = GetDecimalValue(c, state->parenCount, FindParenCount, state);
- if (state->flags & JSREG_FIND_PAREN_ERROR)
- return JS_FALSE;
- if (num == OVERFLOW_VALUE) {
- /* Give a strict mode warning. */
- if (!ReportRegExpError(state,
- JSREPORT_WARNING | JSREPORT_STRICT,
- (c >= '8')
- ? JSMSG_INVALID_BACKREF
- : JSMSG_BAD_BACKREF)) {
- return JS_FALSE;
- }
- /*
- * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax
- * error here. However, for compatibility with IE, we treat the
- * whole backref as flat if the first character in it is not a
- * valid octal character, and as an octal escape otherwise.
- */
- state->cp = termStart;
- if (c >= '8') {
- /* Treat this as flat. termStart - 1 is the \. */
- c = '\\';
- goto asFlat;
- }
- /* Treat this as an octal escape. */
- goto doOctal;
- }
- JS_ASSERT(1 <= num && num <= 0x10000);
- state->result = NewRENode(state, REOP_BACKREF);
- if (!state->result)
- return JS_FALSE;
- state->result->u.parenIndex = num - 1;
- state->progLength
- += 1 + GetCompactIndexWidth(state->result->u.parenIndex);
- break;
- /* Control escape */
- case 'f':
- c = 0xC;
- goto doFlat;
- case 'n':
- c = 0xA;
- goto doFlat;
- case 'r':
- c = 0xD;
- goto doFlat;
- case 't':
- c = 0x9;
- goto doFlat;
- case 'v':
- c = 0xB;
- goto doFlat;
- /* Control letter */
- case 'c':
- if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) {
- c = (jschar) (*state->cp++ & 0x1F);
- } else {
- /* back off to accepting the original '\' as a literal */
- --state->cp;
- c = '\\';
- }
- goto doFlat;
- /* HexEscapeSequence */
- case 'x':
- nDigits = 2;
- goto lexHex;
- /* UnicodeEscapeSequence */
- case 'u':
- nDigits = 4;
- lexHex:
- n = 0;
- for (i = 0; i < nDigits && state->cp < state->cpend; i++) {
- uintN digit;
- c = *state->cp++;
- if (!isASCIIHexDigit(c, &digit)) {
- /*
- * Back off to accepting the original 'u' or 'x' as a
- * literal.
- */
- state->cp -= i + 2;
- n = *state->cp++;
- break;
- }
- n = (n << 4) | digit;
- }
- c = (jschar) n;
- goto doFlat;
- /* Character class escapes */
- case 'd':
- state->result = NewRENode(state, REOP_DIGIT);
- doSimple:
- if (!state->result)
- return JS_FALSE;
- state->progLength++;
- break;
- case 'D':
- state->result = NewRENode(state, REOP_NONDIGIT);
- goto doSimple;
- case 's':
- state->result = NewRENode(state, REOP_SPACE);
- goto doSimple;
- case 'S':
- state->result = NewRENode(state, REOP_NONSPACE);
- goto doSimple;
- case 'w':
- state->result = NewRENode(state, REOP_ALNUM);
- goto doSimple;
- case 'W':
- state->result = NewRENode(state, REOP_NONALNUM);
- goto doSimple;
- /* IdentityEscape */
- default:
- state->result = NewRENode(state, REOP_FLAT);
- if (!state->result)
- return JS_FALSE;
- state->result->u.flat.chr = c;
- state->result->u.flat.length = 1;
- state->result->kid = (void *) (state->cp - 1);
- state->progLength += 3;
- break;
- }
- break;
- case '[':
- state->result = NewRENode(state, REOP_CLASS);
- if (!state->result)
- return JS_FALSE;
- termStart = state->cp;
- state->result->u.ucclass.startIndex = termStart - state->cpbegin;
- for (;;) {
- if (state->cp == state->cpend) {
- ReportRegExpErrorHelper(state, JSREPORT_ERROR,
- JSMSG_UNTERM_CLASS, termStart);
- return JS_FALSE;
- }
- if (*state->cp == '\\') {
- state->cp++;
- if (state->cp != state->cpend)
- state->cp++;
- continue;
- }
- if (*state->cp == ']') {
- state->result->u.ucclass.kidlen = state->cp - termStart;
- break;
- }
- state->cp++;
- }
- for (i = 0; i < CLASS_CACHE_SIZE; i++) {
- if (!state->classCache[i].start) {
- state->classCache[i].start = termStart;
- state->classCache[i].length = state->result->u.ucclass.kidlen;
- state->classCache[i].index = state->classCount;
- break;
- }
- if (state->classCache[i].length ==
- state->result->u.ucclass.kidlen) {
- for (n = 0; ; n++) {
- if (n == state->classCache[i].length) {
- state->result->u.ucclass.index
- = state->classCache[i].index;
- goto claim;
- }
- if (state->classCache[i].start[n] != termStart[n])
- break;
- }
- }
- }
- state->result->u.ucclass.index = state->classCount++;
- claim:
- /*
- * Call CalculateBitmapSize now as we want any errors it finds
- * to be reported during the parse phase, not at execution.
- */
- if (!CalculateBitmapSize(state, state->result, termStart, state->cp++))
- return JS_FALSE;
- /*
- * Update classBitmapsMem with number of bytes to hold bmsize bits,
- * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8
- * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize.
- */
- n = (state->result->u.ucclass.bmsize >> 3) + 1;
- if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) {
- ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX);
- return JS_FALSE;
- }
- state->classBitmapsMem += n;
- /* CLASS, <index> */
- state->progLength
- += 1 + GetCompactIndexWidth(state->result->u.ucclass.index);
- break;
- case '.':
- state->result = NewRENode(state, REOP_DOT);
- goto doSimple;
- case '{':
- {
- const jschar *errp = state->cp--;
- intN err;
- err = ParseMinMaxQuantifier(state, JS_TRUE);
- state->cp = errp;
- if (err < 0)
- goto asFlat;
- /* FALL THROUGH */
- }
- case '*':
- case '+':
- case '?':
- ReportRegExpErrorHelper(state, JSREPORT_ERROR,
- JSMSG_BAD_QUANTIFIER, state->cp - 1);
- return JS_FALSE;
- default:
- asFlat:
- state->result = NewRENode(state, REOP_FLAT);
- if (!state->result)
- return JS_FALSE;
- state->result->u.flat.chr = c;
- state->result->u.flat.length = 1;
- state->result->kid = (void *) (state->cp - 1);
- state->progLength += 3;
- break;
- }
- return ParseQuantifier(state);
- }
- static JSBool
- ParseQuantifier(CompilerState *s…
Large files files are truncated, but you can click here to view the full file