/js/src/yarr/pcre/pcre_exec.cpp
http://github.com/zpao/v8monkey · C++ · 2192 lines · 1480 code · 331 blank · 381 comment · 428 complexity · 9ffcd184ede3ce61e3c420ee9f8311da MD5 · raw file
Large files are truncated click here to view the full file
- /* This is JavaScriptCore's variant of the PCRE library. While this library
- started out as a copy of PCRE, many of the features of PCRE have been
- removed. This library now supports only the regular expression features
- required by the JavaScript language specification, and has only the functions
- needed by JavaScriptCore and the rest of WebKit.
- Originally written by Philip Hazel
- Copyright (c) 1997-2006 University of Cambridge
- Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
- Copyright (C) 2007 Eric Seidel <eric@webkit.org>
- -----------------------------------------------------------------------------
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
- * Neither the name of the University of Cambridge nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- POSSIBILITY OF SUCH DAMAGE.
- -----------------------------------------------------------------------------
- */
- /* This module contains jsRegExpExecute(), the externally visible function
- that does pattern matching using an NFA algorithm, following the rules from
- the JavaScript specification. There are also some supporting functions. */
- #include "pcre_internal.h"
- #include <limits.h>
- #include "yarr/ASCIICType.h"
- #include "jsarena.h"
- #include "jscntxt.h"
- using namespace WTF;
- #if !WTF_COMPILER_MSVC && !WTF_COMPILER_SUNPRO
- #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
- #endif
- /* Note: Webkit sources have USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP disabled. */
- /* Note: There are hardcoded constants all over the place, but in the port of
- Yarr to TraceMonkey two bytes are added to the OP_BRA* opcodes, so the
- instruction stream now looks like this at the start of a bracket group:
- OP_BRA* [link:LINK_SIZE] [minNestedBracket,maxNestedBracket:2]
- Both capturing and non-capturing brackets encode this information. */
- /* Avoid warnings on Windows. */
- #undef min
- #undef max
- #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
- typedef int ReturnLocation;
- #else
- typedef void* ReturnLocation;
- #endif
- /* Node on a stack of brackets. This is used to detect and reject
- matches of the empty string per ECMAScript repeat match rules. This
- also prevents infinite loops on quantified empty matches. One node
- represents the start state at the start of this bracket group. */
- struct BracketChainNode {
- BracketChainNode* previousBracket;
- const UChar* bracketStart;
- /* True if the minimum number of matches was already satisfied
- when we started matching this group. */
- bool minSatisfied;
- };
- struct MatchFrame {
- ReturnLocation returnLocation;
- struct MatchFrame* previousFrame;
- int *savedOffsets;
- /* The frame allocates saved offsets into the regular expression arena pool so
- that they can be restored during backtracking. */
- size_t savedOffsetsSize;
- JSArenaPool *regExpPool;
- MatchFrame() : savedOffsetsSize(0), regExpPool(0) {}
- void init(JSArenaPool *regExpPool) { this->regExpPool = regExpPool; }
-
- /* Function arguments that may change */
- struct {
- const UChar* subjectPtr;
- const unsigned char* instructionPtr;
- int offsetTop;
- BracketChainNode* bracketChain;
- } args;
-
-
- /* PCRE uses "fake" recursion built off of gotos, thus
- stack-based local variables are not safe to use. Instead we have to
- store local variables on the current MatchFrame. */
- struct {
- const unsigned char* data;
- const unsigned char* startOfRepeatingBracket;
- const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare
- const unsigned char* instructionPtrAtStartOfOnce;
-
- int repeatOthercase;
- int savedSubjectOffset;
-
- int ctype;
- int fc;
- int fi;
- int length;
- int max;
- int number;
- int offset;
- int skipBytes;
- int minBracket;
- int limitBracket;
- int bracketsBefore;
- bool minSatisfied;
-
- BracketChainNode bracketChainNode;
- } locals;
- void saveOffsets(int minBracket, int limitBracket, int *offsets, int offsetEnd) {
- JS_ASSERT(regExpPool);
- JS_ASSERT(minBracket >= 0);
- JS_ASSERT(limitBracket >= minBracket);
- JS_ASSERT(offsetEnd >= 0);
- if (minBracket == limitBracket)
- return;
- const size_t newSavedOffsetCount = 3 * (limitBracket - minBracket);
- /* Increase saved offset space if necessary. */
- {
- size_t targetSize = sizeof(*savedOffsets) * newSavedOffsetCount;
- if (savedOffsetsSize < targetSize) {
- JS_ARENA_ALLOCATE_CAST(savedOffsets, int *, regExpPool, targetSize);
- JS_ASSERT(savedOffsets); /* FIXME: error code, bug 574459. */
- savedOffsetsSize = targetSize;
- }
- }
- for (unsigned i = 0; i < unsigned(limitBracket - minBracket); ++i) {
- int bracketIter = minBracket + i;
- JS_ASSERT(2 * bracketIter + 1 <= offsetEnd);
- int start = offsets[2 * bracketIter];
- int end = offsets[2 * bracketIter + 1];
- JS_ASSERT(bracketIter <= offsetEnd);
- int offset = offsets[offsetEnd - bracketIter];
- DPRINTF(("saving bracket %d; start: %d; end: %d; offset: %d\n", bracketIter, start, end, offset));
- JS_ASSERT(start <= end);
- JS_ASSERT(i * 3 + 2 < newSavedOffsetCount);
- savedOffsets[i * 3 + 0] = start;
- savedOffsets[i * 3 + 1] = end;
- savedOffsets[i * 3 + 2] = offset;
- }
- }
- void clobberOffsets(int minBracket, int limitBracket, int *offsets, int offsetEnd) {
- for (int i = 0; i < limitBracket - minBracket; ++i) {
- int bracketIter = minBracket + i;
- JS_ASSERT(2 * bracketIter + 1 < offsetEnd);
- offsets[2 * bracketIter + 0] = -1;
- offsets[2 * bracketIter + 1] = -1;
- }
- }
- void restoreOffsets(int minBracket, int limitBracket, int *offsets, int offsetEnd) {
- JS_ASSERT(regExpPool);
- JS_ASSERT_IF(limitBracket > minBracket, savedOffsets);
- for (int i = 0; i < limitBracket - minBracket; ++i) {
- int bracketIter = minBracket + i;
- int start = savedOffsets[i * 3 + 0];
- int end = savedOffsets[i * 3 + 1];
- int offset = savedOffsets[i * 3 + 2];
- DPRINTF(("restoring bracket %d; start: %d; end: %d; offset: %d\n", bracketIter, start, end, offset));
- JS_ASSERT(start <= end);
- offsets[2 * bracketIter + 0] = start;
- offsets[2 * bracketIter + 1] = end;
- offsets[offsetEnd - bracketIter] = offset;
- }
- }
- /* Extract the bracket data after the current opcode/link at |instructionPtr| into the locals. */
- void extractBrackets(const unsigned char *instructionPtr) {
- uint16_t bracketMess = get2ByteValue(instructionPtr + 1 + LINK_SIZE);
- locals.minBracket = (bracketMess >> 8) & 0xff;
- locals.limitBracket = (bracketMess & 0xff);
- JS_ASSERT(locals.minBracket <= locals.limitBracket);
- }
- /* At the start of a bracketed group, add the current subject pointer to the
- stack of such pointers, to be re-instated at the end of the group when we hit
- the closing ket. When match() is called in other circumstances, we don't add to
- this stack. */
- void startNewGroup(bool minSatisfied) {
- locals.bracketChainNode.previousBracket = args.bracketChain;
- locals.bracketChainNode.bracketStart = args.subjectPtr;
- locals.bracketChainNode.minSatisfied = minSatisfied;
- args.bracketChain = &locals.bracketChainNode;
- }
- };
- /* Structure for passing "static" information around between the functions
- doing traditional NFA matching, so that they are thread-safe. */
- struct MatchData {
- int *offsetVector; /* Offset vector */
- int offsetEnd; /* One past the end */
- int offsetMax; /* The maximum usable for return data */
- bool offsetOverflow; /* Set if too many extractions */
- const UChar *startSubject; /* Start of the subject string */
- const UChar *endSubject; /* End of the subject string */
- const UChar *endMatchPtr; /* Subject position at end match */
- int endOffsetTop; /* Highwater mark at end of match */
- bool multiline;
- bool ignoreCase;
- void setOffsetPair(size_t pairNum, int start, int end) {
- JS_ASSERT(int(2 * pairNum + 1) < offsetEnd && int(pairNum) < offsetEnd);
- JS_ASSERT(start <= end);
- JS_ASSERT_IF(start < 0, start == end && start == -1);
- DPRINTF(("setting offset pair at %u (%d, %d)\n", pairNum, start, end));
- offsetVector[2 * pairNum + 0] = start;
- offsetVector[2 * pairNum + 1] = end;
- }
- };
- /* The maximum remaining length of subject we are prepared to search for a
- reqByte match. */
- #define REQ_BYTE_MAX 1000
- /* The below limit restricts the number of "recursive" match calls in order to
- avoid spending exponential time on complex regular expressions. */
- static const unsigned matchLimit = 1000000;
- /*************************************************
- * Match a back-reference *
- *************************************************/
- /* If a back reference hasn't been set, the length that is passed is greater
- than the number of characters left in the string, so the match fails.
- Arguments:
- offset index into the offset vector
- subjectPtr points into the subject
- length length to be matched
- md points to match data block
- Returns: true if matched
- */
- static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md)
- {
- const UChar* p = md.startSubject + md.offsetVector[offset];
-
- /* Always fail if not enough characters left */
-
- if (length > md.endSubject - subjectPtr)
- return false;
-
- /* Separate the caselesss case for speed */
-
- if (md.ignoreCase) {
- while (length-- > 0) {
- UChar c = *p++;
- int othercase = jsc_pcre_ucp_othercase(c);
- UChar d = *subjectPtr++;
- if (c != d && othercase != d)
- return false;
- }
- }
- else {
- while (length-- > 0)
- if (*p++ != *subjectPtr++)
- return false;
- }
-
- return true;
- }
- #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
- /* Use numbered labels and switch statement at the bottom of the match function. */
- #define RMATCH_WHERE(num) num
- #define RRETURN_LABEL RRETURN_SWITCH
- #else
- /* Use GCC's computed goto extension. */
- /* For one test case this is more than 40% faster than the switch statement.
- We could avoid the use of the num argument entirely by using local labels,
- but using it for the GCC case as well as the non-GCC case allows us to share
- a bit more code and notice if we use conflicting numbers.*/
- #define RMATCH_WHERE(num) JS_EXTENSION(&&RRETURN_##num)
- #define RRETURN_LABEL *stack.currentFrame->returnLocation
- #endif
- #define RECURSIVE_MATCH_COMMON(num) \
- goto RECURSE;\
- RRETURN_##num: \
- stack.popCurrentFrame();
- #define RECURSIVE_MATCH(num, ra, rb) \
- do { \
- stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
- RECURSIVE_MATCH_COMMON(num) \
- } while (0)
- #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb, gm) \
- do { \
- stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \
- stack.currentFrame->startNewGroup(gm); \
- RECURSIVE_MATCH_COMMON(num) \
- } while (0)
- #define RRETURN do { JS_EXTENSION_(goto RRETURN_LABEL); } while (0)
- #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0)
- /*************************************************
- * Match from current position *
- *************************************************/
- /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character
- in the subject string, while substringStart holds the value of subjectPtr at the start of the
- last bracketed group - used for breaking infinite loops matching zero-length
- strings. This function is called recursively in many circumstances. Whenever it
- returns a negative (error) response, the outer match() call must also return the
- same response.
- Arguments:
- subjectPtr pointer in subject
- instructionPtr position in code
- offsetTop current top pointer
- md pointer to "static" info for the match
- Returns: 1 if matched ) these values are >= 0
- 0 if failed to match )
- a negative error value if aborted by an error condition
- (e.g. stopped by repeated call or recursion limit)
- */
- static const unsigned numFramesOnStack = 16;
- struct MatchStack {
- JSArenaPool *regExpPool;
- void *regExpPoolMark;
- MatchStack(JSArenaPool *regExpPool)
- : regExpPool(regExpPool)
- , regExpPoolMark(JS_ARENA_MARK(regExpPool))
- , framesEnd(frames + numFramesOnStack)
- , currentFrame(frames)
- , size(1) // match() creates accesses the first frame w/o calling pushNewFrame
- {
- JS_ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack);
- JS_ASSERT(regExpPool);
- for (size_t i = 0; i < numFramesOnStack; ++i)
- frames[i].init(regExpPool);
- }
- ~MatchStack() { JS_ARENA_RELEASE(regExpPool, regExpPoolMark); }
-
- MatchFrame frames[numFramesOnStack];
- MatchFrame* framesEnd;
- MatchFrame* currentFrame;
- unsigned size;
-
- bool canUseStackBufferForNextFrame() {
- return size < numFramesOnStack;
- }
-
- MatchFrame* allocateNextFrame() {
- if (canUseStackBufferForNextFrame())
- return currentFrame + 1;
- // FIXME: bug 574459 -- no NULL check
- MatchFrame *frame = js::OffTheBooks::new_<MatchFrame>();
- frame->init(regExpPool);
- return frame;
- }
-
- void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation) {
- MatchFrame* newframe = allocateNextFrame();
- newframe->previousFrame = currentFrame;
- newframe->args.subjectPtr = currentFrame->args.subjectPtr;
- newframe->args.offsetTop = currentFrame->args.offsetTop;
- newframe->args.instructionPtr = instructionPtr;
- newframe->args.bracketChain = bracketChain;
- newframe->returnLocation = returnLocation;
- size++;
- currentFrame = newframe;
- }
-
- void popCurrentFrame() {
- MatchFrame* oldFrame = currentFrame;
- currentFrame = currentFrame->previousFrame;
- if (size > numFramesOnStack)
- js::Foreground::delete_(oldFrame);
- size--;
- }
- void popAllFrames() {
- while (size)
- popCurrentFrame();
- }
- };
- static int matchError(int errorCode, MatchStack& stack)
- {
- stack.popAllFrames();
- return errorCode;
- }
- /* Get the next UTF-8 character, not advancing the pointer, incrementing length
- if there are extra bytes. This is called when we know we are in UTF-8 mode. */
- static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len)
- {
- c = *subjectPtr;
- if ((c & 0xc0) == 0xc0) {
- int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
- int gcss = 6 * gcaa;
- c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss;
- for (int gcii = 1; gcii <= gcaa; gcii++) {
- gcss -= 6;
- c |= (subjectPtr[gcii] & 0x3f) << gcss;
- }
- len += gcaa;
- }
- }
- static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats)
- {
- // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR
- static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 };
- static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 };
- JS_ASSERT(instructionOffset >= 0);
- JS_ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR));
- minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2
- minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset];
- maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset];
- }
- /* Helper class for passing a flag value from one op to the next that runs.
- This allows us to set the flag in certain ops. When the flag is read, it
- will be true only if the previous op set the flag, otherwise it is false. */
- class LinearFlag {
- public:
- LinearFlag() : flag(false) {}
-
- bool readAndClear() {
- bool rv = flag;
- flag = false;
- return rv;
- }
- void set() {
- flag = true;
- }
- private:
- bool flag;
- };
- static int
- match(JSArenaPool *regExpPool, const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md)
- {
- bool isMatch = false;
- int min;
- bool minimize = false; /* Initialization not really needed, but some compilers think so. */
- unsigned remainingMatchCount = matchLimit;
- int othercase; /* Declare here to avoid errors during jumps */
- bool minSatisfied;
-
- MatchStack stack(regExpPool);
- LinearFlag minSatNextBracket;
- /* The opcode jump table. */
- #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
- #define EMIT_JUMP_TABLE_ENTRY(opcode) JS_EXTENSION(&&LABEL_OP_##opcode)
- static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) };
- #undef EMIT_JUMP_TABLE_ENTRY
- #endif
-
- /* One-time setup of the opcode jump table. */
- #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
- for (int i = 255; !opcodeJumpTable[i]; i--)
- opcodeJumpTable[i] = &&CAPTURING_BRACKET;
- #endif
-
- #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION
- // Shark shows this as a hot line
- // Using a static const here makes this line disappear, but makes later access hotter (not sure why)
- stack.currentFrame->returnLocation = JS_EXTENSION(&&RETURN);
- #else
- stack.currentFrame->returnLocation = 0;
- #endif
- stack.currentFrame->args.subjectPtr = subjectPtr;
- stack.currentFrame->args.instructionPtr = instructionPtr;
- stack.currentFrame->args.offsetTop = offsetTop;
- stack.currentFrame->args.bracketChain = 0;
- stack.currentFrame->startNewGroup(false);
-
- /* This is where control jumps back to to effect "recursion" */
-
- RECURSE:
- if (!--remainingMatchCount)
- return matchError(JSRegExpErrorHitLimit, stack);
- /* Now start processing the operations. */
-
- #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
- while (true)
- #endif
- {
-
- #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
- #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode
- #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr]
- #else
- #define BEGIN_OPCODE(opcode) case OP_##opcode
- #define NEXT_OPCODE continue
- #endif
- #define LOCALS(__ident) (stack.currentFrame->locals.__ident)
-
- #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP
- NEXT_OPCODE;
- #else
- switch (*stack.currentFrame->args.instructionPtr)
- #endif
- {
- /* Non-capturing bracket: optimized */
-
- BEGIN_OPCODE(BRA):
- NON_CAPTURING_BRACKET:
- DPRINTF(("start non-capturing bracket\n"));
- stack.currentFrame->extractBrackets(stack.currentFrame->args.instructionPtr);
- /* If we see no ALT, we have to skip three bytes of bracket data (link plus nested
- bracket data. */
- stack.currentFrame->locals.skipBytes = 3;
- /* We must compute this value at the top, before we move the instruction pointer. */
- stack.currentFrame->locals.minSatisfied = minSatNextBracket.readAndClear();
- do {
- /* We need to extract this into a variable so we can correctly pass it by value
- through RECURSIVE_MATCH_NEW_GROUP, which modifies currentFrame. */
- minSatisfied = stack.currentFrame->locals.minSatisfied;
- RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + stack.currentFrame->locals.skipBytes + LINK_SIZE, stack.currentFrame->args.bracketChain, minSatisfied);
- if (isMatch) {
- DPRINTF(("non-capturing bracket succeeded\n"));
- RRETURN;
- }
- stack.currentFrame->locals.skipBytes = 1;
- stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
- } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
- DPRINTF(("non-capturing bracket failed\n"));
- for (size_t i = LOCALS(minBracket); i < size_t(LOCALS(limitBracket)); ++i)
- md.setOffsetPair(i, -1, -1);
- RRETURN;
-
- /* Skip over large extraction number data if encountered. */
-
- BEGIN_OPCODE(BRANUMBER):
- stack.currentFrame->args.instructionPtr += 3;
- NEXT_OPCODE;
-
- /* End of the pattern. */
-
- BEGIN_OPCODE(END):
- md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */
- md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */
- isMatch = true;
- RRETURN;
-
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. */
-
- BEGIN_OPCODE(ASSERT):
- {
- uint16_t bracketMess = get2ByteValue(stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE);
- LOCALS(minBracket) = (bracketMess >> 8) & 0xff;
- LOCALS(limitBracket) = bracketMess & 0xff;
- JS_ASSERT(LOCALS(minBracket) <= LOCALS(limitBracket));
- }
- stack.currentFrame->locals.skipBytes = 3;
- do {
- RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + stack.currentFrame->locals.skipBytes + LINK_SIZE, NULL, false);
- if (isMatch)
- break;
- stack.currentFrame->locals.skipBytes = 1;
- stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
- } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
- if (*stack.currentFrame->args.instructionPtr == OP_KET) {
- for (size_t i = LOCALS(minBracket); i < size_t(LOCALS(limitBracket)); ++i)
- md.setOffsetPair(i, -1, -1);
- RRETURN_NO_MATCH;
- }
-
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
-
- advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
- stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
- stack.currentFrame->args.offsetTop = md.endOffsetTop;
- NEXT_OPCODE;
-
- /* Negative assertion: all branches must fail to match */
-
- BEGIN_OPCODE(ASSERT_NOT):
- stack.currentFrame->locals.skipBytes = 3;
- {
- unsigned bracketMess = get2ByteValue(stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE);
- LOCALS(minBracket) = (bracketMess >> 8) & 0xff;
- LOCALS(limitBracket) = bracketMess & 0xff;
- }
- JS_ASSERT(LOCALS(minBracket) <= LOCALS(limitBracket));
- do {
- RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + stack.currentFrame->locals.skipBytes + LINK_SIZE, NULL, false);
- if (isMatch)
- RRETURN_NO_MATCH;
- stack.currentFrame->locals.skipBytes = 1;
- stack.currentFrame->args.instructionPtr += getLinkValue(stack.currentFrame->args.instructionPtr + 1);
- } while (*stack.currentFrame->args.instructionPtr == OP_ALT);
-
- stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.skipBytes + LINK_SIZE;
- NEXT_OPCODE;
-
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
-
- BEGIN_OPCODE(ALT):
- advanceToEndOfBracket(stack.currentFrame->args.instructionPtr);
- NEXT_OPCODE;
-
- /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
- that it may occur zero times. It may repeat infinitely, or not at all -
- i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
- repeat limits are compiled as a number of copies, with the optional ones
- preceded by BRAZERO or BRAMINZERO. */
-
- BEGIN_OPCODE(BRAZERO): {
- stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
- stack.currentFrame->extractBrackets(stack.currentFrame->args.instructionPtr + 1);
- stack.currentFrame->saveOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- minSatNextBracket.set();
- RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain, true);
- if (isMatch)
- RRETURN;
- stack.currentFrame->restoreOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
- stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE;
- NEXT_OPCODE;
- }
-
- BEGIN_OPCODE(BRAMINZERO): {
- stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1;
- advanceToEndOfBracket(stack.currentFrame->locals.startOfRepeatingBracket);
- RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain, false);
- if (isMatch)
- RRETURN;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- }
-
- /* End of a group, repeated or non-repeating. If we are at the end of
- an assertion "group", stop matching and return 1, but record the
- current high water mark for use by positive assertions. Do this also
- for the "once" (not-backup up) groups. */
-
- BEGIN_OPCODE(KET):
- BEGIN_OPCODE(KETRMIN):
- BEGIN_OPCODE(KETRMAX):
- stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(stack.currentFrame->args.instructionPtr + 1);
- stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart;
- stack.currentFrame->locals.minSatisfied = stack.currentFrame->args.bracketChain->minSatisfied;
- /* Back up the stack of bracket start pointers. */
- stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket;
- if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) {
- md.endOffsetTop = stack.currentFrame->args.offsetTop;
- isMatch = true;
- RRETURN;
- }
-
- /* In all other cases except a conditional group we have to check the
- group number back at the start and if necessary complete handling an
- extraction by setting the offsets and bumping the high water mark. */
-
- stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA;
-
- /* For extended extraction brackets (large number), we have to fish out
- the number from a dummy opcode at the start. */
-
- if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX)
- stack.currentFrame->locals.number = get2ByteValue(stack.currentFrame->locals.instructionPtrAtStartOfOnce + 4 + LINK_SIZE);
- stack.currentFrame->locals.offset = 2 * stack.currentFrame->locals.number;
-
- DPRINTF(("end bracket %d\n", stack.currentFrame->locals.number));
-
- /* Test for a numbered group. This includes groups called as a result
- of recursion. Note that whole-pattern recursion is coded as a recurse
- into group 0, so it won't be picked up here. Instead, we catch it when
- the OP_END is reached. */
-
- if (stack.currentFrame->locals.number > 0) {
- if (stack.currentFrame->locals.offset >= md.offsetMax)
- md.offsetOverflow = true;
- else {
- int start = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number];
- int end = stack.currentFrame->args.subjectPtr - md.startSubject;
- if (start == end && stack.currentFrame->locals.minSatisfied) {
- DPRINTF(("empty string while group already matched; bailing"));
- RRETURN_NO_MATCH;
- }
- DPRINTF(("saving; start: %d; end: %d\n", start, end));
- JS_ASSERT(start <= end);
- md.setOffsetPair(stack.currentFrame->locals.number, start, end);
- if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset)
- stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2;
- }
- }
-
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
-
- if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
- DPRINTF(("non-repeating ket or empty match\n"));
- if (stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction && stack.currentFrame->locals.minSatisfied) {
- DPRINTF(("empty string while group already matched; bailing"));
- RRETURN_NO_MATCH;
- }
- stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE;
- NEXT_OPCODE;
- }
-
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. */
-
- stack.currentFrame->extractBrackets(LOCALS(instructionPtrAtStartOfOnce));
- JS_ASSERT_IF(LOCALS(number), LOCALS(minBracket) <= LOCALS(number) && LOCALS(number) < LOCALS(limitBracket));
- if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) {
- stack.currentFrame->saveOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
- if (isMatch)
- RRETURN;
- else
- stack.currentFrame->restoreOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- DPRINTF(("recursively matching lazy group\n"));
- minSatNextBracket.set();
- RECURSIVE_MATCH_NEW_GROUP(17, LOCALS(instructionPtrAtStartOfOnce), stack.currentFrame->args.bracketChain, true);
- } else { /* OP_KETRMAX */
- stack.currentFrame->saveOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- stack.currentFrame->clobberOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- DPRINTF(("recursively matching greedy group\n"));
- minSatNextBracket.set();
- RECURSIVE_MATCH_NEW_GROUP(18, LOCALS(instructionPtrAtStartOfOnce), stack.currentFrame->args.bracketChain, true);
- if (isMatch)
- RRETURN;
- else
- stack.currentFrame->restoreOffsets(LOCALS(minBracket), LOCALS(limitBracket), md.offsetVector, md.offsetEnd);
- RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain);
- }
- RRETURN;
-
- /* Start of subject. */
- BEGIN_OPCODE(CIRC):
- if (stack.currentFrame->args.subjectPtr != md.startSubject)
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- /* After internal newline if multiline. */
- BEGIN_OPCODE(BOL):
- if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(stack.currentFrame->args.subjectPtr[-1]))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- /* End of subject. */
- BEGIN_OPCODE(DOLL):
- if (stack.currentFrame->args.subjectPtr < md.endSubject)
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- /* Before internal newline if multiline. */
- BEGIN_OPCODE(EOL):
- if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(*stack.currentFrame->args.subjectPtr))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
-
- /* Word boundary assertions */
-
- BEGIN_OPCODE(NOT_WORD_BOUNDARY):
- BEGIN_OPCODE(WORD_BOUNDARY): {
- bool currentCharIsWordChar = false;
- bool previousCharIsWordChar = false;
-
- if (stack.currentFrame->args.subjectPtr > md.startSubject)
- previousCharIsWordChar = isWordChar(stack.currentFrame->args.subjectPtr[-1]);
- if (stack.currentFrame->args.subjectPtr < md.endSubject)
- currentCharIsWordChar = isWordChar(*stack.currentFrame->args.subjectPtr);
-
- /* Now see if the situation is what we want */
- bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY);
- if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar)
- RRETURN_NO_MATCH;
- NEXT_OPCODE;
- }
-
- /* Match a single character type; inline for speed */
-
- BEGIN_OPCODE(NOT_NEWLINE):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (isNewline(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- BEGIN_OPCODE(NOT_DIGIT):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- BEGIN_OPCODE(DIGIT):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (!isASCIIDigit(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- BEGIN_OPCODE(NOT_WHITESPACE):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (isSpaceChar(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
- BEGIN_OPCODE(WHITESPACE):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (!isSpaceChar(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
-
- BEGIN_OPCODE(NOT_WORDCHAR):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (isWordChar(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
-
- BEGIN_OPCODE(WORDCHAR):
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- if (!isWordChar(*stack.currentFrame->args.subjectPtr++))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.instructionPtr++;
- NEXT_OPCODE;
-
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. The code is similar
- to that for character classes, but repeated for efficiency. Then obey
- similar code to character type repeats - written out again for speed.
- However, if the referenced string is the empty string, always treat
- it as matched, any number of times (otherwise there could be infinite
- loops). */
-
- BEGIN_OPCODE(REF):
- stack.currentFrame->locals.offset = get2ByteValue(stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */
- stack.currentFrame->args.instructionPtr += 3; /* Advance past item */
-
- /* If the reference is unset, set the length to be longer than the amount
- of subject left; this ensures that every attempt at a match fails. We
- can't just fail here, because of the possibility of quantifiers with zero
- minima. */
-
- if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0)
- stack.currentFrame->locals.length = 0;
- else
- stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset];
-
- /* Set up for repetition, or handle the non-repeated case */
-
- switch (*stack.currentFrame->args.instructionPtr) {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
- min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
- stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
- if (stack.currentFrame->locals.max == 0)
- stack.currentFrame->locals.max = INT_MAX;
- stack.currentFrame->args.instructionPtr += 5;
- break;
-
- default: /* No repeat follows */
- if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
- NEXT_OPCODE;
- }
-
- /* If the length of the reference is zero, just continue with the
- main loop. */
-
- if (stack.currentFrame->locals.length == 0)
- NEXT_OPCODE;
-
- /* First, ensure the minimum number of matches are present. */
-
- for (int i = 1; i <= min; i++) {
- if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
- RRETURN_NO_MATCH;
- stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
- }
-
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
-
- if (min == stack.currentFrame->locals.max)
- NEXT_OPCODE;
-
- /* If minimizing, keep trying and advancing the pointer */
-
- if (minimize) {
- for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) {
- RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
- if (isMatch)
- RRETURN;
- if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
- RRETURN;
- stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
- }
- /* Control never reaches here */
- }
-
- /* If maximizing, find the longest string and work backwards */
-
- else {
- stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr;
- for (int i = min; i < stack.currentFrame->locals.max; i++) {
- if (!matchRef(stack.currentFrame->locals.offset, stack.currentFrame->args.subjectPtr, stack.currentFrame->locals.length, md))
- break;
- stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length;
- }
- while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) {
- RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain);
- if (isMatch)
- RRETURN;
- stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length;
- }
- RRETURN_NO_MATCH;
- }
- /* Control never reaches here */
-
- /* Match a bit-mapped character class, possibly repeatedly. This op code is
- used when all the characters in the class have values in the range 0-255,
- and either the matching is caseful, or the characters are in the range
- 0-127 when UTF-8 processing is enabled. The only difference between
- OP_CLASS and OP_NCLASS occurs when a data character outside the range is
- encountered.
-
- First, look past the end of the item to see if there is repeat information
- following. Then obey similar code to character type repeats - written out
- again for speed. */
-
- BEGIN_OPCODE(NCLASS):
- BEGIN_OPCODE(CLASS):
- stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */
- stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */
-
- switch (*stack.currentFrame->args.instructionPtr) {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- repeatInformationFromInstructionOffset(*stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, min, stack.currentFrame->locals.max);
- break;
-
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE);
- min = get2ByteValue(stack.currentFrame->args.instructionPtr + 1);
- stack.currentFrame->locals.max = get2ByteValue(stack.currentFrame->args.instructionPtr + 3);
- if (stack.currentFrame->locals.max == 0)
- stack.currentFrame->locals.max = INT_MAX;
- stack.currentFrame->args.instructionPtr += 5;
- break;
-
- default: /* No repeat follows */
- min = stack.currentFrame->locals.max = 1;
- break;
- }
-
- /* First, ensure the minimum number of matches are present. */
-
- for (int i = 1; i <= min; i++) {
- if (stack.currentFrame->args.subjectPtr >= md.endSubject)
- RRETURN_NO_MATCH;
- int c = *stack.currentFrame->args.subjectPtr++;
- if (c > 255) {
- if (stack.currentFrame->locals.data[-1] == OP_CLASS)
- RRETURN_NO_MATCH;
- } else {
- if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7))))
- RRETURN_NO_MATCH;
- }…