/deps/v8/src/jsregexp.cc
C++ | 6113 lines | 4598 code | 684 blank | 831 comment | 1004 complexity | aef8ad3ebb01bc84c3cbdd8aea488a1e MD5 | raw file
Possible License(s): 0BSD, Apache-2.0, MPL-2.0-no-copyleft-exception, JSON, WTFPL, CC-BY-SA-3.0, Unlicense, ISC, BSD-3-Clause, MIT, AGPL-3.0
Large files files are truncated, but you can click here to view the full file
- // Copyright 2012 the V8 project authors. All rights reserved.
- // Use of this source code is governed by a BSD-style license that can be
- // found in the LICENSE file.
- #include "src/v8.h"
- #include "src/ast.h"
- #include "src/base/platform/platform.h"
- #include "src/compilation-cache.h"
- #include "src/compiler.h"
- #include "src/execution.h"
- #include "src/factory.h"
- #include "src/jsregexp-inl.h"
- #include "src/jsregexp.h"
- #include "src/ostreams.h"
- #include "src/parser.h"
- #include "src/regexp-macro-assembler.h"
- #include "src/regexp-macro-assembler-irregexp.h"
- #include "src/regexp-macro-assembler-tracer.h"
- #include "src/regexp-stack.h"
- #include "src/runtime.h"
- #include "src/string-search.h"
- #ifndef V8_INTERPRETED_REGEXP
- #if V8_TARGET_ARCH_IA32
- #include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT
- #elif V8_TARGET_ARCH_X64
- #include "src/x64/regexp-macro-assembler-x64.h" // NOLINT
- #elif V8_TARGET_ARCH_ARM64
- #include "src/arm64/regexp-macro-assembler-arm64.h" // NOLINT
- #elif V8_TARGET_ARCH_ARM
- #include "src/arm/regexp-macro-assembler-arm.h" // NOLINT
- #elif V8_TARGET_ARCH_MIPS
- #include "src/mips/regexp-macro-assembler-mips.h" // NOLINT
- #elif V8_TARGET_ARCH_MIPS64
- #include "src/mips64/regexp-macro-assembler-mips64.h" // NOLINT
- #elif V8_TARGET_ARCH_X87
- #include "src/x87/regexp-macro-assembler-x87.h" // NOLINT
- #else
- #error Unsupported target architecture.
- #endif
- #endif
- #include "src/interpreter-irregexp.h"
- namespace v8 {
- namespace internal {
- MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
- Handle<JSFunction> constructor,
- Handle<String> pattern,
- Handle<String> flags) {
- // Call the construct code with 2 arguments.
- Handle<Object> argv[] = { pattern, flags };
- return Execution::New(constructor, ARRAY_SIZE(argv), argv);
- }
- static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
- int flags = JSRegExp::NONE;
- for (int i = 0; i < str->length(); i++) {
- switch (str->Get(i)) {
- case 'i':
- flags |= JSRegExp::IGNORE_CASE;
- break;
- case 'g':
- flags |= JSRegExp::GLOBAL;
- break;
- case 'm':
- flags |= JSRegExp::MULTILINE;
- break;
- }
- }
- return JSRegExp::Flags(flags);
- }
- MUST_USE_RESULT
- static inline MaybeHandle<Object> ThrowRegExpException(
- Handle<JSRegExp> re,
- Handle<String> pattern,
- Handle<String> error_text,
- const char* message) {
- Isolate* isolate = re->GetIsolate();
- Factory* factory = isolate->factory();
- Handle<FixedArray> elements = factory->NewFixedArray(2);
- elements->set(0, *pattern);
- elements->set(1, *error_text);
- Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
- Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
- return isolate->Throw<Object>(regexp_err);
- }
- ContainedInLattice AddRange(ContainedInLattice containment,
- const int* ranges,
- int ranges_length,
- Interval new_range) {
- DCHECK((ranges_length & 1) == 1);
- DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
- if (containment == kLatticeUnknown) return containment;
- bool inside = false;
- int last = 0;
- for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
- // Consider the range from last to ranges[i].
- // We haven't got to the new range yet.
- if (ranges[i] <= new_range.from()) continue;
- // New range is wholly inside last-ranges[i]. Note that new_range.to() is
- // inclusive, but the values in ranges are not.
- if (last <= new_range.from() && new_range.to() < ranges[i]) {
- return Combine(containment, inside ? kLatticeIn : kLatticeOut);
- }
- return kLatticeUnknown;
- }
- return containment;
- }
- // More makes code generation slower, less makes V8 benchmark score lower.
- const int kMaxLookaheadForBoyerMoore = 8;
- // In a 3-character pattern you can maximally step forwards 3 characters
- // at a time, which is not always enough to pay for the extra logic.
- const int kPatternTooShortForBoyerMoore = 2;
- // Identifies the sort of regexps where the regexp engine is faster
- // than the code used for atom matches.
- static bool HasFewDifferentCharacters(Handle<String> pattern) {
- int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
- if (length <= kPatternTooShortForBoyerMoore) return false;
- const int kMod = 128;
- bool character_found[kMod];
- int different = 0;
- memset(&character_found[0], 0, sizeof(character_found));
- for (int i = 0; i < length; i++) {
- int ch = (pattern->Get(i) & (kMod - 1));
- if (!character_found[ch]) {
- character_found[ch] = true;
- different++;
- // We declare a regexp low-alphabet if it has at least 3 times as many
- // characters as it has different characters.
- if (different * 3 > length) return false;
- }
- }
- return true;
- }
- // Generic RegExp methods. Dispatches to implementation specific methods.
- MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
- Handle<String> pattern,
- Handle<String> flag_str) {
- Isolate* isolate = re->GetIsolate();
- Zone zone(isolate);
- JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
- CompilationCache* compilation_cache = isolate->compilation_cache();
- MaybeHandle<FixedArray> maybe_cached =
- compilation_cache->LookupRegExp(pattern, flags);
- Handle<FixedArray> cached;
- bool in_cache = maybe_cached.ToHandle(&cached);
- LOG(isolate, RegExpCompileEvent(re, in_cache));
- Handle<Object> result;
- if (in_cache) {
- re->set_data(*cached);
- return re;
- }
- pattern = String::Flatten(pattern);
- PostponeInterruptsScope postpone(isolate);
- RegExpCompileData parse_result;
- FlatStringReader reader(isolate, pattern);
- if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
- &parse_result, &zone)) {
- // Throw an exception if we fail to parse the pattern.
- return ThrowRegExpException(re,
- pattern,
- parse_result.error,
- "malformed_regexp");
- }
- bool has_been_compiled = false;
- if (parse_result.simple &&
- !flags.is_ignore_case() &&
- !HasFewDifferentCharacters(pattern)) {
- // Parse-tree is a single atom that is equal to the pattern.
- AtomCompile(re, pattern, flags, pattern);
- has_been_compiled = true;
- } else if (parse_result.tree->IsAtom() &&
- !flags.is_ignore_case() &&
- parse_result.capture_count == 0) {
- RegExpAtom* atom = parse_result.tree->AsAtom();
- Vector<const uc16> atom_pattern = atom->data();
- Handle<String> atom_string;
- ASSIGN_RETURN_ON_EXCEPTION(
- isolate, atom_string,
- isolate->factory()->NewStringFromTwoByte(atom_pattern),
- Object);
- if (!HasFewDifferentCharacters(atom_string)) {
- AtomCompile(re, pattern, flags, atom_string);
- has_been_compiled = true;
- }
- }
- if (!has_been_compiled) {
- IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
- }
- DCHECK(re->data()->IsFixedArray());
- // Compilation succeeded so the data is set on the regexp
- // and we can store it in the cache.
- Handle<FixedArray> data(FixedArray::cast(re->data()));
- compilation_cache->PutRegExp(pattern, flags, data);
- return re;
- }
- MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
- Handle<String> subject,
- int index,
- Handle<JSArray> last_match_info) {
- switch (regexp->TypeTag()) {
- case JSRegExp::ATOM:
- return AtomExec(regexp, subject, index, last_match_info);
- case JSRegExp::IRREGEXP: {
- return IrregexpExec(regexp, subject, index, last_match_info);
- }
- default:
- UNREACHABLE();
- return MaybeHandle<Object>();
- }
- }
- // RegExp Atom implementation: Simple string search using indexOf.
- void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
- Handle<String> pattern,
- JSRegExp::Flags flags,
- Handle<String> match_pattern) {
- re->GetIsolate()->factory()->SetRegExpAtomData(re,
- JSRegExp::ATOM,
- pattern,
- flags,
- match_pattern);
- }
- static void SetAtomLastCapture(FixedArray* array,
- String* subject,
- int from,
- int to) {
- SealHandleScope shs(array->GetIsolate());
- RegExpImpl::SetLastCaptureCount(array, 2);
- RegExpImpl::SetLastSubject(array, subject);
- RegExpImpl::SetLastInput(array, subject);
- RegExpImpl::SetCapture(array, 0, from);
- RegExpImpl::SetCapture(array, 1, to);
- }
- int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
- Handle<String> subject,
- int index,
- int32_t* output,
- int output_size) {
- Isolate* isolate = regexp->GetIsolate();
- DCHECK(0 <= index);
- DCHECK(index <= subject->length());
- subject = String::Flatten(subject);
- DisallowHeapAllocation no_gc; // ensure vectors stay valid
- String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
- int needle_len = needle->length();
- DCHECK(needle->IsFlat());
- DCHECK_LT(0, needle_len);
- if (index + needle_len > subject->length()) {
- return RegExpImpl::RE_FAILURE;
- }
- for (int i = 0; i < output_size; i += 2) {
- String::FlatContent needle_content = needle->GetFlatContent();
- String::FlatContent subject_content = subject->GetFlatContent();
- DCHECK(needle_content.IsFlat());
- DCHECK(subject_content.IsFlat());
- // dispatch on type of strings
- index = (needle_content.IsAscii()
- ? (subject_content.IsAscii()
- ? SearchString(isolate,
- subject_content.ToOneByteVector(),
- needle_content.ToOneByteVector(),
- index)
- : SearchString(isolate,
- subject_content.ToUC16Vector(),
- needle_content.ToOneByteVector(),
- index))
- : (subject_content.IsAscii()
- ? SearchString(isolate,
- subject_content.ToOneByteVector(),
- needle_content.ToUC16Vector(),
- index)
- : SearchString(isolate,
- subject_content.ToUC16Vector(),
- needle_content.ToUC16Vector(),
- index)));
- if (index == -1) {
- return i / 2; // Return number of matches.
- } else {
- output[i] = index;
- output[i+1] = index + needle_len;
- index += needle_len;
- }
- }
- return output_size / 2;
- }
- Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
- Handle<String> subject,
- int index,
- Handle<JSArray> last_match_info) {
- Isolate* isolate = re->GetIsolate();
- static const int kNumRegisters = 2;
- STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
- int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
- int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
- if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
- DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
- SealHandleScope shs(isolate);
- FixedArray* array = FixedArray::cast(last_match_info->elements());
- SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
- return last_match_info;
- }
- // Irregexp implementation.
- // Ensures that the regexp object contains a compiled version of the
- // source for either ASCII or non-ASCII strings.
- // If the compiled version doesn't already exist, it is compiled
- // from the source pattern.
- // If compilation fails, an exception is thrown and this function
- // returns false.
- bool RegExpImpl::EnsureCompiledIrregexp(
- Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
- Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
- #ifdef V8_INTERPRETED_REGEXP
- if (compiled_code->IsByteArray()) return true;
- #else // V8_INTERPRETED_REGEXP (RegExp native code)
- if (compiled_code->IsCode()) return true;
- #endif
- // We could potentially have marked this as flushable, but have kept
- // a saved version if we did not flush it yet.
- Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
- if (saved_code->IsCode()) {
- // Reinstate the code in the original place.
- re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
- DCHECK(compiled_code->IsSmi());
- return true;
- }
- return CompileIrregexp(re, sample_subject, is_ascii);
- }
- static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
- bool is_ascii,
- Handle<String> error_message,
- Isolate* isolate) {
- Factory* factory = isolate->factory();
- Handle<FixedArray> elements = factory->NewFixedArray(2);
- elements->set(0, re->Pattern());
- elements->set(1, *error_message);
- Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
- Handle<Object> regexp_err =
- factory->NewSyntaxError("malformed_regexp", array);
- isolate->Throw(*regexp_err);
- return false;
- }
- bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
- Handle<String> sample_subject,
- bool is_ascii) {
- // Compile the RegExp.
- Isolate* isolate = re->GetIsolate();
- Zone zone(isolate);
- PostponeInterruptsScope postpone(isolate);
- // If we had a compilation error the last time this is saved at the
- // saved code index.
- Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
- // When arriving here entry can only be a smi, either representing an
- // uncompiled regexp, a previous compilation error, or code that has
- // been flushed.
- DCHECK(entry->IsSmi());
- int entry_value = Smi::cast(entry)->value();
- DCHECK(entry_value == JSRegExp::kUninitializedValue ||
- entry_value == JSRegExp::kCompilationErrorValue ||
- (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
- if (entry_value == JSRegExp::kCompilationErrorValue) {
- // A previous compilation failed and threw an error which we store in
- // the saved code index (we store the error message, not the actual
- // error). Recreate the error object and throw it.
- Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
- DCHECK(error_string->IsString());
- Handle<String> error_message(String::cast(error_string));
- CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
- return false;
- }
- JSRegExp::Flags flags = re->GetFlags();
- Handle<String> pattern(re->Pattern());
- pattern = String::Flatten(pattern);
- RegExpCompileData compile_data;
- FlatStringReader reader(isolate, pattern);
- if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
- &compile_data,
- &zone)) {
- // Throw an exception if we fail to parse the pattern.
- // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
- USE(ThrowRegExpException(re,
- pattern,
- compile_data.error,
- "malformed_regexp"));
- return false;
- }
- RegExpEngine::CompilationResult result =
- RegExpEngine::Compile(&compile_data,
- flags.is_ignore_case(),
- flags.is_global(),
- flags.is_multiline(),
- pattern,
- sample_subject,
- is_ascii,
- &zone);
- if (result.error_message != NULL) {
- // Unable to compile regexp.
- Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
- CStrVector(result.error_message)).ToHandleChecked();
- CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
- return false;
- }
- Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
- data->set(JSRegExp::code_index(is_ascii), result.code);
- int register_max = IrregexpMaxRegisterCount(*data);
- if (result.num_registers > register_max) {
- SetIrregexpMaxRegisterCount(*data, result.num_registers);
- }
- return true;
- }
- int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
- return Smi::cast(
- re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
- }
- void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
- re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
- }
- int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
- return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
- }
- int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
- return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
- }
- ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
- return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
- }
- Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
- return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
- }
- void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
- Handle<String> pattern,
- JSRegExp::Flags flags,
- int capture_count) {
- // Initialize compiled code entries to null.
- re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
- JSRegExp::IRREGEXP,
- pattern,
- flags,
- capture_count);
- }
- int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
- Handle<String> subject) {
- subject = String::Flatten(subject);
- // Check the asciiness of the underlying storage.
- bool is_ascii = subject->IsOneByteRepresentationUnderneath();
- if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
- #ifdef V8_INTERPRETED_REGEXP
- // Byte-code regexp needs space allocated for all its registers.
- // The result captures are copied to the start of the registers array
- // if the match succeeds. This way those registers are not clobbered
- // when we set the last match info from last successful match.
- return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
- (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
- #else // V8_INTERPRETED_REGEXP
- // Native regexp only needs room to output captures. Registers are handled
- // internally.
- return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
- #endif // V8_INTERPRETED_REGEXP
- }
- int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
- Handle<String> subject,
- int index,
- int32_t* output,
- int output_size) {
- Isolate* isolate = regexp->GetIsolate();
- Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
- DCHECK(index >= 0);
- DCHECK(index <= subject->length());
- DCHECK(subject->IsFlat());
- bool is_ascii = subject->IsOneByteRepresentationUnderneath();
- #ifndef V8_INTERPRETED_REGEXP
- DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
- do {
- EnsureCompiledIrregexp(regexp, subject, is_ascii);
- Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
- // The stack is used to allocate registers for the compiled regexp code.
- // This means that in case of failure, the output registers array is left
- // untouched and contains the capture results from the previous successful
- // match. We can use that to set the last match info lazily.
- NativeRegExpMacroAssembler::Result res =
- NativeRegExpMacroAssembler::Match(code,
- subject,
- output,
- output_size,
- index,
- isolate);
- if (res != NativeRegExpMacroAssembler::RETRY) {
- DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
- isolate->has_pending_exception());
- STATIC_ASSERT(
- static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
- STATIC_ASSERT(
- static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
- STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
- == RE_EXCEPTION);
- return static_cast<IrregexpResult>(res);
- }
- // If result is RETRY, the string has changed representation, and we
- // must restart from scratch.
- // In this case, it means we must make sure we are prepared to handle
- // the, potentially, different subject (the string can switch between
- // being internal and external, and even between being ASCII and UC16,
- // but the characters are always the same).
- IrregexpPrepare(regexp, subject);
- is_ascii = subject->IsOneByteRepresentationUnderneath();
- } while (true);
- UNREACHABLE();
- return RE_EXCEPTION;
- #else // V8_INTERPRETED_REGEXP
- DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
- // We must have done EnsureCompiledIrregexp, so we can get the number of
- // registers.
- int number_of_capture_registers =
- (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
- int32_t* raw_output = &output[number_of_capture_registers];
- // We do not touch the actual capture result registers until we know there
- // has been a match so that we can use those capture results to set the
- // last match info.
- for (int i = number_of_capture_registers - 1; i >= 0; i--) {
- raw_output[i] = -1;
- }
- Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
- IrregexpResult result = IrregexpInterpreter::Match(isolate,
- byte_codes,
- subject,
- raw_output,
- index);
- if (result == RE_SUCCESS) {
- // Copy capture results to the start of the registers array.
- MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
- }
- if (result == RE_EXCEPTION) {
- DCHECK(!isolate->has_pending_exception());
- isolate->StackOverflow();
- }
- return result;
- #endif // V8_INTERPRETED_REGEXP
- }
- MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
- Handle<String> subject,
- int previous_index,
- Handle<JSArray> last_match_info) {
- Isolate* isolate = regexp->GetIsolate();
- DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
- // Prepare space for the return values.
- #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
- if (FLAG_trace_regexp_bytecodes) {
- String* pattern = regexp->Pattern();
- PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
- PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
- }
- #endif
- int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
- if (required_registers < 0) {
- // Compiling failed with an exception.
- DCHECK(isolate->has_pending_exception());
- return MaybeHandle<Object>();
- }
- int32_t* output_registers = NULL;
- if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
- output_registers = NewArray<int32_t>(required_registers);
- }
- SmartArrayPointer<int32_t> auto_release(output_registers);
- if (output_registers == NULL) {
- output_registers = isolate->jsregexp_static_offsets_vector();
- }
- int res = RegExpImpl::IrregexpExecRaw(
- regexp, subject, previous_index, output_registers, required_registers);
- if (res == RE_SUCCESS) {
- int capture_count =
- IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
- return SetLastMatchInfo(
- last_match_info, subject, capture_count, output_registers);
- }
- if (res == RE_EXCEPTION) {
- DCHECK(isolate->has_pending_exception());
- return MaybeHandle<Object>();
- }
- DCHECK(res == RE_FAILURE);
- return isolate->factory()->null_value();
- }
- Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
- Handle<String> subject,
- int capture_count,
- int32_t* match) {
- DCHECK(last_match_info->HasFastObjectElements());
- int capture_register_count = (capture_count + 1) * 2;
- JSArray::EnsureSize(last_match_info,
- capture_register_count + kLastMatchOverhead);
- DisallowHeapAllocation no_allocation;
- FixedArray* array = FixedArray::cast(last_match_info->elements());
- if (match != NULL) {
- for (int i = 0; i < capture_register_count; i += 2) {
- SetCapture(array, i, match[i]);
- SetCapture(array, i + 1, match[i + 1]);
- }
- }
- SetLastCaptureCount(array, capture_register_count);
- SetLastSubject(array, *subject);
- SetLastInput(array, *subject);
- return last_match_info;
- }
- RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
- Handle<String> subject,
- bool is_global,
- Isolate* isolate)
- : register_array_(NULL),
- register_array_size_(0),
- regexp_(regexp),
- subject_(subject) {
- #ifdef V8_INTERPRETED_REGEXP
- bool interpreted = true;
- #else
- bool interpreted = false;
- #endif // V8_INTERPRETED_REGEXP
- if (regexp_->TypeTag() == JSRegExp::ATOM) {
- static const int kAtomRegistersPerMatch = 2;
- registers_per_match_ = kAtomRegistersPerMatch;
- // There is no distinction between interpreted and native for atom regexps.
- interpreted = false;
- } else {
- registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
- if (registers_per_match_ < 0) {
- num_matches_ = -1; // Signal exception.
- return;
- }
- }
- if (is_global && !interpreted) {
- register_array_size_ =
- Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
- max_matches_ = register_array_size_ / registers_per_match_;
- } else {
- // Global loop in interpreted regexp is not implemented. We choose
- // the size of the offsets vector so that it can only store one match.
- register_array_size_ = registers_per_match_;
- max_matches_ = 1;
- }
- if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
- register_array_ = NewArray<int32_t>(register_array_size_);
- } else {
- register_array_ = isolate->jsregexp_static_offsets_vector();
- }
- // Set state so that fetching the results the first time triggers a call
- // to the compiled regexp.
- current_match_index_ = max_matches_ - 1;
- num_matches_ = max_matches_;
- DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
- DCHECK_GE(register_array_size_, registers_per_match_);
- int32_t* last_match =
- ®ister_array_[current_match_index_ * registers_per_match_];
- last_match[0] = -1;
- last_match[1] = 0;
- }
- // -------------------------------------------------------------------
- // Implementation of the Irregexp regular expression engine.
- //
- // The Irregexp regular expression engine is intended to be a complete
- // implementation of ECMAScript regular expressions. It generates either
- // bytecodes or native code.
- // The Irregexp regexp engine is structured in three steps.
- // 1) The parser generates an abstract syntax tree. See ast.cc.
- // 2) From the AST a node network is created. The nodes are all
- // subclasses of RegExpNode. The nodes represent states when
- // executing a regular expression. Several optimizations are
- // performed on the node network.
- // 3) From the nodes we generate either byte codes or native code
- // that can actually execute the regular expression (perform
- // the search). The code generation step is described in more
- // detail below.
- // Code generation.
- //
- // The nodes are divided into four main categories.
- // * Choice nodes
- // These represent places where the regular expression can
- // match in more than one way. For example on entry to an
- // alternation (foo|bar) or a repetition (*, +, ? or {}).
- // * Action nodes
- // These represent places where some action should be
- // performed. Examples include recording the current position
- // in the input string to a register (in order to implement
- // captures) or other actions on register for example in order
- // to implement the counters needed for {} repetitions.
- // * Matching nodes
- // These attempt to match some element part of the input string.
- // Examples of elements include character classes, plain strings
- // or back references.
- // * End nodes
- // These are used to implement the actions required on finding
- // a successful match or failing to find a match.
- //
- // The code generated (whether as byte codes or native code) maintains
- // some state as it runs. This consists of the following elements:
- //
- // * The capture registers. Used for string captures.
- // * Other registers. Used for counters etc.
- // * The current position.
- // * The stack of backtracking information. Used when a matching node
- // fails to find a match and needs to try an alternative.
- //
- // Conceptual regular expression execution model:
- //
- // There is a simple conceptual model of regular expression execution
- // which will be presented first. The actual code generated is a more
- // efficient simulation of the simple conceptual model:
- //
- // * Choice nodes are implemented as follows:
- // For each choice except the last {
- // push current position
- // push backtrack code location
- // <generate code to test for choice>
- // backtrack code location:
- // pop current position
- // }
- // <generate code to test for last choice>
- //
- // * Actions nodes are generated as follows
- // <push affected registers on backtrack stack>
- // <generate code to perform action>
- // push backtrack code location
- // <generate code to test for following nodes>
- // backtrack code location:
- // <pop affected registers to restore their state>
- // <pop backtrack location from stack and go to it>
- //
- // * Matching nodes are generated as follows:
- // if input string matches at current position
- // update current position
- // <generate code to test for following nodes>
- // else
- // <pop backtrack location from stack and go to it>
- //
- // Thus it can be seen that the current position is saved and restored
- // by the choice nodes, whereas the registers are saved and restored by
- // by the action nodes that manipulate them.
- //
- // The other interesting aspect of this model is that nodes are generated
- // at the point where they are needed by a recursive call to Emit(). If
- // the node has already been code generated then the Emit() call will
- // generate a jump to the previously generated code instead. In order to
- // limit recursion it is possible for the Emit() function to put the node
- // on a work list for later generation and instead generate a jump. The
- // destination of the jump is resolved later when the code is generated.
- //
- // Actual regular expression code generation.
- //
- // Code generation is actually more complicated than the above. In order
- // to improve the efficiency of the generated code some optimizations are
- // performed
- //
- // * Choice nodes have 1-character lookahead.
- // A choice node looks at the following character and eliminates some of
- // the choices immediately based on that character. This is not yet
- // implemented.
- // * Simple greedy loops store reduced backtracking information.
- // A quantifier like /.*foo/m will greedily match the whole input. It will
- // then need to backtrack to a point where it can match "foo". The naive
- // implementation of this would push each character position onto the
- // backtracking stack, then pop them off one by one. This would use space
- // proportional to the length of the input string. However since the "."
- // can only match in one way and always has a constant length (in this case
- // of 1) it suffices to store the current position on the top of the stack
- // once. Matching now becomes merely incrementing the current position and
- // backtracking becomes decrementing the current position and checking the
- // result against the stored current position. This is faster and saves
- // space.
- // * The current state is virtualized.
- // This is used to defer expensive operations until it is clear that they
- // are needed and to generate code for a node more than once, allowing
- // specialized an efficient versions of the code to be created. This is
- // explained in the section below.
- //
- // Execution state virtualization.
- //
- // Instead of emitting code, nodes that manipulate the state can record their
- // manipulation in an object called the Trace. The Trace object can record a
- // current position offset, an optional backtrack code location on the top of
- // the virtualized backtrack stack and some register changes. When a node is
- // to be emitted it can flush the Trace or update it. Flushing the Trace
- // will emit code to bring the actual state into line with the virtual state.
- // Avoiding flushing the state can postpone some work (e.g. updates of capture
- // registers). Postponing work can save time when executing the regular
- // expression since it may be found that the work never has to be done as a
- // failure to match can occur. In addition it is much faster to jump to a
- // known backtrack code location than it is to pop an unknown backtrack
- // location from the stack and jump there.
- //
- // The virtual state found in the Trace affects code generation. For example
- // the virtual state contains the difference between the actual current
- // position and the virtual current position, and matching code needs to use
- // this offset to attempt a match in the correct location of the input
- // string. Therefore code generated for a non-trivial trace is specialized
- // to that trace. The code generator therefore has the ability to generate
- // code for each node several times. In order to limit the size of the
- // generated code there is an arbitrary limit on how many specialized sets of
- // code may be generated for a given node. If the limit is reached, the
- // trace is flushed and a generic version of the code for a node is emitted.
- // This is subsequently used for that node. The code emitted for non-generic
- // trace is not recorded in the node and so it cannot currently be reused in
- // the event that code generation is requested for an identical trace.
- void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
- UNREACHABLE();
- }
- void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
- text->AddElement(TextElement::Atom(this), zone);
- }
- void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
- text->AddElement(TextElement::CharClass(this), zone);
- }
- void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
- for (int i = 0; i < elements()->length(); i++)
- text->AddElement(elements()->at(i), zone);
- }
- TextElement TextElement::Atom(RegExpAtom* atom) {
- return TextElement(ATOM, atom);
- }
- TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
- return TextElement(CHAR_CLASS, char_class);
- }
- int TextElement::length() const {
- switch (text_type()) {
- case ATOM:
- return atom()->length();
- case CHAR_CLASS:
- return 1;
- }
- UNREACHABLE();
- return 0;
- }
- DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
- if (table_ == NULL) {
- table_ = new(zone()) DispatchTable(zone());
- DispatchTableConstructor cons(table_, ignore_case, zone());
- cons.BuildTable(this);
- }
- return table_;
- }
- class FrequencyCollator {
- public:
- FrequencyCollator() : total_samples_(0) {
- for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
- frequencies_[i] = CharacterFrequency(i);
- }
- }
- void CountCharacter(int character) {
- int index = (character & RegExpMacroAssembler::kTableMask);
- frequencies_[index].Increment();
- total_samples_++;
- }
- // Does not measure in percent, but rather per-128 (the table size from the
- // regexp macro assembler).
- int Frequency(int in_character) {
- DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
- if (total_samples_ < 1) return 1; // Division by zero.
- int freq_in_per128 =
- (frequencies_[in_character].counter() * 128) / total_samples_;
- return freq_in_per128;
- }
- private:
- class CharacterFrequency {
- public:
- CharacterFrequency() : counter_(0), character_(-1) { }
- explicit CharacterFrequency(int character)
- : counter_(0), character_(character) { }
- void Increment() { counter_++; }
- int counter() { return counter_; }
- int character() { return character_; }
- private:
- int counter_;
- int character_;
- };
- private:
- CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
- int total_samples_;
- };
- class RegExpCompiler {
- public:
- RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
- Zone* zone);
- int AllocateRegister() {
- if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
- reg_exp_too_big_ = true;
- return next_register_;
- }
- return next_register_++;
- }
- RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
- RegExpNode* start,
- int capture_count,
- Handle<String> pattern);
- inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
- static const int kImplementationOffset = 0;
- static const int kNumberOfRegistersOffset = 0;
- static const int kCodeOffset = 1;
- RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
- EndNode* accept() { return accept_; }
- static const int kMaxRecursion = 100;
- inline int recursion_depth() { return recursion_depth_; }
- inline void IncrementRecursionDepth() { recursion_depth_++; }
- inline void DecrementRecursionDepth() { recursion_depth_--; }
- void SetRegExpTooBig() { reg_exp_too_big_ = true; }
- inline bool ignore_case() { return ignore_case_; }
- inline bool ascii() { return ascii_; }
- FrequencyCollator* frequency_collator() { return &frequency_collator_; }
- int current_expansion_factor() { return current_expansion_factor_; }
- void set_current_expansion_factor(int value) {
- current_expansion_factor_ = value;
- }
- Zone* zone() const { return zone_; }
- static const int kNoRegister = -1;
- private:
- EndNode* accept_;
- int next_register_;
- List<RegExpNode*>* work_list_;
- int recursion_depth_;
- RegExpMacroAssembler* macro_assembler_;
- bool ignore_case_;
- bool ascii_;
- bool reg_exp_too_big_;
- int current_expansion_factor_;
- FrequencyCollator frequency_collator_;
- Zone* zone_;
- };
- class RecursionCheck {
- public:
- explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
- compiler->IncrementRecursionDepth();
- }
- ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
- private:
- RegExpCompiler* compiler_;
- };
- static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
- return RegExpEngine::CompilationResult(isolate, "RegExp too big");
- }
- // Attempts to compile the regexp using an Irregexp code generator. Returns
- // a fixed array or a null handle depending on whether it succeeded.
- RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
- Zone* zone)
- : next_register_(2 * (capture_count + 1)),
- work_list_(NULL),
- recursion_depth_(0),
- ignore_case_(ignore_case),
- ascii_(ascii),
- reg_exp_too_big_(false),
- current_expansion_factor_(1),
- frequency_collator_(),
- zone_(zone) {
- accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
- DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
- }
- RegExpEngine::CompilationResult RegExpCompiler::Assemble(
- RegExpMacroAssembler* macro_assembler,
- RegExpNode* start,
- int capture_count,
- Handle<String> pattern) {
- Heap* heap = pattern->GetHeap();
- bool use_slow_safe_regexp_compiler = false;
- if (heap->total_regexp_code_generated() >
- RegExpImpl::kRegWxpCompiledLimit &&
- heap->isolate()->memory_allocator()->SizeExecutable() >
- RegExpImpl::kRegExpExecutableMemoryLimit) {
- use_slow_safe_regexp_compiler = true;
- }
- macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
- #ifdef DEBUG
- if (FLAG_trace_regexp_assembler)
- macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
- else
- #endif
- macro_assembler_ = macro_assembler;
- List <RegExpNode*> work_list(0);
- work_list_ = &work_list;
- Label fail;
- macro_assembler_->PushBacktrack(&fail);
- Trace new_trace;
- start->Emit(this, &new_trace);
- macro_assembler_->Bind(&fail);
- macro_assembler_->Fail();
- while (!work_list.is_empty()) {
- work_list.RemoveLast()->Emit(this, &new_trace);
- }
- if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
- Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
- heap->IncreaseTotalRegexpCodeGenerated(code->Size());
- work_list_ = NULL;
- #ifdef DEBUG
- if (FLAG_print_code) {
- CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
- OFStream os(trace_scope.file());
- Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
- }
- if (FLAG_trace_regexp_assembler) {
- delete macro_assembler_;
- }
- #endif
- return RegExpEngine::CompilationResult(*code, next_register_);
- }
- bool Trace::DeferredAction::Mentions(int that) {
- if (action_type() == ActionNode::CLEAR_CAPTURES) {
- Interval range = static_cast<DeferredClearCaptures*>(this)->range();
- return range.Contains(that);
- } else {
- return reg() == that;
- }
- }
- bool Trace::mentions_reg(int reg) {
- for (DeferredAction* action = actions_;
- action != NULL;
- action = action->next()) {
- if (action->Mentions(reg))
- return true;
- }
- return false;
- }
- bool Trace::GetStoredPosition(int reg, int* cp_offset) {
- DCHECK_EQ(0, *cp_offset);
- for (DeferredAction* action = actions_;
- action != NULL;
- action = action->next()) {
- if (action->Mentions(reg)) {
- if (action->action_type() == ActionNode::STORE_POSITION) {
- *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
- return true;
- } else {
- return false;
- }
- }
- }
- return false;
- }
- int Trace::FindAffectedRegisters(OutSet* affected_registers,
- Zone* zone) {
- int max_register = RegExpCompiler::kNoRegister;
- for (DeferredAction* action = actions_;
- action != NULL;
- action = action->next()) {
- if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
- Interval range = static_cast<DeferredClearCaptures*>(action)->range();
- for (int i = range.from(); i <= range.to(); i++)
- affected_registers->Set(i, zone);
- if (range.to() > max_register) max_register = range.to();
- } else {
- affected_registers->Set(action->reg(), zone);
- if (action->reg() > max_register) max_register = action->reg();
- }
- }
- return max_register;
- }
- void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
- int max_register,
- const OutSet& registers_to_pop,
- const OutSet& registers_to_clear) {
- for (int reg = max_register; reg >= 0; reg--) {
- if (registers_to_pop.Get(reg)) {
- assembler->PopRegister(reg);
- } else if (registers_to_clear.Get(reg)) {
- int clear_to = reg;
- while (reg > 0 && registers_to_clear.Get(reg - 1)) {
- reg--;
- }
- assembler->ClearRegisters(reg, clear_to);
- }
- }
- }
- void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
- int max_register,
- const OutSet& affected_registers,
- OutSet* registers_to_pop,
- OutSet* registers_to_clear,
- Zone* zone) {
- // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
- const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
- // Count pushes performed to force a stack limit check occasionally.
- int pushes = 0;
- for (int reg = 0; reg <= max_register; reg++) {
- if (!affected_registers.Get(reg)) {
- continue;
- }
- // The chronologically first deferred action in the trace
- // is used to infer the action needed to restore a register
- // to its previous state (or not, if it's safe to ignore it).
- enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
- DeferredActionUndoType undo_action = IGNORE;
- int value = 0;
- bool absolute = false;
- bool clear = false;
- int store_position = -1;
- // This is a little tricky because we are scanning the actions in reverse
- // historical order (newest first).
- for (DeferredAction* action = actions_;
- action != NULL;
- action = action->next()) {
- if (action->Mentions(reg)) {
- switch (action->action_type()) {
- case ActionNode::SET_REGISTER: {
- Trace::DeferredSetRegister* psr =
- static_cast<Trace::DeferredSetRegister*>(action);
- if (!absolute) {
- value += psr->value();
- absolute = true;
- }
- // SET_REGISTER is currently only used for newly introduced loop
- // counters. They can have a significant previous value if they
- // occour in a loop. TODO(lrn): Propagate this information, so
- // we can set undo_action to IGNORE if we know there is no value to
- // restore.
- undo_action = RESTORE;
- DCHECK_EQ(store_position, -1);
- DCHECK(!clear);
- break;
- }
- case ActionNode::INCREMENT_REGISTER:
- if (!absolute) {
- value++;
- }
- DCHECK_EQ(store_position, -1);
- DCHECK(!clear);
- undo_action = RESTORE;
- break;
- case ActionNode::STORE_POSITION: {
- Trace::DeferredCapture* pc =
- static_cast<Trace::DeferredCapture*>(action);
- if (!clear && store_position == -1) {
- store_position = pc->cp_offset();
- }
- // For captures we know that stores and clears alternate.
- // Other register, are never cleared, and if the occur
- // inside a loop, they might be assigned more than once.
- if (reg <= 1) {
- // Registers zero and one, aka "capture zero", is
- // always set correctly if we succeed. There is no
- // need to undo a setting on backtrack, because we
- // will set it again or fail.
- undo_action = IGNORE;
- } else {
- undo_action = pc->is_capture() ? CLEAR : RESTORE;
- }
- DCHECK(!absolute);
- DCHECK_EQ(value, 0);
- break;
- }
- case ActionNode::CLEAR_CAPTURES: {
- // Since we're scanning in reverse order, if we've already
- // set the position we have to ignore historically earlier
- // clearing operations.
- if (store_position == -1) {
- clear = true;
- }
- undo_action = RESTORE;
- DCHECK(!absolute);
- DCHECK_EQ(value, 0);
- break;
- }
- default:
- UNREACHABLE();
- break;
- }
- }
- }
- // Prepare for the undo-action (e.g., push if it's going to be popped).
- if (undo_action == RESTORE) {
- pushes++;
- RegExpMacroAssembler::StackCheckFlag stack_check =
- RegExpMacroAssembler::kNoStackLimitCheck;
- if (pushes == push_limit) {
- stack_check = RegExpMacroAssembler::kCheckStackLimit;
- pushes = 0;
- }
- assembler->PushRegister(reg, stack_check);
- registers_to_pop->Set(reg, zone);
- } else if (undo_action == CLEAR) {
- registers_to_clear->Set(reg, zone);
- }
- // Perform the chronologically last action (or accumulated increment)
- // for the register.
- if (store_position != -1) {
- assembler->WriteCurrentPositionToRegister(reg, store_position);
- } else if (clear) {
- assembler->ClearRegisters(reg, reg);
- } else if (absolute) {
- assembler->SetRegister(reg, value);
- } else if (value != 0) {
- assembler->AdvanceRegister(reg, value);
- }
- }
- }
- // This is called as we come into a loop choice node and some other tricky
- // nodes. It normalizes the state of the code generator to ensure we can
- // g…
Large files files are truncated, but you can click here to view the full file