jsregexp.cc - Copyright 2012 the V8 project authors. All ri…

/deps/v8/src/jsregexp.cc

https://gitlab.com/GeekSir/node · C++ · 6113 lines · 4606 code · 676 blank · 831 comment · 999 complexity · aef8ad3ebb01bc84c3cbdd8aea488a1e MD5 · raw file
Large files are truncated click here to view the full file

// Copyright 2012 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "src/v8.h"

#include "src/ast.h"
#include "src/base/platform/platform.h"
#include "src/compilation-cache.h"
#include "src/compiler.h"
#include "src/execution.h"
#include "src/factory.h"
#include "src/jsregexp-inl.h"
#include "src/jsregexp.h"
#include "src/ostreams.h"
#include "src/parser.h"
#include "src/regexp-macro-assembler.h"
#include "src/regexp-macro-assembler-irregexp.h"
#include "src/regexp-macro-assembler-tracer.h"
#include "src/regexp-stack.h"
#include "src/runtime.h"
#include "src/string-search.h"

#ifndef V8_INTERPRETED_REGEXP
#if V8_TARGET_ARCH_IA32
#include "src/ia32/regexp-macro-assembler-ia32.h"  // NOLINT
#elif V8_TARGET_ARCH_X64
#include "src/x64/regexp-macro-assembler-x64.h"  // NOLINT
#elif V8_TARGET_ARCH_ARM64
#include "src/arm64/regexp-macro-assembler-arm64.h"  // NOLINT
#elif V8_TARGET_ARCH_ARM
#include "src/arm/regexp-macro-assembler-arm.h"  // NOLINT
#elif V8_TARGET_ARCH_MIPS
#include "src/mips/regexp-macro-assembler-mips.h"  // NOLINT
#elif V8_TARGET_ARCH_MIPS64
#include "src/mips64/regexp-macro-assembler-mips64.h"  // NOLINT
#elif V8_TARGET_ARCH_X87
#include "src/x87/regexp-macro-assembler-x87.h"  // NOLINT
#else
#error Unsupported target architecture.
#endif
#endif

#include "src/interpreter-irregexp.h"


namespace v8 {
namespace internal {

MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
    Handle<JSFunction> constructor,
    Handle<String> pattern,
    Handle<String> flags) {
  // Call the construct code with 2 arguments.
  Handle<Object> argv[] = { pattern, flags };
  return Execution::New(constructor, ARRAY_SIZE(argv), argv);
}


static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
  int flags = JSRegExp::NONE;
  for (int i = 0; i < str->length(); i++) {
    switch (str->Get(i)) {
      case 'i':
        flags |= JSRegExp::IGNORE_CASE;
        break;
      case 'g':
        flags |= JSRegExp::GLOBAL;
        break;
      case 'm':
        flags |= JSRegExp::MULTILINE;
        break;
    }
  }
  return JSRegExp::Flags(flags);
}


MUST_USE_RESULT
static inline MaybeHandle<Object> ThrowRegExpException(
    Handle<JSRegExp> re,
    Handle<String> pattern,
    Handle<String> error_text,
    const char* message) {
  Isolate* isolate = re->GetIsolate();
  Factory* factory = isolate->factory();
  Handle<FixedArray> elements = factory->NewFixedArray(2);
  elements->set(0, *pattern);
  elements->set(1, *error_text);
  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
  Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
  return isolate->Throw<Object>(regexp_err);
}


ContainedInLattice AddRange(ContainedInLattice containment,
                            const int* ranges,
                            int ranges_length,
                            Interval new_range) {
  DCHECK((ranges_length & 1) == 1);
  DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
  if (containment == kLatticeUnknown) return containment;
  bool inside = false;
  int last = 0;
  for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
    // Consider the range from last to ranges[i].
    // We haven't got to the new range yet.
    if (ranges[i] <= new_range.from()) continue;
    // New range is wholly inside last-ranges[i].  Note that new_range.to() is
    // inclusive, but the values in ranges are not.
    if (last <= new_range.from() && new_range.to() < ranges[i]) {
      return Combine(containment, inside ? kLatticeIn : kLatticeOut);
    }
    return kLatticeUnknown;
  }
  return containment;
}


// More makes code generation slower, less makes V8 benchmark score lower.
const int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
const int kPatternTooShortForBoyerMoore = 2;


// Identifies the sort of regexps where the regexp engine is faster
// than the code used for atom matches.
static bool HasFewDifferentCharacters(Handle<String> pattern) {
  int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
  if (length <= kPatternTooShortForBoyerMoore) return false;
  const int kMod = 128;
  bool character_found[kMod];
  int different = 0;
  memset(&character_found[0], 0, sizeof(character_found));
  for (int i = 0; i < length; i++) {
    int ch = (pattern->Get(i) & (kMod - 1));
    if (!character_found[ch]) {
      character_found[ch] = true;
      different++;
      // We declare a regexp low-alphabet if it has at least 3 times as many
      // characters as it has different characters.
      if (different * 3 > length) return false;
    }
  }
  return true;
}


// Generic RegExp methods. Dispatches to implementation specific methods.


MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
                                        Handle<String> pattern,
                                        Handle<String> flag_str) {
  Isolate* isolate = re->GetIsolate();
  Zone zone(isolate);
  JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
  CompilationCache* compilation_cache = isolate->compilation_cache();
  MaybeHandle<FixedArray> maybe_cached =
      compilation_cache->LookupRegExp(pattern, flags);
  Handle<FixedArray> cached;
  bool in_cache = maybe_cached.ToHandle(&cached);
  LOG(isolate, RegExpCompileEvent(re, in_cache));

  Handle<Object> result;
  if (in_cache) {
    re->set_data(*cached);
    return re;
  }
  pattern = String::Flatten(pattern);
  PostponeInterruptsScope postpone(isolate);
  RegExpCompileData parse_result;
  FlatStringReader reader(isolate, pattern);
  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
                                 &parse_result, &zone)) {
    // Throw an exception if we fail to parse the pattern.
    return ThrowRegExpException(re,
                                pattern,
                                parse_result.error,
                                "malformed_regexp");
  }

  bool has_been_compiled = false;

  if (parse_result.simple &&
      !flags.is_ignore_case() &&
      !HasFewDifferentCharacters(pattern)) {
    // Parse-tree is a single atom that is equal to the pattern.
    AtomCompile(re, pattern, flags, pattern);
    has_been_compiled = true;
  } else if (parse_result.tree->IsAtom() &&
      !flags.is_ignore_case() &&
      parse_result.capture_count == 0) {
    RegExpAtom* atom = parse_result.tree->AsAtom();
    Vector<const uc16> atom_pattern = atom->data();
    Handle<String> atom_string;
    ASSIGN_RETURN_ON_EXCEPTION(
        isolate, atom_string,
        isolate->factory()->NewStringFromTwoByte(atom_pattern),
        Object);
    if (!HasFewDifferentCharacters(atom_string)) {
      AtomCompile(re, pattern, flags, atom_string);
      has_been_compiled = true;
    }
  }
  if (!has_been_compiled) {
    IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
  }
  DCHECK(re->data()->IsFixedArray());
  // Compilation succeeded so the data is set on the regexp
  // and we can store it in the cache.
  Handle<FixedArray> data(FixedArray::cast(re->data()));
  compilation_cache->PutRegExp(pattern, flags, data);

  return re;
}


MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
                                     Handle<String> subject,
                                     int index,
                                     Handle<JSArray> last_match_info) {
  switch (regexp->TypeTag()) {
    case JSRegExp::ATOM:
      return AtomExec(regexp, subject, index, last_match_info);
    case JSRegExp::IRREGEXP: {
      return IrregexpExec(regexp, subject, index, last_match_info);
    }
    default:
      UNREACHABLE();
      return MaybeHandle<Object>();
  }
}


// RegExp Atom implementation: Simple string search using indexOf.


void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
                             Handle<String> pattern,
                             JSRegExp::Flags flags,
                             Handle<String> match_pattern) {
  re->GetIsolate()->factory()->SetRegExpAtomData(re,
                                                 JSRegExp::ATOM,
                                                 pattern,
                                                 flags,
                                                 match_pattern);
}


static void SetAtomLastCapture(FixedArray* array,
                               String* subject,
                               int from,
                               int to) {
  SealHandleScope shs(array->GetIsolate());
  RegExpImpl::SetLastCaptureCount(array, 2);
  RegExpImpl::SetLastSubject(array, subject);
  RegExpImpl::SetLastInput(array, subject);
  RegExpImpl::SetCapture(array, 0, from);
  RegExpImpl::SetCapture(array, 1, to);
}


int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
                            Handle<String> subject,
                            int index,
                            int32_t* output,
                            int output_size) {
  Isolate* isolate = regexp->GetIsolate();

  DCHECK(0 <= index);
  DCHECK(index <= subject->length());

  subject = String::Flatten(subject);
  DisallowHeapAllocation no_gc;  // ensure vectors stay valid

  String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
  int needle_len = needle->length();
  DCHECK(needle->IsFlat());
  DCHECK_LT(0, needle_len);

  if (index + needle_len > subject->length()) {
    return RegExpImpl::RE_FAILURE;
  }

  for (int i = 0; i < output_size; i += 2) {
    String::FlatContent needle_content = needle->GetFlatContent();
    String::FlatContent subject_content = subject->GetFlatContent();
    DCHECK(needle_content.IsFlat());
    DCHECK(subject_content.IsFlat());
    // dispatch on type of strings
    index = (needle_content.IsAscii()
             ? (subject_content.IsAscii()
                ? SearchString(isolate,
                               subject_content.ToOneByteVector(),
                               needle_content.ToOneByteVector(),
                               index)
                : SearchString(isolate,
                               subject_content.ToUC16Vector(),
                               needle_content.ToOneByteVector(),
                               index))
             : (subject_content.IsAscii()
                ? SearchString(isolate,
                               subject_content.ToOneByteVector(),
                               needle_content.ToUC16Vector(),
                               index)
                : SearchString(isolate,
                               subject_content.ToUC16Vector(),
                               needle_content.ToUC16Vector(),
                               index)));
    if (index == -1) {
      return i / 2;  // Return number of matches.
    } else {
      output[i] = index;
      output[i+1] = index + needle_len;
      index += needle_len;
    }
  }
  return output_size / 2;
}


Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
                                    Handle<String> subject,
                                    int index,
                                    Handle<JSArray> last_match_info) {
  Isolate* isolate = re->GetIsolate();

  static const int kNumRegisters = 2;
  STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
  int32_t* output_registers = isolate->jsregexp_static_offsets_vector();

  int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);

  if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();

  DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
  SealHandleScope shs(isolate);
  FixedArray* array = FixedArray::cast(last_match_info->elements());
  SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
  return last_match_info;
}


// Irregexp implementation.

// Ensures that the regexp object contains a compiled version of the
// source for either ASCII or non-ASCII strings.
// If the compiled version doesn't already exist, it is compiled
// from the source pattern.
// If compilation fails, an exception is thrown and this function
// returns false.
bool RegExpImpl::EnsureCompiledIrregexp(
    Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
  Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
#ifdef V8_INTERPRETED_REGEXP
  if (compiled_code->IsByteArray()) return true;
#else  // V8_INTERPRETED_REGEXP (RegExp native code)
  if (compiled_code->IsCode()) return true;
#endif
  // We could potentially have marked this as flushable, but have kept
  // a saved version if we did not flush it yet.
  Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
  if (saved_code->IsCode()) {
    // Reinstate the code in the original place.
    re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
    DCHECK(compiled_code->IsSmi());
    return true;
  }
  return CompileIrregexp(re, sample_subject, is_ascii);
}


static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
                                            bool is_ascii,
                                            Handle<String> error_message,
                                            Isolate* isolate) {
  Factory* factory = isolate->factory();
  Handle<FixedArray> elements = factory->NewFixedArray(2);
  elements->set(0, re->Pattern());
  elements->set(1, *error_message);
  Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
  Handle<Object> regexp_err =
      factory->NewSyntaxError("malformed_regexp", array);
  isolate->Throw(*regexp_err);
  return false;
}


bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
                                 Handle<String> sample_subject,
                                 bool is_ascii) {
  // Compile the RegExp.
  Isolate* isolate = re->GetIsolate();
  Zone zone(isolate);
  PostponeInterruptsScope postpone(isolate);
  // If we had a compilation error the last time this is saved at the
  // saved code index.
  Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
  // When arriving here entry can only be a smi, either representing an
  // uncompiled regexp, a previous compilation error, or code that has
  // been flushed.
  DCHECK(entry->IsSmi());
  int entry_value = Smi::cast(entry)->value();
  DCHECK(entry_value == JSRegExp::kUninitializedValue ||
         entry_value == JSRegExp::kCompilationErrorValue ||
         (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));

  if (entry_value == JSRegExp::kCompilationErrorValue) {
    // A previous compilation failed and threw an error which we store in
    // the saved code index (we store the error message, not the actual
    // error). Recreate the error object and throw it.
    Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
    DCHECK(error_string->IsString());
    Handle<String> error_message(String::cast(error_string));
    CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
    return false;
  }

  JSRegExp::Flags flags = re->GetFlags();

  Handle<String> pattern(re->Pattern());
  pattern = String::Flatten(pattern);
  RegExpCompileData compile_data;
  FlatStringReader reader(isolate, pattern);
  if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
                                 &compile_data,
                                 &zone)) {
    // Throw an exception if we fail to parse the pattern.
    // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
    USE(ThrowRegExpException(re,
                             pattern,
                             compile_data.error,
                             "malformed_regexp"));
    return false;
  }
  RegExpEngine::CompilationResult result =
      RegExpEngine::Compile(&compile_data,
                            flags.is_ignore_case(),
                            flags.is_global(),
                            flags.is_multiline(),
                            pattern,
                            sample_subject,
                            is_ascii,
                            &zone);
  if (result.error_message != NULL) {
    // Unable to compile regexp.
    Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
        CStrVector(result.error_message)).ToHandleChecked();
    CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
    return false;
  }

  Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
  data->set(JSRegExp::code_index(is_ascii), result.code);
  int register_max = IrregexpMaxRegisterCount(*data);
  if (result.num_registers > register_max) {
    SetIrregexpMaxRegisterCount(*data, result.num_registers);
  }

  return true;
}


int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
  return Smi::cast(
      re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
}


void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
  re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
}


int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
  return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
}


int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
  return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
}


ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
  return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
}


Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
  return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
}


void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
                                    Handle<String> pattern,
                                    JSRegExp::Flags flags,
                                    int capture_count) {
  // Initialize compiled code entries to null.
  re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
                                                     JSRegExp::IRREGEXP,
                                                     pattern,
                                                     flags,
                                                     capture_count);
}


int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
                                Handle<String> subject) {
  subject = String::Flatten(subject);

  // Check the asciiness of the underlying storage.
  bool is_ascii = subject->IsOneByteRepresentationUnderneath();
  if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;

#ifdef V8_INTERPRETED_REGEXP
  // Byte-code regexp needs space allocated for all its registers.
  // The result captures are copied to the start of the registers array
  // if the match succeeds.  This way those registers are not clobbered
  // when we set the last match info from last successful match.
  return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
         (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
#else  // V8_INTERPRETED_REGEXP
  // Native regexp only needs room to output captures. Registers are handled
  // internally.
  return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
#endif  // V8_INTERPRETED_REGEXP
}


int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
                                Handle<String> subject,
                                int index,
                                int32_t* output,
                                int output_size) {
  Isolate* isolate = regexp->GetIsolate();

  Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);

  DCHECK(index >= 0);
  DCHECK(index <= subject->length());
  DCHECK(subject->IsFlat());

  bool is_ascii = subject->IsOneByteRepresentationUnderneath();

#ifndef V8_INTERPRETED_REGEXP
  DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
  do {
    EnsureCompiledIrregexp(regexp, subject, is_ascii);
    Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
    // The stack is used to allocate registers for the compiled regexp code.
    // This means that in case of failure, the output registers array is left
    // untouched and contains the capture results from the previous successful
    // match.  We can use that to set the last match info lazily.
    NativeRegExpMacroAssembler::Result res =
        NativeRegExpMacroAssembler::Match(code,
                                          subject,
                                          output,
                                          output_size,
                                          index,
                                          isolate);
    if (res != NativeRegExpMacroAssembler::RETRY) {
      DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
             isolate->has_pending_exception());
      STATIC_ASSERT(
          static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
      STATIC_ASSERT(
          static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
      STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
                    == RE_EXCEPTION);
      return static_cast<IrregexpResult>(res);
    }
    // If result is RETRY, the string has changed representation, and we
    // must restart from scratch.
    // In this case, it means we must make sure we are prepared to handle
    // the, potentially, different subject (the string can switch between
    // being internal and external, and even between being ASCII and UC16,
    // but the characters are always the same).
    IrregexpPrepare(regexp, subject);
    is_ascii = subject->IsOneByteRepresentationUnderneath();
  } while (true);
  UNREACHABLE();
  return RE_EXCEPTION;
#else  // V8_INTERPRETED_REGEXP

  DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
  // We must have done EnsureCompiledIrregexp, so we can get the number of
  // registers.
  int number_of_capture_registers =
      (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
  int32_t* raw_output = &output[number_of_capture_registers];
  // We do not touch the actual capture result registers until we know there
  // has been a match so that we can use those capture results to set the
  // last match info.
  for (int i = number_of_capture_registers - 1; i >= 0; i--) {
    raw_output[i] = -1;
  }
  Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);

  IrregexpResult result = IrregexpInterpreter::Match(isolate,
                                                     byte_codes,
                                                     subject,
                                                     raw_output,
                                                     index);
  if (result == RE_SUCCESS) {
    // Copy capture results to the start of the registers array.
    MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
  }
  if (result == RE_EXCEPTION) {
    DCHECK(!isolate->has_pending_exception());
    isolate->StackOverflow();
  }
  return result;
#endif  // V8_INTERPRETED_REGEXP
}


MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
                                             Handle<String> subject,
                                             int previous_index,
                                             Handle<JSArray> last_match_info) {
  Isolate* isolate = regexp->GetIsolate();
  DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);

  // Prepare space for the return values.
#if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
  if (FLAG_trace_regexp_bytecodes) {
    String* pattern = regexp->Pattern();
    PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
    PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
  }
#endif
  int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
  if (required_registers < 0) {
    // Compiling failed with an exception.
    DCHECK(isolate->has_pending_exception());
    return MaybeHandle<Object>();
  }

  int32_t* output_registers = NULL;
  if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    output_registers = NewArray<int32_t>(required_registers);
  }
  SmartArrayPointer<int32_t> auto_release(output_registers);
  if (output_registers == NULL) {
    output_registers = isolate->jsregexp_static_offsets_vector();
  }

  int res = RegExpImpl::IrregexpExecRaw(
      regexp, subject, previous_index, output_registers, required_registers);
  if (res == RE_SUCCESS) {
    int capture_count =
        IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
    return SetLastMatchInfo(
        last_match_info, subject, capture_count, output_registers);
  }
  if (res == RE_EXCEPTION) {
    DCHECK(isolate->has_pending_exception());
    return MaybeHandle<Object>();
  }
  DCHECK(res == RE_FAILURE);
  return isolate->factory()->null_value();
}


Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
                                             Handle<String> subject,
                                             int capture_count,
                                             int32_t* match) {
  DCHECK(last_match_info->HasFastObjectElements());
  int capture_register_count = (capture_count + 1) * 2;
  JSArray::EnsureSize(last_match_info,
                      capture_register_count + kLastMatchOverhead);
  DisallowHeapAllocation no_allocation;
  FixedArray* array = FixedArray::cast(last_match_info->elements());
  if (match != NULL) {
    for (int i = 0; i < capture_register_count; i += 2) {
      SetCapture(array, i, match[i]);
      SetCapture(array, i + 1, match[i + 1]);
    }
  }
  SetLastCaptureCount(array, capture_register_count);
  SetLastSubject(array, *subject);
  SetLastInput(array, *subject);
  return last_match_info;
}


RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
                                     Handle<String> subject,
                                     bool is_global,
                                     Isolate* isolate)
  : register_array_(NULL),
    register_array_size_(0),
    regexp_(regexp),
    subject_(subject) {
#ifdef V8_INTERPRETED_REGEXP
  bool interpreted = true;
#else
  bool interpreted = false;
#endif  // V8_INTERPRETED_REGEXP

  if (regexp_->TypeTag() == JSRegExp::ATOM) {
    static const int kAtomRegistersPerMatch = 2;
    registers_per_match_ = kAtomRegistersPerMatch;
    // There is no distinction between interpreted and native for atom regexps.
    interpreted = false;
  } else {
    registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
    if (registers_per_match_ < 0) {
      num_matches_ = -1;  // Signal exception.
      return;
    }
  }

  if (is_global && !interpreted) {
    register_array_size_ =
        Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
    max_matches_ = register_array_size_ / registers_per_match_;
  } else {
    // Global loop in interpreted regexp is not implemented.  We choose
    // the size of the offsets vector so that it can only store one match.
    register_array_size_ = registers_per_match_;
    max_matches_ = 1;
  }

  if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
    register_array_ = NewArray<int32_t>(register_array_size_);
  } else {
    register_array_ = isolate->jsregexp_static_offsets_vector();
  }

  // Set state so that fetching the results the first time triggers a call
  // to the compiled regexp.
  current_match_index_ = max_matches_ - 1;
  num_matches_ = max_matches_;
  DCHECK(registers_per_match_ >= 2);  // Each match has at least one capture.
  DCHECK_GE(register_array_size_, registers_per_match_);
  int32_t* last_match =
      &register_array_[current_match_index_ * registers_per_match_];
  last_match[0] = -1;
  last_match[1] = 0;
}


// -------------------------------------------------------------------
// Implementation of the Irregexp regular expression engine.
//
// The Irregexp regular expression engine is intended to be a complete
// implementation of ECMAScript regular expressions.  It generates either
// bytecodes or native code.

//   The Irregexp regexp engine is structured in three steps.
//   1) The parser generates an abstract syntax tree.  See ast.cc.
//   2) From the AST a node network is created.  The nodes are all
//      subclasses of RegExpNode.  The nodes represent states when
//      executing a regular expression.  Several optimizations are
//      performed on the node network.
//   3) From the nodes we generate either byte codes or native code
//      that can actually execute the regular expression (perform
//      the search).  The code generation step is described in more
//      detail below.

// Code generation.
//
//   The nodes are divided into four main categories.
//   * Choice nodes
//        These represent places where the regular expression can
//        match in more than one way.  For example on entry to an
//        alternation (foo|bar) or a repetition (*, +, ? or {}).
//   * Action nodes
//        These represent places where some action should be
//        performed.  Examples include recording the current position
//        in the input string to a register (in order to implement
//        captures) or other actions on register for example in order
//        to implement the counters needed for {} repetitions.
//   * Matching nodes
//        These attempt to match some element part of the input string.
//        Examples of elements include character classes, plain strings
//        or back references.
//   * End nodes
//        These are used to implement the actions required on finding
//        a successful match or failing to find a match.
//
//   The code generated (whether as byte codes or native code) maintains
//   some state as it runs.  This consists of the following elements:
//
//   * The capture registers.  Used for string captures.
//   * Other registers.  Used for counters etc.
//   * The current position.
//   * The stack of backtracking information.  Used when a matching node
//     fails to find a match and needs to try an alternative.
//
// Conceptual regular expression execution model:
//
//   There is a simple conceptual model of regular expression execution
//   which will be presented first.  The actual code generated is a more
//   efficient simulation of the simple conceptual model:
//
//   * Choice nodes are implemented as follows:
//     For each choice except the last {
//       push current position
//       push backtrack code location
//       <generate code to test for choice>
//       backtrack code location:
//       pop current position
//     }
//     <generate code to test for last choice>
//
//   * Actions nodes are generated as follows
//     <push affected registers on backtrack stack>
//     <generate code to perform action>
//     push backtrack code location
//     <generate code to test for following nodes>
//     backtrack code location:
//     <pop affected registers to restore their state>
//     <pop backtrack location from stack and go to it>
//
//   * Matching nodes are generated as follows:
//     if input string matches at current position
//       update current position
//       <generate code to test for following nodes>
//     else
//       <pop backtrack location from stack and go to it>
//
//   Thus it can be seen that the current position is saved and restored
//   by the choice nodes, whereas the registers are saved and restored by
//   by the action nodes that manipulate them.
//
//   The other interesting aspect of this model is that nodes are generated
//   at the point where they are needed by a recursive call to Emit().  If
//   the node has already been code generated then the Emit() call will
//   generate a jump to the previously generated code instead.  In order to
//   limit recursion it is possible for the Emit() function to put the node
//   on a work list for later generation and instead generate a jump.  The
//   destination of the jump is resolved later when the code is generated.
//
// Actual regular expression code generation.
//
//   Code generation is actually more complicated than the above.  In order
//   to improve the efficiency of the generated code some optimizations are
//   performed
//
//   * Choice nodes have 1-character lookahead.
//     A choice node looks at the following character and eliminates some of
//     the choices immediately based on that character.  This is not yet
//     implemented.
//   * Simple greedy loops store reduced backtracking information.
//     A quantifier like /.*foo/m will greedily match the whole input.  It will
//     then need to backtrack to a point where it can match "foo".  The naive
//     implementation of this would push each character position onto the
//     backtracking stack, then pop them off one by one.  This would use space
//     proportional to the length of the input string.  However since the "."
//     can only match in one way and always has a constant length (in this case
//     of 1) it suffices to store the current position on the top of the stack
//     once.  Matching now becomes merely incrementing the current position and
//     backtracking becomes decrementing the current position and checking the
//     result against the stored current position.  This is faster and saves
//     space.
//   * The current state is virtualized.
//     This is used to defer expensive operations until it is clear that they
//     are needed and to generate code for a node more than once, allowing
//     specialized an efficient versions of the code to be created. This is
//     explained in the section below.
//
// Execution state virtualization.
//
//   Instead of emitting code, nodes that manipulate the state can record their
//   manipulation in an object called the Trace.  The Trace object can record a
//   current position offset, an optional backtrack code location on the top of
//   the virtualized backtrack stack and some register changes.  When a node is
//   to be emitted it can flush the Trace or update it.  Flushing the Trace
//   will emit code to bring the actual state into line with the virtual state.
//   Avoiding flushing the state can postpone some work (e.g. updates of capture
//   registers).  Postponing work can save time when executing the regular
//   expression since it may be found that the work never has to be done as a
//   failure to match can occur.  In addition it is much faster to jump to a
//   known backtrack code location than it is to pop an unknown backtrack
//   location from the stack and jump there.
//
//   The virtual state found in the Trace affects code generation.  For example
//   the virtual state contains the difference between the actual current
//   position and the virtual current position, and matching code needs to use
//   this offset to attempt a match in the correct location of the input
//   string.  Therefore code generated for a non-trivial trace is specialized
//   to that trace.  The code generator therefore has the ability to generate
//   code for each node several times.  In order to limit the size of the
//   generated code there is an arbitrary limit on how many specialized sets of
//   code may be generated for a given node.  If the limit is reached, the
//   trace is flushed and a generic version of the code for a node is emitted.
//   This is subsequently used for that node.  The code emitted for non-generic
//   trace is not recorded in the node and so it cannot currently be reused in
//   the event that code generation is requested for an identical trace.


void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
  UNREACHABLE();
}


void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
  text->AddElement(TextElement::Atom(this), zone);
}


void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
  text->AddElement(TextElement::CharClass(this), zone);
}


void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
  for (int i = 0; i < elements()->length(); i++)
    text->AddElement(elements()->at(i), zone);
}


TextElement TextElement::Atom(RegExpAtom* atom) {
  return TextElement(ATOM, atom);
}


TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
  return TextElement(CHAR_CLASS, char_class);
}


int TextElement::length() const {
  switch (text_type()) {
    case ATOM:
      return atom()->length();

    case CHAR_CLASS:
      return 1;
  }
  UNREACHABLE();
  return 0;
}


DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
  if (table_ == NULL) {
    table_ = new(zone()) DispatchTable(zone());
    DispatchTableConstructor cons(table_, ignore_case, zone());
    cons.BuildTable(this);
  }
  return table_;
}


class FrequencyCollator {
 public:
  FrequencyCollator() : total_samples_(0) {
    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
      frequencies_[i] = CharacterFrequency(i);
    }
  }

  void CountCharacter(int character) {
    int index = (character & RegExpMacroAssembler::kTableMask);
    frequencies_[index].Increment();
    total_samples_++;
  }

  // Does not measure in percent, but rather per-128 (the table size from the
  // regexp macro assembler).
  int Frequency(int in_character) {
    DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
    if (total_samples_ < 1) return 1;  // Division by zero.
    int freq_in_per128 =
        (frequencies_[in_character].counter() * 128) / total_samples_;
    return freq_in_per128;
  }

 private:
  class CharacterFrequency {
   public:
    CharacterFrequency() : counter_(0), character_(-1) { }
    explicit CharacterFrequency(int character)
        : counter_(0), character_(character) { }

    void Increment() { counter_++; }
    int counter() { return counter_; }
    int character() { return character_; }

   private:
    int counter_;
    int character_;
  };


 private:
  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
  int total_samples_;
};


class RegExpCompiler {
 public:
  RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
                 Zone* zone);

  int AllocateRegister() {
    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
      reg_exp_too_big_ = true;
      return next_register_;
    }
    return next_register_++;
  }

  RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
                                           RegExpNode* start,
                                           int capture_count,
                                           Handle<String> pattern);

  inline void AddWork(RegExpNode* node) { work_list_->Add(node); }

  static const int kImplementationOffset = 0;
  static const int kNumberOfRegistersOffset = 0;
  static const int kCodeOffset = 1;

  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
  EndNode* accept() { return accept_; }

  static const int kMaxRecursion = 100;
  inline int recursion_depth() { return recursion_depth_; }
  inline void IncrementRecursionDepth() { recursion_depth_++; }
  inline void DecrementRecursionDepth() { recursion_depth_--; }

  void SetRegExpTooBig() { reg_exp_too_big_ = true; }

  inline bool ignore_case() { return ignore_case_; }
  inline bool ascii() { return ascii_; }
  FrequencyCollator* frequency_collator() { return &frequency_collator_; }

  int current_expansion_factor() { return current_expansion_factor_; }
  void set_current_expansion_factor(int value) {
    current_expansion_factor_ = value;
  }

  Zone* zone() const { return zone_; }

  static const int kNoRegister = -1;

 private:
  EndNode* accept_;
  int next_register_;
  List<RegExpNode*>* work_list_;
  int recursion_depth_;
  RegExpMacroAssembler* macro_assembler_;
  bool ignore_case_;
  bool ascii_;
  bool reg_exp_too_big_;
  int current_expansion_factor_;
  FrequencyCollator frequency_collator_;
  Zone* zone_;
};


class RecursionCheck {
 public:
  explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
    compiler->IncrementRecursionDepth();
  }
  ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
 private:
  RegExpCompiler* compiler_;
};


static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
  return RegExpEngine::CompilationResult(isolate, "RegExp too big");
}


// Attempts to compile the regexp using an Irregexp code generator.  Returns
// a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
                               Zone* zone)
    : next_register_(2 * (capture_count + 1)),
      work_list_(NULL),
      recursion_depth_(0),
      ignore_case_(ignore_case),
      ascii_(ascii),
      reg_exp_too_big_(false),
      current_expansion_factor_(1),
      frequency_collator_(),
      zone_(zone) {
  accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
  DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
}


RegExpEngine::CompilationResult RegExpCompiler::Assemble(
    RegExpMacroAssembler* macro_assembler,
    RegExpNode* start,
    int capture_count,
    Handle<String> pattern) {
  Heap* heap = pattern->GetHeap();

  bool use_slow_safe_regexp_compiler = false;
  if (heap->total_regexp_code_generated() >
          RegExpImpl::kRegWxpCompiledLimit &&
      heap->isolate()->memory_allocator()->SizeExecutable() >
          RegExpImpl::kRegExpExecutableMemoryLimit) {
    use_slow_safe_regexp_compiler = true;
  }

  macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);

#ifdef DEBUG
  if (FLAG_trace_regexp_assembler)
    macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
  else
#endif
    macro_assembler_ = macro_assembler;

  List <RegExpNode*> work_list(0);
  work_list_ = &work_list;
  Label fail;
  macro_assembler_->PushBacktrack(&fail);
  Trace new_trace;
  start->Emit(this, &new_trace);
  macro_assembler_->Bind(&fail);
  macro_assembler_->Fail();
  while (!work_list.is_empty()) {
    work_list.RemoveLast()->Emit(this, &new_trace);
  }
  if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());

  Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
  heap->IncreaseTotalRegexpCodeGenerated(code->Size());
  work_list_ = NULL;
#ifdef DEBUG
  if (FLAG_print_code) {
    CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
    OFStream os(trace_scope.file());
    Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
  }
  if (FLAG_trace_regexp_assembler) {
    delete macro_assembler_;
  }
#endif
  return RegExpEngine::CompilationResult(*code, next_register_);
}


bool Trace::DeferredAction::Mentions(int that) {
  if (action_type() == ActionNode::CLEAR_CAPTURES) {
    Interval range = static_cast<DeferredClearCaptures*>(this)->range();
    return range.Contains(that);
  } else {
    return reg() == that;
  }
}


bool Trace::mentions_reg(int reg) {
  for (DeferredAction* action = actions_;
       action != NULL;
       action = action->next()) {
    if (action->Mentions(reg))
      return true;
  }
  return false;
}


bool Trace::GetStoredPosition(int reg, int* cp_offset) {
  DCHECK_EQ(0, *cp_offset);
  for (DeferredAction* action = actions_;
       action != NULL;
       action = action->next()) {
    if (action->Mentions(reg)) {
      if (action->action_type() == ActionNode::STORE_POSITION) {
        *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
        return true;
      } else {
        return false;
      }
    }
  }
  return false;
}


int Trace::FindAffectedRegisters(OutSet* affected_registers,
                                 Zone* zone) {
  int max_register = RegExpCompiler::kNoRegister;
  for (DeferredAction* action = actions_;
       action != NULL;
       action = action->next()) {
    if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
      Interval range = static_cast<DeferredClearCaptures*>(action)->range();
      for (int i = range.from(); i <= range.to(); i++)
        affected_registers->Set(i, zone);
      if (range.to() > max_register) max_register = range.to();
    } else {
      affected_registers->Set(action->reg(), zone);
      if (action->reg() > max_register) max_register = action->reg();
    }
  }
  return max_register;
}


void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
                                     int max_register,
                                     const OutSet& registers_to_pop,
                                     const OutSet& registers_to_clear) {
  for (int reg = max_register; reg >= 0; reg--) {
    if (registers_to_pop.Get(reg)) {
      assembler->PopRegister(reg);
    } else if (registers_to_clear.Get(reg)) {
      int clear_to = reg;
      while (reg > 0 && registers_to_clear.Get(reg - 1)) {
        reg--;
      }
      assembler->ClearRegisters(reg, clear_to);
    }
  }
}


void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
                                   int max_register,
                                   const OutSet& affected_registers,
                                   OutSet* registers_to_pop,
                                   OutSet* registers_to_clear,
                                   Zone* zone) {
  // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
  const int push_limit = (assembler->stack_limit_slack() + 1) / 2;

  // Count pushes performed to force a stack limit check occasionally.
  int pushes = 0;

  for (int reg = 0; reg <= max_register; reg++) {
    if (!affected_registers.Get(reg)) {
      continue;
    }

    // The chronologically first deferred action in the trace
    // is used to infer the action needed to restore a register
    // to its previous state (or not, if it's safe to ignore it).
    enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
    DeferredActionUndoType undo_action = IGNORE;

    int value = 0;
    bool absolute = false;
    bool clear = false;
    int store_position = -1;
    // This is a little tricky because we are scanning the actions in reverse
    // historical order (newest first).
    for (DeferredAction* action = actions_;
         action != NULL;
         action = action->next()) {
      if (action->Mentions(reg)) {
        switch (action->action_type()) {
          case ActionNode::SET_REGISTER: {
            Trace::DeferredSetRegister* psr =
                static_cast<Trace::DeferredSetRegister*>(action);
            if (!absolute) {
              value += psr->value();
              absolute = true;
            }
            // SET_REGISTER is currently only used for newly introduced loop
            // counters. They can have a significant previous value if they
            // occour in a loop. TODO(lrn): Propagate this information, so
            // we can set undo_action to IGNORE if we know there is no value to
            // restore.
            undo_action = RESTORE;
            DCHECK_EQ(store_position, -1);
            DCHECK(!clear);
            break;
          }
          case ActionNode::INCREMENT_REGISTER:
            if (!absolute) {
              value++;
            }
            DCHECK_EQ(store_position, -1);
            DCHECK(!clear);
            undo_action = RESTORE;
            break;
          case ActionNode::STORE_POSITION: {
            Trace::DeferredCapture* pc =
                static_cast<Trace::DeferredCapture*>(action);
            if (!clear && store_position == -1) {
              store_position = pc->cp_offset();
            }

            // For captures we know that stores and clears alternate.
            // Other register, are never cleared, and if the occur
            // inside a loop, they might be assigned more than once.
            if (reg <= 1) {
              // Registers zero and one, aka "capture zero", is
              // always set correctly if we succeed. There is no
              // need to undo a setting on backtrack, because we
              // will set it again or fail.
              undo_action = IGNORE;
            } else {
              undo_action = pc->is_capture() ? CLEAR : RESTORE;
            }
            DCHECK(!absolute);
            DCHECK_EQ(value, 0);
            break;
          }
          case ActionNode::CLEAR_CAPTURES: {
            // Since we're scanning in reverse order, if we've already
            // set the position we have to ignore historically earlier
            // clearing operations.
            if (store_position == -1) {
              clear = true;
            }
            undo_action = RESTORE;
            DCHECK(!absolute);
            DCHECK_EQ(value, 0);
            break;
          }
          default:
            UNREACHABLE();
            break;
        }
      }
    }
    // Prepare for the undo-action (e.g., push if it's going to be popped).
    if (undo_action == RESTORE) {
      pushes++;
      RegExpMacroAssembler::StackCheckFlag stack_check =
          RegExpMacroAssembler::kNoStackLimitCheck;
      if (pushes == push_limit) {
        stack_check = RegExpMacroAssembler::kCheckStackLimit;
        pushes = 0;
      }

      assembler->PushRegister(reg, stack_check);
      registers_to_pop->Set(reg, zone);
    } else if (undo_action == CLEAR) {
      registers_to_clear->Set(reg, zone);
    }
    // Perform the chronologically last action (or accumulated increment)
    // for the register.
    if (store_position != -1) {
      assembler->WriteCurrentPositionToRegister(reg, store_position);
    } else if (clear) {
      assembler->ClearRegisters(reg, reg);
    } else if (absolute) {
      assembler->SetRegister(reg, value);
    } else if (value != 0) {
      assembler->AdvanceRegister(reg, value);
    }
  }
}


// This is called as we come into a loop choice node and some other tricky
// nodes.  It normalizes the state of the code generator to ensure we can
// g…