regexp-parser.cc - mozsearch

mozilla-central/js/src/irregexp/imported/regexp-parser.cc (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

// Copyright 2016 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

#include "irregexp/imported/regexp-parser.h"

#include "irregexp/imported/regexp-ast.h"

#include "irregexp/imported/regexp-macro-assembler.h"

#include "irregexp/imported/regexp.h"

#ifdef V8_INTL_SUPPORT

#include "unicode/uniset.h"

#include "unicode/unistr.h"

#include "unicode/usetiter.h"

#include "unicode/utf16.h"  // For U16_NEXT

#endif                      // V8_INTL_SUPPORT

namespace v8 {

namespace internal {

namespace {

// Whether we're currently inside the ClassEscape production

// (tc39.es/ecma262/#prod-annexB-CharacterEscape).

enum class InClassEscapeState {

  kInClass,

  kNotInClass,

};

// The production used to derive ClassSetOperand.

enum class ClassSetOperandType {

  kClassSetCharacter,

  kClassStringDisjunction,

  kNestedClass,

  kCharacterClassEscape,  // \ CharacterClassEscape is a special nested class,

                          // as we can fold it directly into another range.

  kClassSetRange

};

class RegExpTextBuilder {

 public:

  using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;

  RegExpTextBuilder(Zone* zone, SmallRegExpTreeVector* terms_storage,

                    RegExpFlags flags)

      : zone_(zone), flags_(flags), terms_(terms_storage), text_(zone) {}

  void AddCharacter(base::uc16 character);

  void AddUnicodeCharacter(base::uc32 character);

  void AddEscapedUnicodeCharacter(base::uc32 character);

  void AddAtom(RegExpTree* atom);

  void AddTerm(RegExpTree* term);

  void AddClassRanges(RegExpClassRanges* cc);

  void FlushPendingSurrogate();

  void FlushText();

  RegExpTree* PopLastAtom();

  RegExpTree* ToRegExp();

 private:

  static const base::uc16 kNoPendingSurrogate = 0;

  void AddLeadSurrogate(base::uc16 lead_surrogate);

  void AddTrailSurrogate(base::uc16 trail_surrogate);

  void FlushCharacters();

  bool NeedsDesugaringForUnicode(RegExpClassRanges* cc);

  bool NeedsDesugaringForIgnoreCase(base::uc32 c);

  void AddClassRangesForDesugaring(base::uc32 c);

  bool ignore_case() const { return IsIgnoreCase(flags_); }

  bool IsUnicodeMode() const {

    // Either /v or /u enable UnicodeMode

    // https://tc39.es/ecma262/#sec-parsepattern

    return IsUnicode(flags_) || IsUnicodeSets(flags_);

  Zone* zone() const { return zone_; }

  Zone* const zone_;

  const RegExpFlags flags_;

  ZoneList<base::uc16>* characters_ = nullptr;

  base::uc16 pending_surrogate_ = kNoPendingSurrogate;

  SmallRegExpTreeVector* terms_;

  SmallRegExpTreeVector text_;

};

void RegExpTextBuilder::AddLeadSurrogate(base::uc16 lead_surrogate) {

  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

  FlushPendingSurrogate();

  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.

  pending_surrogate_ = lead_surrogate;

void RegExpTextBuilder::AddTrailSurrogate(base::uc16 trail_surrogate) {

  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

  if (pending_surrogate_ != kNoPendingSurrogate) {

    base::uc16 lead_surrogate = pending_surrogate_;

    pending_surrogate_ = kNoPendingSurrogate;

    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

    base::uc32 combined =

        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);

    if (NeedsDesugaringForIgnoreCase(combined)) {

      AddClassRangesForDesugaring(combined);

    } else {

      ZoneList<base::uc16> surrogate_pair(2, zone());

      surrogate_pair.Add(lead_surrogate, zone());

      surrogate_pair.Add(trail_surrogate, zone());

      RegExpAtom* atom =

          zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());

      AddAtom(atom);

  } else {

    pending_surrogate_ = trail_surrogate;

    FlushPendingSurrogate();

void RegExpTextBuilder::FlushPendingSurrogate() {

  if (pending_surrogate_ != kNoPendingSurrogate) {

    DCHECK(IsUnicodeMode());

    base::uc32 c = pending_surrogate_;

    pending_surrogate_ = kNoPendingSurrogate;

    AddClassRangesForDesugaring(c);

void RegExpTextBuilder::FlushCharacters() {

  FlushPendingSurrogate();

  if (characters_ != nullptr) {

    RegExpTree* atom = zone()->New<RegExpAtom>(characters_->ToConstVector());

    characters_ = nullptr;

    text_.emplace_back(atom);

void RegExpTextBuilder::FlushText() {

  FlushCharacters();

  size_t num_text = text_.size();

  if (num_text == 0) {

    return;

  } else if (num_text == 1) {

    terms_->emplace_back(text_.back());

  } else {

    RegExpText* text = zone()->New<RegExpText>(zone());

    for (size_t i = 0; i < num_text; i++) {

      text_[i]->AppendToText(text, zone());

    terms_->emplace_back(text);

  text_.clear();

void RegExpTextBuilder::AddCharacter(base::uc16 c) {

  FlushPendingSurrogate();

  if (NeedsDesugaringForIgnoreCase(c)) {

    AddClassRangesForDesugaring(c);

  } else {

    if (characters_ == nullptr) {

      characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());

    characters_->Add(c, zone());

void RegExpTextBuilder::AddUnicodeCharacter(base::uc32 c) {

  if (c > static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {

    DCHECK(IsUnicodeMode());

    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));

    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

  } else if (IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(c)) {

    AddLeadSurrogate(c);

  } else if (IsUnicodeMode() && unibrow::Utf16::IsTrailSurrogate(c)) {

    AddTrailSurrogate(c);

  } else {

    AddCharacter(static_cast<base::uc16>(c));

void RegExpTextBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {

  // A lead or trail surrogate parsed via escape sequence will not

  // pair up with any preceding lead or following trail surrogate.

  FlushPendingSurrogate();

  AddUnicodeCharacter(character);

  FlushPendingSurrogate();

void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) {

  if (NeedsDesugaringForUnicode(cr)) {

    // With /u or /v, character class needs to be desugared, so it

    // must be a standalone term instead of being part of a RegExpText.

    AddTerm(cr);

  } else {

    AddAtom(cr);

void RegExpTextBuilder::AddClassRangesForDesugaring(base::uc32 c) {

  AddTerm(zone()->New<RegExpClassRanges>(

      zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));

void RegExpTextBuilder::AddAtom(RegExpTree* atom) {

  DCHECK(atom->IsTextElement());

  FlushCharacters();

  text_.emplace_back(atom);

void RegExpTextBuilder::AddTerm(RegExpTree* term) {

  DCHECK(term->IsTextElement());

  FlushText();

  terms_->emplace_back(term);

bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges* cc) {

  if (!IsUnicodeMode()) return false;

  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not

  // necessarily mean that we need to desugar. It's probably nicer to have a

  // separate pass to figure out unicode desugarings.

  if (ignore_case()) return true;

  ZoneList<CharacterRange>* ranges = cc->ranges(zone());

  CharacterRange::Canonicalize(ranges);

  if (cc->is_negated()) {

    ZoneList<CharacterRange>* negated_ranges =

        zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());

    CharacterRange::Negate(ranges, negated_ranges, zone());

    ranges = negated_ranges;

  for (int i = ranges->length() - 1; i >= 0; i--) {

    base::uc32 from = ranges->at(i).from();

    base::uc32 to = ranges->at(i).to();

    // Check for non-BMP characters.

    if (to >= kNonBmpStart) return true;

    // Check for lone surrogates.

    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;

  return false;

bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(base::uc32 c) {

#ifdef V8_INTL_SUPPORT

  if (IsUnicodeMode() && ignore_case()) {

    icu::UnicodeSet set(c, c);

    set.closeOver(USET_CASE_INSENSITIVE);

    set.removeAllStrings();

    return set.size() > 1;

  // In the case where ICU is not included, we act as if the unicode flag is

  // not set, and do not desugar.

#endif  // V8_INTL_SUPPORT

  return false;

RegExpTree* RegExpTextBuilder::PopLastAtom() {

  FlushPendingSurrogate();

  RegExpTree* atom;

  if (characters_ != nullptr) {

    base::Vector<const base::uc16> char_vector = characters_->ToConstVector();

    int num_chars = char_vector.length();

    if (num_chars > 1) {

      base::Vector<const base::uc16> prefix =

          char_vector.SubVector(0, num_chars - 1);

      text_.emplace_back(zone()->New<RegExpAtom>(prefix));

      char_vector = char_vector.SubVector(num_chars - 1, num_chars);

    characters_ = nullptr;

    atom = zone()->New<RegExpAtom>(char_vector);

    return atom;

  } else if (!text_.empty()) {

    atom = text_.back();

    text_.pop_back();

    return atom;

  return nullptr;

RegExpTree* RegExpTextBuilder::ToRegExp() {

  FlushText();

  size_t num_alternatives = terms_->size();

  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();

  if (num_alternatives == 1) return terms_->back();

  return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(

      base::VectorOf(terms_->begin(), terms_->size()), zone()));

// Accumulates RegExp atoms and assertions into lists of terms and alternatives.

class RegExpBuilder {

 public:

  RegExpBuilder(Zone* zone, RegExpFlags flags)

      : zone_(zone),

        flags_(flags),

        terms_(zone),

        alternatives_(zone),

        text_builder_(RegExpTextBuilder{zone, &terms_, flags}) {}

  void AddCharacter(base::uc16 character);

  void AddUnicodeCharacter(base::uc32 character);

  void AddEscapedUnicodeCharacter(base::uc32 character);

  // "Adds" an empty expression. Does nothing except consume a

  // following quantifier

  void AddEmpty();

  void AddClassRanges(RegExpClassRanges* cc);

  void AddAtom(RegExpTree* tree);

  void AddTerm(RegExpTree* tree);

  void AddAssertion(RegExpTree* tree);

  void NewAlternative();  // '|'

  bool AddQuantifierToAtom(int min, int max,

                           RegExpQuantifier::QuantifierType type);

  void FlushText();

  RegExpTree* ToRegExp();

  RegExpFlags flags() const { return flags_; }

  bool ignore_case() const { return IsIgnoreCase(flags_); }

  bool multiline() const { return IsMultiline(flags_); }

  bool dotall() const { return IsDotAll(flags_); }

 private:

  void FlushTerms();

  bool IsUnicodeMode() const {

    // Either /v or /u enable UnicodeMode

    // https://tc39.es/ecma262/#sec-parsepattern

    return IsUnicode(flags_) || IsUnicodeSets(flags_);

  Zone* zone() const { return zone_; }

  RegExpTextBuilder& text_builder() { return text_builder_; }

  Zone* const zone_;

  bool pending_empty_ = false;

  const RegExpFlags flags_;

  using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;

  SmallRegExpTreeVector terms_;

  SmallRegExpTreeVector alternatives_;

  RegExpTextBuilder text_builder_;

};

enum SubexpressionType {

  INITIAL,

  CAPTURE,  // All positive values represent captures.

  POSITIVE_LOOKAROUND,

  NEGATIVE_LOOKAROUND,

  GROUPING

};

class RegExpParserState : public ZoneObject {

 public:

  // Push a state on the stack.

  RegExpParserState(RegExpParserState* previous_state,

                    SubexpressionType group_type,

                    RegExpLookaround::Type lookaround_type,

                    int disjunction_capture_index,

                    const ZoneVector<base::uc16>* capture_name,

                    RegExpFlags flags, Zone* zone)

      : previous_state_(previous_state),

        builder_(zone, flags),

        group_type_(group_type),

        lookaround_type_(lookaround_type),

        disjunction_capture_index_(disjunction_capture_index),

        capture_name_(capture_name) {

    if (previous_state != nullptr) {

      non_participating_capture_group_interval_ =

          previous_state->non_participating_capture_group_interval();

  // Parser state of containing expression, if any.

  RegExpParserState* previous_state() const { return previous_state_; }

  bool IsSubexpression() { return previous_state_ != nullptr; }

  // RegExpBuilder building this regexp's AST.

  RegExpBuilder* builder() { return &builder_; }

  // Type of regexp being parsed (parenthesized group or entire regexp).

  SubexpressionType group_type() const { return group_type_; }

  // Lookahead or Lookbehind.

  RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }

  // Index in captures array of first capture in this sub-expression, if any.

  // Also the capture index of this sub-expression itself, if group_type

  // is CAPTURE.

  int capture_index() const { return disjunction_capture_index_; }

  // The name of the current sub-expression, if group_type is CAPTURE. Only

  // used for named captures.

  const ZoneVector<base::uc16>* capture_name() const { return capture_name_; }

  std::pair<int, int> non_participating_capture_group_interval() const {

    return non_participating_capture_group_interval_;

  bool IsNamedCapture() const { return capture_name_ != nullptr; }

  // Check whether the parser is inside a capture group with the given index.

  bool IsInsideCaptureGroup(int index) const {

    for (const RegExpParserState* s = this; s != nullptr;

         s = s->previous_state()) {

      if (s->group_type() != CAPTURE) continue;

      // Return true if we found the matching capture index.

      if (index == s->capture_index()) return true;

      // Abort if index is larger than what has been parsed up till this state.

      if (index > s->capture_index()) return false;

    return false;

  // Check whether the parser is inside a capture group with the given name.

  bool IsInsideCaptureGroup(const ZoneVector<base::uc16>* name) const {

    DCHECK_NOT_NULL(name);

    for (const RegExpParserState* s = this; s != nullptr;

         s = s->previous_state()) {

      if (s->capture_name() == nullptr) continue;

      if (*s->capture_name() == *name) return true;

    return false;

  void NewAlternative(int captures_started) {

    if (non_participating_capture_group_interval().second != 0) {

      // Extend the non-participating interval.

      non_participating_capture_group_interval_.second = captures_started;

    } else {

      // Create new non-participating interval from the start of the current

      // enclosing group to all captures created within that group so far.

      non_participating_capture_group_interval_ =

          std::make_pair(capture_index(), captures_started);

 private:

  // Linked list implementation of stack of states.

  RegExpParserState* const previous_state_;

  // Builder for the stored disjunction.

  RegExpBuilder builder_;

  // Stored disjunction type (capture, look-ahead or grouping), if any.

  const SubexpressionType group_type_;

  // Stored read direction.

  const RegExpLookaround::Type lookaround_type_;

  // Stored disjunction's capture index (if any).

  const int disjunction_capture_index_;

  // Stored capture name (if any).

  const ZoneVector<base::uc16>* const capture_name_;

  // Interval of (named) capture indices ]from, to] that are not participating

  // in the current state (i.e. they cannot match).

  // Capture indices are not participating if they were created in a different

  // alternative.

  std::pair<int, int> non_participating_capture_group_interval_;

};

template <class CharT>

class RegExpParserImpl final {

 private:

  RegExpParserImpl(const CharT* input, int input_length, RegExpFlags flags,

                   uintptr_t stack_limit, Zone* zone,

                   const DisallowGarbageCollection& no_gc);

  bool Parse(RegExpCompileData* result);

  RegExpTree* ParsePattern();

  RegExpTree* ParseDisjunction();

  RegExpTree* ParseGroup();

  // Parses a {...,...} quantifier and stores the range in the given

  // out parameters.

  bool ParseIntervalQuantifier(int* min_out, int* max_out);

  // Checks whether the following is a length-digit hexadecimal number,

  // and sets the value if it is.

  bool ParseHexEscape(int length, base::uc32* value);

  bool ParseUnicodeEscape(base::uc32* value);

  bool ParseUnlimitedLengthHexNumber(int max_value, base::uc32* value);

  bool ParsePropertyClassName(ZoneVector<char>* name_1,

                              ZoneVector<char>* name_2);

  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range,

                             CharacterClassStrings* add_to_strings, bool negate,

                             const ZoneVector<char>& name_1,

                             const ZoneVector<char>& name_2);

  RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges,

                               bool add_unicode_case_equivalents);

  // Parse inside a class. Either add escaped class to the range, or return

  // false and pass parsed single character through |char_out|.

  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,

                        bool add_unicode_case_equivalents, base::uc32* char_out,

                        bool* is_class_escape);

  // Returns true iff parsing was successful.

  bool TryParseCharacterClassEscape(base::uc32 next,

                                    InClassEscapeState in_class_escape_state,

                                    ZoneList<CharacterRange>* ranges,

                                    CharacterClassStrings* strings, Zone* zone,

                                    bool add_unicode_case_equivalents);

  RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,

                                          CharacterClassStrings* strings);

  RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,

                                   ClassSetOperandType* type_out);

  RegExpTree* ParseClassSetOperand(const RegExpBuilder* builder,

                                   ClassSetOperandType* type_out,

                                   ZoneList<CharacterRange>* ranges,

                                   CharacterClassStrings* strings,

                                   base::uc32* character);

  base::uc32 ParseClassSetCharacter();

  // Parses and returns a single escaped character.

  base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,

                                  bool* is_escaped_unicode_character);

  void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges,

                                     CharacterRange new_range);

  RegExpTree* ParseClassUnion(const RegExpBuilder* builder, bool is_negated,

                              RegExpTree* first_operand,

                              ClassSetOperandType first_operand_type,

                              ZoneList<CharacterRange>* ranges,

                              CharacterClassStrings* strings,

                              base::uc32 first_character);

  RegExpTree* ParseClassIntersection(const RegExpBuilder* builder,

                                     bool is_negated, RegExpTree* first_operand,

                                     ClassSetOperandType first_operand_type);

  RegExpTree* ParseClassSubtraction(const RegExpBuilder* builder,

                                    bool is_negated, RegExpTree* first_operand,

                                    ClassSetOperandType first_operand_type);

  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);

  base::uc32 ParseOctalLiteral();

  // Tries to parse the input as a back reference.  If successful it

  // stores the result in the output parameter and returns true.  If

  // it fails it will push back the characters read so the same characters

  // can be reparsed.

  bool ParseBackReferenceIndex(int* index_out);

  RegExpTree* ReportError(RegExpError error);

  void Advance();

  void Advance(int dist);

  void RewindByOneCodepoint();  // Rewinds to before the previous Advance().

  void Reset(int pos);

  // Reports whether the pattern might be used as a literal search string.

  // Only use if the result of the parse is a single atom node.

  bool simple() const { return simple_; }

  bool contains_anchor() const { return contains_anchor_; }

  void set_contains_anchor() { contains_anchor_ = true; }

  int captures_started() const { return captures_started_; }

  int position() const { return next_pos_ - 1; }

  bool failed() const { return failed_; }

  RegExpFlags flags() const { return flags_; }

  bool IsUnicodeMode() const {

    // Either /v or /u enable UnicodeMode

    // https://tc39.es/ecma262/#sec-parsepattern

    return IsUnicode(flags()) || IsUnicodeSets(flags()) || force_unicode_;

  bool unicode_sets() const { return IsUnicodeSets(flags()); }

  bool ignore_case() const { return IsIgnoreCase(flags()); }

  static bool IsSyntaxCharacterOrSlash(base::uc32 c);

  static bool IsClassSetSyntaxCharacter(base::uc32 c);

  static bool IsClassSetReservedPunctuator(base::uc32 c);

  bool IsClassSetReservedDoublePunctuator(base::uc32 c);

  static const base::uc32 kEndMarker = (1 << 21);

 private:

  // Return the 1-indexed RegExpCapture object, allocate if necessary.

  RegExpCapture* GetCapture(int index);

  // Creates a new named capture at the specified index. Must be called exactly

  // once for each named capture. Fails if a capture with the same name is

  // encountered.

  bool CreateNamedCaptureAtIndex(const RegExpParserState* state, int index);

  // Parses the name of a capture group (?<name>pattern). The name must adhere

  // to IdentifierName in the ECMAScript standard.

  const ZoneVector<base::uc16>* ParseCaptureGroupName();

  bool ParseNamedBackReference(RegExpBuilder* builder,

                               RegExpParserState* state);

  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);

  // After the initial parsing pass, patch corresponding RegExpCapture objects

  // into all RegExpBackReferences. This is done after initial parsing in order

  // to avoid complicating cases in which references comes before the capture.

  void PatchNamedBackReferences();

  ZoneVector<RegExpCapture*>* GetNamedCaptures();

  // Returns true iff the pattern contains named captures. May call

  // ScanForCaptures to look ahead at the remaining pattern.

  bool HasNamedCaptures(InClassEscapeState in_class_escape_state);

  Zone* zone() const { return zone_; }

  base::uc32 current() const { return current_; }

  bool has_more() const { return has_more_; }

  bool has_next() const { return next_pos_ < input_length(); }

  base::uc32 Next();

  template <bool update_position>

  base::uc32 ReadNext();

  CharT InputAt(int index) const {

    DCHECK(0 <= index && index < input_length());

    return input_[index];

  int input_length() const { return input_length_; }

  void ScanForCaptures(InClassEscapeState in_class_escape_state);

  struct RegExpCaptureNameLess {

    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {

      DCHECK_NOT_NULL(lhs);

      DCHECK_NOT_NULL(rhs);

      return *lhs->name() < *rhs->name();

};

  class ForceUnicodeScope final {

   public:

    explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)

        : parser_(parser) {

      DCHECK(!parser_->force_unicode_);

      parser_->force_unicode_ = true;

    ~ForceUnicodeScope() {

      DCHECK(parser_->force_unicode_);

      parser_->force_unicode_ = false;

   private:

    RegExpParserImpl<CharT>* const parser_;

};

  const DisallowGarbageCollection no_gc_;

  Zone* const zone_;

  RegExpError error_ = RegExpError::kNone;

  int error_pos_ = 0;

  ZoneList<RegExpCapture*>* captures_;

  // Maps capture names to a list of capture indices with this name.

  ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>*

      named_captures_;

  ZoneList<RegExpBackReference*>* named_back_references_;

  ZoneList<CharacterRange>* temp_ranges_;

  const CharT* const input_;

  const int input_length_;

  base::uc32 current_;

  RegExpFlags flags_;

  bool force_unicode_ = false;  // Force parser to act as if unicode were set.

  int next_pos_;

  int captures_started_;

  int capture_count_;  // Only valid after we have scanned for captures.

  int lookaround_count_;  // Only valid after we have scanned for lookbehinds.

  bool has_more_;

  bool simple_;

  bool contains_anchor_;

  bool is_scanned_for_captures_;

  bool has_named_captures_;  // Only valid after we have scanned for captures.

  bool failed_;

  const uintptr_t stack_limit_;

  friend class v8::internal::RegExpParser;

};

template <class CharT>

RegExpParserImpl<CharT>::RegExpParserImpl(

    const CharT* input, int input_length, RegExpFlags flags,

    uintptr_t stack_limit, Zone* zone, const DisallowGarbageCollection& no_gc)

    : zone_(zone),

      captures_(nullptr),

      named_captures_(nullptr),

      named_back_references_(nullptr),

      input_(input),

      input_length_(input_length),

      current_(kEndMarker),

      flags_(flags),

      next_pos_(0),

      captures_started_(0),

      capture_count_(0),

      lookaround_count_(0),

      has_more_(true),

      simple_(false),

      contains_anchor_(false),

      is_scanned_for_captures_(false),

      has_named_captures_(false),

      failed_(false),

      stack_limit_(stack_limit) {

  Advance();

template <>

template <bool update_position>

inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {

  int position = next_pos_;

  base::uc16 c0 = InputAt(position);

  position++;

  DCHECK(!unibrow::Utf16::IsLeadSurrogate(c0));

  if (update_position) next_pos_ = position;

  return c0;

template <>

template <bool update_position>

inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {

  int position = next_pos_;

  base::uc16 c0 = InputAt(position);

  base::uc32 result = c0;

  position++;

  // Read the whole surrogate pair in case of unicode mode, if possible.

  if (IsUnicodeMode() && position < input_length() &&

      unibrow::Utf16::IsLeadSurrogate(c0)) {

    base::uc16 c1 = InputAt(position);

    if (unibrow::Utf16::IsTrailSurrogate(c1)) {

      result = unibrow::Utf16::CombineSurrogatePair(c0, c1);

      position++;

  if (update_position) next_pos_ = position;

  return result;

template <class CharT>

base::uc32 RegExpParserImpl<CharT>::Next() {

  if (has_next()) {

    return ReadNext<false>();

  } else {

    return kEndMarker;

template <class CharT>

void RegExpParserImpl<CharT>::Advance() {

  if (has_next()) {

    if (GetCurrentStackPosition() < stack_limit_) {

      if (v8_flags.correctness_fuzzer_suppressions) {

        FATAL("Aborting on stack overflow");

      ReportError(RegExpError::kStackOverflow);

    } else {

      current_ = ReadNext<true>();

  } else {

    current_ = kEndMarker;

    // Advance so that position() points to 1-after-the-last-character. This is

    // important so that Reset() to this position works correctly.

    next_pos_ = input_length() + 1;

    has_more_ = false;

template <class CharT>

void RegExpParserImpl<CharT>::RewindByOneCodepoint() {

  if (!has_more()) return;

  // Rewinds by one code point, i.e.: two code units if `current` is outside

  // the basic multilingual plane (= composed of a lead and trail surrogate),

  // or one code unit otherwise.

  const int rewind_by =

      current() > unibrow::Utf16::kMaxNonSurrogateCharCode ? -2 : -1;

  Advance(rewind_by);  // Undo the last Advance.

template <class CharT>

void RegExpParserImpl<CharT>::Reset(int pos) {

  next_pos_ = pos;

  has_more_ = (pos < input_length());

  Advance();

template <class CharT>

void RegExpParserImpl<CharT>::Advance(int dist) {

  next_pos_ += dist - 1;

  Advance();

// static

template <class CharT>

bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(base::uc32 c) {

  switch (c) {

    case '^':

    case '$':

    case '\\':

    case '.':

    case '*':

    case '+':

    case '?':

    case '(':

    case ')':

    case '[':

    case ']':

    case '{':

    case '}':

    case '|':

    case '/':

      return true;

    default:

      break;

  return false;

// static

template <class CharT>

bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(base::uc32 c) {

  switch (c) {

    case '(':

    case ')':

    case '[':

    case ']':

    case '{':

    case '}':

    case '/':

    case '-':

    case '\\':

    case '|':

      return true;

    default:

      break;

  return false;

// static

template <class CharT>

bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(base::uc32 c) {

  switch (c) {

    case '&':

    case '-':

    case '!':

    case '#':

    case '%':

    case ',':

    case ':':

    case ';':

    case '<':

    case '=':

    case '>':

    case '@':

    case '`':

    case '~':

      return true;

    default:

      break;

  return false;

template <class CharT>

bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(base::uc32 c) {

#define DOUBLE_PUNCTUATOR_CASE(Char) \

  case Char:                         \

    return Next() == Char

  switch (c) {

    DOUBLE_PUNCTUATOR_CASE('&');

    DOUBLE_PUNCTUATOR_CASE('!');

    DOUBLE_PUNCTUATOR_CASE('#');

    DOUBLE_PUNCTUATOR_CASE('$');

    DOUBLE_PUNCTUATOR_CASE('%');

    DOUBLE_PUNCTUATOR_CASE('*');

    DOUBLE_PUNCTUATOR_CASE('+');

    DOUBLE_PUNCTUATOR_CASE(',');

    DOUBLE_PUNCTUATOR_CASE('.');

    DOUBLE_PUNCTUATOR_CASE(':');

    DOUBLE_PUNCTUATOR_CASE(';');

    DOUBLE_PUNCTUATOR_CASE('<');

    DOUBLE_PUNCTUATOR_CASE('=');

    DOUBLE_PUNCTUATOR_CASE('>');

    DOUBLE_PUNCTUATOR_CASE('?');

    DOUBLE_PUNCTUATOR_CASE('@');

    DOUBLE_PUNCTUATOR_CASE('^');

    DOUBLE_PUNCTUATOR_CASE('`');

    DOUBLE_PUNCTUATOR_CASE('~');

    default:

      break;

#undef DOUBLE_PUNCTUATOR_CASE

  return false;

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ReportError(RegExpError error) {

  if (failed_) return nullptr;  // Do not overwrite any existing error.

  failed_ = true;

  error_ = error;

  error_pos_ = position();

  // Zip to the end to make sure no more input is read.

  current_ = kEndMarker;

  next_pos_ = input_length();

  has_more_ = false;

  return nullptr;

#define CHECK_FAILED /**/);    \

  if (failed_) return nullptr; \

  ((void)0

// Pattern ::

//   Disjunction

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {

  RegExpTree* result = ParseDisjunction(CHECK_FAILED);

  PatchNamedBackReferences(CHECK_FAILED);

  DCHECK(!has_more());

  // If the result of parsing is a literal string atom, and it has the

  // same length as the input, then the atom is identical to the input.

  if (result->IsAtom() && result->AsAtom()->length() == input_length()) {

    simple_ = true;

  return result;

// Disjunction ::

//   Alternative

//   Alternative | Disjunction

// Alternative ::

//   [empty]

//   Term Alternative

// Term ::

//   Assertion

//   Atom

//   Atom Quantifier

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {

  // Used to store current state while parsing subexpressions.

  RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,

                                  0, nullptr, flags(), zone());

  RegExpParserState* state = &initial_state;

  // Cache the builder in a local variable for quick access.

  RegExpBuilder* builder = initial_state.builder();

  while (true) {

    switch (current()) {

      case kEndMarker:

        if (failed()) return nullptr;  // E.g. the initial Advance failed.

        if (state->IsSubexpression()) {

          // Inside a parenthesized group when hitting end of input.

          return ReportError(RegExpError::kUnterminatedGroup);

        DCHECK_EQ(INITIAL, state->group_type());

        // Parsing completed successfully.

        return builder->ToRegExp();

      case ')': {

        if (!state->IsSubexpression()) {

          return ReportError(RegExpError::kUnmatchedParen);

        DCHECK_NE(INITIAL, state->group_type());

        Advance();

        // End disjunction parsing and convert builder content to new single

        // regexp atom.

        RegExpTree* body = builder->ToRegExp();

        int end_capture_index = captures_started();

        int capture_index = state->capture_index();

        SubexpressionType group_type = state->group_type();

        // Build result of subexpression.

        if (group_type == CAPTURE) {

          if (state->IsNamedCapture()) {

            CreateNamedCaptureAtIndex(state, capture_index CHECK_FAILED);

          RegExpCapture* capture = GetCapture(capture_index);

          capture->set_body(body);

          body = capture;

        } else if (group_type == GROUPING) {

          body = zone()->template New<RegExpGroup>(body, builder->flags());

        } else {

          DCHECK(group_type == POSITIVE_LOOKAROUND ||

                 group_type == NEGATIVE_LOOKAROUND);

          bool is_positive = (group_type == POSITIVE_LOOKAROUND);

          body = zone()->template New<RegExpLookaround>(

              body, is_positive, end_capture_index - capture_index,

              capture_index, state->lookaround_type(), lookaround_count_);

          lookaround_count_++;

        // Restore previous state.

        state = state->previous_state();

        builder = state->builder();

        builder->AddAtom(body);

        // For compatibility with JSC and ES3, we allow quantifiers after

        // lookaheads, and break in all cases.

        break;

      case '|': {

        Advance();

        state->NewAlternative(captures_started());

        builder->NewAlternative();

        continue;

      case '*':

      case '+':

      case '?':

        return ReportError(RegExpError::kNothingToRepeat);

      case '^': {

        Advance();

        builder->AddAssertion(zone()->template New<RegExpAssertion>(

            builder->multiline() ? RegExpAssertion::Type::START_OF_LINE

                                 : RegExpAssertion::Type::START_OF_INPUT));

        set_contains_anchor();

        continue;

      case '$': {

        Advance();

        RegExpAssertion::Type assertion_type =

            builder->multiline() ? RegExpAssertion::Type::END_OF_LINE

                                 : RegExpAssertion::Type::END_OF_INPUT;

        builder->AddAssertion(

            zone()->template New<RegExpAssertion>(assertion_type));

        continue;

      case '.': {

        Advance();

        ZoneList<CharacterRange>* ranges =

            zone()->template New<ZoneList<CharacterRange>>(2, zone());

        if (builder->dotall()) {

          // Everything.

          CharacterRange::AddClassEscape(StandardCharacterSet::kEverything,

                                         ranges, false, zone());

        } else {

          // Everything except \x0A, \x0D, \u2028 and \u2029.

          CharacterRange::AddClassEscape(

              StandardCharacterSet::kNotLineTerminator, ranges, false, zone());

        RegExpClassRanges* cc =

            zone()->template New<RegExpClassRanges>(zone(), ranges);

        builder->AddClassRanges(cc);

        break;

      case '(': {

        state = ParseOpenParenthesis(state CHECK_FAILED);

        builder = state->builder();

        flags_ = builder->flags();

        continue;

      case '[': {

        RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);

        if (cc->IsClassRanges()) {

          builder->AddClassRanges(cc->AsClassRanges());

        } else {

          DCHECK(cc->IsClassSetExpression());

          builder->AddTerm(cc);

        break;

      // Atom ::

      //   \ AtomEscape

      case '\\':

        switch (Next()) {

          case kEndMarker:

            return ReportError(RegExpError::kEscapeAtEndOfPattern);

          // AtomEscape ::

          //   [+UnicodeMode] DecimalEscape

          //   [~UnicodeMode] DecimalEscape but only if the CapturingGroupNumber

          //                  of DecimalEscape is ≤ NcapturingParens

          //   CharacterEscape (some cases of this mixed in too)

//

          // TODO(jgruber): It may make sense to disentangle all the different

          // cases and make the structure mirror the spec, e.g. for AtomEscape:

//

          //  if (TryParseDecimalEscape(...)) return;

          //  if (TryParseCharacterClassEscape(...)) return;

          //  if (TryParseCharacterEscape(...)) return;

          //  if (TryParseGroupName(...)) return;

          case '1':

          case '2':

          case '3':

          case '4':

          case '5':

          case '6':

          case '7':

          case '8':

          case '9': {

            int index = 0;

            const bool is_backref =

                ParseBackReferenceIndex(&index CHECK_FAILED);

            if (is_backref) {

              if (state->IsInsideCaptureGroup(index)) {

                // The back reference is inside the capture group it refers to.

                // Nothing can possibly have been captured yet, so we use empty

                // instead. This ensures that, when checking a back reference,

                // the capture registers of the referenced capture are either

                // both set or both cleared.

                builder->AddEmpty();

              } else {

                RegExpCapture* capture = GetCapture(index);

                RegExpTree* atom =

                    zone()->template New<RegExpBackReference>(capture, zone());

                builder->AddAtom(atom);

              break;

            // With /u and /v, no identity escapes except for syntax characters

            // are allowed. Otherwise, all identity escapes are allowed.

            if (IsUnicodeMode()) {

              return ReportError(RegExpError::kInvalidEscape);

            base::uc32 first_digit = Next();

            if (first_digit == '8' || first_digit == '9') {

              builder->AddCharacter(first_digit);

              Advance(2);

              break;

            [[fallthrough]];

          case '0': {

            Advance();

            if (IsUnicodeMode() && Next() >= '0' && Next() <= '9') {

              // Decimal escape with leading 0 are not parsed as octal.

              return ReportError(RegExpError::kInvalidDecimalEscape);

            base::uc32 octal = ParseOctalLiteral();

            builder->AddCharacter(octal);

            break;

          case 'b':

            Advance(2);

            builder->AddAssertion(zone()->template New<RegExpAssertion>(

                RegExpAssertion::Type::BOUNDARY));

            continue;

          case 'B':

            Advance(2);

            builder->AddAssertion(zone()->template New<RegExpAssertion>(

                RegExpAssertion::Type::NON_BOUNDARY));

            continue;

          // AtomEscape ::

          //   CharacterClassEscape

          case 'd':

          case 'D':

          case 's':

          case 'S':

          case 'w':

          case 'W': {

            base::uc32 next = Next();

            ZoneList<CharacterRange>* ranges =

                zone()->template New<ZoneList<CharacterRange>>(2, zone());

            bool add_unicode_case_equivalents =

                IsUnicodeMode() && ignore_case();

            bool parsed_character_class_escape = TryParseCharacterClassEscape(

                next, InClassEscapeState::kNotInClass, ranges, nullptr, zone(),

                add_unicode_case_equivalents CHECK_FAILED);

            if (parsed_character_class_escape) {

              RegExpClassRanges* cc =

                  zone()->template New<RegExpClassRanges>(zone(), ranges);

              builder->AddClassRanges(cc);

            } else {

              CHECK(!IsUnicodeMode());

              Advance(2);

              builder->AddCharacter(next);  // IdentityEscape.

            break;

          case 'p':

          case 'P': {

            base::uc32 next = Next();

            ZoneList<CharacterRange>* ranges =

                zone()->template New<ZoneList<CharacterRange>>(2, zone());

            CharacterClassStrings* strings = nullptr;

            if (unicode_sets()) {

              strings = zone()->template New<CharacterClassStrings>(zone());

            bool add_unicode_case_equivalents = ignore_case();

            bool parsed_character_class_escape = TryParseCharacterClassEscape(

                next, InClassEscapeState::kNotInClass, ranges, strings, zone(),

                add_unicode_case_equivalents CHECK_FAILED);

            if (parsed_character_class_escape) {

              if (unicode_sets()) {

                RegExpClassSetOperand* op =

                    zone()->template New<RegExpClassSetOperand>(ranges,

                                                                strings);

                builder->AddTerm(op);

              } else {

                RegExpClassRanges* cc =

                    zone()->template New<RegExpClassRanges>(zone(), ranges);

                builder->AddClassRanges(cc);

            } else {

              CHECK(!IsUnicodeMode());

              Advance(2);

              builder->AddCharacter(next);  // IdentityEscape.

            break;

          // AtomEscape ::

          //   k GroupName

          case 'k': {

            // Either an identity escape or a named back-reference.  The two

            // interpretations are mutually exclusive: '\k' is interpreted as

            // an identity escape for non-Unicode patterns without named

            // capture groups, and as the beginning of a named back-reference

            // in all other cases.

            const bool has_named_captures =

                HasNamedCaptures(InClassEscapeState::kNotInClass CHECK_FAILED);

            if (IsUnicodeMode() || has_named_captures) {

              Advance(2);

              ParseNamedBackReference(builder, state CHECK_FAILED);

              break;

            [[fallthrough]];

          // AtomEscape ::

          //   CharacterEscape

          default: {

            bool is_escaped_unicode_character = false;

            base::uc32 c = ParseCharacterEscape(

                InClassEscapeState::kNotInClass,

                &is_escaped_unicode_character CHECK_FAILED);

            if (is_escaped_unicode_character) {

              builder->AddEscapedUnicodeCharacter(c);

            } else {

              builder->AddCharacter(c);

            break;

        break;

      case '{': {

        int dummy;

        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);

        if (parsed) return ReportError(RegExpError::kNothingToRepeat);

        [[fallthrough]];

      case '}':

      case ']':

        if (IsUnicodeMode()) {

          return ReportError(RegExpError::kLoneQuantifierBrackets);

        [[fallthrough]];

      default:

        builder->AddUnicodeCharacter(current());

        Advance();

        break;

    }  // end switch(current())

    int min;

    int max;

    switch (current()) {

      // QuantifierPrefix ::

      //   *

      //   +

      //   ?

      //   {

      case '*':

        min = 0;

        max = RegExpTree::kInfinity;

        Advance();

        break;

      case '+':

        min = 1;

        max = RegExpTree::kInfinity;

        Advance();

        break;

      case '?':

        min = 0;

        max = 1;

        Advance();

        break;

      case '{':

        if (ParseIntervalQuantifier(&min, &max)) {

          if (max < min) {

            return ReportError(RegExpError::kRangeOutOfOrder);

          break;

        } else if (IsUnicodeMode()) {

          // Incomplete quantifiers are not allowed.

          return ReportError(RegExpError::kIncompleteQuantifier);

        continue;

      default:

        continue;

    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

    if (current() == '?') {

      quantifier_type = RegExpQuantifier::NON_GREEDY;

      Advance();

    } else if (v8_flags.regexp_possessive_quantifier && current() == '+') {

      // v8_flags.regexp_possessive_quantifier is a debug-only flag.

      quantifier_type = RegExpQuantifier::POSSESSIVE;

      Advance();

    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {

      return ReportError(RegExpError::kInvalidQuantifier);

template <class CharT>

RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(

    RegExpParserState* state) {

  RegExpLookaround::Type lookaround_type = state->lookaround_type();

  bool is_named_capture = false;

  const ZoneVector<base::uc16>* capture_name = nullptr;

  SubexpressionType subexpr_type = CAPTURE;

  RegExpFlags flags = state->builder()->flags();

  bool parsing_modifiers = false;

  bool modifiers_polarity = true;

  RegExpFlags modifiers;

  Advance();

  if (current() == '?') {

    do {

      switch (Next()) {

        case '-':

          if (!v8_flags.js_regexp_modifiers) {

            ReportError(RegExpError::kInvalidGroup);

            return nullptr;

          Advance();

          parsing_modifiers = true;

          if (modifiers_polarity == false) {

            ReportError(RegExpError::kMultipleFlagDashes);

            return nullptr;

          modifiers_polarity = false;

          break;

        case 'm':

        case 'i':

        case 's': {

          if (!v8_flags.js_regexp_modifiers) {

            ReportError(RegExpError::kInvalidGroup);

            return nullptr;

          Advance();

          parsing_modifiers = true;

          RegExpFlag flag = TryRegExpFlagFromChar(current()).value();

          if ((modifiers & flag) != 0) {

            ReportError(RegExpError::kRepeatedFlag);

            return nullptr;

          modifiers |= flag;

          flags.set(flag, modifiers_polarity);

          break;

        case ':':

          Advance(2);

          parsing_modifiers = false;

          subexpr_type = GROUPING;

          break;

        case '=':

          Advance(2);

          if (parsing_modifiers) {

            DCHECK(v8_flags.js_regexp_modifiers);

            ReportError(RegExpError::kInvalidGroup);

            return nullptr;

          lookaround_type = RegExpLookaround::LOOKAHEAD;

          subexpr_type = POSITIVE_LOOKAROUND;

          break;

        case '!':

          Advance(2);

          if (parsing_modifiers) {

            DCHECK(v8_flags.js_regexp_modifiers);

            ReportError(RegExpError::kInvalidGroup);

            return nullptr;

          lookaround_type = RegExpLookaround::LOOKAHEAD;

          subexpr_type = NEGATIVE_LOOKAROUND;

          break;

        case '<':

          Advance();

          if (parsing_modifiers) {

            DCHECK(v8_flags.js_regexp_modifiers);

            ReportError(RegExpError::kInvalidGroup);

            return nullptr;

          if (Next() == '=') {

            Advance(2);

            lookaround_type = RegExpLookaround::LOOKBEHIND;

            subexpr_type = POSITIVE_LOOKAROUND;

            break;

          } else if (Next() == '!') {

            Advance(2);

            lookaround_type = RegExpLookaround::LOOKBEHIND;

            subexpr_type = NEGATIVE_LOOKAROUND;

            break;

          is_named_capture = true;

          has_named_captures_ = true;

          Advance();

          break;

        default:

          ReportError(RegExpError::kInvalidGroup);

          return nullptr;

    } while (parsing_modifiers);

  if (modifiers_polarity == false) {

    // We encountered a dash.

    if (modifiers == 0) {

      ReportError(RegExpError::kInvalidFlagGroup);

      return nullptr;

  if (subexpr_type == CAPTURE) {

    if (captures_started_ >= RegExpMacroAssembler::kMaxCaptures) {

      ReportError(RegExpError::kTooManyCaptures);

      return nullptr;

    captures_started_++;

    if (is_named_capture) {

      capture_name = ParseCaptureGroupName(CHECK_FAILED);

  // Store current state and begin new disjunction parsing.

  return zone()->template New<RegExpParserState>(

      state, subexpr_type, lookaround_type, captures_started_, capture_name,

      flags, zone());

// In order to know whether an escape is a backreference or not we have to scan

// the entire regexp and find the number of capturing parentheses.  However we

// don't want to scan the regexp twice unless it is necessary.  This mini-parser

// is called when needed.  It can see the difference between capturing and

// noncapturing parentheses and can skip character classes and backslash-escaped

// characters.

//

// Important: The scanner has to be in a consistent state when calling

// ScanForCaptures, e.g. not in the middle of an escape sequence '\[' or while

// parsing a nested class.

template <class CharT>

void RegExpParserImpl<CharT>::ScanForCaptures(

    InClassEscapeState in_class_escape_state) {

  DCHECK(!is_scanned_for_captures_);

  const int saved_position = position();

  // Start with captures started previous to current position

  int capture_count = captures_started();

  // When we start inside a character class, skip everything inside the class.

  if (in_class_escape_state == InClassEscapeState::kInClass) {

    // \k is always invalid within a class in unicode mode, thus we should never

    // call ScanForCaptures within a class.

    DCHECK(!IsUnicodeMode());

    int c;

    while ((c = current()) != kEndMarker) {

      Advance();

      if (c == '\\') {

        Advance();

      } else {

        if (c == ']') break;

  // Add count of captures after this position.

  int n;

  while ((n = current()) != kEndMarker) {

    Advance();

    switch (n) {

      case '\\':

        Advance();

        break;

      case '[': {

        int class_nest_level = 0;

        int c;

        while ((c = current()) != kEndMarker) {

          Advance();

          if (c == '\\') {

            Advance();

          } else if (c == '[') {

            // With /v, '[' inside a class is treated as a nested class.

            // Without /v, '[' is a normal character.

            if (unicode_sets()) class_nest_level++;

          } else if (c == ']') {

            if (class_nest_level == 0) break;

            class_nest_level--;

        break;

      case '(':

        if (current() == '?') {

          // At this point we could be in

          // * a non-capturing group '(:',

          // * a lookbehind assertion '(?<=' '(?<!'

          // * or a named capture '(?<'.

//

          // Of these, only named captures are capturing groups.

          Advance();

          if (current() != '<') break;

          Advance();

          if (current() == '=' || current() == '!') break;

          // Found a possible named capture. It could turn out to be a syntax

          // error (e.g. an unterminated or invalid name), but that distinction

          // does not matter for our purposes.

          has_named_captures_ = true;

        capture_count++;

        break;

  capture_count_ = capture_count;

  is_scanned_for_captures_ = true;

  Reset(saved_position);

template <class CharT>

bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(int* index_out) {

  DCHECK_EQ('\\', current());

  DCHECK('1' <= Next() && Next() <= '9');

  // Try to parse a decimal literal that is no greater than the total number

  // of left capturing parentheses in the input.

  int start = position();

  int value = Next() - '0';

  Advance(2);

  while (true) {

    base::uc32 c = current();

    if (IsDecimalDigit(c)) {

      value = 10 * value + (c - '0');

      if (value > RegExpMacroAssembler::kMaxCaptures) {

        Reset(start);

        return false;

      Advance();

    } else {

      break;

  if (value > captures_started()) {

    if (!is_scanned_for_captures_) {

      ScanForCaptures(InClassEscapeState::kNotInClass);

    if (value > capture_count_) {

      Reset(start);

      return false;

  *index_out = value;

  return true;

namespace {

void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {

  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {

    v->push_back(code_unit);

  } else {

    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));

    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));

}  // namespace

template <class CharT>

const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {

  // Due to special Advance requirements (see the next comment), rewind by one

  // such that names starting with a surrogate pair are parsed correctly for

  // patterns where the unicode flag is unset.

//

  // Note that we use this odd pattern of rewinding the last advance in order

  // to adhere to the common parser behavior of expecting `current` to point at

  // the first candidate character for a function (e.g. when entering ParseFoo,

  // `current` should point at the first character of Foo).

  RewindByOneCodepoint();

  ZoneVector<base::uc16>* name =

      zone()->template New<ZoneVector<base::uc16>>(zone());

    // Advance behavior inside this function is tricky since

    // RegExpIdentifierName explicitly enables unicode (in spec terms, sets +U)

    // and thus allows surrogate pairs and \u{}-style escapes even in

    // non-unicode patterns. Therefore Advance within the capture group name

    // has to force-enable unicode, and outside the name revert to default

    // behavior.

    ForceUnicodeScope force_unicode(this);

    bool at_start = true;

    while (true) {

      Advance();

      base::uc32 c = current();

      // Convert unicode escapes.

      if (c == '\\' && Next() == 'u') {

        Advance(2);

        if (!ParseUnicodeEscape(&c)) {

          ReportError(RegExpError::kInvalidUnicodeEscape);

          return nullptr;

        RewindByOneCodepoint();

      // The backslash char is misclassified as both ID_Start and ID_Continue.

      if (c == '\\') {

        ReportError(RegExpError::kInvalidCaptureGroupName);

        return nullptr;

      if (at_start) {

        if (!IsIdentifierStart(c)) {

          ReportError(RegExpError::kInvalidCaptureGroupName);

          return nullptr;

        push_code_unit(name, c);

        at_start = false;

      } else {

        if (c == '>') {

          break;

        } else if (IsIdentifierPart(c)) {

          push_code_unit(name, c);

        } else {

          ReportError(RegExpError::kInvalidCaptureGroupName);

          return nullptr;

  // This final advance goes back into the state of pointing at the next

  // relevant char, which the rest of the parser expects. See also the previous

  // comments in this function.

  Advance();

  return name;

template <class CharT>

bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(

    const RegExpParserState* state, int index) {

  const ZoneVector<base::uc16>* name = state->capture_name();

  const std::pair<int, int> non_participating_capture_group_interval =

      state->non_participating_capture_group_interval();

  DCHECK(0 < index && index <= captures_started_);

  DCHECK_NOT_NULL(name);

  RegExpCapture* capture = GetCapture(index);

  DCHECK_NULL(capture->name());

  capture->set_name(name);

  if (named_captures_ == nullptr) {

    named_captures_ = zone_->template New<

        ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone());

  } else {

    // Check for duplicates and bail if we find any.

    const auto& named_capture_it = named_captures_->find(capture);

    if (named_capture_it != named_captures_->end()) {

      if (v8_flags.js_regexp_duplicate_named_groups) {

        ZoneList<int>* named_capture_indices = named_capture_it->second;

        DCHECK_NOT_NULL(named_capture_indices);

        DCHECK(!named_capture_indices->is_empty());

        for (int named_index : *named_capture_indices) {

          if (named_index < non_participating_capture_group_interval.first ||

              named_index > non_participating_capture_group_interval.second) {

            ReportError(RegExpError::kDuplicateCaptureGroupName);

            return false;

      } else {

        ReportError(RegExpError::kDuplicateCaptureGroupName);

        return false;

  auto entry = named_captures_->try_emplace(

      capture, zone()->template New<ZoneList<int>>(1, zone()));

  entry.first->second->Add(index, zone());

  return true;

template <class CharT>

bool RegExpParserImpl<CharT>::ParseNamedBackReference(

    RegExpBuilder* builder, RegExpParserState* state) {

  // The parser is assumed to be on the '<' in \k<name>.

  if (current() != '<') {

    ReportError(RegExpError::kInvalidNamedReference);

    return false;

  Advance();

  const ZoneVector<base::uc16>* name = ParseCaptureGroupName();

  if (name == nullptr) {

    return false;

  if (state->IsInsideCaptureGroup(name)) {

    builder->AddEmpty();

  } else {

    RegExpBackReference* atom =

        zone()->template New<RegExpBackReference>(zone());

    atom->set_name(name);

    builder->AddAtom(atom);

    if (named_back_references_ == nullptr) {

      named_back_references_ =

          zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());

    named_back_references_->Add(atom, zone());

  return true;

template <class CharT>

void RegExpParserImpl<CharT>::PatchNamedBackReferences() {

  if (named_back_references_ == nullptr) return;

  if (named_captures_ == nullptr) {

    ReportError(RegExpError::kInvalidNamedCaptureReference);

    return;

  // Look up and patch the actual capture for each named back reference.

  for (int i = 0; i < named_back_references_->length(); i++) {

    RegExpBackReference* ref = named_back_references_->at(i);

    // Capture used to search the named_captures_ by name, index of the

    // capture is never used.

    static const int kInvalidIndex = 0;

    RegExpCapture* search_capture =

        zone()->template New<RegExpCapture>(kInvalidIndex);

    DCHECK_NULL(search_capture->name());

    search_capture->set_name(ref->name());

    const auto& capture_it = named_captures_->find(search_capture);

    if (capture_it == named_captures_->end()) {

      ReportError(RegExpError::kInvalidNamedCaptureReference);

      return;

    DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,

                   capture_it->second->length() == 1);

    for (int index : *capture_it->second) {

      ref->add_capture(GetCapture(index), zone());

template <class CharT>

RegExpCapture* RegExpParserImpl<CharT>::GetCapture(int index) {

  // The index for the capture groups are one-based. Its index in the list is

  // zero-based.

  const int known_captures =

      is_scanned_for_captures_ ? capture_count_ : captures_started_;

  DCHECK(index <= known_captures);

  if (captures_ == nullptr) {

    captures_ =

        zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());

  while (captures_->length() < known_captures) {

    captures_->Add(zone()->template New<RegExpCapture>(captures_->length() + 1),

                   zone());

  return captures_->at(index - 1);

template <class CharT>

ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() {

  if (named_captures_ == nullptr) {

    return nullptr;

  DCHECK(!named_captures_->empty());

  ZoneVector<RegExpCapture*>* flattened_named_captures =

      zone()->template New<ZoneVector<RegExpCapture*>>(zone());

  for (auto capture : *named_captures_) {

    DCHECK_IMPLIES(!v8_flags.js_regexp_duplicate_named_groups,

                   capture.second->length() == 1);

    for (int index : *capture.second) {

      flattened_named_captures->push_back(GetCapture(index));

  return flattened_named_captures;

template <class CharT>

bool RegExpParserImpl<CharT>::HasNamedCaptures(

    InClassEscapeState in_class_escape_state) {

  if (has_named_captures_ || is_scanned_for_captures_) {

    return has_named_captures_;

  ScanForCaptures(in_class_escape_state);

  DCHECK(is_scanned_for_captures_);

  return has_named_captures_;

// QuantifierPrefix ::

//   { DecimalDigits }

//   { DecimalDigits , }

//   { DecimalDigits , DecimalDigits }

//

// Returns true if parsing succeeds, and set the min_out and max_out

// values. Values are truncated to RegExpTree::kInfinity if they overflow.

template <class CharT>

bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(int* min_out,

                                                      int* max_out) {

  DCHECK_EQ(current(), '{');

  int start = position();

  Advance();

  int min = 0;

  if (!IsDecimalDigit(current())) {

    Reset(start);

    return false;

  while (IsDecimalDigit(current())) {

    int next = current() - '0';

    if (min > (RegExpTree::kInfinity - next) / 10) {

      // Overflow. Skip past remaining decimal digits and return -1.

      do {

        Advance();

      } while (IsDecimalDigit(current()));

      min = RegExpTree::kInfinity;

      break;

    min = 10 * min + next;

    Advance();

  int max = 0;

  if (current() == '}') {

    max = min;

    Advance();

  } else if (current() == ',') {

    Advance();

    if (current() == '}') {

      max = RegExpTree::kInfinity;

      Advance();

    } else {

      while (IsDecimalDigit(current())) {

        int next = current() - '0';

        if (max > (RegExpTree::kInfinity - next) / 10) {

          do {

            Advance();

          } while (IsDecimalDigit(current()));

          max = RegExpTree::kInfinity;

          break;

        max = 10 * max + next;

        Advance();

      if (current() != '}') {

        Reset(start);

        return false;

      Advance();

  } else {

    Reset(start);

    return false;

  *min_out = min;

  *max_out = max;

  return true;

template <class CharT>

base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {

  DCHECK(('0' <= current() && current() <= '7') || !has_more());

  // For compatibility with some other browsers (not all), we parse

  // up to three octal digits with a value below 256.

  // ES#prod-annexB-LegacyOctalEscapeSequence

  base::uc32 value = current() - '0';

  Advance();

  if ('0' <= current() && current() <= '7') {

    value = value * 8 + current() - '0';

    Advance();

    if (value < 32 && '0' <= current() && current() <= '7') {

      value = value * 8 + current() - '0';

      Advance();

  return value;

template <class CharT>

bool RegExpParserImpl<CharT>::ParseHexEscape(int length, base::uc32* value) {

  int start = position();

  base::uc32 val = 0;

  for (int i = 0; i < length; ++i) {

    base::uc32 c = current();

    int d = base::HexValue(c);

    if (d < 0) {

      Reset(start);

      return false;

    val = val * 16 + d;

    Advance();

  *value = val;

  return true;

// This parses RegExpUnicodeEscapeSequence as described in ECMA262.

template <class CharT>

bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {

  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

  // allowed). In the latter case, the number of hex digits between { } is

  // arbitrary. \ and u have already been read.

  if (current() == '{' && IsUnicodeMode()) {

    int start = position();

    Advance();

    if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {

      if (current() == '}') {

        Advance();

        return true;

    Reset(start);

    return false;

  // \u but no {, or \u{...} escapes not allowed.

  bool result = ParseHexEscape(4, value);

  if (result && IsUnicodeMode() && unibrow::Utf16::IsLeadSurrogate(*value) &&

      current() == '\\') {

    // Attempt to read trail surrogate.

    int start = position();

    if (Next() == 'u') {

      Advance(2);

      base::uc32 trail;

      if (ParseHexEscape(4, &trail) &&

          unibrow::Utf16::IsTrailSurrogate(trail)) {

        *value = unibrow::Utf16::CombineSurrogatePair(

            static_cast<base::uc16>(*value), static_cast<base::uc16>(trail));

        return true;

    Reset(start);

  return result;

#ifdef V8_INTL_SUPPORT

namespace {

bool IsExactPropertyAlias(const char* property_name, UProperty property) {

  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);

  if (short_name != nullptr && strcmp(property_name, short_name) == 0)

    return true;

  for (int i = 0;; i++) {

    const char* long_name = u_getPropertyName(

        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));

    if (long_name == nullptr) break;

    if (strcmp(property_name, long_name) == 0) return true;

  return false;

bool IsExactPropertyValueAlias(const char* property_value_name,

                               UProperty property, int32_t property_value) {

  const char* short_name =

      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);

  if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {

    return true;

  for (int i = 0;; i++) {

    const char* long_name = u_getPropertyValueName(

        property, property_value,

        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));

    if (long_name == nullptr) break;

    if (strcmp(property_value_name, long_name) == 0) return true;

  return false;

void ExtractStringsFromUnicodeSet(const icu::UnicodeSet& set,

                                  CharacterClassStrings* strings,

                                  RegExpFlags flags, Zone* zone) {

  DCHECK(set.hasStrings());

  DCHECK(IsUnicodeSets(flags));

  DCHECK_NOT_NULL(strings);

  RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone);

  RegExpTextBuilder string_builder(zone, &string_storage, flags);

  const bool needs_case_folding = IsIgnoreCase(flags);

  icu::UnicodeSetIterator iter(set);

  iter.skipToStrings();

  while (iter.next()) {

    const icu::UnicodeString& s = iter.getString();

    const char16_t* p = s.getBuffer();

    int32_t length = s.length();

    ZoneList<base::uc32>* string =

        zone->template New<ZoneList<base::uc32>>(length, zone);

    for (int32_t i = 0; i < length;) {

      UChar32 c;

      U16_NEXT(p, i, length, c);

      string_builder.AddUnicodeCharacter(c);

      if (needs_case_folding) {

        c = u_foldCase(c, U_FOLD_CASE_DEFAULT);

      string->Add(c, zone);

    strings->emplace(string->ToVector(), string_builder.ToRegExp());

    string_storage.clear();

bool LookupPropertyValueName(UProperty property,

                             const char* property_value_name, bool negate,

                             ZoneList<CharacterRange>* result_ranges,

                             CharacterClassStrings* result_strings,

                             RegExpFlags flags, Zone* zone) {

  UProperty property_for_lookup = property;

  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {

    // For the property Script_Extensions, we have to do the property value

    // name lookup as if the property is Script.

    property_for_lookup = UCHAR_SCRIPT;

  int32_t property_value =

      u_getPropertyValueEnum(property_for_lookup, property_value_name);

  if (property_value == UCHAR_INVALID_CODE) return false;

  // We require the property name to match exactly to one of the property value

  // aliases. However, u_getPropertyValueEnum uses loose matching.

  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,

                                 property_value)) {

    return false;

  UErrorCode ec = U_ZERO_ERROR;

  icu::UnicodeSet set;

  set.applyIntPropertyValue(property, property_value, ec);

  bool success = ec == U_ZERO_ERROR && !set.isEmpty();

  if (success) {

    if (set.hasStrings()) {

      ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);

    const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);

    if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);

    set.removeAllStrings();

    if (negate) set.complement();

    for (int i = 0; i < set.getRangeCount(); i++) {

      result_ranges->Add(

          CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),

          zone);

  return success;

template <size_t N>

inline bool NameEquals(const char* name, const char (&literal)[N]) {

  return strncmp(name, literal, N + 1) == 0;

bool LookupSpecialPropertyValueName(const char* name,

                                    ZoneList<CharacterRange>* result,

                                    bool negate, RegExpFlags flags,

                                    Zone* zone) {

  if (NameEquals(name, "Any")) {

    if (negate) {

      // Leave the list of character ranges empty, since the negation of 'Any'

      // is the empty set.

    } else {

      result->Add(CharacterRange::Everything(), zone);

  } else if (NameEquals(name, "ASCII")) {

    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)

                       : CharacterRange::Range(0x0, 0x7F),

                zone);

  } else if (NameEquals(name, "Assigned")) {

    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",

                                   !negate, result, nullptr, flags, zone);

  } else {

    return false;

  return true;

// Explicitly allowlist supported binary properties. The spec forbids supporting

// properties outside of this set to ensure interoperability.

bool IsSupportedBinaryProperty(UProperty property, bool unicode_sets) {

  switch (property) {

    case UCHAR_ALPHABETIC:

    // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.

    // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.

    case UCHAR_ASCII_HEX_DIGIT:

    // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.

    case UCHAR_BIDI_CONTROL:

    case UCHAR_BIDI_MIRRORED:

    case UCHAR_CASE_IGNORABLE:

    case UCHAR_CASED:

    case UCHAR_CHANGES_WHEN_CASEFOLDED:

    case UCHAR_CHANGES_WHEN_CASEMAPPED:

    case UCHAR_CHANGES_WHEN_LOWERCASED:

    case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:

    case UCHAR_CHANGES_WHEN_TITLECASED:

    case UCHAR_CHANGES_WHEN_UPPERCASED:

    case UCHAR_DASH:

    case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:

    case UCHAR_DEPRECATED:

    case UCHAR_DIACRITIC:

    case UCHAR_EMOJI:

    case UCHAR_EMOJI_COMPONENT:

    case UCHAR_EMOJI_MODIFIER_BASE:

    case UCHAR_EMOJI_MODIFIER:

    case UCHAR_EMOJI_PRESENTATION:

    case UCHAR_EXTENDED_PICTOGRAPHIC:

    case UCHAR_EXTENDER:

    case UCHAR_GRAPHEME_BASE:

    case UCHAR_GRAPHEME_EXTEND:

    case UCHAR_HEX_DIGIT:

    case UCHAR_ID_CONTINUE:

    case UCHAR_ID_START:

    case UCHAR_IDEOGRAPHIC:

    case UCHAR_IDS_BINARY_OPERATOR:

    case UCHAR_IDS_TRINARY_OPERATOR:

    case UCHAR_JOIN_CONTROL:

    case UCHAR_LOGICAL_ORDER_EXCEPTION:

    case UCHAR_LOWERCASE:

    case UCHAR_MATH:

    case UCHAR_NONCHARACTER_CODE_POINT:

    case UCHAR_PATTERN_SYNTAX:

    case UCHAR_PATTERN_WHITE_SPACE:

    case UCHAR_QUOTATION_MARK:

    case UCHAR_RADICAL:

    case UCHAR_REGIONAL_INDICATOR:

    case UCHAR_S_TERM:

    case UCHAR_SOFT_DOTTED:

    case UCHAR_TERMINAL_PUNCTUATION:

    case UCHAR_UNIFIED_IDEOGRAPH:

    case UCHAR_UPPERCASE:

    case UCHAR_VARIATION_SELECTOR:

    case UCHAR_WHITE_SPACE:

    case UCHAR_XID_CONTINUE:

    case UCHAR_XID_START:

      return true;

    case UCHAR_BASIC_EMOJI:

    case UCHAR_EMOJI_KEYCAP_SEQUENCE:

    case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:

    case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:

    case UCHAR_RGI_EMOJI_TAG_SEQUENCE:

    case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:

    case UCHAR_RGI_EMOJI:

      return unicode_sets;

    default:

      break;

  return false;

bool IsBinaryPropertyOfStrings(UProperty property) {

  switch (property) {

    case UCHAR_BASIC_EMOJI:

    case UCHAR_EMOJI_KEYCAP_SEQUENCE:

    case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:

    case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:

    case UCHAR_RGI_EMOJI_TAG_SEQUENCE:

    case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:

    case UCHAR_RGI_EMOJI:

      return true;

    default:

      break;

  return false;

bool IsUnicodePropertyValueCharacter(char c) {

  // https://tc39.github.io/proposal-regexp-unicode-property-escapes/

//

  // Note that using this to validate each parsed char is quite conservative.

  // A possible alternative solution would be to only ensure the parsed

  // property name/value candidate string does not contain '\0' characters and

  // let ICU lookups trigger the final failure.

  if ('a' <= c && c <= 'z') return true;

  if ('A' <= c && c <= 'Z') return true;

  if ('0' <= c && c <= '9') return true;

  return (c == '_');

}  // namespace

template <class CharT>

bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,

                                                     ZoneVector<char>* name_2) {

  DCHECK(name_1->empty());

  DCHECK(name_2->empty());

  // Parse the property class as follows:

  // - In \p{name}, 'name' is interpreted

  //   - either as a general category property value name.

  //   - or as a binary property name.

  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,

  //   and 'value' is interpreted as one of the available property value names.

  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.

  // - Loose matching is not applied.

  if (current() == '{') {

    // Parse \p{[PropertyName=]PropertyNameValue}

    for (Advance(); current() != '}' && current() != '='; Advance()) {

      if (!IsUnicodePropertyValueCharacter(current())) return false;

      if (!has_next()) return false;

      name_1->push_back(static_cast<char>(current()));

    if (current() == '=') {

      for (Advance(); current() != '}'; Advance()) {

        if (!IsUnicodePropertyValueCharacter(current())) return false;

        if (!has_next()) return false;

        name_2->push_back(static_cast<char>(current()));

      name_2->push_back(0);  // null-terminate string.

  } else {

    return false;

  Advance();

  name_1->push_back(0);  // null-terminate string.

  DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));

  DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));

  return true;

template <class CharT>

bool RegExpParserImpl<CharT>::AddPropertyClassRange(

    ZoneList<CharacterRange>* add_to_ranges,

    CharacterClassStrings* add_to_strings, bool negate,

    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {

  if (name_2.empty()) {

    // First attempt to interpret as general category property value name.

    const char* name = name_1.data();

    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,

                                add_to_ranges, add_to_strings, flags(),

                                zone())) {

      return true;

    // Interpret "Any", "ASCII", and "Assigned".

    if (LookupSpecialPropertyValueName(name, add_to_ranges, negate, flags(),

                                       zone())) {

      return true;

    // Then attempt to interpret as binary property name with value name 'Y'.

    UProperty property = u_getPropertyEnum(name);

    if (!IsSupportedBinaryProperty(property, unicode_sets())) return false;

    if (!IsExactPropertyAlias(name, property)) return false;

    // Negation of properties with strings is not allowed.

    // See

    // https://tc39.es/ecma262/#sec-static-semantics-maycontainstrings

    if (negate && IsBinaryPropertyOfStrings(property)) return false;

    if (unicode_sets()) {

      // In /v mode we can't simple lookup the "false" binary property values,

      // as the spec requires us to perform case folding before calculating the

      // complement.

      // See https://tc39.es/ecma262/#sec-compiletocharset

      // UnicodePropertyValueExpression :: LoneUnicodePropertyNameOrValue

      return LookupPropertyValueName(property, "Y", negate, add_to_ranges,

                                     add_to_strings, flags(), zone());

    } else {

      return LookupPropertyValueName(property, negate ? "N" : "Y", false,

                                     add_to_ranges, add_to_strings, flags(),

                                     zone());

  } else {

    // Both property name and value name are specified. Attempt to interpret

    // the property name as enumerated property.

    const char* property_name = name_1.data();

    const char* value_name = name_2.data();

    UProperty property = u_getPropertyEnum(property_name);

    if (!IsExactPropertyAlias(property_name, property)) return false;

    if (property == UCHAR_GENERAL_CATEGORY) {

      // We want to allow aggregate value names such as "Letter".

      property = UCHAR_GENERAL_CATEGORY_MASK;

    } else if (property != UCHAR_SCRIPT &&

               property != UCHAR_SCRIPT_EXTENSIONS) {

      return false;

    return LookupPropertyValueName(property, value_name, negate, add_to_ranges,

                                   add_to_strings, flags(), zone());

#else  // V8_INTL_SUPPORT

template <class CharT>

bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,

                                                     ZoneVector<char>* name_2) {

  return false;

template <class CharT>

bool RegExpParserImpl<CharT>::AddPropertyClassRange(

    ZoneList<CharacterRange>* add_to_ranges,

    CharacterClassStrings* add_to_strings, bool negate,

    const ZoneVector<char>& name_1, const ZoneVector<char>& name_2) {

  return false;

#endif  // V8_INTL_SUPPORT

template <class CharT>

bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(int max_value,

                                                            base::uc32* value) {

  base::uc32 x = 0;

  int d = base::HexValue(current());

  if (d < 0) {

    return false;

  while (d >= 0) {

    x = x * 16 + d;

    if (x > static_cast<base::uc32>(max_value)) {

      return false;

    Advance();

    d = base::HexValue(current());

  *value = x;

  return true;

// https://tc39.es/ecma262/#prod-CharacterEscape

template <class CharT>

base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(

    InClassEscapeState in_class_escape_state,

    bool* is_escaped_unicode_character) {

  DCHECK_EQ('\\', current());

  DCHECK(has_next());

  Advance();

  const base::uc32 c = current();

  switch (c) {

    // CharacterEscape ::

    //   ControlEscape :: one of

    //     f n r t v

    case 'f':

      Advance();

      return '\f';

    case 'n':

      Advance();

      return '\n';

    case 'r':

      Advance();

      return '\r';

    case 't':

      Advance();

      return '\t';

    case 'v':

      Advance();

      return '\v';

    // CharacterEscape ::

    //   c ControlLetter

    case 'c': {

      base::uc32 controlLetter = Next();

      base::uc32 letter = controlLetter & ~('A' ^ 'a');

      if (letter >= 'A' && letter <= 'Z') {

        Advance(2);

        // Control letters mapped to ASCII control characters in the range

        // 0x00-0x1F.

        return controlLetter & 0x1F;

      if (IsUnicodeMode()) {

        // With /u and /v, invalid escapes are not treated as identity escapes.

        ReportError(RegExpError::kInvalidUnicodeEscape);

        return 0;

      if (in_class_escape_state == InClassEscapeState::kInClass) {

        // Inside a character class, we also accept digits and underscore as

        // control characters, unless with /u or /v. See Annex B:

        // ES#prod-annexB-ClassControlLetter

        if ((controlLetter >= '0' && controlLetter <= '9') ||

            controlLetter == '_') {

          Advance(2);

          return controlLetter & 0x1F;

      // We match JSC in reading the backslash as a literal

      // character instead of as starting an escape.

      return '\\';

    // CharacterEscape ::

    //   0 [lookahead ∉ DecimalDigit]

    //   [~UnicodeMode] LegacyOctalEscapeSequence

    case '0':

      // \0 is interpreted as NUL if not followed by another digit.

      if (Next() < '0' || Next() > '9') {

        Advance();

        return 0;

      [[fallthrough]];

    case '1':

    case '2':

    case '3':

    case '4':

    case '5':

    case '6':

    case '7':

      // For compatibility, we interpret a decimal escape that isn't

      // a back reference (and therefore either \0 or not valid according

      // to the specification) as a 1..3 digit octal character code.

      // ES#prod-annexB-LegacyOctalEscapeSequence

      if (IsUnicodeMode()) {

        // With /u or /v, decimal escape is not interpreted as octal character

        // code.

        ReportError(RegExpError::kInvalidClassEscape);

        return 0;

      return ParseOctalLiteral();

    // CharacterEscape ::

    //   HexEscapeSequence

    case 'x': {

      Advance();

      base::uc32 value;

      if (ParseHexEscape(2, &value)) return value;

      if (IsUnicodeMode()) {

        // With /u or /v, invalid escapes are not treated as identity escapes.

        ReportError(RegExpError::kInvalidEscape);

        return 0;

      // If \x is not followed by a two-digit hexadecimal, treat it

      // as an identity escape.

      return 'x';

    // CharacterEscape ::

    //   RegExpUnicodeEscapeSequence [?UnicodeMode]

    case 'u': {

      Advance();

      base::uc32 value;

      if (ParseUnicodeEscape(&value)) {

        *is_escaped_unicode_character = true;

        return value;

      if (IsUnicodeMode()) {

        // With /u or /v, invalid escapes are not treated as identity escapes.

        ReportError(RegExpError::kInvalidUnicodeEscape);

        return 0;

      // If \u is not followed by a two-digit hexadecimal, treat it

      // as an identity escape.

      return 'u';

    default:

      break;

  // CharacterEscape ::

  //   IdentityEscape[?UnicodeMode, ?N]

//

  // * With /u, no identity escapes except for syntax characters are

  //   allowed.

  // * With /v, no identity escapes except for syntax characters and

  //   ClassSetReservedPunctuators (if within a class) are allowed.

  // * Without /u or /v:

  //   * '\c' is not an IdentityEscape.

  //   * '\k' is not an IdentityEscape when named captures exist.

  //   * Otherwise, all identity escapes are allowed.

  if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) {

    if (IsClassSetReservedPunctuator(c)) {

      Advance();

      return c;

  if (IsUnicodeMode()) {

    if (!IsSyntaxCharacterOrSlash(c)) {

      ReportError(RegExpError::kInvalidEscape);

      return 0;

    Advance();

    return c;

  DCHECK(!IsUnicodeMode());

  if (c == 'c') {

    ReportError(RegExpError::kInvalidEscape);

    return 0;

  Advance();

  // Note: It's important to Advance before the HasNamedCaptures call s.t. we

  // don't start scanning in the middle of an escape.

  if (c == 'k' && HasNamedCaptures(in_class_escape_state)) {

    ReportError(RegExpError::kInvalidEscape);

    return 0;

  return c;

// https://tc39.es/ecma262/#prod-ClassRanges

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(

    ZoneList<CharacterRange>* ranges, bool add_unicode_case_equivalents) {

  base::uc32 char_1, char_2;

  bool is_class_1, is_class_2;

  while (has_more() && current() != ']') {

    ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,

                     &is_class_1 CHECK_FAILED);

    // ClassAtom

    if (current() == '-') {

      Advance();

      if (!has_more()) {

        // If we reach the end we break out of the loop and let the

        // following code report an error.

        break;

      } else if (current() == ']') {

        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());

        ranges->Add(CharacterRange::Singleton('-'), zone());

        break;

      ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,

                       &is_class_2 CHECK_FAILED);

      if (is_class_1 || is_class_2) {

        // Either end is an escaped character class. Treat the '-' verbatim.

        if (IsUnicodeMode()) {

          // ES2015 21.2.2.15.1 step 1.

          return ReportError(RegExpError::kInvalidCharacterClass);

        if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());

        ranges->Add(CharacterRange::Singleton('-'), zone());

        if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());

        continue;

      // ES2015 21.2.2.15.1 step 6.

      if (char_1 > char_2) {

        return ReportError(RegExpError::kOutOfOrderCharacterClass);

      ranges->Add(CharacterRange::Range(char_1, char_2), zone());

    } else {

      if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());

  return nullptr;

// https://tc39.es/ecma262/#prod-ClassEscape

template <class CharT>

void RegExpParserImpl<CharT>::ParseClassEscape(

    ZoneList<CharacterRange>* ranges, Zone* zone,

    bool add_unicode_case_equivalents, base::uc32* char_out,

    bool* is_class_escape) {

  *is_class_escape = false;

  if (current() != '\\') {

    // Not a ClassEscape.

    *char_out = current();

    Advance();

    return;

  const base::uc32 next = Next();

  switch (next) {

    case 'b':

      *char_out = '\b';

      Advance(2);

      return;

    case '-':

      if (IsUnicodeMode()) {

        *char_out = next;

        Advance(2);

        return;

      break;

    case kEndMarker:

      ReportError(RegExpError::kEscapeAtEndOfPattern);

      return;

    default:

      break;

  static constexpr InClassEscapeState kInClassEscape =

      InClassEscapeState::kInClass;

  *is_class_escape =

      TryParseCharacterClassEscape(next, kInClassEscape, ranges, nullptr, zone,

                                   add_unicode_case_equivalents);

  if (*is_class_escape) return;

  bool dummy = false;  // Unused.

  *char_out = ParseCharacterEscape(kInClassEscape, &dummy);

// https://tc39.es/ecma262/#prod-CharacterClassEscape

template <class CharT>

bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(

    base::uc32 next, InClassEscapeState in_class_escape_state,

    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,

    Zone* zone, bool add_unicode_case_equivalents) {

  DCHECK_EQ(current(), '\\');

  DCHECK_EQ(Next(), next);

  switch (next) {

    case 'd':

    case 'D':

    case 's':

    case 'S':

    case 'w':

    case 'W':

      CharacterRange::AddClassEscape(static_cast<StandardCharacterSet>(next),

                                     ranges, add_unicode_case_equivalents,

                                     zone);

      Advance(2);

      return true;

    case 'p':

    case 'P': {

      if (!IsUnicodeMode()) return false;

      bool negate = next == 'P';

      Advance(2);

      ZoneVector<char> name_1(zone);

      ZoneVector<char> name_2(zone);

      if (!ParsePropertyClassName(&name_1, &name_2) ||

          !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) {

        ReportError(in_class_escape_state == InClassEscapeState::kInClass

                        ? RegExpError::kInvalidClassPropertyName

                        : RegExpError::kInvalidPropertyName);

      return true;

    default:

      return false;

namespace {

// Add |string| to |ranges| if length of |string| == 1, otherwise add |string|

// to |strings|.

void AddClassString(ZoneList<base::uc32>* normalized_string,

                    RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,

                    CharacterClassStrings* strings, Zone* zone) {

  if (normalized_string->length() == 1) {

    ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);

  } else {

    strings->emplace(normalized_string->ToVector(), regexp_string);

}  // namespace

// https://tc39.es/ecma262/#prod-ClassStringDisjunction

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(

    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {

  DCHECK(unicode_sets());

  DCHECK_EQ(current(), '\\');

  DCHECK_EQ(Next(), 'q');

  Advance(2);

  if (current() != '{') {

    // Identity escape of 'q' is not allowed in unicode mode.

    return ReportError(RegExpError::kInvalidEscape);

  Advance();

  ZoneList<base::uc32>* string =

      zone()->template New<ZoneList<base::uc32>>(4, zone());

  RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone());

  RegExpTextBuilder string_builder(zone(), &string_storage, flags());

  while (has_more() && current() != '}') {

    if (current() == '|') {

      AddClassString(string, string_builder.ToRegExp(), ranges, strings,

                     zone());

      string = zone()->template New<ZoneList<base::uc32>>(4, zone());

      string_storage.clear();

      Advance();

    } else {

      base::uc32 c = ParseClassSetCharacter(CHECK_FAILED);

      if (ignore_case()) {

#ifdef V8_INTL_SUPPORT

        c = u_foldCase(c, U_FOLD_CASE_DEFAULT);

#else

        c = AsciiAlphaToLower(c);

#endif

      string->Add(c, zone());

      string_builder.AddUnicodeCharacter(c);

  AddClassString(string, string_builder.ToRegExp(), ranges, strings, zone());

  CharacterRange::Canonicalize(ranges);

  // We don't need to handle missing closing '}' here.

  // If the character class is correctly closed, ParseClassSetCharacter will

  // report an error.

  Advance();

  return nullptr;

// https://tc39.es/ecma262/#prod-ClassSetOperand

// Tree returned based on type_out:

//  * kNestedClass: RegExpClassSetExpression

//  * For all other types: RegExpClassSetOperand

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(

    const RegExpBuilder* builder, ClassSetOperandType* type_out) {

  ZoneList<CharacterRange>* ranges =

      zone()->template New<ZoneList<CharacterRange>>(1, zone());

  CharacterClassStrings* strings =

      zone()->template New<CharacterClassStrings>(zone());

  base::uc32 character;

  RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings,

                                          &character CHECK_FAILED);

  DCHECK_IMPLIES(*type_out != ClassSetOperandType::kNestedClass,

                 tree == nullptr);

  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,

                 ranges->is_empty());

  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,

                 strings->empty());

  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,

                 ranges->is_empty());

  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,

                 strings->empty());

  DCHECK_IMPLIES(*type_out == ClassSetOperandType::kNestedClass,

                 tree->IsClassSetExpression());

  // ClassSetRange is only used within ClassSetUnion().

  DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);

  // There are no restrictions for kCharacterClassEscape.

  // CharacterClassEscape includes \p{}, which can contain ranges, strings or

  // both and \P{}, which could contain nothing (i.e. \P{Any}).

  if (tree == nullptr) {

    if (*type_out == ClassSetOperandType::kClassSetCharacter) {

      AddMaybeSimpleCaseFoldedRange(ranges,

                                    CharacterRange::Singleton(character));

    tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);

  return tree;

// https://tc39.es/ecma262/#prod-ClassSetOperand

// Based on |type_out| either a tree is returned or

// |ranges|/|strings|/|character| modified. If a tree is returned,

// ranges/strings are not modified. If |type_out| is kNestedClass, a tree of

// type RegExpClassSetExpression is returned. If | type_out| is

// kClassSetCharacter, |character| is set and nullptr returned. For all other

// types, |ranges|/|strings|/|character| is modified and nullptr is returned.

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(

    const RegExpBuilder* builder, ClassSetOperandType* type_out,

    ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,

    base::uc32* character) {

  DCHECK(unicode_sets());

  base::uc32 c = current();

  if (c == '\\') {

    const base::uc32 next = Next();

    if (next == 'q') {

      *type_out = ClassSetOperandType::kClassStringDisjunction;

      ParseClassStringDisjunction(ranges, strings CHECK_FAILED);

      return nullptr;

    static constexpr InClassEscapeState kInClassEscape =

        InClassEscapeState::kInClass;

    const bool add_unicode_case_equivalents = ignore_case();

    if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings,

                                     zone(), add_unicode_case_equivalents)) {

      *type_out = ClassSetOperandType::kCharacterClassEscape;

      return nullptr;

  if (c == '[') {

    *type_out = ClassSetOperandType::kNestedClass;

    return ParseCharacterClass(builder);

  *type_out = ClassSetOperandType::kClassSetCharacter;

  c = ParseClassSetCharacter(CHECK_FAILED);

  *character = c;

  return nullptr;

template <class CharT>

base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {

  DCHECK(unicode_sets());

  const base::uc32 c = current();

  if (c == '\\') {

    const base::uc32 next = Next();

    switch (next) {

      case 'b':

        Advance(2);

        return '\b';

      case kEndMarker:

        ReportError(RegExpError::kEscapeAtEndOfPattern);

        return 0;

    static constexpr InClassEscapeState kInClassEscape =

        InClassEscapeState::kInClass;

    bool dummy = false;  // Unused.

    return ParseCharacterEscape(kInClassEscape, &dummy);

  if (IsClassSetSyntaxCharacter(c)) {

    ReportError(RegExpError::kInvalidCharacterInClass);

    return 0;

  if (IsClassSetReservedDoublePunctuator(c)) {

    ReportError(RegExpError::kInvalidClassSetOperation);

    return 0;

  Advance();

  return c;

namespace {

bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {

  switch (type) {

    case ClassSetOperandType::kClassSetCharacter:

    case ClassSetOperandType::kClassSetRange:

      return false;

    case ClassSetOperandType::kCharacterClassEscape:

    case ClassSetOperandType::kClassStringDisjunction:

      return operand->AsClassSetOperand()->has_strings();

    case ClassSetOperandType::kNestedClass:

      if (operand->IsClassRanges()) return false;

      return operand->AsClassSetExpression()->may_contain_strings();

}  // namespace

template <class CharT>

void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange(

    ZoneList<CharacterRange>* ranges, CharacterRange new_range) {

  DCHECK(unicode_sets());

  if (ignore_case()) {

    ZoneList<CharacterRange>* new_ranges =

        zone()->template New<ZoneList<CharacterRange>>(2, zone());

    new_ranges->Add(new_range, zone());

    CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone());

    ranges->AddAll(*new_ranges, zone());

  } else {

    ranges->Add(new_range, zone());

  CharacterRange::Canonicalize(ranges);

// https://tc39.es/ecma262/#prod-ClassUnion

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(

    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,

    ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,

    CharacterClassStrings* strings, base::uc32 character) {

  DCHECK(unicode_sets());

  ZoneList<RegExpTree*>* operands =

      zone()->template New<ZoneList<RegExpTree*>>(2, zone());

  bool may_contain_strings = false;

  // Add the lhs to operands if necessary.

  // Either the lhs values were added to |ranges|/|strings| (in which case

  // |first_operand| is nullptr), or the lhs was evaluated to a tree and passed

  // as |first_operand| (in which case |ranges| and |strings| are empty).

  if (first_operand != nullptr) {

    may_contain_strings = MayContainStrings(first_operand_type, first_operand);

    operands->Add(first_operand, zone());

  ClassSetOperandType last_type = first_operand_type;

  while (has_more() && current() != ']') {

    if (current() == '-') {

      // Mix of ClassSetRange and ClassSubtraction is not allowed.

      if (Next() == '-') {

        return ReportError(RegExpError::kInvalidClassSetOperation);

      Advance();

      if (!has_more()) {

        // If we reach the end we break out of the loop and let the

        // following code report an error.

        break;

      // If the lhs and rhs around '-' are both ClassSetCharacters, they

      // represent a character range.

      // In case one of them is not a ClassSetCharacter, it is a syntax error,

      // as '-' can not be used unescaped within a class with /v.

      // See

      // https://tc39.es/ecma262/#prod-ClassSetRange

      if (last_type != ClassSetOperandType::kClassSetCharacter) {

        return ReportError(RegExpError::kInvalidCharacterClass);

      base::uc32 from = character;

      ParseClassSetOperand(builder, &last_type, ranges, strings,

                           &character CHECK_FAILED);

      if (last_type != ClassSetOperandType::kClassSetCharacter) {

        return ReportError(RegExpError::kInvalidCharacterClass);

      if (from > character) {

        return ReportError(RegExpError::kOutOfOrderCharacterClass);

      AddMaybeSimpleCaseFoldedRange(ranges,

                                    CharacterRange::Range(from, character));

      last_type = ClassSetOperandType::kClassSetRange;

    } else {

      DCHECK_NE(current(), '-');

      if (last_type == ClassSetOperandType::kClassSetCharacter) {

        AddMaybeSimpleCaseFoldedRange(ranges,

                                      CharacterRange::Singleton(character));

      RegExpTree* operand = ParseClassSetOperand(

          builder, &last_type, ranges, strings, &character CHECK_FAILED);

      if (operand != nullptr) {

        may_contain_strings |= MayContainStrings(last_type, operand);

        // Add the range we started building as operand and reset the current

        // range.

        if (!ranges->is_empty() || !strings->empty()) {

          may_contain_strings |= !strings->empty();

          operands->Add(

              zone()->template New<RegExpClassSetOperand>(ranges, strings),

              zone());

          ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());

          strings = zone()->template New<CharacterClassStrings>(zone());

        operands->Add(operand, zone());

  if (!has_more()) {

    return ReportError(RegExpError::kUnterminatedCharacterClass);

  if (last_type == ClassSetOperandType::kClassSetCharacter) {

    AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character));

  // Add the range we started building as operand.

  if (!ranges->is_empty() || !strings->empty()) {

    may_contain_strings |= !strings->empty();

    operands->Add(zone()->template New<RegExpClassSetOperand>(ranges, strings),

                  zone());

  DCHECK_EQ(current(), ']');

  Advance();

  if (is_negated && may_contain_strings) {

    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);

  if (operands->is_empty()) {

    // Return empty expression if no operands were added (e.g. [\P{Any}]

    // produces an empty range).

    DCHECK(ranges->is_empty());

    DCHECK(strings->empty());

    return RegExpClassSetExpression::Empty(zone(), is_negated);

  return zone()->template New<RegExpClassSetExpression>(

      RegExpClassSetExpression::OperationType::kUnion, is_negated,

      may_contain_strings, operands);

// https://tc39.es/ecma262/#prod-ClassIntersection

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(

    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,

    ClassSetOperandType first_operand_type) {

  DCHECK(unicode_sets());

  DCHECK(current() == '&' && Next() == '&');

  bool may_contain_strings =

      MayContainStrings(first_operand_type, first_operand);

  ZoneList<RegExpTree*>* operands =

      zone()->template New<ZoneList<RegExpTree*>>(2, zone());

  operands->Add(first_operand, zone());

  while (has_more() && current() != ']') {

    if (current() != '&' || Next() != '&') {

      return ReportError(RegExpError::kInvalidClassSetOperation);

    Advance(2);

    // [lookahead ≠ &]

    if (current() == '&') {

      return ReportError(RegExpError::kInvalidCharacterInClass);

    ClassSetOperandType operand_type;

    RegExpTree* operand =

        ParseClassSetOperand(builder, &operand_type CHECK_FAILED);

    may_contain_strings &= MayContainStrings(operand_type, operand);

    operands->Add(operand, zone());

  if (!has_more()) {

    return ReportError(RegExpError::kUnterminatedCharacterClass);

  if (is_negated && may_contain_strings) {

    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);

  DCHECK_EQ(current(), ']');

  Advance();

  return zone()->template New<RegExpClassSetExpression>(

      RegExpClassSetExpression::OperationType::kIntersection, is_negated,

      may_contain_strings, operands);

// https://tc39.es/ecma262/#prod-ClassSubtraction

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(

    const RegExpBuilder* builder, bool is_negated, RegExpTree* first_operand,

    ClassSetOperandType first_operand_type) {

  DCHECK(unicode_sets());

  DCHECK(current() == '-' && Next() == '-');

  const bool may_contain_strings =

      MayContainStrings(first_operand_type, first_operand);

  if (is_negated && may_contain_strings) {

    return ReportError(RegExpError::kNegatedCharacterClassWithStrings);

  ZoneList<RegExpTree*>* operands =

      zone()->template New<ZoneList<RegExpTree*>>(2, zone());

  operands->Add(first_operand, zone());

  while (has_more() && current() != ']') {

    if (current() != '-' || Next() != '-') {

      return ReportError(RegExpError::kInvalidClassSetOperation);

    Advance(2);

    ClassSetOperandType dummy;  // unused

    RegExpTree* operand = ParseClassSetOperand(builder, &dummy CHECK_FAILED);

    operands->Add(operand, zone());

  if (!has_more()) {

    return ReportError(RegExpError::kUnterminatedCharacterClass);

  DCHECK_EQ(current(), ']');

  Advance();

  return zone()->template New<RegExpClassSetExpression>(

      RegExpClassSetExpression::OperationType::kSubtraction, is_negated,

      may_contain_strings, operands);

// https://tc39.es/ecma262/#prod-CharacterClass

template <class CharT>

RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(

    const RegExpBuilder* builder) {

  DCHECK_EQ(current(), '[');

  Advance();

  bool is_negated = false;

  if (current() == '^') {

    is_negated = true;

    Advance();

  ZoneList<CharacterRange>* ranges =

      zone()->template New<ZoneList<CharacterRange>>(2, zone());

  if (current() == ']') {

    Advance();

    if (unicode_sets()) {

      return RegExpClassSetExpression::Empty(zone(), is_negated);

    } else {

      RegExpClassRanges::ClassRangesFlags class_ranges_flags;

      if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;

      return zone()->template New<RegExpClassRanges>(zone(), ranges,

                                                     class_ranges_flags);

  if (!unicode_sets()) {

    bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();

    ParseClassRanges(ranges, add_unicode_case_equivalents CHECK_FAILED);

    if (!has_more()) {

      return ReportError(RegExpError::kUnterminatedCharacterClass);

    DCHECK_EQ(current(), ']');

    Advance();

    RegExpClassRanges::ClassRangesFlags character_class_flags;

    if (is_negated) character_class_flags = RegExpClassRanges::NEGATED;

    return zone()->template New<RegExpClassRanges>(zone(), ranges,

                                                   character_class_flags);

  } else {

    ClassSetOperandType operand_type;

    CharacterClassStrings* strings =

        zone()->template New<CharacterClassStrings>(zone());

    base::uc32 character;

    RegExpTree* operand = ParseClassSetOperand(

        builder, &operand_type, ranges, strings, &character CHECK_FAILED);

    switch (current()) {

      case '-':

        if (Next() == '-') {

          if (operand == nullptr) {

            if (operand_type == ClassSetOperandType::kClassSetCharacter) {

              ranges->Add(CharacterRange::Singleton(character), zone());

            operand =

                zone()->template New<RegExpClassSetOperand>(ranges, strings);

          return ParseClassSubtraction(builder, is_negated, operand,

                                       operand_type);

        // ClassSetRange is handled in ParseClassUnion().

        break;

      case '&':

        if (Next() == '&') {

          if (operand == nullptr) {

            if (operand_type == ClassSetOperandType::kClassSetCharacter) {

              ranges->Add(CharacterRange::Singleton(character), zone());

            operand =

                zone()->template New<RegExpClassSetOperand>(ranges, strings);

          return ParseClassIntersection(builder, is_negated, operand,

                                        operand_type);

    return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,

                           strings, character);

#undef CHECK_FAILED

template <class CharT>

bool RegExpParserImpl<CharT>::Parse(RegExpCompileData* result) {

  DCHECK_NOT_NULL(result);

  RegExpTree* tree = ParsePattern();

  if (failed()) {

    DCHECK_NULL(tree);

    DCHECK_NE(error_, RegExpError::kNone);

    result->error = error_;

    result->error_pos = error_pos_;

    return false;

  DCHECK_NOT_NULL(tree);

  DCHECK_EQ(error_, RegExpError::kNone);

  if (v8_flags.trace_regexp_parser) {

    StdoutStream os;

    tree->Print(os, zone());

    os << "\n";

  result->tree = tree;

  const int capture_count = captures_started();

  result->simple = tree->IsAtom() && simple() && capture_count == 0;

  result->contains_anchor = contains_anchor();

  result->capture_count = capture_count;

  result->named_captures = GetNamedCaptures();

  return true;

void RegExpBuilder::FlushText() { text_builder().FlushText(); }

void RegExpBuilder::AddCharacter(base::uc16 c) {

  pending_empty_ = false;

  text_builder().AddCharacter(c);

void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {

  pending_empty_ = false;

  text_builder().AddUnicodeCharacter(c);

void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {

  pending_empty_ = false;

  text_builder().AddEscapedUnicodeCharacter(character);

void RegExpBuilder::AddEmpty() {

  text_builder().FlushPendingSurrogate();

  pending_empty_ = true;

void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) {

  pending_empty_ = false;

  text_builder().AddClassRanges(cc);

void RegExpBuilder::AddAtom(RegExpTree* term) {

  if (term->IsEmpty()) {

    AddEmpty();

    return;

  pending_empty_ = false;

  if (term->IsTextElement()) {

    text_builder().AddAtom(term);

  } else {

    FlushText();

    terms_.emplace_back(term);

void RegExpBuilder::AddTerm(RegExpTree* term) {

  DCHECK(!term->IsEmpty());

  pending_empty_ = false;

  if (term->IsTextElement()) {

    text_builder().AddTerm(term);

  } else {

    FlushText();

    terms_.emplace_back(term);

void RegExpBuilder::AddAssertion(RegExpTree* assert) {

  FlushText();

  pending_empty_ = false;

  terms_.emplace_back(assert);

void RegExpBuilder::NewAlternative() { FlushTerms(); }

void RegExpBuilder::FlushTerms() {

  FlushText();

  size_t num_terms = terms_.size();

  RegExpTree* alternative;

  if (num_terms == 0) {

    alternative = zone()->New<RegExpEmpty>();

  } else if (num_terms == 1) {

    alternative = terms_.back();

  } else {

    alternative =

        zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(

            base::VectorOf(terms_.begin(), terms_.size()), zone()));

  alternatives_.emplace_back(alternative);

  terms_.clear();

RegExpTree* RegExpBuilder::ToRegExp() {

  FlushTerms();

  size_t num_alternatives = alternatives_.size();

  if (num_alternatives == 0) return zone()->New<RegExpEmpty>();

  if (num_alternatives == 1) return alternatives_.back();

  return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(

      base::VectorOf(alternatives_.begin(), alternatives_.size()), zone()));

bool RegExpBuilder::AddQuantifierToAtom(

    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

  if (pending_empty_) {

    pending_empty_ = false;

    return true;

  RegExpTree* atom = text_builder().PopLastAtom();

  if (atom != nullptr) {

    FlushText();

  } else if (!terms_.empty()) {

    atom = terms_.back();

    terms_.pop_back();

    if (atom->IsLookaround()) {

      // With /u or /v, lookarounds are not quantifiable.

      if (IsUnicodeMode()) return false;

      // Lookbehinds are not quantifiable.

      if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {

        return false;

    if (atom->max_match() == 0) {

      // Guaranteed to only match an empty string.

      if (min == 0) {

        return true;

      terms_.emplace_back(atom);

      return true;

  } else {

    // Only call immediately after adding an atom or character!

    UNREACHABLE();

  terms_.emplace_back(

      zone()->New<RegExpQuantifier>(min, max, quantifier_type, atom));

  return true;

template class RegExpParserImpl<uint8_t>;

template class RegExpParserImpl<base::uc16>;

}  // namespace

// static

bool RegExpParser::ParseRegExpFromHeapString(Isolate* isolate, Zone* zone,

                                             Handle<String> input,

                                             RegExpFlags flags,

                                             RegExpCompileData* result) {

  DisallowGarbageCollection no_gc;

  uintptr_t stack_limit = isolate->stack_guard()->real_climit();

  String::FlatContent content = input->GetFlatContent(no_gc);

  if (content.IsOneByte()) {

    base::Vector<const uint8_t> v = content.ToOneByteVector();

    return RegExpParserImpl<uint8_t>{v.begin(),   v.length(), flags,

                                     stack_limit, zone,       no_gc}

        .Parse(result);

  } else {

    base::Vector<const base::uc16> v = content.ToUC16Vector();

    return RegExpParserImpl<base::uc16>{v.begin(),   v.length(), flags,

                                        stack_limit, zone,       no_gc}

        .Parse(result);

// static

template <class CharT>

bool RegExpParser::VerifyRegExpSyntax(Zone* zone, uintptr_t stack_limit,

                                      const CharT* input, int input_length,

                                      RegExpFlags flags,

                                      RegExpCompileData* result,

                                      const DisallowGarbageCollection& no_gc) {

  return RegExpParserImpl<CharT>{input,       input_length, flags,

                                 stack_limit, zone,         no_gc}

      .Parse(result);

template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(

    Zone*, uintptr_t, const uint8_t*, int, RegExpFlags, RegExpCompileData*,

    const DisallowGarbageCollection&);

template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(

    Zone*, uintptr_t, const base::uc16*, int, RegExpFlags, RegExpCompileData*,

    const DisallowGarbageCollection&);

}  // namespace internal

}  // namespace v8