diff options
author | Moonchild <moonchild@palemoon.org> | 2022-10-10 19:36:24 +0000 |
---|---|---|
committer | Moonchild <moonchild@palemoon.org> | 2022-10-10 19:36:24 +0000 |
commit | fb173db64bdeeca0b236a360ed79b36ad9dd2273 (patch) | |
tree | e712461d55ee46ee95b9e57372b5e1531894944a /js/src/new-regexp/regexp-parser.cc | |
parent | 4ba7e05b96a30cb2abff2db1e2f9dd8f8a56cb3e (diff) | |
download | uxp-fb173db64bdeeca0b236a360ed79b36ad9dd2273.tar.gz |
Issue #2016 - Remove non-functional V8 regex import
Diffstat (limited to 'js/src/new-regexp/regexp-parser.cc')
-rw-r--r-- | js/src/new-regexp/regexp-parser.cc | 2109 |
1 files changed, 0 insertions, 2109 deletions
diff --git a/js/src/new-regexp/regexp-parser.cc b/js/src/new-regexp/regexp-parser.cc deleted file mode 100644 index a26e354389..0000000000 --- a/js/src/new-regexp/regexp-parser.cc +++ /dev/null @@ -1,2109 +0,0 @@ -// Copyright 2016 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include "new-regexp/regexp-parser.h" - -#include <vector> - -#include "new-regexp/property-sequences.h" -#include "new-regexp/regexp-macro-assembler.h" -#include "new-regexp/regexp.h" - -#ifdef V8_INTL_SUPPORT -#include "unicode/uniset.h" -#endif // V8_INTL_SUPPORT - -namespace v8 { -namespace internal { - -RegExpParser::RegExpParser(FlatStringReader* in, JSRegExp::Flags flags, - Isolate* isolate, Zone* zone) - : isolate_(isolate), - zone_(zone), - captures_(nullptr), - named_captures_(nullptr), - named_back_references_(nullptr), - in_(in), - current_(kEndMarker), - top_level_flags_(flags), - next_pos_(0), - captures_started_(0), - capture_count_(0), - has_more_(true), - simple_(false), - contains_anchor_(false), - is_scanned_for_captures_(false), - has_named_captures_(false), - failed_(false) { - Advance(); -} - -template <bool update_position> -inline uc32 RegExpParser::ReadNext() { - int position = next_pos_; - uc32 c0 = in()->Get(position); - position++; - // Read the whole surrogate pair in case of unicode flag, if possible. - if (unicode() && position < in()->length() && - unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { - uc16 c1 = in()->Get(position); - if (unibrow::Utf16::IsTrailSurrogate(c1)) { - c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); - position++; - } - } - if (update_position) next_pos_ = position; - return c0; -} - - -uc32 RegExpParser::Next() { - if (has_next()) { - return ReadNext<false>(); - } else { - return kEndMarker; - } -} - -void RegExpParser::Advance() { - if (has_next()) { - StackLimitCheck check(isolate()); - if (check.HasOverflowed()) { - if (FLAG_correctness_fuzzer_suppressions) { - FATAL("Aborting on stack overflow"); - } - ReportError(RegExpError::kStackOverflow); - } else if (zone()->excess_allocation()) { - if (FLAG_correctness_fuzzer_suppressions) { - FATAL("Aborting on excess zone allocation"); - } - ReportError(RegExpError::kTooLarge); - } else { - current_ = ReadNext<true>(); - } - } else { - current_ = kEndMarker; - // Advance so that position() points to 1-after-the-last-character. This is - // important so that Reset() to this position works correctly. - next_pos_ = in()->length() + 1; - has_more_ = false; - } -} - - -void RegExpParser::Reset(int pos) { - next_pos_ = pos; - has_more_ = (pos < in()->length()); - Advance(); -} - -void RegExpParser::Advance(int dist) { - next_pos_ += dist - 1; - Advance(); -} - - -bool RegExpParser::simple() { return simple_; } - -bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { - switch (c) { - case '^': - case '$': - case '\\': - case '.': - case '*': - case '+': - case '?': - case '(': - case ')': - case '[': - case ']': - case '{': - case '}': - case '|': - case '/': - return true; - default: - break; - } - return false; -} - -RegExpTree* RegExpParser::ReportError(RegExpError error) { - if (failed_) return nullptr; // Do not overwrite any existing error. - failed_ = true; - error_ = error; - error_pos_ = position(); - // Zip to the end to make sure no more input is read. - current_ = kEndMarker; - next_pos_ = in()->length(); - return nullptr; -} - -#define CHECK_FAILED /**/); \ - if (failed_) return nullptr; \ - ((void)0 - -// Pattern :: -// Disjunction -RegExpTree* RegExpParser::ParsePattern() { - RegExpTree* result = ParseDisjunction(CHECK_FAILED); - PatchNamedBackReferences(CHECK_FAILED); - DCHECK(!has_more()); - // If the result of parsing is a literal string atom, and it has the - // same length as the input, then the atom is identical to the input. - if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { - simple_ = true; - } - return result; -} - - -// Disjunction :: -// Alternative -// Alternative | Disjunction -// Alternative :: -// [empty] -// Term Alternative -// Term :: -// Assertion -// Atom -// Atom Quantifier -RegExpTree* RegExpParser::ParseDisjunction() { - // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, - 0, nullptr, top_level_flags_, zone()); - RegExpParserState* state = &initial_state; - // Cache the builder in a local variable for quick access. - RegExpBuilder* builder = initial_state.builder(); - while (true) { - switch (current()) { - case kEndMarker: - if (state->IsSubexpression()) { - // Inside a parenthesized group when hitting end of input. - return ReportError(RegExpError::kUnterminatedGroup); - } - DCHECK_EQ(INITIAL, state->group_type()); - // Parsing completed successfully. - return builder->ToRegExp(); - case ')': { - if (!state->IsSubexpression()) { - return ReportError(RegExpError::kUnmatchedParen); - } - DCHECK_NE(INITIAL, state->group_type()); - - Advance(); - // End disjunction parsing and convert builder content to new single - // regexp atom. - RegExpTree* body = builder->ToRegExp(); - - int end_capture_index = captures_started(); - - int capture_index = state->capture_index(); - SubexpressionType group_type = state->group_type(); - - // Build result of subexpression. - if (group_type == CAPTURE) { - if (state->IsNamedCapture()) { - CreateNamedCaptureAtIndex(state->capture_name(), - capture_index CHECK_FAILED); - } - RegExpCapture* capture = GetCapture(capture_index); - capture->set_body(body); - body = capture; - } else if (group_type == GROUPING) { - body = new (zone()) RegExpGroup(body); - } else { - DCHECK(group_type == POSITIVE_LOOKAROUND || - group_type == NEGATIVE_LOOKAROUND); - bool is_positive = (group_type == POSITIVE_LOOKAROUND); - body = new (zone()) RegExpLookaround( - body, is_positive, end_capture_index - capture_index, - capture_index, state->lookaround_type()); - } - - // Restore previous state. - state = state->previous_state(); - builder = state->builder(); - - builder->AddAtom(body); - // For compatibility with JSC and ES3, we allow quantifiers after - // lookaheads, and break in all cases. - break; - } - case '|': { - Advance(); - builder->NewAlternative(); - continue; - } - case '*': - case '+': - case '?': - return ReportError(RegExpError::kNothingToRepeat); - case '^': { - Advance(); - if (builder->multiline()) { - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::START_OF_LINE, builder->flags())); - } else { - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::START_OF_INPUT, builder->flags())); - set_contains_anchor(); - } - continue; - } - case '$': { - Advance(); - RegExpAssertion::AssertionType assertion_type = - builder->multiline() ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; - builder->AddAssertion( - new (zone()) RegExpAssertion(assertion_type, builder->flags())); - continue; - } - case '.': { - Advance(); - ZoneList<CharacterRange>* ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - - if (builder->dotall()) { - // Everything. - CharacterRange::AddClassEscape('*', ranges, false, zone()); - } else { - // Everything except \x0A, \x0D, \u2028 and \u2029 - CharacterRange::AddClassEscape('.', ranges, false, zone()); - } - - RegExpCharacterClass* cc = - new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - case '(': { - state = ParseOpenParenthesis(state CHECK_FAILED); - builder = state->builder(); - continue; - } - case '[': { - RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED); - builder->AddCharacterClass(cc->AsCharacterClass()); - break; - } - // Atom :: - // \ AtomEscape - case '\\': - switch (Next()) { - case kEndMarker: - return ReportError(RegExpError::kEscapeAtEndOfPattern); - case 'b': - Advance(2); - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::BOUNDARY, builder->flags())); - continue; - case 'B': - Advance(2); - builder->AddAssertion(new (zone()) RegExpAssertion( - RegExpAssertion::NON_BOUNDARY, builder->flags())); - continue; - // AtomEscape :: - // CharacterClassEscape - // - // CharacterClassEscape :: one of - // d D s S w W - case 'd': - case 'D': - case 's': - case 'S': - case 'w': - case 'W': { - uc32 c = Next(); - Advance(2); - ZoneList<CharacterRange>* ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - CharacterRange::AddClassEscape( - c, ranges, unicode() && builder->ignore_case(), zone()); - RegExpCharacterClass* cc = new (zone()) - RegExpCharacterClass(zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - case 'p': - case 'P': { - uc32 p = Next(); - Advance(2); - if (unicode()) { - ZoneList<CharacterRange>* ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - ZoneVector<char> name_1(zone()); - ZoneVector<char> name_2(zone()); - if (ParsePropertyClassName(&name_1, &name_2)) { - if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { - RegExpCharacterClass* cc = new (zone()) - RegExpCharacterClass(zone(), ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - if (p == 'p' && name_2.empty()) { - RegExpTree* sequence = GetPropertySequence(name_1); - if (sequence != nullptr) { - builder->AddAtom(sequence); - break; - } - } - } - return ReportError(RegExpError::kInvalidPropertyName); - } else { - builder->AddCharacter(p); - } - break; - } - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': { - int index = 0; - bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED); - if (is_backref) { - if (state->IsInsideCaptureGroup(index)) { - // The back reference is inside the capture group it refers to. - // Nothing can possibly have been captured yet, so we use empty - // instead. This ensures that, when checking a back reference, - // the capture registers of the referenced capture are either - // both set or both cleared. - builder->AddEmpty(); - } else { - RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = - new (zone()) RegExpBackReference(capture, builder->flags()); - builder->AddAtom(atom); - } - break; - } - // With /u, no identity escapes except for syntax characters - // are allowed. Otherwise, all identity escapes are allowed. - if (unicode()) { - return ReportError(RegExpError::kInvalidEscape); - } - uc32 first_digit = Next(); - if (first_digit == '8' || first_digit == '9') { - builder->AddCharacter(first_digit); - Advance(2); - break; - } - V8_FALLTHROUGH; - } - case '0': { - Advance(); - if (unicode() && Next() >= '0' && Next() <= '9') { - // With /u, decimal escape with leading 0 are not parsed as octal. - return ReportError(RegExpError::kInvalidDecimalEscape); - } - uc32 octal = ParseOctalLiteral(); - builder->AddCharacter(octal); - break; - } - // ControlEscape :: one of - // f n r t v - case 'f': - Advance(2); - builder->AddCharacter('\f'); - break; - case 'n': - Advance(2); - builder->AddCharacter('\n'); - break; - case 'r': - Advance(2); - builder->AddCharacter('\r'); - break; - case 't': - Advance(2); - builder->AddCharacter('\t'); - break; - case 'v': - Advance(2); - builder->AddCharacter('\v'); - break; - case 'c': { - Advance(); - uc32 controlLetter = Next(); - // Special case if it is an ASCII letter. - // Convert lower case letters to uppercase. - uc32 letter = controlLetter & ~('a' ^ 'A'); - if (letter < 'A' || 'Z' < letter) { - // controlLetter is not in range 'A'-'Z' or 'a'-'z'. - // Read the backslash as a literal character instead of as - // starting an escape. - // ES#prod-annexB-ExtendedPatternCharacter - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidUnicodeEscape); - } - builder->AddCharacter('\\'); - } else { - Advance(2); - builder->AddCharacter(controlLetter & 0x1F); - } - break; - } - case 'x': { - Advance(2); - uc32 value; - if (ParseHexEscape(2, &value)) { - builder->AddCharacter(value); - } else if (!unicode()) { - builder->AddCharacter('x'); - } else { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidEscape); - } - break; - } - case 'u': { - Advance(2); - uc32 value; - if (ParseUnicodeEscape(&value)) { - builder->AddEscapedUnicodeCharacter(value); - } else if (!unicode()) { - builder->AddCharacter('u'); - } else { - // With /u, invalid escapes are not treated as identity escapes. - return ReportError(RegExpError::kInvalidUnicodeEscape); - } - break; - } - case 'k': - // Either an identity escape or a named back-reference. The two - // interpretations are mutually exclusive: '\k' is interpreted as - // an identity escape for non-Unicode patterns without named - // capture groups, and as the beginning of a named back-reference - // in all other cases. - if (unicode() || HasNamedCaptures()) { - Advance(2); - ParseNamedBackReference(builder, state CHECK_FAILED); - break; - } - V8_FALLTHROUGH; - default: - Advance(); - // With /u, no identity escapes except for syntax characters - // are allowed. Otherwise, all identity escapes are allowed. - if (!unicode() || IsSyntaxCharacterOrSlash(current())) { - builder->AddCharacter(current()); - Advance(); - } else { - return ReportError(RegExpError::kInvalidEscape); - } - break; - } - break; - case '{': { - int dummy; - bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED); - if (parsed) return ReportError(RegExpError::kNothingToRepeat); - V8_FALLTHROUGH; - } - case '}': - case ']': - if (unicode()) { - return ReportError(RegExpError::kLoneQuantifierBrackets); - } - V8_FALLTHROUGH; - default: - builder->AddUnicodeCharacter(current()); - Advance(); - break; - } // end switch(current()) - - int min; - int max; - switch (current()) { - // QuantifierPrefix :: - // * - // + - // ? - // { - case '*': - min = 0; - max = RegExpTree::kInfinity; - Advance(); - break; - case '+': - min = 1; - max = RegExpTree::kInfinity; - Advance(); - break; - case '?': - min = 0; - max = 1; - Advance(); - break; - case '{': - if (ParseIntervalQuantifier(&min, &max)) { - if (max < min) { - return ReportError(RegExpError::kRangeOutOfOrder); - } - break; - } else if (unicode()) { - // With /u, incomplete quantifiers are not allowed. - return ReportError(RegExpError::kIncompleteQuantifier); - } - continue; - default: - continue; - } - RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; - if (current() == '?') { - quantifier_type = RegExpQuantifier::NON_GREEDY; - Advance(); - } else if (FLAG_regexp_possessive_quantifier && current() == '+') { - // FLAG_regexp_possessive_quantifier is a debug-only flag. - quantifier_type = RegExpQuantifier::POSSESSIVE; - Advance(); - } - if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { - return ReportError(RegExpError::kInvalidQuantifier); - } - } -} - -RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( - RegExpParserState* state) { - RegExpLookaround::Type lookaround_type = state->lookaround_type(); - bool is_named_capture = false; - JSRegExp::Flags switch_on = JSRegExp::kNone; - JSRegExp::Flags switch_off = JSRegExp::kNone; - const ZoneVector<uc16>* capture_name = nullptr; - SubexpressionType subexpr_type = CAPTURE; - Advance(); - if (current() == '?') { - switch (Next()) { - case ':': - Advance(2); - subexpr_type = GROUPING; - break; - case '=': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = POSITIVE_LOOKAROUND; - break; - case '!': - Advance(2); - lookaround_type = RegExpLookaround::LOOKAHEAD; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - case '-': - case 'i': - case 's': - case 'm': { - if (!FLAG_regexp_mode_modifiers) { - ReportError(RegExpError::kInvalidGroup); - return nullptr; - } - Advance(); - bool flags_sense = true; // Switching on flags. - while (subexpr_type != GROUPING) { - switch (current()) { - case '-': - if (!flags_sense) { - ReportError(RegExpError::kMultipleFlagDashes); - return nullptr; - } - flags_sense = false; - Advance(); - continue; - case 's': - case 'i': - case 'm': { - JSRegExp::Flags bit = JSRegExp::kUnicode; - if (current() == 'i') bit = JSRegExp::kIgnoreCase; - if (current() == 'm') bit = JSRegExp::kMultiline; - if (current() == 's') bit = JSRegExp::kDotAll; - if (((switch_on | switch_off) & bit) != 0) { - ReportError(RegExpError::kRepeatedFlag); - return nullptr; - } - if (flags_sense) { - switch_on |= bit; - } else { - switch_off |= bit; - } - Advance(); - continue; - } - case ')': { - Advance(); - state->builder() - ->FlushText(); // Flush pending text using old flags. - // These (?i)-style flag switches don't put us in a subexpression - // at all, they just modify the flags in the rest of the current - // subexpression. - JSRegExp::Flags flags = - (state->builder()->flags() | switch_on) & ~switch_off; - state->builder()->set_flags(flags); - return state; - } - case ':': - Advance(); - subexpr_type = GROUPING; // Will break us out of the outer loop. - continue; - default: - ReportError(RegExpError::kInvalidFlagGroup); - return nullptr; - } - } - break; - } - case '<': - Advance(); - if (Next() == '=') { - Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; - subexpr_type = POSITIVE_LOOKAROUND; - break; - } else if (Next() == '!') { - Advance(2); - lookaround_type = RegExpLookaround::LOOKBEHIND; - subexpr_type = NEGATIVE_LOOKAROUND; - break; - } - is_named_capture = true; - has_named_captures_ = true; - Advance(); - break; - default: - ReportError(RegExpError::kInvalidGroup); - return nullptr; - } - } - if (subexpr_type == CAPTURE) { - if (captures_started_ >= JSRegExp::kMaxCaptures) { - ReportError(RegExpError::kTooManyCaptures); - return nullptr; - } - captures_started_++; - - if (is_named_capture) { - capture_name = ParseCaptureGroupName(CHECK_FAILED); - } - } - JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off; - // Store current state and begin new disjunction parsing. - return new (zone()) - RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, - capture_name, flags, zone()); -} - -#ifdef DEBUG -// Currently only used in an DCHECK. -static bool IsSpecialClassEscape(uc32 c) { - switch (c) { - case 'd': - case 'D': - case 's': - case 'S': - case 'w': - case 'W': - return true; - default: - return false; - } -} -#endif - - -// In order to know whether an escape is a backreference or not we have to scan -// the entire regexp and find the number of capturing parentheses. However we -// don't want to scan the regexp twice unless it is necessary. This mini-parser -// is called when needed. It can see the difference between capturing and -// noncapturing parentheses and can skip character classes and backslash-escaped -// characters. -void RegExpParser::ScanForCaptures() { - DCHECK(!is_scanned_for_captures_); - const int saved_position = position(); - // Start with captures started previous to current position - int capture_count = captures_started(); - // Add count of captures after this position. - int n; - while ((n = current()) != kEndMarker) { - Advance(); - switch (n) { - case '\\': - Advance(); - break; - case '[': { - int c; - while ((c = current()) != kEndMarker) { - Advance(); - if (c == '\\') { - Advance(); - } else { - if (c == ']') break; - } - } - break; - } - case '(': - if (current() == '?') { - // At this point we could be in - // * a non-capturing group '(:', - // * a lookbehind assertion '(?<=' '(?<!' - // * or a named capture '(?<'. - // - // Of these, only named captures are capturing groups. - - Advance(); - if (current() != '<') break; - - Advance(); - if (current() == '=' || current() == '!') break; - - // Found a possible named capture. It could turn out to be a syntax - // error (e.g. an unterminated or invalid name), but that distinction - // does not matter for our purposes. - has_named_captures_ = true; - } - capture_count++; - break; - } - } - capture_count_ = capture_count; - is_scanned_for_captures_ = true; - Reset(saved_position); -} - - -bool RegExpParser::ParseBackReferenceIndex(int* index_out) { - DCHECK_EQ('\\', current()); - DCHECK('1' <= Next() && Next() <= '9'); - // Try to parse a decimal literal that is no greater than the total number - // of left capturing parentheses in the input. - int start = position(); - int value = Next() - '0'; - Advance(2); - while (true) { - uc32 c = current(); - if (IsDecimalDigit(c)) { - value = 10 * value + (c - '0'); - if (value > JSRegExp::kMaxCaptures) { - Reset(start); - return false; - } - Advance(); - } else { - break; - } - } - if (value > captures_started()) { - if (!is_scanned_for_captures_) ScanForCaptures(); - if (value > capture_count_) { - Reset(start); - return false; - } - } - *index_out = value; - return true; -} - -static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) { - if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) { - v->push_back(code_unit); - } else { - v->push_back(unibrow::Utf16::LeadSurrogate(code_unit)); - v->push_back(unibrow::Utf16::TrailSurrogate(code_unit)); - } -} - -const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() { - ZoneVector<uc16>* name = - new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone()); - - bool at_start = true; - while (true) { - uc32 c = current(); - Advance(); - - // Convert unicode escapes. - if (c == '\\' && current() == 'u') { - Advance(); - if (!ParseUnicodeEscape(&c)) { - ReportError(RegExpError::kInvalidUnicodeEscape); - return nullptr; - } - } - - // The backslash char is misclassified as both ID_Start and ID_Continue. - if (c == '\\') { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - - if (at_start) { - if (!IsIdentifierStart(c)) { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - push_code_unit(name, c); - at_start = false; - } else { - if (c == '>') { - break; - } else if (IsIdentifierPart(c)) { - push_code_unit(name, c); - } else { - ReportError(RegExpError::kInvalidCaptureGroupName); - return nullptr; - } - } - } - - return name; -} - -bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, - int index) { - DCHECK(0 < index && index <= captures_started_); - DCHECK_NOT_NULL(name); - - RegExpCapture* capture = GetCapture(index); - DCHECK_NULL(capture->name()); - - capture->set_name(name); - - if (named_captures_ == nullptr) { - named_captures_ = new (zone_->New(sizeof(*named_captures_))) - ZoneSet<RegExpCapture*, RegExpCaptureNameLess>(zone()); - } else { - // Check for duplicates and bail if we find any. - - const auto& named_capture_it = named_captures_->find(capture); - if (named_capture_it != named_captures_->end()) { - ReportError(RegExpError::kDuplicateCaptureGroupName); - return false; - } - } - - named_captures_->emplace(capture); - - return true; -} - -bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, - RegExpParserState* state) { - // The parser is assumed to be on the '<' in \k<name>. - if (current() != '<') { - ReportError(RegExpError::kInvalidNamedReference); - return false; - } - - Advance(); - const ZoneVector<uc16>* name = ParseCaptureGroupName(); - if (name == nullptr) { - return false; - } - - if (state->IsInsideCaptureGroup(name)) { - builder->AddEmpty(); - } else { - RegExpBackReference* atom = - new (zone()) RegExpBackReference(builder->flags()); - atom->set_name(name); - - builder->AddAtom(atom); - - if (named_back_references_ == nullptr) { - named_back_references_ = - new (zone()) ZoneList<RegExpBackReference*>(1, zone()); - } - named_back_references_->Add(atom, zone()); - } - - return true; -} - -void RegExpParser::PatchNamedBackReferences() { - if (named_back_references_ == nullptr) return; - - if (named_captures_ == nullptr) { - ReportError(RegExpError::kInvalidNamedCaptureReference); - return; - } - - // Look up and patch the actual capture for each named back reference. - - for (int i = 0; i < named_back_references_->length(); i++) { - RegExpBackReference* ref = named_back_references_->at(i); - - // Capture used to search the named_captures_ by name, index of the - // capture is never used. - static const int kInvalidIndex = 0; - RegExpCapture* search_capture = new (zone()) RegExpCapture(kInvalidIndex); - DCHECK_NULL(search_capture->name()); - search_capture->set_name(ref->name()); - - int index = -1; - const auto& capture_it = named_captures_->find(search_capture); - if (capture_it != named_captures_->end()) { - index = (*capture_it)->index(); - } else { - ReportError(RegExpError::kInvalidNamedCaptureReference); - return; - } - - ref->set_capture(GetCapture(index)); - } -} - -RegExpCapture* RegExpParser::GetCapture(int index) { - // The index for the capture groups are one-based. Its index in the list is - // zero-based. - int know_captures = - is_scanned_for_captures_ ? capture_count_ : captures_started_; - DCHECK(index <= know_captures); - if (captures_ == nullptr) { - captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); - } - while (captures_->length() < know_captures) { - captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); - } - return captures_->at(index - 1); -} - -namespace { - -struct RegExpCaptureIndexLess { - bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { - DCHECK_NOT_NULL(lhs); - DCHECK_NOT_NULL(rhs); - return lhs->index() < rhs->index(); - } -}; - -} // namespace - -Handle<FixedArray> RegExpParser::CreateCaptureNameMap() { - if (named_captures_ == nullptr || named_captures_->empty()) { - return Handle<FixedArray>(); - } - - // Named captures are sorted by name (because the set is used to ensure - // name uniqueness). But the capture name map must to be sorted by index. - - ZoneVector<RegExpCapture*> sorted_named_captures( - named_captures_->begin(), named_captures_->end(), zone()); - std::sort(sorted_named_captures.begin(), sorted_named_captures.end(), - RegExpCaptureIndexLess{}); - DCHECK_EQ(sorted_named_captures.size(), named_captures_->size()); - - Factory* factory = isolate()->factory(); - - int len = static_cast<int>(sorted_named_captures.size()) * 2; - Handle<FixedArray> array = factory->NewFixedArray(len); - - int i = 0; - for (const auto& capture : sorted_named_captures) { - Vector<const uc16> capture_name(capture->name()->data(), - capture->name()->size()); - // CSA code in ConstructNewResultFromMatchInfo requires these strings to be - // internalized so they can be used as property names in the 'exec' results. - Handle<String> name = factory->InternalizeString(capture_name); - array->set(i * 2, *name); - array->set(i * 2 + 1, Smi::FromInt(capture->index())); - - i++; - } - DCHECK_EQ(i * 2, len); - - return array; -} - -bool RegExpParser::HasNamedCaptures() { - if (has_named_captures_ || is_scanned_for_captures_) { - return has_named_captures_; - } - - ScanForCaptures(); - DCHECK(is_scanned_for_captures_); - return has_named_captures_; -} - -bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { - for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { - if (s->group_type() != CAPTURE) continue; - // Return true if we found the matching capture index. - if (index == s->capture_index()) return true; - // Abort if index is larger than what has been parsed up till this state. - if (index > s->capture_index()) return false; - } - return false; -} - -bool RegExpParser::RegExpParserState::IsInsideCaptureGroup( - const ZoneVector<uc16>* name) { - DCHECK_NOT_NULL(name); - for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) { - if (s->capture_name() == nullptr) continue; - if (*s->capture_name() == *name) return true; - } - return false; -} - -// QuantifierPrefix :: -// { DecimalDigits } -// { DecimalDigits , } -// { DecimalDigits , DecimalDigits } -// -// Returns true if parsing succeeds, and set the min_out and max_out -// values. Values are truncated to RegExpTree::kInfinity if they overflow. -bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { - DCHECK_EQ(current(), '{'); - int start = position(); - Advance(); - int min = 0; - if (!IsDecimalDigit(current())) { - Reset(start); - return false; - } - while (IsDecimalDigit(current())) { - int next = current() - '0'; - if (min > (RegExpTree::kInfinity - next) / 10) { - // Overflow. Skip past remaining decimal digits and return -1. - do { - Advance(); - } while (IsDecimalDigit(current())); - min = RegExpTree::kInfinity; - break; - } - min = 10 * min + next; - Advance(); - } - int max = 0; - if (current() == '}') { - max = min; - Advance(); - } else if (current() == ',') { - Advance(); - if (current() == '}') { - max = RegExpTree::kInfinity; - Advance(); - } else { - while (IsDecimalDigit(current())) { - int next = current() - '0'; - if (max > (RegExpTree::kInfinity - next) / 10) { - do { - Advance(); - } while (IsDecimalDigit(current())); - max = RegExpTree::kInfinity; - break; - } - max = 10 * max + next; - Advance(); - } - if (current() != '}') { - Reset(start); - return false; - } - Advance(); - } - } else { - Reset(start); - return false; - } - *min_out = min; - *max_out = max; - return true; -} - - -uc32 RegExpParser::ParseOctalLiteral() { - DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); - // For compatibility with some other browsers (not all), we parse - // up to three octal digits with a value below 256. - // ES#prod-annexB-LegacyOctalEscapeSequence - uc32 value = current() - '0'; - Advance(); - if ('0' <= current() && current() <= '7') { - value = value * 8 + current() - '0'; - Advance(); - if (value < 32 && '0' <= current() && current() <= '7') { - value = value * 8 + current() - '0'; - Advance(); - } - } - return value; -} - - -bool RegExpParser::ParseHexEscape(int length, uc32* value) { - int start = position(); - uc32 val = 0; - for (int i = 0; i < length; ++i) { - uc32 c = current(); - int d = HexValue(c); - if (d < 0) { - Reset(start); - return false; - } - val = val * 16 + d; - Advance(); - } - *value = val; - return true; -} - -// This parses RegExpUnicodeEscapeSequence as described in ECMA262. -bool RegExpParser::ParseUnicodeEscape(uc32* value) { - // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are - // allowed). In the latter case, the number of hex digits between { } is - // arbitrary. \ and u have already been read. - if (current() == '{' && unicode()) { - int start = position(); - Advance(); - if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { - if (current() == '}') { - Advance(); - return true; - } - } - Reset(start); - return false; - } - // \u but no {, or \u{...} escapes not allowed. - bool result = ParseHexEscape(4, value); - if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) && - current() == '\\') { - // Attempt to read trail surrogate. - int start = position(); - if (Next() == 'u') { - Advance(2); - uc32 trail; - if (ParseHexEscape(4, &trail) && - unibrow::Utf16::IsTrailSurrogate(trail)) { - *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value), - static_cast<uc16>(trail)); - return true; - } - } - Reset(start); - } - return result; -} - -#ifdef V8_INTL_SUPPORT - -namespace { - -bool IsExactPropertyAlias(const char* property_name, UProperty property) { - const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); - if (short_name != nullptr && strcmp(property_name, short_name) == 0) - return true; - for (int i = 0;; i++) { - const char* long_name = u_getPropertyName( - property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); - if (long_name == nullptr) break; - if (strcmp(property_name, long_name) == 0) return true; - } - return false; -} - -bool IsExactPropertyValueAlias(const char* property_value_name, - UProperty property, int32_t property_value) { - const char* short_name = - u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); - if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { - return true; - } - for (int i = 0;; i++) { - const char* long_name = u_getPropertyValueName( - property, property_value, - static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i)); - if (long_name == nullptr) break; - if (strcmp(property_value_name, long_name) == 0) return true; - } - return false; -} - -bool LookupPropertyValueName(UProperty property, - const char* property_value_name, bool negate, - ZoneList<CharacterRange>* result, Zone* zone) { - UProperty property_for_lookup = property; - if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { - // For the property Script_Extensions, we have to do the property value - // name lookup as if the property is Script. - property_for_lookup = UCHAR_SCRIPT; - } - int32_t property_value = - u_getPropertyValueEnum(property_for_lookup, property_value_name); - if (property_value == UCHAR_INVALID_CODE) return false; - - // We require the property name to match exactly to one of the property value - // aliases. However, u_getPropertyValueEnum uses loose matching. - if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, - property_value)) { - return false; - } - - UErrorCode ec = U_ZERO_ERROR; - icu::UnicodeSet set; - set.applyIntPropertyValue(property, property_value, ec); - bool success = ec == U_ZERO_ERROR && !set.isEmpty(); - - if (success) { - set.removeAllStrings(); - if (negate) set.complement(); - for (int i = 0; i < set.getRangeCount(); i++) { - result->Add( - CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), - zone); - } - } - return success; -} - -template <size_t N> -inline bool NameEquals(const char* name, const char (&literal)[N]) { - return strncmp(name, literal, N + 1) == 0; -} - -bool LookupSpecialPropertyValueName(const char* name, - ZoneList<CharacterRange>* result, - bool negate, Zone* zone) { - if (NameEquals(name, "Any")) { - if (negate) { - // Leave the list of character ranges empty, since the negation of 'Any' - // is the empty set. - } else { - result->Add(CharacterRange::Everything(), zone); - } - } else if (NameEquals(name, "ASCII")) { - result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint) - : CharacterRange::Range(0x0, 0x7F), - zone); - } else if (NameEquals(name, "Assigned")) { - return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", - !negate, result, zone); - } else { - return false; - } - return true; -} - -// Explicitly whitelist supported binary properties. The spec forbids supporting -// properties outside of this set to ensure interoperability. -bool IsSupportedBinaryProperty(UProperty property) { - switch (property) { - case UCHAR_ALPHABETIC: - // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. - // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. - case UCHAR_ASCII_HEX_DIGIT: - // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. - case UCHAR_BIDI_CONTROL: - case UCHAR_BIDI_MIRRORED: - case UCHAR_CASE_IGNORABLE: - case UCHAR_CASED: - case UCHAR_CHANGES_WHEN_CASEFOLDED: - case UCHAR_CHANGES_WHEN_CASEMAPPED: - case UCHAR_CHANGES_WHEN_LOWERCASED: - case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: - case UCHAR_CHANGES_WHEN_TITLECASED: - case UCHAR_CHANGES_WHEN_UPPERCASED: - case UCHAR_DASH: - case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: - case UCHAR_DEPRECATED: - case UCHAR_DIACRITIC: - case UCHAR_EMOJI: - case UCHAR_EMOJI_COMPONENT: - case UCHAR_EMOJI_MODIFIER_BASE: - case UCHAR_EMOJI_MODIFIER: - case UCHAR_EMOJI_PRESENTATION: - case UCHAR_EXTENDED_PICTOGRAPHIC: - case UCHAR_EXTENDER: - case UCHAR_GRAPHEME_BASE: - case UCHAR_GRAPHEME_EXTEND: - case UCHAR_HEX_DIGIT: - case UCHAR_ID_CONTINUE: - case UCHAR_ID_START: - case UCHAR_IDEOGRAPHIC: - case UCHAR_IDS_BINARY_OPERATOR: - case UCHAR_IDS_TRINARY_OPERATOR: - case UCHAR_JOIN_CONTROL: - case UCHAR_LOGICAL_ORDER_EXCEPTION: - case UCHAR_LOWERCASE: - case UCHAR_MATH: - case UCHAR_NONCHARACTER_CODE_POINT: - case UCHAR_PATTERN_SYNTAX: - case UCHAR_PATTERN_WHITE_SPACE: - case UCHAR_QUOTATION_MARK: - case UCHAR_RADICAL: - case UCHAR_REGIONAL_INDICATOR: - case UCHAR_S_TERM: - case UCHAR_SOFT_DOTTED: - case UCHAR_TERMINAL_PUNCTUATION: - case UCHAR_UNIFIED_IDEOGRAPH: - case UCHAR_UPPERCASE: - case UCHAR_VARIATION_SELECTOR: - case UCHAR_WHITE_SPACE: - case UCHAR_XID_CONTINUE: - case UCHAR_XID_START: - return true; - default: - break; - } - return false; -} - -bool IsUnicodePropertyValueCharacter(char c) { - // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ - // - // Note that using this to validate each parsed char is quite conservative. - // A possible alternative solution would be to only ensure the parsed - // property name/value candidate string does not contain '\0' characters and - // let ICU lookups trigger the final failure. - if ('a' <= c && c <= 'z') return true; - if ('A' <= c && c <= 'Z') return true; - if ('0' <= c && c <= '9') return true; - return (c == '_'); -} - -} // anonymous namespace - -bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1, - ZoneVector<char>* name_2) { - DCHECK(name_1->empty()); - DCHECK(name_2->empty()); - // Parse the property class as follows: - // - In \p{name}, 'name' is interpreted - // - either as a general category property value name. - // - or as a binary property name. - // - In \p{name=value}, 'name' is interpreted as an enumerated property name, - // and 'value' is interpreted as one of the available property value names. - // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. - // - Loose matching is not applied. - if (current() == '{') { - // Parse \p{[PropertyName=]PropertyNameValue} - for (Advance(); current() != '}' && current() != '='; Advance()) { - if (!IsUnicodePropertyValueCharacter(current())) return false; - if (!has_next()) return false; - name_1->push_back(static_cast<char>(current())); - } - if (current() == '=') { - for (Advance(); current() != '}'; Advance()) { - if (!IsUnicodePropertyValueCharacter(current())) return false; - if (!has_next()) return false; - name_2->push_back(static_cast<char>(current())); - } - name_2->push_back(0); // null-terminate string. - } - } else { - return false; - } - Advance(); - name_1->push_back(0); // null-terminate string. - - DCHECK(name_1->size() - 1 == std::strlen(name_1->data())); - DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data())); - return true; -} - -bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, - bool negate, - const ZoneVector<char>& name_1, - const ZoneVector<char>& name_2) { - if (name_2.empty()) { - // First attempt to interpret as general category property value name. - const char* name = name_1.data(); - if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, - add_to, zone())) { - return true; - } - // Interpret "Any", "ASCII", and "Assigned". - if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) { - return true; - } - // Then attempt to interpret as binary property name with value name 'Y'. - UProperty property = u_getPropertyEnum(name); - if (!IsSupportedBinaryProperty(property)) return false; - if (!IsExactPropertyAlias(name, property)) return false; - return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to, - zone()); - } else { - // Both property name and value name are specified. Attempt to interpret - // the property name as enumerated property. - const char* property_name = name_1.data(); - const char* value_name = name_2.data(); - UProperty property = u_getPropertyEnum(property_name); - if (!IsExactPropertyAlias(property_name, property)) return false; - if (property == UCHAR_GENERAL_CATEGORY) { - // We want to allow aggregate value names such as "Letter". - property = UCHAR_GENERAL_CATEGORY_MASK; - } else if (property != UCHAR_SCRIPT && - property != UCHAR_SCRIPT_EXTENSIONS) { - return false; - } - return LookupPropertyValueName(property, value_name, negate, add_to, - zone()); - } -} - -RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name_1) { - if (!FLAG_harmony_regexp_sequence) return nullptr; - const char* name = name_1.data(); - const uc32* sequence_list = nullptr; - JSRegExp::Flags flags = JSRegExp::kUnicode; - if (NameEquals(name, "Emoji_Flag_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiFlagSequences; - } else if (NameEquals(name, "Emoji_Tag_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiTagSequences; - } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) { - sequence_list = UnicodePropertySequences::kEmojiZWJSequences; - } - if (sequence_list != nullptr) { - // TODO(yangguo): this creates huge regexp code. Alternative to this is - // to create a new operator that checks for these sequences at runtime. - RegExpBuilder builder(zone(), flags); - while (true) { // Iterate through list of sequences. - while (*sequence_list != 0) { // Iterate through sequence. - builder.AddUnicodeCharacter(*sequence_list); - sequence_list++; - } - sequence_list++; - if (*sequence_list == 0) break; - builder.NewAlternative(); - } - return builder.ToRegExp(); - } - - if (NameEquals(name, "Emoji_Keycap_Sequence")) { - // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence - // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3} - RegExpBuilder builder(zone(), flags); - ZoneList<CharacterRange>* prefix_ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - prefix_ranges->Add(CharacterRange::Range('0', '9'), zone()); - prefix_ranges->Add(CharacterRange::Singleton('#'), zone()); - prefix_ranges->Add(CharacterRange::Singleton('*'), zone()); - builder.AddCharacterClass( - new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags)); - builder.AddCharacter(0xFE0F); - builder.AddCharacter(0x20E3); - return builder.ToRegExp(); - } else if (NameEquals(name, "Emoji_Modifier_Sequence")) { - // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence - // emoji_modifier_sequence := emoji_modifier_base emoji_modifier - RegExpBuilder builder(zone(), flags); - ZoneList<CharacterRange>* modifier_base_ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false, - modifier_base_ranges, zone()); - builder.AddCharacterClass( - new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags)); - ZoneList<CharacterRange>* modifier_ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges, - zone()); - builder.AddCharacterClass( - new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags)); - return builder.ToRegExp(); - } - - return nullptr; -} - -#else // V8_INTL_SUPPORT - -bool RegExpParser::ParsePropertyClassName(ZoneVector<char>* name_1, - ZoneVector<char>* name_2) { - return false; -} - -bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to, - bool negate, - const ZoneVector<char>& name_1, - const ZoneVector<char>& name_2) { - return false; -} - -RegExpTree* RegExpParser::GetPropertySequence(const ZoneVector<char>& name) { - return nullptr; -} - -#endif // V8_INTL_SUPPORT - -bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { - uc32 x = 0; - int d = HexValue(current()); - if (d < 0) { - return false; - } - while (d >= 0) { - x = x * 16 + d; - if (x > max_value) { - return false; - } - Advance(); - d = HexValue(current()); - } - *value = x; - return true; -} - - -uc32 RegExpParser::ParseClassCharacterEscape() { - DCHECK_EQ('\\', current()); - DCHECK(has_next() && !IsSpecialClassEscape(Next())); - Advance(); - switch (current()) { - case 'b': - Advance(); - return '\b'; - // ControlEscape :: one of - // f n r t v - case 'f': - Advance(); - return '\f'; - case 'n': - Advance(); - return '\n'; - case 'r': - Advance(); - return '\r'; - case 't': - Advance(); - return '\t'; - case 'v': - Advance(); - return '\v'; - case 'c': { - uc32 controlLetter = Next(); - uc32 letter = controlLetter & ~('A' ^ 'a'); - // Inside a character class, we also accept digits and underscore as - // control characters, unless with /u. See Annex B: - // ES#prod-annexB-ClassControlLetter - if (letter >= 'A' && letter <= 'Z') { - Advance(2); - // Control letters mapped to ASCII control characters in the range - // 0x00-0x1F. - return controlLetter & 0x1F; - } - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidClassEscape); - return 0; - } - if ((controlLetter >= '0' && controlLetter <= '9') || - controlLetter == '_') { - Advance(2); - return controlLetter & 0x1F; - } - // We match JSC in reading the backslash as a literal - // character instead of as starting an escape. - // TODO(v8:6201): Not yet covered by the spec. - return '\\'; - } - case '0': - // With /u, \0 is interpreted as NUL if not followed by another digit. - if (unicode() && !(Next() >= '0' && Next() <= '9')) { - Advance(); - return 0; - } - V8_FALLTHROUGH; - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - // For compatibility, we interpret a decimal escape that isn't - // a back reference (and therefore either \0 or not valid according - // to the specification) as a 1..3 digit octal character code. - // ES#prod-annexB-LegacyOctalEscapeSequence - if (unicode()) { - // With /u, decimal escape is not interpreted as octal character code. - ReportError(RegExpError::kInvalidClassEscape); - return 0; - } - return ParseOctalLiteral(); - case 'x': { - Advance(); - uc32 value; - if (ParseHexEscape(2, &value)) return value; - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidEscape); - return 0; - } - // If \x is not followed by a two-digit hexadecimal, treat it - // as an identity escape. - return 'x'; - } - case 'u': { - Advance(); - uc32 value; - if (ParseUnicodeEscape(&value)) return value; - if (unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(RegExpError::kInvalidUnicodeEscape); - return 0; - } - // If \u is not followed by a two-digit hexadecimal, treat it - // as an identity escape. - return 'u'; - } - default: { - uc32 result = current(); - // With /u, no identity escapes except for syntax characters and '-' are - // allowed. Otherwise, all identity escapes are allowed. - if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { - Advance(); - return result; - } - ReportError(RegExpError::kInvalidEscape); - return 0; - } - } - UNREACHABLE(); -} - -void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges, - Zone* zone, - bool add_unicode_case_equivalents, - uc32* char_out, bool* is_class_escape) { - uc32 current_char = current(); - if (current_char == '\\') { - switch (Next()) { - case 'w': - case 'W': - case 'd': - case 'D': - case 's': - case 'S': { - CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges, - add_unicode_case_equivalents, zone); - Advance(2); - *is_class_escape = true; - return; - } - case kEndMarker: - ReportError(RegExpError::kEscapeAtEndOfPattern); - return; - case 'p': - case 'P': - if (unicode()) { - bool negate = Next() == 'P'; - Advance(2); - ZoneVector<char> name_1(zone); - ZoneVector<char> name_2(zone); - if (!ParsePropertyClassName(&name_1, &name_2) || - !AddPropertyClassRange(ranges, negate, name_1, name_2)) { - ReportError(RegExpError::kInvalidClassPropertyName); - } - *is_class_escape = true; - return; - } - break; - default: - break; - } - *char_out = ParseClassCharacterEscape(); - *is_class_escape = false; - } else { - Advance(); - *char_out = current_char; - *is_class_escape = false; - } -} - -RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { - DCHECK_EQ(current(), '['); - Advance(); - bool is_negated = false; - if (current() == '^') { - is_negated = true; - Advance(); - } - ZoneList<CharacterRange>* ranges = - new (zone()) ZoneList<CharacterRange>(2, zone()); - bool add_unicode_case_equivalents = unicode() && builder->ignore_case(); - while (has_more() && current() != ']') { - uc32 char_1, char_2; - bool is_class_1, is_class_2; - ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1, - &is_class_1 CHECK_FAILED); - if (current() == '-') { - Advance(); - if (current() == kEndMarker) { - // If we reach the end we break out of the loop and let the - // following code report an error. - break; - } else if (current() == ']') { - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); - ranges->Add(CharacterRange::Singleton('-'), zone()); - break; - } - ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2, - &is_class_2 CHECK_FAILED); - if (is_class_1 || is_class_2) { - // Either end is an escaped character class. Treat the '-' verbatim. - if (unicode()) { - // ES2015 21.2.2.15.1 step 1. - return ReportError(RegExpError::kInvalidCharacterClass); - } - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); - ranges->Add(CharacterRange::Singleton('-'), zone()); - if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone()); - continue; - } - // ES2015 21.2.2.15.1 step 6. - if (char_1 > char_2) { - return ReportError(RegExpError::kOutOfOrderCharacterClass); - } - ranges->Add(CharacterRange::Range(char_1, char_2), zone()); - } else { - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone()); - } - } - if (!has_more()) { - return ReportError(RegExpError::kUnterminatedCharacterClass); - } - Advance(); - RegExpCharacterClass::CharacterClassFlags character_class_flags; - if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED; - return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(), - character_class_flags); -} - - -#undef CHECK_FAILED - - -bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, - FlatStringReader* input, JSRegExp::Flags flags, - RegExpCompileData* result) { - DCHECK(result != nullptr); - RegExpParser parser(input, flags, isolate, zone); - RegExpTree* tree = parser.ParsePattern(); - if (parser.failed()) { - DCHECK(tree == nullptr); - DCHECK(parser.error_ != RegExpError::kNone); - result->error = parser.error_; - result->error_pos = parser.error_pos_; - } else { - DCHECK(tree != nullptr); - DCHECK(parser.error_ == RegExpError::kNone); - if (FLAG_trace_regexp_parser) { - StdoutStream os; - tree->Print(os, zone); - os << "\n"; - } - result->tree = tree; - int capture_count = parser.captures_started(); - result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; - result->contains_anchor = parser.contains_anchor(); - result->capture_name_map = parser.CreateCaptureNameMap(); - result->capture_count = capture_count; - } - return !parser.failed(); -} - -RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) - : zone_(zone), - pending_empty_(false), - flags_(flags), - characters_(nullptr), - pending_surrogate_(kNoPendingSurrogate), - terms_(), - alternatives_() -#ifdef DEBUG - , - last_added_(ADD_NONE) -#endif -{ -} - - -void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { - DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); - FlushPendingSurrogate(); - // Hold onto the lead surrogate, waiting for a trail surrogate to follow. - pending_surrogate_ = lead_surrogate; -} - - -void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { - DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); - if (pending_surrogate_ != kNoPendingSurrogate) { - uc16 lead_surrogate = pending_surrogate_; - pending_surrogate_ = kNoPendingSurrogate; - DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); - uc32 combined = - unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); - if (NeedsDesugaringForIgnoreCase(combined)) { - AddCharacterClassForDesugaring(combined); - } else { - ZoneList<uc16> surrogate_pair(2, zone()); - surrogate_pair.Add(lead_surrogate, zone()); - surrogate_pair.Add(trail_surrogate, zone()); - RegExpAtom* atom = - new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_); - AddAtom(atom); - } - } else { - pending_surrogate_ = trail_surrogate; - FlushPendingSurrogate(); - } -} - - -void RegExpBuilder::FlushPendingSurrogate() { - if (pending_surrogate_ != kNoPendingSurrogate) { - DCHECK(unicode()); - uc32 c = pending_surrogate_; - pending_surrogate_ = kNoPendingSurrogate; - AddCharacterClassForDesugaring(c); - } -} - - -void RegExpBuilder::FlushCharacters() { - FlushPendingSurrogate(); - pending_empty_ = false; - if (characters_ != nullptr) { - RegExpTree* atom = - new (zone()) RegExpAtom(characters_->ToConstVector(), flags_); - characters_ = nullptr; - text_.Add(atom, zone()); - LAST(ADD_ATOM); - } -} - - -void RegExpBuilder::FlushText() { - FlushCharacters(); - int num_text = text_.length(); - if (num_text == 0) { - return; - } else if (num_text == 1) { - terms_.Add(text_.last(), zone()); - } else { - RegExpText* text = new (zone()) RegExpText(zone()); - for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); - terms_.Add(text, zone()); - } - text_.Clear(); -} - - -void RegExpBuilder::AddCharacter(uc16 c) { - FlushPendingSurrogate(); - pending_empty_ = false; - if (NeedsDesugaringForIgnoreCase(c)) { - AddCharacterClassForDesugaring(c); - } else { - if (characters_ == nullptr) { - characters_ = new (zone()) ZoneList<uc16>(4, zone()); - } - characters_->Add(c, zone()); - LAST(ADD_CHAR); - } -} - - -void RegExpBuilder::AddUnicodeCharacter(uc32 c) { - if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { - DCHECK(unicode()); - AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); - AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); - } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { - AddLeadSurrogate(c); - } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { - AddTrailSurrogate(c); - } else { - AddCharacter(static_cast<uc16>(c)); - } -} - -void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) { - // A lead or trail surrogate parsed via escape sequence will not - // pair up with any preceding lead or following trail surrogate. - FlushPendingSurrogate(); - AddUnicodeCharacter(character); - FlushPendingSurrogate(); -} - -void RegExpBuilder::AddEmpty() { pending_empty_ = true; } - - -void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { - if (NeedsDesugaringForUnicode(cc)) { - // With /u, character class needs to be desugared, so it - // must be a standalone term instead of being part of a RegExpText. - AddTerm(cc); - } else { - AddAtom(cc); - } -} - -void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { - AddTerm(new (zone()) RegExpCharacterClass( - zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)), - flags_)); -} - - -void RegExpBuilder::AddAtom(RegExpTree* term) { - if (term->IsEmpty()) { - AddEmpty(); - return; - } - if (term->IsTextElement()) { - FlushCharacters(); - text_.Add(term, zone()); - } else { - FlushText(); - terms_.Add(term, zone()); - } - LAST(ADD_ATOM); -} - - -void RegExpBuilder::AddTerm(RegExpTree* term) { - FlushText(); - terms_.Add(term, zone()); - LAST(ADD_ATOM); -} - - -void RegExpBuilder::AddAssertion(RegExpTree* assert) { - FlushText(); - terms_.Add(assert, zone()); - LAST(ADD_ASSERT); -} - - -void RegExpBuilder::NewAlternative() { FlushTerms(); } - - -void RegExpBuilder::FlushTerms() { - FlushText(); - int num_terms = terms_.length(); - RegExpTree* alternative; - if (num_terms == 0) { - alternative = new (zone()) RegExpEmpty(); - } else if (num_terms == 1) { - alternative = terms_.last(); - } else { - alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); - } - alternatives_.Add(alternative, zone()); - terms_.Clear(); - LAST(ADD_NONE); -} - - -bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { - if (!unicode()) return false; - // TODO(yangguo): we could be smarter than this. Case-insensitivity does not - // necessarily mean that we need to desugar. It's probably nicer to have a - // separate pass to figure out unicode desugarings. - if (ignore_case()) return true; - ZoneList<CharacterRange>* ranges = cc->ranges(zone()); - CharacterRange::Canonicalize(ranges); - for (int i = ranges->length() - 1; i >= 0; i--) { - uc32 from = ranges->at(i).from(); - uc32 to = ranges->at(i).to(); - // Check for non-BMP characters. - if (to >= kNonBmpStart) return true; - // Check for lone surrogates. - if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; - } - return false; -} - - -bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { -#ifdef V8_INTL_SUPPORT - if (unicode() && ignore_case()) { - icu::UnicodeSet set(c, c); - set.closeOver(USET_CASE_INSENSITIVE); - set.removeAllStrings(); - return set.size() > 1; - } - // In the case where ICU is not included, we act as if the unicode flag is - // not set, and do not desugar. -#endif // V8_INTL_SUPPORT - return false; -} - - -RegExpTree* RegExpBuilder::ToRegExp() { - FlushTerms(); - int num_alternatives = alternatives_.length(); - if (num_alternatives == 0) return new (zone()) RegExpEmpty(); - if (num_alternatives == 1) return alternatives_.last(); - return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); -} - -bool RegExpBuilder::AddQuantifierToAtom( - int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { - FlushPendingSurrogate(); - if (pending_empty_) { - pending_empty_ = false; - return true; - } - RegExpTree* atom; - if (characters_ != nullptr) { - DCHECK(last_added_ == ADD_CHAR); - // Last atom was character. - Vector<const uc16> char_vector = characters_->ToConstVector(); - int num_chars = char_vector.length(); - if (num_chars > 1) { - Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); - text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone()); - char_vector = char_vector.SubVector(num_chars - 1, num_chars); - } - characters_ = nullptr; - atom = new (zone()) RegExpAtom(char_vector, flags_); - FlushText(); - } else if (text_.length() > 0) { - DCHECK(last_added_ == ADD_ATOM); - atom = text_.RemoveLast(); - FlushText(); - } else if (terms_.length() > 0) { - DCHECK(last_added_ == ADD_ATOM); - atom = terms_.RemoveLast(); - if (atom->IsLookaround()) { - // With /u, lookarounds are not quantifiable. - if (unicode()) return false; - // Lookbehinds are not quantifiable. - if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) { - return false; - } - } - if (atom->max_match() == 0) { - // Guaranteed to only match an empty string. - LAST(ADD_TERM); - if (min == 0) { - return true; - } - terms_.Add(atom, zone()); - return true; - } - } else { - // Only call immediately after adding an atom or character! - UNREACHABLE(); - } - terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), - zone()); - LAST(ADD_TERM); - return true; -} - -} // namespace internal -} // namespace v8 |