diff options
author | Martok <martok@martoks-place.de> | 2022-12-21 18:53:27 +0100 |
---|---|---|
committer | Martok <martok@martoks-place.de> | 2022-12-21 18:53:27 +0100 |
commit | b356d64f4e69af8cf1f2eaf87d1f328bdd24127e (patch) | |
tree | f2d5df6d2bc16818cc71283363d54f56af80da39 /js | |
parent | 9a0807a62c96d72387b218d840eb6bb8afb5916d (diff) | |
download | uxp-b356d64f4e69af8cf1f2eaf87d1f328bdd24127e.tar.gz |
Issue #1285 - implement named capturing groups and named backrefs
- RegExpParser collects seen groups in named_captures_.
- After irregexp::ParsePattern has finished, RegExpParser::StoreNamedCaptureMap translates
the parser data to RegExpCompileData.capture_name/index
- RegExpShared::initializeNamedCaptures takes these and builds a PlainObject map which
is kept with the compiled expression
This is done because irregexp doesn't have access to the JS context and so can't allocate
any JSValues itself.
- for each match result, this map is used to build PlainObjects of name->match/undefined
(extremely simplified from upstream at the expense of some perf)
IonMonkey switches to non-masm code path for expressions with named groups.
Diffstat (limited to 'js')
-rw-r--r-- | js/src/builtin/RegExp.cpp | 69 | ||||
-rw-r--r-- | js/src/builtin/RegExp.h | 3 | ||||
-rw-r--r-- | js/src/irregexp/RegExpAST.h | 13 | ||||
-rw-r--r-- | js/src/irregexp/RegExpEngine.h | 17 | ||||
-rw-r--r-- | js/src/irregexp/RegExpParser.cpp | 292 | ||||
-rw-r--r-- | js/src/irregexp/RegExpParser.h | 43 | ||||
-rw-r--r-- | js/src/jit/CodeGenerator.cpp | 12 | ||||
-rw-r--r-- | js/src/js.msg | 6 | ||||
-rw-r--r-- | js/src/vm/CommonPropertyNames.h | 1 | ||||
-rw-r--r-- | js/src/vm/RegExpObject.cpp | 77 | ||||
-rw-r--r-- | js/src/vm/RegExpObject.h | 13 |
11 files changed, 512 insertions, 34 deletions
diff --git a/js/src/builtin/RegExp.cpp b/js/src/builtin/RegExp.cpp index 39db78c8e9..de83b38aed 100644 --- a/js/src/builtin/RegExp.cpp +++ b/js/src/builtin/RegExp.cpp @@ -21,6 +21,7 @@ #include "vm/NativeObject-inl.h" + using namespace js; using namespace js::unicode; @@ -31,11 +32,12 @@ using mozilla::Maybe; using CapturesVector = GCVector<Value, 4>; /* - * ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad 21.2.5.2.2 - * steps 3, 16-25. + * ES 2021 draft 21.2.5.2.2: Steps 16-28 + * https://tc39.es/ecma262/#sec-regexpbuiltinexec */ bool -js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& matches, +js::CreateRegExpMatchResult(JSContext* cx, RegExpShared& re, + HandleString input, const MatchPairs& matches, MutableHandleValue rval) { MOZ_ASSERT(input); @@ -48,6 +50,7 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& * 1..pairCount-1: paren matches * input: input string * index: start index for the match + * groups: named capture groups for the match */ /* Get the templateObject that defines the shape and type of the output object */ @@ -55,15 +58,16 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& if (!templateObject) return false; + // Step 16 size_t numPairs = matches.length(); MOZ_ASSERT(numPairs > 0); - /* Step 17. */ + /* Step 18-19. */ RootedArrayObject arr(cx, NewDenseFullyAllocatedArrayWithTemplate(cx, numPairs, templateObject)); if (!arr) return false; - /* Steps 22-24. + /* Steps 22-23 and 27 a-e * Store a Value for each pair. */ for (size_t i = 0; i < numPairs; i++) { const MatchPair& pair = matches[i]; @@ -81,6 +85,40 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& } } + // Step 24 (reordered) + RootedPlainObject groups(cx); + if (re.numNamedCaptures() > 0) { + // construct a new object from the template saved on RegExpShared + RootedPlainObject groupsTemplate(cx, re.getGroupsTemplate()); + groups = NewObjectWithGivenProto<PlainObject>(cx, nullptr); + groups->setGroup(groupsTemplate->group()); + + // Step 27 f. + // The groups template object stores the names of the named captures in the + // the order in which they are defined. + // Grab the index into the match vector from the template object and define the + // corresponding property on the result + AutoIdVector keys(cx); + if (!GetPropertyKeys(cx, groupsTemplate, 0, &keys)) { + return false; + } + MOZ_ASSERT(keys.length() == re.numNamedCaptures()); + RootedId key(cx); + RootedValue ival(cx); + RootedValue val(cx); + for (size_t i = 0; i < keys.length(); i++) { + key = keys[i]; + // fetch the group's match index... + if (!NativeGetProperty(cx, groupsTemplate, key, &ival)) + return false; + // ... and set it on groups + val = arr->getDenseElement(ival.toInt32()); + if (!NativeDefineProperty(cx, groups, key, val, nullptr, nullptr, JSPROP_ENUMERATE)) { + return false; + } + } + } + /* Step 20 (reordered). * Set the |index| property. (TemplateObject positions it in slot 0) */ arr->setSlot(0, Int32Value(matches[0].start)); @@ -89,6 +127,10 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& * Set the |input| property. (TemplateObject positions it in slot 1) */ arr->setSlot(1, StringValue(input)); + // Steps 25-26 (reordered) + // Set the |groups| property. + arr->setSlot(2, groups ? ObjectValue(*groups) : UndefinedValue()); + #ifdef DEBUG RootedValue test(cx); RootedId id(cx, NameToId(cx->names().index)); @@ -170,7 +212,7 @@ js::ExecuteRegExpLegacy(JSContext* cx, RegExpStatics* res, Handle<RegExpObject*> return true; } - return CreateRegExpMatchResult(cx, input, matches, rval); + return CreateRegExpMatchResult(cx, *shared, input, matches, rval); } static bool @@ -1027,7 +1069,11 @@ RegExpMatcherImpl(JSContext* cx, HandleObject regexp, HandleString string, } /* Steps 16-25 */ - return CreateRegExpMatchResult(cx, string, matches, rval); + Rooted<RegExpObject*> reobj(cx, ®exp->as<RegExpObject>()); + RegExpGuard shared(cx); + if (!RegExpObject::getShared(cx, reobj, &shared)) + return false; + return CreateRegExpMatchResult(cx, *shared, string, matches, rval); } /* @@ -1069,8 +1115,13 @@ js::RegExpMatcherRaw(JSContext* cx, HandleObject regexp, HandleString input, // The MatchPairs will always be passed in, but RegExp execution was // successful only if the pairs have actually been filled in. - if (maybeMatches && maybeMatches->pairsRaw()[0] >= 0) - return CreateRegExpMatchResult(cx, input, *maybeMatches, output); + if (maybeMatches && maybeMatches->pairsRaw()[0] >= 0) { + Rooted<RegExpObject*> reobj(cx, ®exp->as<RegExpObject>()); + RegExpGuard shared(cx); + if (!RegExpObject::getShared(cx, reobj, &shared)) + return false; + return CreateRegExpMatchResult(cx, *shared, input, *maybeMatches, output); + } return RegExpMatcherImpl(cx, regexp, input, lastIndex, UpdateRegExpStatics, output); } diff --git a/js/src/builtin/RegExp.h b/js/src/builtin/RegExp.h index 275efd7ce3..cb88319ac9 100644 --- a/js/src/builtin/RegExp.h +++ b/js/src/builtin/RegExp.h @@ -36,7 +36,8 @@ ExecuteRegExpLegacy(JSContext* cx, RegExpStatics* res, Handle<RegExpObject*> reo /* Translation from MatchPairs to a JS array in regexp_exec()'s output format. */ MOZ_MUST_USE bool -CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& matches, +CreateRegExpMatchResult(JSContext* cx, RegExpShared& re, + HandleString input, const MatchPairs& matches, MutableHandleValue rval); extern MOZ_MUST_USE bool diff --git a/js/src/irregexp/RegExpAST.h b/js/src/irregexp/RegExpAST.h index bd01f6c6cd..9e023d537f 100644 --- a/js/src/irregexp/RegExpAST.h +++ b/js/src/irregexp/RegExpAST.h @@ -339,7 +339,7 @@ class RegExpCapture : public RegExpTree { public: explicit RegExpCapture(RegExpTree* body, int index) - : body_(body), index_(index) + : body_(body), index_(index), name_(nullptr) {} virtual void* Accept(RegExpVisitor* visitor, void* data); @@ -359,12 +359,15 @@ class RegExpCapture : public RegExpTree RegExpTree* body() { return body_; } void set_body(RegExpTree* body) { body_ = body; } int index() { return index_; } + const CharacterVector* name() const { return name_; } + void set_name(const CharacterVector* name) { name_ = name; } static int StartRegister(int index) { return index * 2; } static int EndRegister(int index) { return index * 2 + 1; } private: RegExpTree* body_; int index_; + const CharacterVector* name_; }; class RegExpLookaround : public RegExpTree @@ -413,7 +416,7 @@ class RegExpBackReference : public RegExpTree { public: explicit RegExpBackReference(RegExpCapture* capture) - : capture_(capture) + : capture_(capture), name_(nullptr) {} virtual void* Accept(RegExpVisitor* visitor, void* data); @@ -427,10 +430,16 @@ class RegExpBackReference : public RegExpTree int max_match() override { return kInfinity; } int index() { return capture_->index(); } RegExpCapture* capture() { return capture_; } + void set_capture(RegExpCapture* capture) { capture_ = capture; } + const CharacterVector* name() const { return name_; } + void set_name(const CharacterVector* name) { name_ = name; } private: RegExpCapture* capture_; + const CharacterVector* name_; }; +typedef InfallibleVector<RegExpBackReference*, 1> RegExpBackReferenceVector; + class RegExpEmpty : public RegExpTree { public: diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h index 7505636100..1b011458cd 100644 --- a/js/src/irregexp/RegExpEngine.h +++ b/js/src/irregexp/RegExpEngine.h @@ -57,13 +57,28 @@ struct RegExpCompileData : tree(nullptr), simple(true), contains_anchor(false), - capture_count(0) + capture_count(0), + capture_name_list(nullptr), + capture_index_list(nullptr) {} + // The parsed AST as produced by the RegExpParser. RegExpTree* tree; + // True, iff the pattern is a 'simple' atom with zero captures. In other + // words, the pattern consists of a string with no metacharacters and special + // regexp features, and can be implemented as a standard string search. bool simple; + + // True, iff the pattern is anchored at the start of the string with '^'. bool contains_anchor; + + // The number of capture groups, without the global capture \0. int capture_count; + + // Only use if the pattern contains named captures. If so, this contains a + // mapping of capture names to capture indices, as Values. + CharacterVectorVector* capture_name_list; + IntegerVector* capture_index_list; }; struct RegExpCode diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp index c6b8727048..c46b8cf8ec 100644 --- a/js/src/irregexp/RegExpParser.cpp +++ b/js/src/irregexp/RegExpParser.cpp @@ -245,6 +245,8 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, : ts(ts), alloc(alloc), captures_(nullptr), + named_captures_(nullptr), + named_back_references_(nullptr), next_pos_(chars), captures_started_(0), end_(end), @@ -257,7 +259,8 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc, dotall_(dotall), simple_(false), contains_anchor_(false), - is_scanned_for_captures_(false) + is_scanned_for_captures_(false), + has_named_captures_(false) { Advance(); } @@ -272,6 +275,30 @@ RegExpParser<CharT>::ReportError(unsigned errorNumber, const char* param /* = nu } template <typename CharT> +bool +RegExpParser<CharT>::StoreNamedCaptureMap(CharacterVectorVector** names, IntegerVector** indices) +{ + // Any named captures defined at all? + if (!named_captures_ || !named_captures_->length()) { + return true; + } + + CharacterVectorVector* nv = alloc->newInfallible<CharacterVectorVector>(*alloc); + IntegerVector* iv = alloc->newInfallible<IntegerVector>(*alloc); + + for (size_t i=0; i<named_captures_->length(); i++) { + RegExpCapture* capture = (*named_captures_)[i]; + const CharacterVector* cn = capture->name(); + nv->append(const_cast<CharacterVector*>(cn)); + iv->append(capture->index()); + } + + *names = nv; + *indices = iv; + return true; +} + +template <typename CharT> void RegExpParser<CharT>::Advance() { @@ -1165,6 +1192,7 @@ template <typename CharT> void RegExpParser<CharT>::ScanForCaptures() { + const CharT* saved_position = position(); // Start with captures started previous to current position int capture_count = captures_started(); // Add count of captures after this position. @@ -1188,12 +1216,32 @@ RegExpParser<CharT>::ScanForCaptures() break; } case '(': - if (current() != '?') capture_count++; + if (current() == '?') { + // At this point we could be in + // * a non-capturing group '(:', + // * a lookbehind assertion '(?<=' '(?<!' + // * or a named capture '(?<'. + // + // Of these, only named captures are capturing groups. + + Advance(); + if (current() != '<') break; + + Advance(); + if (current() == '=' || current() == '!') break; + + // Found a possible named capture. It could turn out to be a syntax + // error (e.g. an unterminated or invalid name), but that distinction + // does not matter for our purposes. + has_named_captures_ = true; + } + capture_count++; break; } } capture_count_ = capture_count; is_scanned_for_captures_ = true; + Reset(saved_position); } inline bool @@ -1251,9 +1299,168 @@ RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out) return true; } +static void push_code_unit(CharacterVector* v, uint32_t code_unit) +{ + // based off of unicode::UTF16Encode + if (!unicode::IsSupplementary(code_unit)) { + v->append(char16_t(code_unit)); + } else { + v->append(unicode::LeadSurrogate(code_unit)); + v->append(unicode::TrailSurrogate(code_unit)); + } +} + +template <typename CharT> +const CharacterVector* +RegExpParser<CharT>::ParseCaptureGroupName() +{ + CharacterVector* name = alloc->newInfallible<CharacterVector>(*alloc); + + bool at_start = true; + while (true) { + widechar c = current(); + Advance(); + + // Convert unicode escapes. + if (c == '\\' && current() == 'u') { + Advance(); + if (!ParseUnicodeEscape(&c)) { + ReportError(JSMSG_INVALID_UNICODE_ESCAPE); + return nullptr; + } + } + + // The backslash char is misclassified as both ID_Start and ID_Continue. + if (c == '\\') { + ReportError(JSMSG_INVALID_CAPTURE_NAME); + return nullptr; + } + + if (at_start) { + if (!unicode::IsIdentifierStart(c)) { + ReportError(JSMSG_INVALID_CAPTURE_NAME); + return nullptr; + } + push_code_unit(name, c); + at_start = false; + } else { + if (c == '>') { + break; + } else if (unicode::IsIdentifierPart(c)) { + push_code_unit(name, c); + } else { + ReportError(JSMSG_INVALID_CAPTURE_NAME); + return nullptr; + } + } + } + + return name; +} + +template <typename CharT> +bool +RegExpParser<CharT>::CreateNamedCaptureAtIndex(const CharacterVector* name, + int index) +{ + MOZ_ASSERT(0 < index && index <= captures_started_); + MOZ_ASSERT(name !== nullptr); + + RegExpCapture* capture = GetCapture(index); + MOZ_ASSERT(capture->name() == nullptr); + + capture->set_name(name); + + if (named_captures_ == nullptr) { + named_captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc); + } else { + // Check for duplicates and bail if we find any. + if (FindNamedCapture(name) != nullptr) { + ReportError(JSMSG_DUPLICATE_CAPTURE_NAME); + return false; + } + } + named_captures_->append(capture); + return true; +} + template <typename CharT> RegExpCapture* -RegExpParser<CharT>::GetCapture(int index) { +RegExpParser<CharT>::FindNamedCapture(const CharacterVector* name) +{ + // Linear search is fine since there are usually very few named groups + for (auto it=named_captures_->begin(); it<named_captures_->end(); it++) { + if (*(*it)->name() == *name) { + return *it; + } + } + return nullptr; +} + +template <typename CharT> +bool +RegExpParser<CharT>::ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state) +{ + // The parser is assumed to be on the '<' in \k<name>. + if (current() != '<') { + ReportError(JSMSG_INVALID_NAMED_REF); + return false; + } + + Advance(); + const CharacterVector* name = ParseCaptureGroupName(); + if (name == nullptr) { + return false; + } + + if (state->IsInsideCaptureGroup(name)) { + builder->AddEmpty(); + } else { + RegExpBackReference* atom = alloc->newInfallible<RegExpBackReference>(nullptr); + atom->set_name(name); + + builder->AddAtom(atom); + + if (named_back_references_ == nullptr) { + named_back_references_ = alloc->newInfallible<RegExpBackReferenceVector>(*alloc); + } + named_back_references_->append(atom); + } + + return true; +} + +template <typename CharT> +void +RegExpParser<CharT>::PatchNamedBackReferences() +{ + if (named_back_references_ == nullptr) return; + + if (named_captures_ == nullptr) { + // Named backrefs but no named groups + ReportError(JSMSG_INVALID_NAMED_CAPTURE_REF); + return; + } + + // Look up and patch the actual capture for each named back reference. + for (size_t i = 0; i < named_back_references_->length(); i++) { + RegExpBackReference* ref = (*named_back_references_)[i]; + + RegExpCapture* capture = FindNamedCapture(ref->name()); + if (capture == nullptr) { + ReportError(JSMSG_INVALID_NAMED_CAPTURE_REF); + return; + } + + ref->set_capture(capture); + } +} + +template <typename CharT> +RegExpCapture* +RegExpParser<CharT>::GetCapture(int index) +{ // The index for the capture groups are one-based. Its index in the list is // zero-based. int known_captures = @@ -1269,10 +1476,21 @@ RegExpParser<CharT>::GetCapture(int index) { return (*captures_)[index - 1]; } +template <typename CharT> +bool +RegExpParser<CharT>::HasNamedCaptures() { + if (has_named_captures_ || is_scanned_for_captures_) { + return has_named_captures_; + } + + ScanForCaptures(); + return has_named_captures_; +} template <typename CharT> bool -RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) { +RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) +{ for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { if (s->group_type() != CAPTURE) continue; // Return true if we found the matching capture index. @@ -1283,6 +1501,18 @@ RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) { return false; } +template <typename CharT> +bool +RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(const CharacterVector* name) +{ + for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { + if (s->group_type() != CAPTURE) continue; + if (!s->IsNamedCapture()) continue; + if (*s->capture_name() == *name) return true; + } + return false; +} + // QuantifierPrefix :: // { DecimalDigits } // { DecimalDigits , } @@ -1359,6 +1589,7 @@ RegExpTree* RegExpParser<CharT>::ParsePattern() { RegExpTree* result = ParseDisjunction(); + PatchNamedBackReferences(); MOZ_ASSERT_IF(result, !has_more()); return result; } @@ -1525,7 +1756,7 @@ RegExpTree* RegExpParser<CharT>::ParseDisjunction() { // Used to store current state while parsing subexpressions. - RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0); + RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0, nullptr); RegExpParserState* state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -1556,6 +1787,11 @@ RegExpParser<CharT>::ParseDisjunction() // Build result of subexpression. if (group_type == CAPTURE) { + if (state->IsNamedCapture()) { + if (!CreateNamedCaptureAtIndex(state->capture_name(), capture_index)) { + return nullptr; + } + } RegExpCapture* capture = GetCapture(capture_index); capture->set_body(body); body = capture; @@ -1635,6 +1871,8 @@ RegExpParser<CharT>::ParseDisjunction() case '(': { SubexpressionType subexpr_type = CAPTURE; RegExpLookaround::Type lookaround_type = state->lookaround_type(); + bool is_named_capture = false; + const CharacterVector* capture_name = nullptr; Advance(); if (current() == '?') { switch (Next()) { @@ -1659,21 +1897,30 @@ RegExpParser<CharT>::ParseDisjunction() subexpr_type = NEGATIVE_LOOKAROUND; break; } - // We didn't get a positive or negative after '<'. - // That's an error. - return ReportError(JSMSG_INVALID_GROUP); + // Not a lookbehind, continue parsing as named group + is_named_capture = true; + has_named_captures_ = true; + break; default: return ReportError(JSMSG_INVALID_GROUP); } - Advance(2); - } else { - if (captures_started() >= kMaxCaptures) - return ReportError(JSMSG_TOO_MANY_PARENS); - captures_started_++; + Advance(is_named_capture ? 1 : 2); + } + if (subexpr_type == CAPTURE) { + if (captures_started() >= kMaxCaptures) + return ReportError(JSMSG_TOO_MANY_PARENS); + captures_started_++; + + if (is_named_capture) { + capture_name = ParseCaptureGroupName(); + if (!capture_name) + return nullptr; + } } // Store current state and begin new disjunction parsing. state = alloc->newInfallible<RegExpParserState>(alloc, state, subexpr_type, - lookaround_type, captures_started_); + lookaround_type, captures_started_, + capture_name); builder = state->builder(); continue; } @@ -1834,6 +2081,22 @@ RegExpParser<CharT>::ParseDisjunction() } break; } + case 'k': { + // Either an identity escape or a named back-reference. The two + // interpretations are mutually exclusive: '\k' is interpreted as + // an identity escape for non-Unicode patterns without named + // capture groups, and as the beginning of a named back-reference + // in all other cases. + if (unicode_ || HasNamedCaptures()) { + Advance(2); + if (!ParseNamedBackReference(builder, state)) { + return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE); + } + } else { + builder->AddCharacter('k'); + } + break; + } default: // Identity escape. if (unicode_ && !IsSyntaxCharacter(Next())) @@ -1962,6 +2225,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si data->simple = parser.simple(); data->contains_anchor = parser.contains_anchor(); data->capture_count = parser.captures_started(); + parser.StoreNamedCaptureMap(&data->capture_name_list, &data->capture_index_list); return true; } diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h index 28a2d58220..48236530ab 100644 --- a/js/src/irregexp/RegExpParser.h +++ b/js/src/irregexp/RegExpParser.h @@ -204,6 +204,10 @@ class RegExpParser bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail); widechar ParseOctalLiteral(); + + // Parses the name of a capture group (?<name>pattern). The name must adhere + // to IdentifierName in the ECMAScript standard. + const CharacterVector* ParseCaptureGroupName(); // Tries to parse the input as a back reference. If successful it // stores the result in the output parameter and returns true. If @@ -218,6 +222,11 @@ class RegExpParser next_pos_ += dist - 1; Advance(); } + + bool StoreNamedCaptureMap(CharacterVectorVector** names, IntegerVector** indices); + // Returns true iff the pattern contains named captures. May call + // ScanForCaptures to look ahead at the remaining pattern. + bool HasNamedCaptures(); void Reset(const CharT* pos) { next_pos_ = pos; @@ -251,12 +260,14 @@ class RegExpParser RegExpParserState* previous_state, SubexpressionType group_type, RegExpLookaround::Type lookaround_type, - int disjunction_capture_index) + int disjunction_capture_index, + const CharacterVector* capture_name) : previous_state_(previous_state), builder_(alloc->newInfallible<RegExpBuilder>(alloc)), group_type_(group_type), lookaround_type_(lookaround_type), - disjunction_capture_index_(disjunction_capture_index) + disjunction_capture_index_(disjunction_capture_index), + capture_name_(capture_name) {} // Parser state of containing expression, if any. RegExpParserState* previous_state() { return previous_state_; } @@ -271,9 +282,15 @@ class RegExpParser // Also the capture index of this sub-expression itself, if group_type // is CAPTURE. int capture_index() { return disjunction_capture_index_; } + // The name of the current sub-expression, if group_type is CAPTURE. Only + // used for named captures. + const CharacterVector* capture_name() const { return capture_name_; } + bool IsNamedCapture() const { return capture_name_ != nullptr; } // Check whether the parser is inside a capture group with the given index. bool IsInsideCaptureGroup(int index); + // Check whether the parser is inside a capture group with the given name. + bool IsInsideCaptureGroup(const CharacterVector* name); private: // Linked list implementation of stack of states. @@ -286,11 +303,29 @@ class RegExpParser RegExpLookaround::Type lookaround_type_; // Stored disjunction's capture index (if any). int disjunction_capture_index_; + // Stored capture name (if any). + const CharacterVector* const capture_name_; }; // Return the 1-indexed RegExpCapture object, allocate if necessary. RegExpCapture* GetCapture(int index); + // Creates a new named capture at the specified index. Must be called exactly + // once for each named capture. Fails if a capture with the same name is + // encountered. + bool CreateNamedCaptureAtIndex(const CharacterVector* name, int index); + + // Find a named capture group by name, or return null if not found + RegExpCapture* FindNamedCapture(const CharacterVector* name); + + bool ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state); + + // After the initial parsing pass, patch corresponding RegExpCapture objects + // into all RegExpBackReferences. This is done after initial parsing in order + // to avoid complicating cases in which references comes before the capture. + void PatchNamedBackReferences(); + widechar current() { return current_; } bool has_more() { return has_more_; } bool has_next() { return next_pos_ < end_; } @@ -304,6 +339,9 @@ class RegExpParser frontend::TokenStream& ts; LifoAlloc* alloc; RegExpCaptureVector* captures_; + // contains the subset of captures_ that have names (for duplicate checking) + RegExpCaptureVector* named_captures_; + RegExpBackReferenceVector* named_back_references_; const CharT* next_pos_; const CharT* end_; widechar current_; @@ -318,6 +356,7 @@ class RegExpParser bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; + bool has_named_captures_; // Only valid after we have scanned for captures. }; } } // namespace js::irregexp diff --git a/js/src/jit/CodeGenerator.cpp b/js/src/jit/CodeGenerator.cpp index 66e8e25ddf..3f1b7251a3 100644 --- a/js/src/jit/CodeGenerator.cpp +++ b/js/src/jit/CodeGenerator.cpp @@ -1513,6 +1513,16 @@ JitCompartment::generateRegExpMatcherStub(JSContext* cx) return nullptr; } + // If a regexp has named captures, fall back to the OOL stub, which + // will end up calling CreateRegExpMatchResults. + Register shared = temp2; + masm.loadPtr(Address(regexp, NativeObject::getFixedSlotOffset(RegExpObject::PRIVATE_SLOT)), + shared); + masm.branchPtr(Assembler::NotEqual, + Address(shared, RegExpShared::offsetOfGroupsTemplate()), + ImmWord(0), + &oolEntry); + // Construct the result. Register object = temp1; Label matchResultFallback, matchResultJoin; @@ -1523,6 +1533,7 @@ JitCompartment::generateRegExpMatcherStub(JSContext* cx) masm.loadPtr(Address(object, NativeObject::offsetOfSlots()), temp2); masm.storeValue(templateObject->getSlot(0), Address(temp2, 0)); masm.storeValue(templateObject->getSlot(1), Address(temp2, sizeof(Value))); + masm.storeValue(templateObject->getSlot(2), Address(temp2, 2 * sizeof(Value))); size_t elementsOffset = NativeObject::offsetOfFixedElements(); @@ -1636,6 +1647,7 @@ JitCompartment::generateRegExpMatcherStub(JSContext* cx) MOZ_ASSERT(templateObject->numFixedSlots() == 0); MOZ_ASSERT(templateObject->lookupPure(cx->names().index)->slot() == 0); MOZ_ASSERT(templateObject->lookupPure(cx->names().input)->slot() == 1); + MOZ_ASSERT(templateObject->lookupPure(cx->names().groups)->slot() == 2); masm.load32(pairsVectorAddress, temp3); masm.storeValue(JSVAL_TYPE_INT32, temp3, Address(temp2, 0)); diff --git a/js/src/js.msg b/js/src/js.msg index 51854fc398..93d8a557b1 100644 --- a/js/src/js.msg +++ b/js/src/js.msg @@ -513,6 +513,12 @@ MSG_DEF(JSMSG_TOO_MANY_PARENS, 0, JSEXN_INTERNALERR, "too many parenthes MSG_DEF(JSMSG_UNICODE_OVERFLOW, 1, JSEXN_SYNTAXERR, "Unicode codepoint must not be greater than 0x10FFFF in {0}") MSG_DEF(JSMSG_UNMATCHED_RIGHT_PAREN, 0, JSEXN_SYNTAXERR, "unmatched ) in regular expression") MSG_DEF(JSMSG_UNTERM_CLASS, 0, JSEXN_SYNTAXERR, "unterminated character class") +MSG_DEF(JSMSG_INVALID_PROPERTY_NAME, 0, JSEXN_SYNTAXERR, "invalid property name in regular expression") +MSG_DEF(JSMSG_INVALID_CLASS_PROPERTY_NAME, 0, JSEXN_SYNTAXERR, "invalid class property name in regular expression") +MSG_DEF(JSMSG_INVALID_CAPTURE_NAME, 0, JSEXN_SYNTAXERR, "invalid capture group name in regular expression") +MSG_DEF(JSMSG_DUPLICATE_CAPTURE_NAME, 0, JSEXN_SYNTAXERR, "duplicate capture group name in regular expression") +MSG_DEF(JSMSG_INVALID_NAMED_REF, 0, JSEXN_SYNTAXERR, "invalid named reference in regular expression") +MSG_DEF(JSMSG_INVALID_NAMED_CAPTURE_REF, 0, JSEXN_SYNTAXERR, "invalid named capture reference in regular expression") // Self-hosting MSG_DEF(JSMSG_DEFAULT_LOCALE_ERROR, 0, JSEXN_ERR, "internal error getting the default locale") diff --git a/js/src/vm/CommonPropertyNames.h b/js/src/vm/CommonPropertyNames.h index 5080e6ab09..57ec80669c 100644 --- a/js/src/vm/CommonPropertyNames.h +++ b/js/src/vm/CommonPropertyNames.h @@ -162,6 +162,7 @@ macro(global, global, "global") \ macro(globalThis, globalThis, "globalThis") \ macro(group, group, "group") \ + macro(groups, groups, "groups") \ macro(Handle, Handle, "Handle") \ macro(has, has, "has") \ macro(hasOwn, hasOwn, "hasOwn") \ diff --git a/js/src/vm/RegExpObject.cpp b/js/src/vm/RegExpObject.cpp index b2375ab8f0..e96db29edb 100644 --- a/js/src/vm/RegExpObject.cpp +++ b/js/src/vm/RegExpObject.cpp @@ -951,7 +951,8 @@ js::StringHasRegExpMetaChars(JSLinearString* str) /* RegExpShared */ RegExpShared::RegExpShared(JSAtom* source, RegExpFlag flags) - : source(source), flags(flags), parenCount(0), canStringMatch(false), marked_(false) + : source(source), flags(flags), parenCount(0), canStringMatch(false), marked_(false), + numNamedCaptures_(0), groupsTemplate_(nullptr) {} RegExpShared::~RegExpShared() @@ -1006,6 +1007,56 @@ RegExpShared::compile(JSContext* cx, HandleLinearString input, } bool +RegExpShared::initializeNamedCaptures(JSContext* cx, irregexp::CharacterVectorVector* names, irregexp::IntegerVector* indices) +{ + MOZ_ASSERT(!groupsTemplate_); + MOZ_ASSERT(names); + MOZ_ASSERT(indices); + MOZ_ASSERT(names->length() == indices->length()); + + // The irregexp parser returns named capture information in the form + // of two arrays. We create a template object with a property for each + // capture name, and store the capture index as Integer in the corresponding value. + uint32_t numNamedCaptures = names->length(); + + // Create a plain template object. + RootedPlainObject templateObject(cx, NewObjectWithGivenProto<PlainObject>(cx, nullptr, TenuredObject)); + if (!templateObject) { + return false; + } + + // Create a new group for the template. + Rooted<TaggedProto> proto(cx, templateObject->taggedProto()); + ObjectGroup* group = ObjectGroupCompartment::makeGroup(cx, templateObject->getClass(), proto); + if (!group) { + return false; + } + templateObject->setGroup(group); + + // Initialize the properties of the template. + RootedId id(cx); + for (uint32_t i = 0; i < numNamedCaptures; i++) { + irregexp::CharacterVector* cv = (*names)[i]; + // Need to explicitly create an Atom (not a String) or it won't get added to the atom table + JSAtom* atom = AtomizeChars(cx, cv->begin(), cv->length()); + if (!atom) { + return false; + } + id = NameToId(atom->asPropertyName()); + RootedValue idx(cx, Int32Value((*indices)[i])); + if (!NativeDefineProperty(cx, templateObject, id, idx, + nullptr, nullptr, JSPROP_ENUMERATE)) { + return false; + } + AddTypePropertyId(cx, templateObject, id, TypeSet::Int32Type()); + } + + groupsTemplate_ = templateObject; + numNamedCaptures_ = numNamedCaptures; + return true; +} + +bool RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString input, CompilationMode mode, ForceByteCodeEnum force) { @@ -1027,6 +1078,12 @@ RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString inpu } this->parenCount = data.capture_count; + if (data.capture_name_list) { + // convert LifoAlloc'd named capture info to NativeObject + if (!initializeNamedCaptures(cx, data.capture_name_list, data.capture_index_list)) { + return false; + } + } irregexp::RegExpCode code = irregexp::CompilePattern(cx, this, &data, input, false /* global() */, @@ -1260,17 +1317,27 @@ RegExpCompartment::createMatchResultTemplateObject(JSContext* cx) return matchResultTemplateObject_; // = nullptr } + /* Set dummy groups property */ + RootedValue groupsVal(cx, UndefinedValue()); + if (!NativeDefineProperty( + cx, templateObject, cx->names().groups, groupsVal, nullptr, nullptr, JSPROP_ENUMERATE)) { + return nullptr; + } + // Make sure that the properties are in the right slots. DebugOnly<Shape*> shape = templateObject->lastProperty(); - MOZ_ASSERT(shape->previous()->slot() == 0 && - shape->previous()->propidRef() == NameToId(cx->names().index)); - MOZ_ASSERT(shape->slot() == 1 && - shape->propidRef() == NameToId(cx->names().input)); + MOZ_ASSERT(shape->slot() == 2 && + shape->propidRef() == NameToId(cx->names().groups)); + MOZ_ASSERT(shape->previous()->slot() == 1 && + shape->previous()->propidRef() == NameToId(cx->names().input)); + MOZ_ASSERT(shape->previous()->previous()->slot() == 0 && + shape->previous()->previous()->propidRef() == NameToId(cx->names().index)); // Make sure type information reflects the indexed properties which might // be added. AddTypePropertyId(cx, templateObject, JSID_VOID, TypeSet::StringType()); AddTypePropertyId(cx, templateObject, JSID_VOID, TypeSet::UndefinedType()); + AddTypePropertyId(cx, templateObject, NameToId(cx->names().groups), TypeSet::AnyObjectType()); matchResultTemplateObject_.set(templateObject); diff --git a/js/src/vm/RegExpObject.h b/js/src/vm/RegExpObject.h index ca7a39ec65..17d961eede 100644 --- a/js/src/vm/RegExpObject.h +++ b/js/src/vm/RegExpObject.h @@ -17,6 +17,7 @@ #include "proxy/Proxy.h" #include "vm/ArrayObject.h" #include "vm/Shape.h" +#include "irregexp/InfallibleVector.h" /* * JavaScript Regular Expressions @@ -133,6 +134,9 @@ class RegExpShared bool canStringMatch; bool marked_; + uint32_t numNamedCaptures_; + GCPtr<PlainObject*> groupsTemplate_; + RegExpCompilation compilationArray[4]; static int CompilationIndex(CompilationMode mode, bool latin1) { @@ -187,6 +191,11 @@ class RegExpShared /* Accounts for the "0" (whole match) pair. */ size_t pairCount() const { return getParenCount() + 1; } + // not public due to circular inclusion problems + bool initializeNamedCaptures(JSContext* cx, irregexp::CharacterVectorVector* names, irregexp::IntegerVector* indices); + PlainObject* getGroupsTemplate() { return groupsTemplate_; } + uint32_t numNamedCaptures() const { return numNamedCaptures_; } + JSAtom* getSource() const { return source; } RegExpFlag getFlags() const { return flags; } bool ignoreCase() const { return flags & IgnoreCaseFlag; } @@ -238,6 +247,10 @@ class RegExpShared + offsetof(RegExpCompilation, jitCode); } + static size_t offsetOfGroupsTemplate() { + return offsetof(RegExpShared, groupsTemplate_); + } + size_t sizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf); #ifdef DEBUG |