summaryrefslogtreecommitdiff
path: root/js
diff options
context:
space:
mode:
authorMartok <martok@martoks-place.de>2022-12-21 18:53:27 +0100
committerMartok <martok@martoks-place.de>2022-12-21 18:53:27 +0100
commitb356d64f4e69af8cf1f2eaf87d1f328bdd24127e (patch)
treef2d5df6d2bc16818cc71283363d54f56af80da39 /js
parent9a0807a62c96d72387b218d840eb6bb8afb5916d (diff)
downloaduxp-b356d64f4e69af8cf1f2eaf87d1f328bdd24127e.tar.gz
Issue #1285 - implement named capturing groups and named backrefs
- RegExpParser collects seen groups in named_captures_. - After irregexp::ParsePattern has finished, RegExpParser::StoreNamedCaptureMap translates the parser data to RegExpCompileData.capture_name/index - RegExpShared::initializeNamedCaptures takes these and builds a PlainObject map which is kept with the compiled expression This is done because irregexp doesn't have access to the JS context and so can't allocate any JSValues itself. - for each match result, this map is used to build PlainObjects of name->match/undefined (extremely simplified from upstream at the expense of some perf) IonMonkey switches to non-masm code path for expressions with named groups.
Diffstat (limited to 'js')
-rw-r--r--js/src/builtin/RegExp.cpp69
-rw-r--r--js/src/builtin/RegExp.h3
-rw-r--r--js/src/irregexp/RegExpAST.h13
-rw-r--r--js/src/irregexp/RegExpEngine.h17
-rw-r--r--js/src/irregexp/RegExpParser.cpp292
-rw-r--r--js/src/irregexp/RegExpParser.h43
-rw-r--r--js/src/jit/CodeGenerator.cpp12
-rw-r--r--js/src/js.msg6
-rw-r--r--js/src/vm/CommonPropertyNames.h1
-rw-r--r--js/src/vm/RegExpObject.cpp77
-rw-r--r--js/src/vm/RegExpObject.h13
11 files changed, 512 insertions, 34 deletions
diff --git a/js/src/builtin/RegExp.cpp b/js/src/builtin/RegExp.cpp
index 39db78c8e9..de83b38aed 100644
--- a/js/src/builtin/RegExp.cpp
+++ b/js/src/builtin/RegExp.cpp
@@ -21,6 +21,7 @@
#include "vm/NativeObject-inl.h"
+
using namespace js;
using namespace js::unicode;
@@ -31,11 +32,12 @@ using mozilla::Maybe;
using CapturesVector = GCVector<Value, 4>;
/*
- * ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad 21.2.5.2.2
- * steps 3, 16-25.
+ * ES 2021 draft 21.2.5.2.2: Steps 16-28
+ * https://tc39.es/ecma262/#sec-regexpbuiltinexec
*/
bool
-js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& matches,
+js::CreateRegExpMatchResult(JSContext* cx, RegExpShared& re,
+ HandleString input, const MatchPairs& matches,
MutableHandleValue rval)
{
MOZ_ASSERT(input);
@@ -48,6 +50,7 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs&
* 1..pairCount-1: paren matches
* input: input string
* index: start index for the match
+ * groups: named capture groups for the match
*/
/* Get the templateObject that defines the shape and type of the output object */
@@ -55,15 +58,16 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs&
if (!templateObject)
return false;
+ // Step 16
size_t numPairs = matches.length();
MOZ_ASSERT(numPairs > 0);
- /* Step 17. */
+ /* Step 18-19. */
RootedArrayObject arr(cx, NewDenseFullyAllocatedArrayWithTemplate(cx, numPairs, templateObject));
if (!arr)
return false;
- /* Steps 22-24.
+ /* Steps 22-23 and 27 a-e
* Store a Value for each pair. */
for (size_t i = 0; i < numPairs; i++) {
const MatchPair& pair = matches[i];
@@ -81,6 +85,40 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs&
}
}
+ // Step 24 (reordered)
+ RootedPlainObject groups(cx);
+ if (re.numNamedCaptures() > 0) {
+ // construct a new object from the template saved on RegExpShared
+ RootedPlainObject groupsTemplate(cx, re.getGroupsTemplate());
+ groups = NewObjectWithGivenProto<PlainObject>(cx, nullptr);
+ groups->setGroup(groupsTemplate->group());
+
+ // Step 27 f.
+ // The groups template object stores the names of the named captures in the
+ // the order in which they are defined.
+ // Grab the index into the match vector from the template object and define the
+ // corresponding property on the result
+ AutoIdVector keys(cx);
+ if (!GetPropertyKeys(cx, groupsTemplate, 0, &keys)) {
+ return false;
+ }
+ MOZ_ASSERT(keys.length() == re.numNamedCaptures());
+ RootedId key(cx);
+ RootedValue ival(cx);
+ RootedValue val(cx);
+ for (size_t i = 0; i < keys.length(); i++) {
+ key = keys[i];
+ // fetch the group's match index...
+ if (!NativeGetProperty(cx, groupsTemplate, key, &ival))
+ return false;
+ // ... and set it on groups
+ val = arr->getDenseElement(ival.toInt32());
+ if (!NativeDefineProperty(cx, groups, key, val, nullptr, nullptr, JSPROP_ENUMERATE)) {
+ return false;
+ }
+ }
+ }
+
/* Step 20 (reordered).
* Set the |index| property. (TemplateObject positions it in slot 0) */
arr->setSlot(0, Int32Value(matches[0].start));
@@ -89,6 +127,10 @@ js::CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs&
* Set the |input| property. (TemplateObject positions it in slot 1) */
arr->setSlot(1, StringValue(input));
+ // Steps 25-26 (reordered)
+ // Set the |groups| property.
+ arr->setSlot(2, groups ? ObjectValue(*groups) : UndefinedValue());
+
#ifdef DEBUG
RootedValue test(cx);
RootedId id(cx, NameToId(cx->names().index));
@@ -170,7 +212,7 @@ js::ExecuteRegExpLegacy(JSContext* cx, RegExpStatics* res, Handle<RegExpObject*>
return true;
}
- return CreateRegExpMatchResult(cx, input, matches, rval);
+ return CreateRegExpMatchResult(cx, *shared, input, matches, rval);
}
static bool
@@ -1027,7 +1069,11 @@ RegExpMatcherImpl(JSContext* cx, HandleObject regexp, HandleString string,
}
/* Steps 16-25 */
- return CreateRegExpMatchResult(cx, string, matches, rval);
+ Rooted<RegExpObject*> reobj(cx, &regexp->as<RegExpObject>());
+ RegExpGuard shared(cx);
+ if (!RegExpObject::getShared(cx, reobj, &shared))
+ return false;
+ return CreateRegExpMatchResult(cx, *shared, string, matches, rval);
}
/*
@@ -1069,8 +1115,13 @@ js::RegExpMatcherRaw(JSContext* cx, HandleObject regexp, HandleString input,
// The MatchPairs will always be passed in, but RegExp execution was
// successful only if the pairs have actually been filled in.
- if (maybeMatches && maybeMatches->pairsRaw()[0] >= 0)
- return CreateRegExpMatchResult(cx, input, *maybeMatches, output);
+ if (maybeMatches && maybeMatches->pairsRaw()[0] >= 0) {
+ Rooted<RegExpObject*> reobj(cx, &regexp->as<RegExpObject>());
+ RegExpGuard shared(cx);
+ if (!RegExpObject::getShared(cx, reobj, &shared))
+ return false;
+ return CreateRegExpMatchResult(cx, *shared, input, *maybeMatches, output);
+ }
return RegExpMatcherImpl(cx, regexp, input, lastIndex,
UpdateRegExpStatics, output);
}
diff --git a/js/src/builtin/RegExp.h b/js/src/builtin/RegExp.h
index 275efd7ce3..cb88319ac9 100644
--- a/js/src/builtin/RegExp.h
+++ b/js/src/builtin/RegExp.h
@@ -36,7 +36,8 @@ ExecuteRegExpLegacy(JSContext* cx, RegExpStatics* res, Handle<RegExpObject*> reo
/* Translation from MatchPairs to a JS array in regexp_exec()'s output format. */
MOZ_MUST_USE bool
-CreateRegExpMatchResult(JSContext* cx, HandleString input, const MatchPairs& matches,
+CreateRegExpMatchResult(JSContext* cx, RegExpShared& re,
+ HandleString input, const MatchPairs& matches,
MutableHandleValue rval);
extern MOZ_MUST_USE bool
diff --git a/js/src/irregexp/RegExpAST.h b/js/src/irregexp/RegExpAST.h
index bd01f6c6cd..9e023d537f 100644
--- a/js/src/irregexp/RegExpAST.h
+++ b/js/src/irregexp/RegExpAST.h
@@ -339,7 +339,7 @@ class RegExpCapture : public RegExpTree
{
public:
explicit RegExpCapture(RegExpTree* body, int index)
- : body_(body), index_(index)
+ : body_(body), index_(index), name_(nullptr)
{}
virtual void* Accept(RegExpVisitor* visitor, void* data);
@@ -359,12 +359,15 @@ class RegExpCapture : public RegExpTree
RegExpTree* body() { return body_; }
void set_body(RegExpTree* body) { body_ = body; }
int index() { return index_; }
+ const CharacterVector* name() const { return name_; }
+ void set_name(const CharacterVector* name) { name_ = name; }
static int StartRegister(int index) { return index * 2; }
static int EndRegister(int index) { return index * 2 + 1; }
private:
RegExpTree* body_;
int index_;
+ const CharacterVector* name_;
};
class RegExpLookaround : public RegExpTree
@@ -413,7 +416,7 @@ class RegExpBackReference : public RegExpTree
{
public:
explicit RegExpBackReference(RegExpCapture* capture)
- : capture_(capture)
+ : capture_(capture), name_(nullptr)
{}
virtual void* Accept(RegExpVisitor* visitor, void* data);
@@ -427,10 +430,16 @@ class RegExpBackReference : public RegExpTree
int max_match() override { return kInfinity; }
int index() { return capture_->index(); }
RegExpCapture* capture() { return capture_; }
+ void set_capture(RegExpCapture* capture) { capture_ = capture; }
+ const CharacterVector* name() const { return name_; }
+ void set_name(const CharacterVector* name) { name_ = name; }
private:
RegExpCapture* capture_;
+ const CharacterVector* name_;
};
+typedef InfallibleVector<RegExpBackReference*, 1> RegExpBackReferenceVector;
+
class RegExpEmpty : public RegExpTree
{
public:
diff --git a/js/src/irregexp/RegExpEngine.h b/js/src/irregexp/RegExpEngine.h
index 7505636100..1b011458cd 100644
--- a/js/src/irregexp/RegExpEngine.h
+++ b/js/src/irregexp/RegExpEngine.h
@@ -57,13 +57,28 @@ struct RegExpCompileData
: tree(nullptr),
simple(true),
contains_anchor(false),
- capture_count(0)
+ capture_count(0),
+ capture_name_list(nullptr),
+ capture_index_list(nullptr)
{}
+ // The parsed AST as produced by the RegExpParser.
RegExpTree* tree;
+ // True, iff the pattern is a 'simple' atom with zero captures. In other
+ // words, the pattern consists of a string with no metacharacters and special
+ // regexp features, and can be implemented as a standard string search.
bool simple;
+
+ // True, iff the pattern is anchored at the start of the string with '^'.
bool contains_anchor;
+
+ // The number of capture groups, without the global capture \0.
int capture_count;
+
+ // Only use if the pattern contains named captures. If so, this contains a
+ // mapping of capture names to capture indices, as Values.
+ CharacterVectorVector* capture_name_list;
+ IntegerVector* capture_index_list;
};
struct RegExpCode
diff --git a/js/src/irregexp/RegExpParser.cpp b/js/src/irregexp/RegExpParser.cpp
index c6b8727048..c46b8cf8ec 100644
--- a/js/src/irregexp/RegExpParser.cpp
+++ b/js/src/irregexp/RegExpParser.cpp
@@ -245,6 +245,8 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
: ts(ts),
alloc(alloc),
captures_(nullptr),
+ named_captures_(nullptr),
+ named_back_references_(nullptr),
next_pos_(chars),
captures_started_(0),
end_(end),
@@ -257,7 +259,8 @@ RegExpParser<CharT>::RegExpParser(frontend::TokenStream& ts, LifoAlloc* alloc,
dotall_(dotall),
simple_(false),
contains_anchor_(false),
- is_scanned_for_captures_(false)
+ is_scanned_for_captures_(false),
+ has_named_captures_(false)
{
Advance();
}
@@ -272,6 +275,30 @@ RegExpParser<CharT>::ReportError(unsigned errorNumber, const char* param /* = nu
}
template <typename CharT>
+bool
+RegExpParser<CharT>::StoreNamedCaptureMap(CharacterVectorVector** names, IntegerVector** indices)
+{
+ // Any named captures defined at all?
+ if (!named_captures_ || !named_captures_->length()) {
+ return true;
+ }
+
+ CharacterVectorVector* nv = alloc->newInfallible<CharacterVectorVector>(*alloc);
+ IntegerVector* iv = alloc->newInfallible<IntegerVector>(*alloc);
+
+ for (size_t i=0; i<named_captures_->length(); i++) {
+ RegExpCapture* capture = (*named_captures_)[i];
+ const CharacterVector* cn = capture->name();
+ nv->append(const_cast<CharacterVector*>(cn));
+ iv->append(capture->index());
+ }
+
+ *names = nv;
+ *indices = iv;
+ return true;
+}
+
+template <typename CharT>
void
RegExpParser<CharT>::Advance()
{
@@ -1165,6 +1192,7 @@ template <typename CharT>
void
RegExpParser<CharT>::ScanForCaptures()
{
+ const CharT* saved_position = position();
// Start with captures started previous to current position
int capture_count = captures_started();
// Add count of captures after this position.
@@ -1188,12 +1216,32 @@ RegExpParser<CharT>::ScanForCaptures()
break;
}
case '(':
- if (current() != '?') capture_count++;
+ if (current() == '?') {
+ // At this point we could be in
+ // * a non-capturing group '(:',
+ // * a lookbehind assertion '(?<=' '(?<!'
+ // * or a named capture '(?<'.
+ //
+ // Of these, only named captures are capturing groups.
+
+ Advance();
+ if (current() != '<') break;
+
+ Advance();
+ if (current() == '=' || current() == '!') break;
+
+ // Found a possible named capture. It could turn out to be a syntax
+ // error (e.g. an unterminated or invalid name), but that distinction
+ // does not matter for our purposes.
+ has_named_captures_ = true;
+ }
+ capture_count++;
break;
}
}
capture_count_ = capture_count;
is_scanned_for_captures_ = true;
+ Reset(saved_position);
}
inline bool
@@ -1251,9 +1299,168 @@ RegExpParser<CharT>::ParseBackReferenceIndex(int* index_out)
return true;
}
+static void push_code_unit(CharacterVector* v, uint32_t code_unit)
+{
+ // based off of unicode::UTF16Encode
+ if (!unicode::IsSupplementary(code_unit)) {
+ v->append(char16_t(code_unit));
+ } else {
+ v->append(unicode::LeadSurrogate(code_unit));
+ v->append(unicode::TrailSurrogate(code_unit));
+ }
+}
+
+template <typename CharT>
+const CharacterVector*
+RegExpParser<CharT>::ParseCaptureGroupName()
+{
+ CharacterVector* name = alloc->newInfallible<CharacterVector>(*alloc);
+
+ bool at_start = true;
+ while (true) {
+ widechar c = current();
+ Advance();
+
+ // Convert unicode escapes.
+ if (c == '\\' && current() == 'u') {
+ Advance();
+ if (!ParseUnicodeEscape(&c)) {
+ ReportError(JSMSG_INVALID_UNICODE_ESCAPE);
+ return nullptr;
+ }
+ }
+
+ // The backslash char is misclassified as both ID_Start and ID_Continue.
+ if (c == '\\') {
+ ReportError(JSMSG_INVALID_CAPTURE_NAME);
+ return nullptr;
+ }
+
+ if (at_start) {
+ if (!unicode::IsIdentifierStart(c)) {
+ ReportError(JSMSG_INVALID_CAPTURE_NAME);
+ return nullptr;
+ }
+ push_code_unit(name, c);
+ at_start = false;
+ } else {
+ if (c == '>') {
+ break;
+ } else if (unicode::IsIdentifierPart(c)) {
+ push_code_unit(name, c);
+ } else {
+ ReportError(JSMSG_INVALID_CAPTURE_NAME);
+ return nullptr;
+ }
+ }
+ }
+
+ return name;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::CreateNamedCaptureAtIndex(const CharacterVector* name,
+ int index)
+{
+ MOZ_ASSERT(0 < index && index <= captures_started_);
+ MOZ_ASSERT(name !== nullptr);
+
+ RegExpCapture* capture = GetCapture(index);
+ MOZ_ASSERT(capture->name() == nullptr);
+
+ capture->set_name(name);
+
+ if (named_captures_ == nullptr) {
+ named_captures_ = alloc->newInfallible<RegExpCaptureVector>(*alloc);
+ } else {
+ // Check for duplicates and bail if we find any.
+ if (FindNamedCapture(name) != nullptr) {
+ ReportError(JSMSG_DUPLICATE_CAPTURE_NAME);
+ return false;
+ }
+ }
+ named_captures_->append(capture);
+ return true;
+}
+
template <typename CharT>
RegExpCapture*
-RegExpParser<CharT>::GetCapture(int index) {
+RegExpParser<CharT>::FindNamedCapture(const CharacterVector* name)
+{
+ // Linear search is fine since there are usually very few named groups
+ for (auto it=named_captures_->begin(); it<named_captures_->end(); it++) {
+ if (*(*it)->name() == *name) {
+ return *it;
+ }
+ }
+ return nullptr;
+}
+
+template <typename CharT>
+bool
+RegExpParser<CharT>::ParseNamedBackReference(RegExpBuilder* builder,
+ RegExpParserState* state)
+{
+ // The parser is assumed to be on the '<' in \k<name>.
+ if (current() != '<') {
+ ReportError(JSMSG_INVALID_NAMED_REF);
+ return false;
+ }
+
+ Advance();
+ const CharacterVector* name = ParseCaptureGroupName();
+ if (name == nullptr) {
+ return false;
+ }
+
+ if (state->IsInsideCaptureGroup(name)) {
+ builder->AddEmpty();
+ } else {
+ RegExpBackReference* atom = alloc->newInfallible<RegExpBackReference>(nullptr);
+ atom->set_name(name);
+
+ builder->AddAtom(atom);
+
+ if (named_back_references_ == nullptr) {
+ named_back_references_ = alloc->newInfallible<RegExpBackReferenceVector>(*alloc);
+ }
+ named_back_references_->append(atom);
+ }
+
+ return true;
+}
+
+template <typename CharT>
+void
+RegExpParser<CharT>::PatchNamedBackReferences()
+{
+ if (named_back_references_ == nullptr) return;
+
+ if (named_captures_ == nullptr) {
+ // Named backrefs but no named groups
+ ReportError(JSMSG_INVALID_NAMED_CAPTURE_REF);
+ return;
+ }
+
+ // Look up and patch the actual capture for each named back reference.
+ for (size_t i = 0; i < named_back_references_->length(); i++) {
+ RegExpBackReference* ref = (*named_back_references_)[i];
+
+ RegExpCapture* capture = FindNamedCapture(ref->name());
+ if (capture == nullptr) {
+ ReportError(JSMSG_INVALID_NAMED_CAPTURE_REF);
+ return;
+ }
+
+ ref->set_capture(capture);
+ }
+}
+
+template <typename CharT>
+RegExpCapture*
+RegExpParser<CharT>::GetCapture(int index)
+{
// The index for the capture groups are one-based. Its index in the list is
// zero-based.
int known_captures =
@@ -1269,10 +1476,21 @@ RegExpParser<CharT>::GetCapture(int index) {
return (*captures_)[index - 1];
}
+template <typename CharT>
+bool
+RegExpParser<CharT>::HasNamedCaptures() {
+ if (has_named_captures_ || is_scanned_for_captures_) {
+ return has_named_captures_;
+ }
+
+ ScanForCaptures();
+ return has_named_captures_;
+}
template <typename CharT>
bool
-RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) {
+RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index)
+{
for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
if (s->group_type() != CAPTURE) continue;
// Return true if we found the matching capture index.
@@ -1283,6 +1501,18 @@ RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(int index) {
return false;
}
+template <typename CharT>
+bool
+RegExpParser<CharT>::RegExpParserState::IsInsideCaptureGroup(const CharacterVector* name)
+{
+ for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
+ if (s->group_type() != CAPTURE) continue;
+ if (!s->IsNamedCapture()) continue;
+ if (*s->capture_name() == *name) return true;
+ }
+ return false;
+}
+
// QuantifierPrefix ::
// { DecimalDigits }
// { DecimalDigits , }
@@ -1359,6 +1589,7 @@ RegExpTree*
RegExpParser<CharT>::ParsePattern()
{
RegExpTree* result = ParseDisjunction();
+ PatchNamedBackReferences();
MOZ_ASSERT_IF(result, !has_more());
return result;
}
@@ -1525,7 +1756,7 @@ RegExpTree*
RegExpParser<CharT>::ParseDisjunction()
{
// Used to store current state while parsing subexpressions.
- RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0);
+ RegExpParserState initial_state(alloc, nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, 0, nullptr);
RegExpParserState* state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@@ -1556,6 +1787,11 @@ RegExpParser<CharT>::ParseDisjunction()
// Build result of subexpression.
if (group_type == CAPTURE) {
+ if (state->IsNamedCapture()) {
+ if (!CreateNamedCaptureAtIndex(state->capture_name(), capture_index)) {
+ return nullptr;
+ }
+ }
RegExpCapture* capture = GetCapture(capture_index);
capture->set_body(body);
body = capture;
@@ -1635,6 +1871,8 @@ RegExpParser<CharT>::ParseDisjunction()
case '(': {
SubexpressionType subexpr_type = CAPTURE;
RegExpLookaround::Type lookaround_type = state->lookaround_type();
+ bool is_named_capture = false;
+ const CharacterVector* capture_name = nullptr;
Advance();
if (current() == '?') {
switch (Next()) {
@@ -1659,21 +1897,30 @@ RegExpParser<CharT>::ParseDisjunction()
subexpr_type = NEGATIVE_LOOKAROUND;
break;
}
- // We didn't get a positive or negative after '<'.
- // That's an error.
- return ReportError(JSMSG_INVALID_GROUP);
+ // Not a lookbehind, continue parsing as named group
+ is_named_capture = true;
+ has_named_captures_ = true;
+ break;
default:
return ReportError(JSMSG_INVALID_GROUP);
}
- Advance(2);
- } else {
- if (captures_started() >= kMaxCaptures)
- return ReportError(JSMSG_TOO_MANY_PARENS);
- captures_started_++;
+ Advance(is_named_capture ? 1 : 2);
+ }
+ if (subexpr_type == CAPTURE) {
+ if (captures_started() >= kMaxCaptures)
+ return ReportError(JSMSG_TOO_MANY_PARENS);
+ captures_started_++;
+
+ if (is_named_capture) {
+ capture_name = ParseCaptureGroupName();
+ if (!capture_name)
+ return nullptr;
+ }
}
// Store current state and begin new disjunction parsing.
state = alloc->newInfallible<RegExpParserState>(alloc, state, subexpr_type,
- lookaround_type, captures_started_);
+ lookaround_type, captures_started_,
+ capture_name);
builder = state->builder();
continue;
}
@@ -1834,6 +2081,22 @@ RegExpParser<CharT>::ParseDisjunction()
}
break;
}
+ case 'k': {
+ // Either an identity escape or a named back-reference. The two
+ // interpretations are mutually exclusive: '\k' is interpreted as
+ // an identity escape for non-Unicode patterns without named
+ // capture groups, and as the beginning of a named back-reference
+ // in all other cases.
+ if (unicode_ || HasNamedCaptures()) {
+ Advance(2);
+ if (!ParseNamedBackReference(builder, state)) {
+ return ReportError(JSMSG_INVALID_IDENTITY_ESCAPE);
+ }
+ } else {
+ builder->AddCharacter('k');
+ }
+ break;
+ }
default:
// Identity escape.
if (unicode_ && !IsSyntaxCharacter(Next()))
@@ -1962,6 +2225,7 @@ ParsePattern(frontend::TokenStream& ts, LifoAlloc& alloc, const CharT* chars, si
data->simple = parser.simple();
data->contains_anchor = parser.contains_anchor();
data->capture_count = parser.captures_started();
+ parser.StoreNamedCaptureMap(&data->capture_name_list, &data->capture_index_list);
return true;
}
diff --git a/js/src/irregexp/RegExpParser.h b/js/src/irregexp/RegExpParser.h
index 28a2d58220..48236530ab 100644
--- a/js/src/irregexp/RegExpParser.h
+++ b/js/src/irregexp/RegExpParser.h
@@ -204,6 +204,10 @@ class RegExpParser
bool ParseRawSurrogatePair(char16_t* lead, char16_t* trail);
widechar ParseOctalLiteral();
+
+ // Parses the name of a capture group (?<name>pattern). The name must adhere
+ // to IdentifierName in the ECMAScript standard.
+ const CharacterVector* ParseCaptureGroupName();
// Tries to parse the input as a back reference. If successful it
// stores the result in the output parameter and returns true. If
@@ -218,6 +222,11 @@ class RegExpParser
next_pos_ += dist - 1;
Advance();
}
+
+ bool StoreNamedCaptureMap(CharacterVectorVector** names, IntegerVector** indices);
+ // Returns true iff the pattern contains named captures. May call
+ // ScanForCaptures to look ahead at the remaining pattern.
+ bool HasNamedCaptures();
void Reset(const CharT* pos) {
next_pos_ = pos;
@@ -251,12 +260,14 @@ class RegExpParser
RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
- int disjunction_capture_index)
+ int disjunction_capture_index,
+ const CharacterVector* capture_name)
: previous_state_(previous_state),
builder_(alloc->newInfallible<RegExpBuilder>(alloc)),
group_type_(group_type),
lookaround_type_(lookaround_type),
- disjunction_capture_index_(disjunction_capture_index)
+ disjunction_capture_index_(disjunction_capture_index),
+ capture_name_(capture_name)
{}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
@@ -271,9 +282,15 @@ class RegExpParser
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
int capture_index() { return disjunction_capture_index_; }
+ // The name of the current sub-expression, if group_type is CAPTURE. Only
+ // used for named captures.
+ const CharacterVector* capture_name() const { return capture_name_; }
+ bool IsNamedCapture() const { return capture_name_ != nullptr; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(int index);
+ // Check whether the parser is inside a capture group with the given name.
+ bool IsInsideCaptureGroup(const CharacterVector* name);
private:
// Linked list implementation of stack of states.
@@ -286,11 +303,29 @@ class RegExpParser
RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
int disjunction_capture_index_;
+ // Stored capture name (if any).
+ const CharacterVector* const capture_name_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(int index);
+ // Creates a new named capture at the specified index. Must be called exactly
+ // once for each named capture. Fails if a capture with the same name is
+ // encountered.
+ bool CreateNamedCaptureAtIndex(const CharacterVector* name, int index);
+
+ // Find a named capture group by name, or return null if not found
+ RegExpCapture* FindNamedCapture(const CharacterVector* name);
+
+ bool ParseNamedBackReference(RegExpBuilder* builder,
+ RegExpParserState* state);
+
+ // After the initial parsing pass, patch corresponding RegExpCapture objects
+ // into all RegExpBackReferences. This is done after initial parsing in order
+ // to avoid complicating cases in which references comes before the capture.
+ void PatchNamedBackReferences();
+
widechar current() { return current_; }
bool has_more() { return has_more_; }
bool has_next() { return next_pos_ < end_; }
@@ -304,6 +339,9 @@ class RegExpParser
frontend::TokenStream& ts;
LifoAlloc* alloc;
RegExpCaptureVector* captures_;
+ // contains the subset of captures_ that have names (for duplicate checking)
+ RegExpCaptureVector* named_captures_;
+ RegExpBackReferenceVector* named_back_references_;
const CharT* next_pos_;
const CharT* end_;
widechar current_;
@@ -318,6 +356,7 @@ class RegExpParser
bool simple_;
bool contains_anchor_;
bool is_scanned_for_captures_;
+ bool has_named_captures_; // Only valid after we have scanned for captures.
};
} } // namespace js::irregexp
diff --git a/js/src/jit/CodeGenerator.cpp b/js/src/jit/CodeGenerator.cpp
index 66e8e25ddf..3f1b7251a3 100644
--- a/js/src/jit/CodeGenerator.cpp
+++ b/js/src/jit/CodeGenerator.cpp
@@ -1513,6 +1513,16 @@ JitCompartment::generateRegExpMatcherStub(JSContext* cx)
return nullptr;
}
+ // If a regexp has named captures, fall back to the OOL stub, which
+ // will end up calling CreateRegExpMatchResults.
+ Register shared = temp2;
+ masm.loadPtr(Address(regexp, NativeObject::getFixedSlotOffset(RegExpObject::PRIVATE_SLOT)),
+ shared);
+ masm.branchPtr(Assembler::NotEqual,
+ Address(shared, RegExpShared::offsetOfGroupsTemplate()),
+ ImmWord(0),
+ &oolEntry);
+
// Construct the result.
Register object = temp1;
Label matchResultFallback, matchResultJoin;
@@ -1523,6 +1533,7 @@ JitCompartment::generateRegExpMatcherStub(JSContext* cx)
masm.loadPtr(Address(object, NativeObject::offsetOfSlots()), temp2);
masm.storeValue(templateObject->getSlot(0), Address(temp2, 0));
masm.storeValue(templateObject->getSlot(1), Address(temp2, sizeof(Value)));
+ masm.storeValue(templateObject->getSlot(2), Address(temp2, 2 * sizeof(Value)));
size_t elementsOffset = NativeObject::offsetOfFixedElements();
@@ -1636,6 +1647,7 @@ JitCompartment::generateRegExpMatcherStub(JSContext* cx)
MOZ_ASSERT(templateObject->numFixedSlots() == 0);
MOZ_ASSERT(templateObject->lookupPure(cx->names().index)->slot() == 0);
MOZ_ASSERT(templateObject->lookupPure(cx->names().input)->slot() == 1);
+ MOZ_ASSERT(templateObject->lookupPure(cx->names().groups)->slot() == 2);
masm.load32(pairsVectorAddress, temp3);
masm.storeValue(JSVAL_TYPE_INT32, temp3, Address(temp2, 0));
diff --git a/js/src/js.msg b/js/src/js.msg
index 51854fc398..93d8a557b1 100644
--- a/js/src/js.msg
+++ b/js/src/js.msg
@@ -513,6 +513,12 @@ MSG_DEF(JSMSG_TOO_MANY_PARENS, 0, JSEXN_INTERNALERR, "too many parenthes
MSG_DEF(JSMSG_UNICODE_OVERFLOW, 1, JSEXN_SYNTAXERR, "Unicode codepoint must not be greater than 0x10FFFF in {0}")
MSG_DEF(JSMSG_UNMATCHED_RIGHT_PAREN, 0, JSEXN_SYNTAXERR, "unmatched ) in regular expression")
MSG_DEF(JSMSG_UNTERM_CLASS, 0, JSEXN_SYNTAXERR, "unterminated character class")
+MSG_DEF(JSMSG_INVALID_PROPERTY_NAME, 0, JSEXN_SYNTAXERR, "invalid property name in regular expression")
+MSG_DEF(JSMSG_INVALID_CLASS_PROPERTY_NAME, 0, JSEXN_SYNTAXERR, "invalid class property name in regular expression")
+MSG_DEF(JSMSG_INVALID_CAPTURE_NAME, 0, JSEXN_SYNTAXERR, "invalid capture group name in regular expression")
+MSG_DEF(JSMSG_DUPLICATE_CAPTURE_NAME, 0, JSEXN_SYNTAXERR, "duplicate capture group name in regular expression")
+MSG_DEF(JSMSG_INVALID_NAMED_REF, 0, JSEXN_SYNTAXERR, "invalid named reference in regular expression")
+MSG_DEF(JSMSG_INVALID_NAMED_CAPTURE_REF, 0, JSEXN_SYNTAXERR, "invalid named capture reference in regular expression")
// Self-hosting
MSG_DEF(JSMSG_DEFAULT_LOCALE_ERROR, 0, JSEXN_ERR, "internal error getting the default locale")
diff --git a/js/src/vm/CommonPropertyNames.h b/js/src/vm/CommonPropertyNames.h
index 5080e6ab09..57ec80669c 100644
--- a/js/src/vm/CommonPropertyNames.h
+++ b/js/src/vm/CommonPropertyNames.h
@@ -162,6 +162,7 @@
macro(global, global, "global") \
macro(globalThis, globalThis, "globalThis") \
macro(group, group, "group") \
+ macro(groups, groups, "groups") \
macro(Handle, Handle, "Handle") \
macro(has, has, "has") \
macro(hasOwn, hasOwn, "hasOwn") \
diff --git a/js/src/vm/RegExpObject.cpp b/js/src/vm/RegExpObject.cpp
index b2375ab8f0..e96db29edb 100644
--- a/js/src/vm/RegExpObject.cpp
+++ b/js/src/vm/RegExpObject.cpp
@@ -951,7 +951,8 @@ js::StringHasRegExpMetaChars(JSLinearString* str)
/* RegExpShared */
RegExpShared::RegExpShared(JSAtom* source, RegExpFlag flags)
- : source(source), flags(flags), parenCount(0), canStringMatch(false), marked_(false)
+ : source(source), flags(flags), parenCount(0), canStringMatch(false), marked_(false),
+ numNamedCaptures_(0), groupsTemplate_(nullptr)
{}
RegExpShared::~RegExpShared()
@@ -1006,6 +1007,56 @@ RegExpShared::compile(JSContext* cx, HandleLinearString input,
}
bool
+RegExpShared::initializeNamedCaptures(JSContext* cx, irregexp::CharacterVectorVector* names, irregexp::IntegerVector* indices)
+{
+ MOZ_ASSERT(!groupsTemplate_);
+ MOZ_ASSERT(names);
+ MOZ_ASSERT(indices);
+ MOZ_ASSERT(names->length() == indices->length());
+
+ // The irregexp parser returns named capture information in the form
+ // of two arrays. We create a template object with a property for each
+ // capture name, and store the capture index as Integer in the corresponding value.
+ uint32_t numNamedCaptures = names->length();
+
+ // Create a plain template object.
+ RootedPlainObject templateObject(cx, NewObjectWithGivenProto<PlainObject>(cx, nullptr, TenuredObject));
+ if (!templateObject) {
+ return false;
+ }
+
+ // Create a new group for the template.
+ Rooted<TaggedProto> proto(cx, templateObject->taggedProto());
+ ObjectGroup* group = ObjectGroupCompartment::makeGroup(cx, templateObject->getClass(), proto);
+ if (!group) {
+ return false;
+ }
+ templateObject->setGroup(group);
+
+ // Initialize the properties of the template.
+ RootedId id(cx);
+ for (uint32_t i = 0; i < numNamedCaptures; i++) {
+ irregexp::CharacterVector* cv = (*names)[i];
+ // Need to explicitly create an Atom (not a String) or it won't get added to the atom table
+ JSAtom* atom = AtomizeChars(cx, cv->begin(), cv->length());
+ if (!atom) {
+ return false;
+ }
+ id = NameToId(atom->asPropertyName());
+ RootedValue idx(cx, Int32Value((*indices)[i]));
+ if (!NativeDefineProperty(cx, templateObject, id, idx,
+ nullptr, nullptr, JSPROP_ENUMERATE)) {
+ return false;
+ }
+ AddTypePropertyId(cx, templateObject, id, TypeSet::Int32Type());
+ }
+
+ groupsTemplate_ = templateObject;
+ numNamedCaptures_ = numNamedCaptures;
+ return true;
+}
+
+bool
RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString input,
CompilationMode mode, ForceByteCodeEnum force)
{
@@ -1027,6 +1078,12 @@ RegExpShared::compile(JSContext* cx, HandleAtom pattern, HandleLinearString inpu
}
this->parenCount = data.capture_count;
+ if (data.capture_name_list) {
+ // convert LifoAlloc'd named capture info to NativeObject
+ if (!initializeNamedCaptures(cx, data.capture_name_list, data.capture_index_list)) {
+ return false;
+ }
+ }
irregexp::RegExpCode code = irregexp::CompilePattern(cx, this, &data, input,
false /* global() */,
@@ -1260,17 +1317,27 @@ RegExpCompartment::createMatchResultTemplateObject(JSContext* cx)
return matchResultTemplateObject_; // = nullptr
}
+ /* Set dummy groups property */
+ RootedValue groupsVal(cx, UndefinedValue());
+ if (!NativeDefineProperty(
+ cx, templateObject, cx->names().groups, groupsVal, nullptr, nullptr, JSPROP_ENUMERATE)) {
+ return nullptr;
+ }
+
// Make sure that the properties are in the right slots.
DebugOnly<Shape*> shape = templateObject->lastProperty();
- MOZ_ASSERT(shape->previous()->slot() == 0 &&
- shape->previous()->propidRef() == NameToId(cx->names().index));
- MOZ_ASSERT(shape->slot() == 1 &&
- shape->propidRef() == NameToId(cx->names().input));
+ MOZ_ASSERT(shape->slot() == 2 &&
+ shape->propidRef() == NameToId(cx->names().groups));
+ MOZ_ASSERT(shape->previous()->slot() == 1 &&
+ shape->previous()->propidRef() == NameToId(cx->names().input));
+ MOZ_ASSERT(shape->previous()->previous()->slot() == 0 &&
+ shape->previous()->previous()->propidRef() == NameToId(cx->names().index));
// Make sure type information reflects the indexed properties which might
// be added.
AddTypePropertyId(cx, templateObject, JSID_VOID, TypeSet::StringType());
AddTypePropertyId(cx, templateObject, JSID_VOID, TypeSet::UndefinedType());
+ AddTypePropertyId(cx, templateObject, NameToId(cx->names().groups), TypeSet::AnyObjectType());
matchResultTemplateObject_.set(templateObject);
diff --git a/js/src/vm/RegExpObject.h b/js/src/vm/RegExpObject.h
index ca7a39ec65..17d961eede 100644
--- a/js/src/vm/RegExpObject.h
+++ b/js/src/vm/RegExpObject.h
@@ -17,6 +17,7 @@
#include "proxy/Proxy.h"
#include "vm/ArrayObject.h"
#include "vm/Shape.h"
+#include "irregexp/InfallibleVector.h"
/*
* JavaScript Regular Expressions
@@ -133,6 +134,9 @@ class RegExpShared
bool canStringMatch;
bool marked_;
+ uint32_t numNamedCaptures_;
+ GCPtr<PlainObject*> groupsTemplate_;
+
RegExpCompilation compilationArray[4];
static int CompilationIndex(CompilationMode mode, bool latin1) {
@@ -187,6 +191,11 @@ class RegExpShared
/* Accounts for the "0" (whole match) pair. */
size_t pairCount() const { return getParenCount() + 1; }
+ // not public due to circular inclusion problems
+ bool initializeNamedCaptures(JSContext* cx, irregexp::CharacterVectorVector* names, irregexp::IntegerVector* indices);
+ PlainObject* getGroupsTemplate() { return groupsTemplate_; }
+ uint32_t numNamedCaptures() const { return numNamedCaptures_; }
+
JSAtom* getSource() const { return source; }
RegExpFlag getFlags() const { return flags; }
bool ignoreCase() const { return flags & IgnoreCaseFlag; }
@@ -238,6 +247,10 @@ class RegExpShared
+ offsetof(RegExpCompilation, jitCode);
}
+ static size_t offsetOfGroupsTemplate() {
+ return offsetof(RegExpShared, groupsTemplate_);
+ }
+
size_t sizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf);
#ifdef DEBUG