summaryrefslogtreecommitdiff
path: root/js/src/regexp/regexp-parser.h
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/regexp/regexp-parser.h')
-rw-r--r--js/src/regexp/regexp-parser.h359
1 files changed, 359 insertions, 0 deletions
diff --git a/js/src/regexp/regexp-parser.h b/js/src/regexp/regexp-parser.h
new file mode 100644
index 0000000000..91677d6c35
--- /dev/null
+++ b/js/src/regexp/regexp-parser.h
@@ -0,0 +1,359 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_PARSER_H_
+#define V8_REGEXP_REGEXP_PARSER_H_
+
+#include "regexp/regexp-ast.h"
+
+namespace v8 {
+namespace internal {
+
+struct RegExpCompileData;
+
+// A BufferedZoneList is an automatically growing list, just like (and backed
+// by) a ZoneList, that is optimized for the case of adding and removing
+// a single element. The last element added is stored outside the backing list,
+// and if no more than one element is ever added, the ZoneList isn't even
+// allocated.
+// Elements must not be nullptr pointers.
+template <typename T, int initial_size>
+class BufferedZoneList {
+ public:
+ BufferedZoneList() : list_(nullptr), last_(nullptr) {}
+
+ // Adds element at end of list. This element is buffered and can
+ // be read using last() or removed using RemoveLast until a new Add or until
+ // RemoveLast or GetList has been called.
+ void Add(T* value, Zone* zone) {
+ if (last_ != nullptr) {
+ if (list_ == nullptr) {
+ list_ = new (zone) ZoneList<T*>(initial_size, zone);
+ }
+ list_->Add(last_, zone);
+ }
+ last_ = value;
+ }
+
+ T* last() {
+ DCHECK(last_ != nullptr);
+ return last_;
+ }
+
+ T* RemoveLast() {
+ DCHECK(last_ != nullptr);
+ T* result = last_;
+ if ((list_ != nullptr) && (list_->length() > 0))
+ last_ = list_->RemoveLast();
+ else
+ last_ = nullptr;
+ return result;
+ }
+
+ T* Get(int i) {
+ DCHECK((0 <= i) && (i < length()));
+ if (list_ == nullptr) {
+ DCHECK_EQ(0, i);
+ return last_;
+ } else {
+ if (i == list_->length()) {
+ DCHECK(last_ != nullptr);
+ return last_;
+ } else {
+ return list_->at(i);
+ }
+ }
+ }
+
+ void Clear() {
+ list_ = nullptr;
+ last_ = nullptr;
+ }
+
+ int length() {
+ int length = (list_ == nullptr) ? 0 : list_->length();
+ return length + ((last_ == nullptr) ? 0 : 1);
+ }
+
+ ZoneList<T*>* GetList(Zone* zone) {
+ if (list_ == nullptr) {
+ list_ = new (zone) ZoneList<T*>(initial_size, zone);
+ }
+ if (last_ != nullptr) {
+ list_->Add(last_, zone);
+ last_ = nullptr;
+ }
+ return list_;
+ }
+
+ private:
+ ZoneList<T*>* list_;
+ T* last_;
+};
+
+
+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
+class RegExpBuilder : public ZoneObject {
+ public:
+ RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
+ void AddCharacter(uc16 character);
+ void AddUnicodeCharacter(uc32 character);
+ void AddEscapedUnicodeCharacter(uc32 character);
+ // "Adds" an empty expression. Does nothing except consume a
+ // following quantifier
+ void AddEmpty();
+ void AddCharacterClass(RegExpCharacterClass* cc);
+ void AddCharacterClassForDesugaring(uc32 c);
+ void AddAtom(RegExpTree* tree);
+ void AddTerm(RegExpTree* tree);
+ void AddAssertion(RegExpTree* tree);
+ void NewAlternative(); // '|'
+ bool AddQuantifierToAtom(int min, int max,
+ RegExpQuantifier::QuantifierType type);
+ void FlushText();
+ RegExpTree* ToRegExp();
+ JSRegExp::Flags flags() const { return flags_; }
+ void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
+
+ bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
+ bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
+ bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
+
+ private:
+ static const uc16 kNoPendingSurrogate = 0;
+ void AddLeadSurrogate(uc16 lead_surrogate);
+ void AddTrailSurrogate(uc16 trail_surrogate);
+ void FlushPendingSurrogate();
+ void FlushCharacters();
+ void FlushTerms();
+ bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
+ bool NeedsDesugaringForIgnoreCase(uc32 c);
+ Zone* zone() const { return zone_; }
+ bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
+
+ Zone* zone_;
+ bool pending_empty_;
+ JSRegExp::Flags flags_;
+ ZoneList<uc16>* characters_;
+ uc16 pending_surrogate_;
+ BufferedZoneList<RegExpTree, 2> terms_;
+ BufferedZoneList<RegExpTree, 2> text_;
+ BufferedZoneList<RegExpTree, 2> alternatives_;
+#ifdef DEBUG
+ enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
+#define LAST(x) last_added_ = x;
+#else
+#define LAST(x)
+#endif
+};
+
+class V8_EXPORT_PRIVATE RegExpParser {
+ public:
+ RegExpParser(FlatStringReader* in, Handle<String>* error,
+ JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
+
+ static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
+ JSRegExp::Flags flags, RegExpCompileData* result);
+
+ RegExpTree* ParsePattern();
+ RegExpTree* ParseDisjunction();
+ RegExpTree* ParseGroup();
+
+ // Parses a {...,...} quantifier and stores the range in the given
+ // out parameters.
+ bool ParseIntervalQuantifier(int* min_out, int* max_out);
+
+ // Parses and returns a single escaped character. The character
+ // must not be 'b' or 'B' since they are usually handle specially.
+ uc32 ParseClassCharacterEscape();
+
+ // Checks whether the following is a length-digit hexadecimal number,
+ // and sets the value if it is.
+ bool ParseHexEscape(int length, uc32* value);
+ bool ParseUnicodeEscape(uc32* value);
+ bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
+
+ bool ParsePropertyClassName(std::vector<char>* name_1,
+ std::vector<char>* name_2);
+ bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
+ const std::vector<char>& name_1,
+ const std::vector<char>& name_2);
+
+ RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
+ RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
+
+ uc32 ParseOctalLiteral();
+
+ // Tries to parse the input as a back reference. If successful it
+ // stores the result in the output parameter and returns true. If
+ // it fails it will push back the characters read so the same characters
+ // can be reparsed.
+ bool ParseBackReferenceIndex(int* index_out);
+
+ // Parse inside a class. Either add escaped class to the range, or return
+ // false and pass parsed single character through |char_out|.
+ void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
+ bool add_unicode_case_equivalents, uc32* char_out,
+ bool* is_class_escape);
+
+ char ParseClassEscape();
+
+ RegExpTree* ReportError(Vector<const char> message);
+ void Advance();
+ void Advance(int dist);
+ void Reset(int pos);
+
+ // Reports whether the pattern might be used as a literal search string.
+ // Only use if the result of the parse is a single atom node.
+ bool simple();
+ bool contains_anchor() { return contains_anchor_; }
+ void set_contains_anchor() { contains_anchor_ = true; }
+ int captures_started() { return captures_started_; }
+ int position() { return next_pos_ - 1; }
+ bool failed() { return failed_; }
+ // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
+ // just read the initial flag value here.
+ bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
+
+ static bool IsSyntaxCharacterOrSlash(uc32 c);
+
+ static const uc32 kEndMarker = (1 << 21);
+
+ private:
+ enum SubexpressionType {
+ INITIAL,
+ CAPTURE, // All positive values represent captures.
+ POSITIVE_LOOKAROUND,
+ NEGATIVE_LOOKAROUND,
+ GROUPING
+ };
+
+ class RegExpParserState : public ZoneObject {
+ public:
+ // Push a state on the stack.
+ RegExpParserState(RegExpParserState* previous_state,
+ SubexpressionType group_type,
+ RegExpLookaround::Type lookaround_type,
+ int disjunction_capture_index,
+ const ZoneVector<uc16>* capture_name,
+ JSRegExp::Flags flags, Zone* zone)
+ : previous_state_(previous_state),
+ builder_(new (zone) RegExpBuilder(zone, flags)),
+ group_type_(group_type),
+ lookaround_type_(lookaround_type),
+ disjunction_capture_index_(disjunction_capture_index),
+ capture_name_(capture_name) {}
+ // Parser state of containing expression, if any.
+ RegExpParserState* previous_state() const { return previous_state_; }
+ bool IsSubexpression() { return previous_state_ != nullptr; }
+ // RegExpBuilder building this regexp's AST.
+ RegExpBuilder* builder() const { return builder_; }
+ // Type of regexp being parsed (parenthesized group or entire regexp).
+ SubexpressionType group_type() const { return group_type_; }
+ // Lookahead or Lookbehind.
+ RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
+ // Index in captures array of first capture in this sub-expression, if any.
+ // Also the capture index of this sub-expression itself, if group_type
+ // is CAPTURE.
+ int capture_index() const { return disjunction_capture_index_; }
+ // The name of the current sub-expression, if group_type is CAPTURE. Only
+ // used for named captures.
+ const ZoneVector<uc16>* capture_name() const { return capture_name_; }
+
+ bool IsNamedCapture() const { return capture_name_ != nullptr; }
+
+ // Check whether the parser is inside a capture group with the given index.
+ bool IsInsideCaptureGroup(int index);
+ // Check whether the parser is inside a capture group with the given name.
+ bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
+
+ private:
+ // Linked list implementation of stack of states.
+ RegExpParserState* const previous_state_;
+ // Builder for the stored disjunction.
+ RegExpBuilder* const builder_;
+ // Stored disjunction type (capture, look-ahead or grouping), if any.
+ const SubexpressionType group_type_;
+ // Stored read direction.
+ const RegExpLookaround::Type lookaround_type_;
+ // Stored disjunction's capture index (if any).
+ const int disjunction_capture_index_;
+ // Stored capture name (if any).
+ const ZoneVector<uc16>* const capture_name_;
+ };
+
+ // Return the 1-indexed RegExpCapture object, allocate if necessary.
+ RegExpCapture* GetCapture(int index);
+
+ // Creates a new named capture at the specified index. Must be called exactly
+ // once for each named capture. Fails if a capture with the same name is
+ // encountered.
+ bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
+
+ // Parses the name of a capture group (?<name>pattern). The name must adhere
+ // to IdentifierName in the ECMAScript standard.
+ const ZoneVector<uc16>* ParseCaptureGroupName();
+
+ bool ParseNamedBackReference(RegExpBuilder* builder,
+ RegExpParserState* state);
+ RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
+
+ // After the initial parsing pass, patch corresponding RegExpCapture objects
+ // into all RegExpBackReferences. This is done after initial parsing in order
+ // to avoid complicating cases in which references comes before the capture.
+ void PatchNamedBackReferences();
+
+ Handle<FixedArray> CreateCaptureNameMap();
+
+ // Returns true iff the pattern contains named captures. May call
+ // ScanForCaptures to look ahead at the remaining pattern.
+ bool HasNamedCaptures();
+
+ Isolate* isolate() { return isolate_; }
+ Zone* zone() const { return zone_; }
+
+ uc32 current() { return current_; }
+ bool has_more() { return has_more_; }
+ bool has_next() { return next_pos_ < in()->length(); }
+ uc32 Next();
+ template <bool update_position>
+ uc32 ReadNext();
+ FlatStringReader* in() { return in_; }
+ void ScanForCaptures();
+
+ struct RegExpCaptureNameLess {
+ bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
+ DCHECK_NOT_NULL(lhs);
+ DCHECK_NOT_NULL(rhs);
+ return *lhs->name() < *rhs->name();
+ }
+ };
+
+ Isolate* isolate_;
+ Zone* zone_;
+ Handle<String>* error_;
+ ZoneList<RegExpCapture*>* captures_;
+ ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
+ ZoneList<RegExpBackReference*>* named_back_references_;
+ FlatStringReader* in_;
+ uc32 current_;
+ // These are the flags specified outside the regexp syntax ie after the
+ // terminating '/' or in the second argument to the constructor. The current
+ // flags are stored on the RegExpBuilder.
+ JSRegExp::Flags top_level_flags_;
+ int next_pos_;
+ int captures_started_;
+ int capture_count_; // Only valid after we have scanned for captures.
+ bool has_more_;
+ bool simple_;
+ bool contains_anchor_;
+ bool is_scanned_for_captures_;
+ bool has_named_captures_; // Only valid after we have scanned for captures.
+ bool failed_;
+};
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_PARSER_H_