diff options
Diffstat (limited to 'js/src/regexp/regexp-parser.h')
-rw-r--r-- | js/src/regexp/regexp-parser.h | 359 |
1 files changed, 359 insertions, 0 deletions
diff --git a/js/src/regexp/regexp-parser.h b/js/src/regexp/regexp-parser.h new file mode 100644 index 0000000000..91677d6c35 --- /dev/null +++ b/js/src/regexp/regexp-parser.h @@ -0,0 +1,359 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_PARSER_H_ +#define V8_REGEXP_REGEXP_PARSER_H_ + +#include "regexp/regexp-ast.h" + +namespace v8 { +namespace internal { + +struct RegExpCompileData; + +// A BufferedZoneList is an automatically growing list, just like (and backed +// by) a ZoneList, that is optimized for the case of adding and removing +// a single element. The last element added is stored outside the backing list, +// and if no more than one element is ever added, the ZoneList isn't even +// allocated. +// Elements must not be nullptr pointers. +template <typename T, int initial_size> +class BufferedZoneList { + public: + BufferedZoneList() : list_(nullptr), last_(nullptr) {} + + // Adds element at end of list. This element is buffered and can + // be read using last() or removed using RemoveLast until a new Add or until + // RemoveLast or GetList has been called. + void Add(T* value, Zone* zone) { + if (last_ != nullptr) { + if (list_ == nullptr) { + list_ = new (zone) ZoneList<T*>(initial_size, zone); + } + list_->Add(last_, zone); + } + last_ = value; + } + + T* last() { + DCHECK(last_ != nullptr); + return last_; + } + + T* RemoveLast() { + DCHECK(last_ != nullptr); + T* result = last_; + if ((list_ != nullptr) && (list_->length() > 0)) + last_ = list_->RemoveLast(); + else + last_ = nullptr; + return result; + } + + T* Get(int i) { + DCHECK((0 <= i) && (i < length())); + if (list_ == nullptr) { + DCHECK_EQ(0, i); + return last_; + } else { + if (i == list_->length()) { + DCHECK(last_ != nullptr); + return last_; + } else { + return list_->at(i); + } + } + } + + void Clear() { + list_ = nullptr; + last_ = nullptr; + } + + int length() { + int length = (list_ == nullptr) ? 0 : list_->length(); + return length + ((last_ == nullptr) ? 0 : 1); + } + + ZoneList<T*>* GetList(Zone* zone) { + if (list_ == nullptr) { + list_ = new (zone) ZoneList<T*>(initial_size, zone); + } + if (last_ != nullptr) { + list_->Add(last_, zone); + last_ = nullptr; + } + return list_; + } + + private: + ZoneList<T*>* list_; + T* last_; +}; + + +// Accumulates RegExp atoms and assertions into lists of terms and alternatives. +class RegExpBuilder : public ZoneObject { + public: + RegExpBuilder(Zone* zone, JSRegExp::Flags flags); + void AddCharacter(uc16 character); + void AddUnicodeCharacter(uc32 character); + void AddEscapedUnicodeCharacter(uc32 character); + // "Adds" an empty expression. Does nothing except consume a + // following quantifier + void AddEmpty(); + void AddCharacterClass(RegExpCharacterClass* cc); + void AddCharacterClassForDesugaring(uc32 c); + void AddAtom(RegExpTree* tree); + void AddTerm(RegExpTree* tree); + void AddAssertion(RegExpTree* tree); + void NewAlternative(); // '|' + bool AddQuantifierToAtom(int min, int max, + RegExpQuantifier::QuantifierType type); + void FlushText(); + RegExpTree* ToRegExp(); + JSRegExp::Flags flags() const { return flags_; } + void set_flags(JSRegExp::Flags flags) { flags_ = flags; } + + bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } + bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; } + bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; } + + private: + static const uc16 kNoPendingSurrogate = 0; + void AddLeadSurrogate(uc16 lead_surrogate); + void AddTrailSurrogate(uc16 trail_surrogate); + void FlushPendingSurrogate(); + void FlushCharacters(); + void FlushTerms(); + bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); + bool NeedsDesugaringForIgnoreCase(uc32 c); + Zone* zone() const { return zone_; } + bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; } + + Zone* zone_; + bool pending_empty_; + JSRegExp::Flags flags_; + ZoneList<uc16>* characters_; + uc16 pending_surrogate_; + BufferedZoneList<RegExpTree, 2> terms_; + BufferedZoneList<RegExpTree, 2> text_; + BufferedZoneList<RegExpTree, 2> alternatives_; +#ifdef DEBUG + enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; +#define LAST(x) last_added_ = x; +#else +#define LAST(x) +#endif +}; + +class V8_EXPORT_PRIVATE RegExpParser { + public: + RegExpParser(FlatStringReader* in, Handle<String>* error, + JSRegExp::Flags flags, Isolate* isolate, Zone* zone); + + static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, + JSRegExp::Flags flags, RegExpCompileData* result); + + RegExpTree* ParsePattern(); + RegExpTree* ParseDisjunction(); + RegExpTree* ParseGroup(); + + // Parses a {...,...} quantifier and stores the range in the given + // out parameters. + bool ParseIntervalQuantifier(int* min_out, int* max_out); + + // Parses and returns a single escaped character. The character + // must not be 'b' or 'B' since they are usually handle specially. + uc32 ParseClassCharacterEscape(); + + // Checks whether the following is a length-digit hexadecimal number, + // and sets the value if it is. + bool ParseHexEscape(int length, uc32* value); + bool ParseUnicodeEscape(uc32* value); + bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); + + bool ParsePropertyClassName(std::vector<char>* name_1, + std::vector<char>* name_2); + bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate, + const std::vector<char>& name_1, + const std::vector<char>& name_2); + + RegExpTree* GetPropertySequence(const std::vector<char>& name_1); + RegExpTree* ParseCharacterClass(const RegExpBuilder* state); + + uc32 ParseOctalLiteral(); + + // Tries to parse the input as a back reference. If successful it + // stores the result in the output parameter and returns true. If + // it fails it will push back the characters read so the same characters + // can be reparsed. + bool ParseBackReferenceIndex(int* index_out); + + // Parse inside a class. Either add escaped class to the range, or return + // false and pass parsed single character through |char_out|. + void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone, + bool add_unicode_case_equivalents, uc32* char_out, + bool* is_class_escape); + + char ParseClassEscape(); + + RegExpTree* ReportError(Vector<const char> message); + void Advance(); + void Advance(int dist); + void Reset(int pos); + + // Reports whether the pattern might be used as a literal search string. + // Only use if the result of the parse is a single atom node. + bool simple(); + bool contains_anchor() { return contains_anchor_; } + void set_contains_anchor() { contains_anchor_ = true; } + int captures_started() { return captures_started_; } + int position() { return next_pos_ - 1; } + bool failed() { return failed_; } + // The Unicode flag can't be changed using in-regexp syntax, so it's OK to + // just read the initial flag value here. + bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; } + + static bool IsSyntaxCharacterOrSlash(uc32 c); + + static const uc32 kEndMarker = (1 << 21); + + private: + enum SubexpressionType { + INITIAL, + CAPTURE, // All positive values represent captures. + POSITIVE_LOOKAROUND, + NEGATIVE_LOOKAROUND, + GROUPING + }; + + class RegExpParserState : public ZoneObject { + public: + // Push a state on the stack. + RegExpParserState(RegExpParserState* previous_state, + SubexpressionType group_type, + RegExpLookaround::Type lookaround_type, + int disjunction_capture_index, + const ZoneVector<uc16>* capture_name, + JSRegExp::Flags flags, Zone* zone) + : previous_state_(previous_state), + builder_(new (zone) RegExpBuilder(zone, flags)), + group_type_(group_type), + lookaround_type_(lookaround_type), + disjunction_capture_index_(disjunction_capture_index), + capture_name_(capture_name) {} + // Parser state of containing expression, if any. + RegExpParserState* previous_state() const { return previous_state_; } + bool IsSubexpression() { return previous_state_ != nullptr; } + // RegExpBuilder building this regexp's AST. + RegExpBuilder* builder() const { return builder_; } + // Type of regexp being parsed (parenthesized group or entire regexp). + SubexpressionType group_type() const { return group_type_; } + // Lookahead or Lookbehind. + RegExpLookaround::Type lookaround_type() const { return lookaround_type_; } + // Index in captures array of first capture in this sub-expression, if any. + // Also the capture index of this sub-expression itself, if group_type + // is CAPTURE. + int capture_index() const { return disjunction_capture_index_; } + // The name of the current sub-expression, if group_type is CAPTURE. Only + // used for named captures. + const ZoneVector<uc16>* capture_name() const { return capture_name_; } + + bool IsNamedCapture() const { return capture_name_ != nullptr; } + + // Check whether the parser is inside a capture group with the given index. + bool IsInsideCaptureGroup(int index); + // Check whether the parser is inside a capture group with the given name. + bool IsInsideCaptureGroup(const ZoneVector<uc16>* name); + + private: + // Linked list implementation of stack of states. + RegExpParserState* const previous_state_; + // Builder for the stored disjunction. + RegExpBuilder* const builder_; + // Stored disjunction type (capture, look-ahead or grouping), if any. + const SubexpressionType group_type_; + // Stored read direction. + const RegExpLookaround::Type lookaround_type_; + // Stored disjunction's capture index (if any). + const int disjunction_capture_index_; + // Stored capture name (if any). + const ZoneVector<uc16>* const capture_name_; + }; + + // Return the 1-indexed RegExpCapture object, allocate if necessary. + RegExpCapture* GetCapture(int index); + + // Creates a new named capture at the specified index. Must be called exactly + // once for each named capture. Fails if a capture with the same name is + // encountered. + bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index); + + // Parses the name of a capture group (?<name>pattern). The name must adhere + // to IdentifierName in the ECMAScript standard. + const ZoneVector<uc16>* ParseCaptureGroupName(); + + bool ParseNamedBackReference(RegExpBuilder* builder, + RegExpParserState* state); + RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); + + // After the initial parsing pass, patch corresponding RegExpCapture objects + // into all RegExpBackReferences. This is done after initial parsing in order + // to avoid complicating cases in which references comes before the capture. + void PatchNamedBackReferences(); + + Handle<FixedArray> CreateCaptureNameMap(); + + // Returns true iff the pattern contains named captures. May call + // ScanForCaptures to look ahead at the remaining pattern. + bool HasNamedCaptures(); + + Isolate* isolate() { return isolate_; } + Zone* zone() const { return zone_; } + + uc32 current() { return current_; } + bool has_more() { return has_more_; } + bool has_next() { return next_pos_ < in()->length(); } + uc32 Next(); + template <bool update_position> + uc32 ReadNext(); + FlatStringReader* in() { return in_; } + void ScanForCaptures(); + + struct RegExpCaptureNameLess { + bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const { + DCHECK_NOT_NULL(lhs); + DCHECK_NOT_NULL(rhs); + return *lhs->name() < *rhs->name(); + } + }; + + Isolate* isolate_; + Zone* zone_; + Handle<String>* error_; + ZoneList<RegExpCapture*>* captures_; + ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_; + ZoneList<RegExpBackReference*>* named_back_references_; + FlatStringReader* in_; + uc32 current_; + // These are the flags specified outside the regexp syntax ie after the + // terminating '/' or in the second argument to the constructor. The current + // flags are stored on the RegExpBuilder. + JSRegExp::Flags top_level_flags_; + int next_pos_; + int captures_started_; + int capture_count_; // Only valid after we have scanned for captures. + bool has_more_; + bool simple_; + bool contains_anchor_; + bool is_scanned_for_captures_; + bool has_named_captures_; // Only valid after we have scanned for captures. + bool failed_; +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_PARSER_H_ |