diff options
Diffstat (limited to 'js/src/regexp/regexp-ast.h')
-rw-r--r-- | js/src/regexp/regexp-ast.h | 599 |
1 files changed, 599 insertions, 0 deletions
diff --git a/js/src/regexp/regexp-ast.h b/js/src/regexp/regexp-ast.h new file mode 100644 index 0000000000..540fdce80a --- /dev/null +++ b/js/src/regexp/regexp-ast.h @@ -0,0 +1,599 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_AST_H_ +#define V8_REGEXP_REGEXP_AST_H_ + + +namespace v8 { +namespace internal { + +#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ + VISIT(Disjunction) \ + VISIT(Alternative) \ + VISIT(Assertion) \ + VISIT(CharacterClass) \ + VISIT(Atom) \ + VISIT(Quantifier) \ + VISIT(Capture) \ + VISIT(Group) \ + VISIT(Lookaround) \ + VISIT(BackReference) \ + VISIT(Empty) \ + VISIT(Text) + +#define FORWARD_DECLARE(Name) class RegExp##Name; +FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) +#undef FORWARD_DECLARE + +class RegExpCompiler; +class RegExpNode; +class RegExpTree; + +class RegExpVisitor { + public: + virtual ~RegExpVisitor() = default; +#define MAKE_CASE(Name) \ + virtual void* Visit##Name(RegExp##Name*, void* data) = 0; + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) +#undef MAKE_CASE +}; + + +// A simple closed interval. +class Interval { + public: + Interval() : from_(kNone), to_(kNone - 1) {} // '- 1' for branchless size(). + Interval(int from, int to) : from_(from), to_(to) {} + Interval Union(Interval that) { + if (that.from_ == kNone) + return *this; + else if (from_ == kNone) + return that; + else + return Interval(Min(from_, that.from_), Max(to_, that.to_)); + } + + bool Contains(int value) { return (from_ <= value) && (value <= to_); } + bool is_empty() { return from_ == kNone; } + int from() const { return from_; } + int to() const { return to_; } + int size() const { return to_ - from_ + 1; } + + static Interval Empty() { return Interval(); } + + static constexpr int kNone = -1; + + private: + int from_; + int to_; +}; + + +// Represents code units in the range from from_ to to_, both ends are +// inclusive. +class CharacterRange { + public: + CharacterRange() : from_(0), to_(0) {} + // For compatibility with the CHECK_OK macro + CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT + V8_EXPORT_PRIVATE static void AddClassEscape(char type, + ZoneList<CharacterRange>* ranges, + Zone* zone); + // Add class escapes. Add case equivalent closure for \w and \W if necessary. + V8_EXPORT_PRIVATE static void AddClassEscape( + char type, ZoneList<CharacterRange>* ranges, + bool add_unicode_case_equivalents, Zone* zone); + static Vector<const int> GetWordBounds(); + static inline CharacterRange Singleton(uc32 value) { + return CharacterRange(value, value); + } + static inline CharacterRange Range(uc32 from, uc32 to) { + DCHECK(0 <= from && to <= String::kMaxCodePoint); + DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); + return CharacterRange(from, to); + } + static inline CharacterRange Everything() { + return CharacterRange(0, String::kMaxCodePoint); + } + static inline ZoneList<CharacterRange>* List(Zone* zone, + CharacterRange range) { + ZoneList<CharacterRange>* list = + new (zone) ZoneList<CharacterRange>(1, zone); + list->Add(range, zone); + return list; + } + bool Contains(uc32 i) { return from_ <= i && i <= to_; } + uc32 from() const { return from_; } + void set_from(uc32 value) { from_ = value; } + uc32 to() const { return to_; } + void set_to(uc32 value) { to_ = value; } + bool is_valid() { return from_ <= to_; } + bool IsEverything(uc32 max) { return from_ == 0 && to_ >= max; } + bool IsSingleton() { return (from_ == to_); } + V8_EXPORT_PRIVATE static void AddCaseEquivalents( + Isolate* isolate, Zone* zone, ZoneList<CharacterRange>* ranges, + bool is_one_byte); + // Whether a range list is in canonical form: Ranges ordered by from value, + // and ranges non-overlapping and non-adjacent. + V8_EXPORT_PRIVATE static bool IsCanonical(ZoneList<CharacterRange>* ranges); + // Convert range list to canonical form. The characters covered by the ranges + // will still be the same, but no character is in more than one range, and + // adjacent ranges are merged. The resulting list may be shorter than the + // original, but cannot be longer. + static void Canonicalize(ZoneList<CharacterRange>* ranges); + // Negate the contents of a character range in canonical form. + static void Negate(ZoneList<CharacterRange>* src, + ZoneList<CharacterRange>* dst, Zone* zone); + static const int kStartMarker = (1 << 24); + static const int kPayloadMask = (1 << 24) - 1; + + private: + CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {} + + uc32 from_; + uc32 to_; +}; + +class CharacterSet final { + public: + explicit CharacterSet(uc16 standard_set_type) + : ranges_(nullptr), standard_set_type_(standard_set_type) {} + explicit CharacterSet(ZoneList<CharacterRange>* ranges) + : ranges_(ranges), standard_set_type_(0) {} + ZoneList<CharacterRange>* ranges(Zone* zone); + uc16 standard_set_type() const { return standard_set_type_; } + void set_standard_set_type(uc16 special_set_type) { + standard_set_type_ = special_set_type; + } + bool is_standard() { return standard_set_type_ != 0; } + V8_EXPORT_PRIVATE void Canonicalize(); + + private: + ZoneList<CharacterRange>* ranges_; + // If non-zero, the value represents a standard set (e.g., all whitespace + // characters) without having to expand the ranges. + uc16 standard_set_type_; +}; + +class TextElement final { + public: + enum TextType { ATOM, CHAR_CLASS }; + + static TextElement Atom(RegExpAtom* atom); + static TextElement CharClass(RegExpCharacterClass* char_class); + + int cp_offset() const { return cp_offset_; } + void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } + int length() const; + + TextType text_type() const { return text_type_; } + + RegExpTree* tree() const { return tree_; } + + RegExpAtom* atom() const { + DCHECK(text_type() == ATOM); + return reinterpret_cast<RegExpAtom*>(tree()); + } + + RegExpCharacterClass* char_class() const { + DCHECK(text_type() == CHAR_CLASS); + return reinterpret_cast<RegExpCharacterClass*>(tree()); + } + + private: + TextElement(TextType text_type, RegExpTree* tree) + : cp_offset_(-1), text_type_(text_type), tree_(tree) {} + + int cp_offset_; + TextType text_type_; + RegExpTree* tree_; +}; + + +class RegExpTree : public ZoneObject { + public: + static const int kInfinity = kMaxInt; + virtual ~RegExpTree() = default; + virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) = 0; + virtual bool IsTextElement() { return false; } + virtual bool IsAnchoredAtStart() { return false; } + virtual bool IsAnchoredAtEnd() { return false; } + virtual int min_match() = 0; + virtual int max_match() = 0; + // Returns the interval of registers used for captures within this + // expression. + virtual Interval CaptureRegisters() { return Interval::Empty(); } + virtual void AppendToText(RegExpText* text, Zone* zone); + V8_EXPORT_PRIVATE std::ostream& Print(std::ostream& os, + Zone* zone); // NOLINT +#define MAKE_ASTYPE(Name) \ + virtual RegExp##Name* As##Name(); \ + virtual bool Is##Name(); + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) +#undef MAKE_ASTYPE +}; + + +class RegExpDisjunction final : public RegExpTree { + public: + explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpDisjunction* AsDisjunction() override; + Interval CaptureRegisters() override; + bool IsDisjunction() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList<RegExpTree*>* alternatives() { return alternatives_; } + + private: + bool SortConsecutiveAtoms(RegExpCompiler* compiler); + void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); + void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); + ZoneList<RegExpTree*>* alternatives_; + int min_match_; + int max_match_; +}; + + +class RegExpAlternative final : public RegExpTree { + public: + explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpAlternative* AsAlternative() override; + Interval CaptureRegisters() override; + bool IsAlternative() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList<RegExpTree*>* nodes() { return nodes_; } + + private: + ZoneList<RegExpTree*>* nodes_; + int min_match_; + int max_match_; +}; + + +class RegExpAssertion final : public RegExpTree { + public: + enum AssertionType { + START_OF_LINE = 0, + START_OF_INPUT = 1, + END_OF_LINE = 2, + END_OF_INPUT = 3, + BOUNDARY = 4, + NON_BOUNDARY = 5, + LAST_TYPE = NON_BOUNDARY, + }; + RegExpAssertion(AssertionType type, JSRegExp::Flags flags) + : assertion_type_(type), flags_(flags) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpAssertion* AsAssertion() override; + bool IsAssertion() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + AssertionType assertion_type() const { return assertion_type_; } + JSRegExp::Flags flags() const { return flags_; } + + private: + const AssertionType assertion_type_; + const JSRegExp::Flags flags_; +}; + + +class RegExpCharacterClass final : public RegExpTree { + public: + // NEGATED: The character class is negated and should match everything but + // the specified ranges. + // CONTAINS_SPLIT_SURROGATE: The character class contains part of a split + // surrogate and should not be unicode-desugared (crbug.com/641091). + enum Flag { + NEGATED = 1 << 0, + CONTAINS_SPLIT_SURROGATE = 1 << 1, + }; + using CharacterClassFlags = base::Flags<Flag>; + + RegExpCharacterClass( + Zone* zone, ZoneList<CharacterRange>* ranges, JSRegExp::Flags flags, + CharacterClassFlags character_class_flags = CharacterClassFlags()) + : set_(ranges), + flags_(flags), + character_class_flags_(character_class_flags) { + // Convert the empty set of ranges to the negated Everything() range. + if (ranges->is_empty()) { + ranges->Add(CharacterRange::Everything(), zone); + character_class_flags_ ^= NEGATED; + } + } + RegExpCharacterClass(uc16 type, JSRegExp::Flags flags) + : set_(type), + flags_(flags), + character_class_flags_(CharacterClassFlags()) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpCharacterClass* AsCharacterClass() override; + bool IsCharacterClass() override; + bool IsTextElement() override { return true; } + int min_match() override { return 1; } + // The character class may match two code units for unicode regexps. + // TODO(yangguo): we should split this class for usage in TextElement, and + // make max_match() dependent on the character class content. + int max_match() override { return 2; } + void AppendToText(RegExpText* text, Zone* zone) override; + CharacterSet character_set() { return set_; } + // TODO(lrn): Remove need for complex version if is_standard that + // recognizes a mangled standard set and just do { return set_.is_special(); } + bool is_standard(Zone* zone); + // Returns a value representing the standard character set if is_standard() + // returns true. + // Currently used values are: + // s : unicode whitespace + // S : unicode non-whitespace + // w : ASCII word character (digit, letter, underscore) + // W : non-ASCII word character + // d : ASCII digit + // D : non-ASCII digit + // . : non-newline + // * : All characters, for advancing unanchored regexp + uc16 standard_type() const { return set_.standard_set_type(); } + ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } + bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; } + JSRegExp::Flags flags() const { return flags_; } + bool contains_split_surrogate() const { + return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0; + } + + private: + CharacterSet set_; + const JSRegExp::Flags flags_; + CharacterClassFlags character_class_flags_; +}; + + +class RegExpAtom final : public RegExpTree { + public: + explicit RegExpAtom(Vector<const uc16> data, JSRegExp::Flags flags) + : data_(data), flags_(flags) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpAtom* AsAtom() override; + bool IsAtom() override; + bool IsTextElement() override { return true; } + int min_match() override { return data_.length(); } + int max_match() override { return data_.length(); } + void AppendToText(RegExpText* text, Zone* zone) override; + Vector<const uc16> data() { return data_; } + int length() { return data_.length(); } + JSRegExp::Flags flags() const { return flags_; } + bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; } + + private: + Vector<const uc16> data_; + const JSRegExp::Flags flags_; +}; + + +class RegExpText final : public RegExpTree { + public: + explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpText* AsText() override; + bool IsText() override; + bool IsTextElement() override { return true; } + int min_match() override { return length_; } + int max_match() override { return length_; } + void AppendToText(RegExpText* text, Zone* zone) override; + void AddElement(TextElement elm, Zone* zone) { + elements_.Add(elm, zone); + length_ += elm.length(); + } + ZoneList<TextElement>* elements() { return &elements_; } + + private: + ZoneList<TextElement> elements_; + int length_; +}; + + +class RegExpQuantifier final : public RegExpTree { + public: + enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; + RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) + : body_(body), + min_(min), + max_(max), + quantifier_type_(type) { + if (min > 0 && body->min_match() > kInfinity / min) { + min_match_ = kInfinity; + } else { + min_match_ = min * body->min_match(); + } + if (max > 0 && body->max_match() > kInfinity / max) { + max_match_ = kInfinity; + } else { + max_match_ = max * body->max_match(); + } + } + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, + RegExpCompiler* compiler, RegExpNode* on_success, + bool not_at_start = false); + RegExpQuantifier* AsQuantifier() override; + Interval CaptureRegisters() override; + bool IsQuantifier() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + int min() { return min_; } + int max() { return max_; } + bool is_possessive() { return quantifier_type_ == POSSESSIVE; } + bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } + bool is_greedy() { return quantifier_type_ == GREEDY; } + RegExpTree* body() { return body_; } + + private: + RegExpTree* body_; + int min_; + int max_; + int min_match_; + int max_match_; + QuantifierType quantifier_type_; +}; + + +class RegExpCapture final : public RegExpTree { + public: + explicit RegExpCapture(int index) + : body_(nullptr), index_(index), name_(nullptr) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + static RegExpNode* ToNode(RegExpTree* body, int index, + RegExpCompiler* compiler, RegExpNode* on_success); + RegExpCapture* AsCapture() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + Interval CaptureRegisters() override; + bool IsCapture() override; + int min_match() override { return body_->min_match(); } + int max_match() override { return body_->max_match(); } + RegExpTree* body() { return body_; } + void set_body(RegExpTree* body) { body_ = body; } + int index() const { return index_; } + const ZoneVector<uc16>* name() const { return name_; } + void set_name(const ZoneVector<uc16>* name) { name_ = name; } + static int StartRegister(int index) { return index * 2; } + static int EndRegister(int index) { return index * 2 + 1; } + + private: + RegExpTree* body_; + int index_; + const ZoneVector<uc16>* name_; +}; + +class RegExpGroup final : public RegExpTree { + public: + explicit RegExpGroup(RegExpTree* body) : body_(body) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) override { + return body_->ToNode(compiler, on_success); + } + RegExpGroup* AsGroup() override; + bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); } + bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); } + bool IsGroup() override; + int min_match() override { return body_->min_match(); } + int max_match() override { return body_->max_match(); } + Interval CaptureRegisters() override { return body_->CaptureRegisters(); } + RegExpTree* body() { return body_; } + + private: + RegExpTree* body_; +}; + +class RegExpLookaround final : public RegExpTree { + public: + enum Type { LOOKAHEAD, LOOKBEHIND }; + + RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, + int capture_from, Type type) + : body_(body), + is_positive_(is_positive), + capture_count_(capture_count), + capture_from_(capture_from), + type_(type) {} + + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpLookaround* AsLookaround() override; + Interval CaptureRegisters() override; + bool IsLookaround() override; + bool IsAnchoredAtStart() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + RegExpTree* body() { return body_; } + bool is_positive() { return is_positive_; } + int capture_count() { return capture_count_; } + int capture_from() { return capture_from_; } + Type type() { return type_; } + + class Builder { + public: + Builder(bool is_positive, RegExpNode* on_success, + int stack_pointer_register, int position_register, + int capture_register_count = 0, int capture_register_start = 0); + RegExpNode* on_match_success() { return on_match_success_; } + RegExpNode* ForMatch(RegExpNode* match); + + private: + bool is_positive_; + RegExpNode* on_match_success_; + RegExpNode* on_success_; + int stack_pointer_register_; + int position_register_; + }; + + private: + RegExpTree* body_; + bool is_positive_; + int capture_count_; + int capture_from_; + Type type_; +}; + + +class RegExpBackReference final : public RegExpTree { + public: + explicit RegExpBackReference(JSRegExp::Flags flags) + : capture_(nullptr), name_(nullptr), flags_(flags) {} + RegExpBackReference(RegExpCapture* capture, JSRegExp::Flags flags) + : capture_(capture), name_(nullptr), flags_(flags) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpBackReference* AsBackReference() override; + bool IsBackReference() override; + int min_match() override { return 0; } + // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite + // recursion, we give up. Ignorance is bliss. + int max_match() override { return kInfinity; } + int index() { return capture_->index(); } + RegExpCapture* capture() { return capture_; } + void set_capture(RegExpCapture* capture) { capture_ = capture; } + const ZoneVector<uc16>* name() const { return name_; } + void set_name(const ZoneVector<uc16>* name) { name_ = name; } + + private: + RegExpCapture* capture_; + const ZoneVector<uc16>* name_; + const JSRegExp::Flags flags_; +}; + + +class RegExpEmpty final : public RegExpTree { + public: + RegExpEmpty() = default; + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpEmpty* AsEmpty() override; + bool IsEmpty() override; + int min_match() override { return 0; } + int max_match() override { return 0; } +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_AST_H_ |