1 files changed, 359 insertions, 0 deletions
diff --git a/js/src/regexp/regexp-parser.h b/js/src/regexp/regexp-parser.h
new file mode 100644
index 0000000000..91677d6c35
--- /dev/null
+++ b/js/src/regexp/regexp-parser.h
@@ -0,0 +1,359 @@
+// Copyright 2016 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_PARSER_H_
+#define V8_REGEXP_REGEXP_PARSER_H_
+
+#include "regexp/regexp-ast.h"
+
+namespace v8 {
+namespace internal {
+
+struct RegExpCompileData;
+
+// A BufferedZoneList is an automatically growing list, just like (and backed
+// by) a ZoneList, that is optimized for the case of adding and removing
+// a single element. The last element added is stored outside the backing list,
+// and if no more than one element is ever added, the ZoneList isn't even
+// allocated.
+// Elements must not be nullptr pointers.
+template <typename T, int initial_size>
+class BufferedZoneList {
+ public:
+  BufferedZoneList() : list_(nullptr), last_(nullptr) {}
+
+  // Adds element at end of list. This element is buffered and can
+  // be read using last() or removed using RemoveLast until a new Add or until
+  // RemoveLast or GetList has been called.
+  void Add(T* value, Zone* zone) {
+    if (last_ != nullptr) {
+      if (list_ == nullptr) {
+        list_ = new (zone) ZoneList<T*>(initial_size, zone);
+      }
+      list_->Add(last_, zone);
+    }
+    last_ = value;
+  }
+
+  T* last() {
+    DCHECK(last_ != nullptr);
+    return last_;
+  }
+
+  T* RemoveLast() {
+    DCHECK(last_ != nullptr);
+    T* result = last_;
+    if ((list_ != nullptr) && (list_->length() > 0))
+      last_ = list_->RemoveLast();
+    else
+      last_ = nullptr;
+    return result;
+  }
+
+  T* Get(int i) {
+    DCHECK((0 <= i) && (i < length()));
+    if (list_ == nullptr) {
+      DCHECK_EQ(0, i);
+      return last_;
+    } else {
+      if (i == list_->length()) {
+        DCHECK(last_ != nullptr);
+        return last_;
+      } else {
+        return list_->at(i);
+      }
+    }
+  }
+
+  void Clear() {
+    list_ = nullptr;
+    last_ = nullptr;
+  }
+
+  int length() {
+    int length = (list_ == nullptr) ? 0 : list_->length();
+    return length + ((last_ == nullptr) ? 0 : 1);
+  }
+
+  ZoneList<T*>* GetList(Zone* zone) {
+    if (list_ == nullptr) {
+      list_ = new (zone) ZoneList<T*>(initial_size, zone);
+    }
+    if (last_ != nullptr) {
+      list_->Add(last_, zone);
+      last_ = nullptr;
+    }
+    return list_;
+  }
+
+ private:
+  ZoneList<T*>* list_;
+  T* last_;
+};
+
+
+// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
+class RegExpBuilder : public ZoneObject {
+ public:
+  RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
+  void AddCharacter(uc16 character);
+  void AddUnicodeCharacter(uc32 character);
+  void AddEscapedUnicodeCharacter(uc32 character);
+  // "Adds" an empty expression. Does nothing except consume a
+  // following quantifier
+  void AddEmpty();
+  void AddCharacterClass(RegExpCharacterClass* cc);
+  void AddCharacterClassForDesugaring(uc32 c);
+  void AddAtom(RegExpTree* tree);
+  void AddTerm(RegExpTree* tree);
+  void AddAssertion(RegExpTree* tree);
+  void NewAlternative();  // '|'
+  bool AddQuantifierToAtom(int min, int max,
+                           RegExpQuantifier::QuantifierType type);
+  void FlushText();
+  RegExpTree* ToRegExp();
+  JSRegExp::Flags flags() const { return flags_; }
+  void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
+
+  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
+  bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
+  bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
+
+ private:
+  static const uc16 kNoPendingSurrogate = 0;
+  void AddLeadSurrogate(uc16 lead_surrogate);
+  void AddTrailSurrogate(uc16 trail_surrogate);
+  void FlushPendingSurrogate();
+  void FlushCharacters();
+  void FlushTerms();
+  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
+  bool NeedsDesugaringForIgnoreCase(uc32 c);
+  Zone* zone() const { return zone_; }
+  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
+
+  Zone* zone_;
+  bool pending_empty_;
+  JSRegExp::Flags flags_;
+  ZoneList<uc16>* characters_;
+  uc16 pending_surrogate_;
+  BufferedZoneList<RegExpTree, 2> terms_;
+  BufferedZoneList<RegExpTree, 2> text_;
+  BufferedZoneList<RegExpTree, 2> alternatives_;
+#ifdef DEBUG
+  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
+#define LAST(x) last_added_ = x;
+#else
+#define LAST(x)
+#endif
+};
+
+class V8_EXPORT_PRIVATE RegExpParser {
+ public:
+  RegExpParser(FlatStringReader* in, Handle<String>* error,
+               JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
+
+  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
+                          JSRegExp::Flags flags, RegExpCompileData* result);
+
+  RegExpTree* ParsePattern();
+  RegExpTree* ParseDisjunction();
+  RegExpTree* ParseGroup();
+
+  // Parses a {...,...} quantifier and stores the range in the given
+  // out parameters.
+  bool ParseIntervalQuantifier(int* min_out, int* max_out);
+
+  // Parses and returns a single escaped character.  The character
+  // must not be 'b' or 'B' since they are usually handle specially.
+  uc32 ParseClassCharacterEscape();
+
+  // Checks whether the following is a length-digit hexadecimal number,
+  // and sets the value if it is.
+  bool ParseHexEscape(int length, uc32* value);
+  bool ParseUnicodeEscape(uc32* value);
+  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
+
+  bool ParsePropertyClassName(std::vector<char>* name_1,
+                              std::vector<char>* name_2);
+  bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
+                             const std::vector<char>& name_1,
+                             const std::vector<char>& name_2);
+
+  RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
+  RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
+
+  uc32 ParseOctalLiteral();
+
+  // Tries to parse the input as a back reference.  If successful it
+  // stores the result in the output parameter and returns true.  If
+  // it fails it will push back the characters read so the same characters
+  // can be reparsed.
+  bool ParseBackReferenceIndex(int* index_out);
+
+  // Parse inside a class. Either add escaped class to the range, or return
+  // false and pass parsed single character through |char_out|.
+  void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
+                        bool add_unicode_case_equivalents, uc32* char_out,
+                        bool* is_class_escape);
+
+  char ParseClassEscape();
+
+  RegExpTree* ReportError(Vector<const char> message);
+  void Advance();
+  void Advance(int dist);
+  void Reset(int pos);
+
+  // Reports whether the pattern might be used as a literal search string.
+  // Only use if the result of the parse is a single atom node.
+  bool simple();
+  bool contains_anchor() { return contains_anchor_; }
+  void set_contains_anchor() { contains_anchor_ = true; }
+  int captures_started() { return captures_started_; }
+  int position() { return next_pos_ - 1; }
+  bool failed() { return failed_; }
+  // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
+  // just read the initial flag value here.
+  bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
+
+  static bool IsSyntaxCharacterOrSlash(uc32 c);
+
+  static const uc32 kEndMarker = (1 << 21);
+
+ private:
+  enum SubexpressionType {
+    INITIAL,
+    CAPTURE,  // All positive values represent captures.
+    POSITIVE_LOOKAROUND,
+    NEGATIVE_LOOKAROUND,
+    GROUPING
+  };
+
+  class RegExpParserState : public ZoneObject {
+   public:
+    // Push a state on the stack.
+    RegExpParserState(RegExpParserState* previous_state,
+                      SubexpressionType group_type,
+                      RegExpLookaround::Type lookaround_type,
+                      int disjunction_capture_index,
+                      const ZoneVector<uc16>* capture_name,
+                      JSRegExp::Flags flags, Zone* zone)
+        : previous_state_(previous_state),
+          builder_(new (zone) RegExpBuilder(zone, flags)),
+          group_type_(group_type),
+          lookaround_type_(lookaround_type),
+          disjunction_capture_index_(disjunction_capture_index),
+          capture_name_(capture_name) {}
+    // Parser state of containing expression, if any.
+    RegExpParserState* previous_state() const { return previous_state_; }
+    bool IsSubexpression() { return previous_state_ != nullptr; }
+    // RegExpBuilder building this regexp's AST.
+    RegExpBuilder* builder() const { return builder_; }
+    // Type of regexp being parsed (parenthesized group or entire regexp).
+    SubexpressionType group_type() const { return group_type_; }
+    // Lookahead or Lookbehind.
+    RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
+    // Index in captures array of first capture in this sub-expression, if any.
+    // Also the capture index of this sub-expression itself, if group_type
+    // is CAPTURE.
+    int capture_index() const { return disjunction_capture_index_; }
+    // The name of the current sub-expression, if group_type is CAPTURE. Only
+    // used for named captures.
+    const ZoneVector<uc16>* capture_name() const { return capture_name_; }
+
+    bool IsNamedCapture() const { return capture_name_ != nullptr; }
+
+    // Check whether the parser is inside a capture group with the given index.
+    bool IsInsideCaptureGroup(int index);
+    // Check whether the parser is inside a capture group with the given name.
+    bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
+
+   private:
+    // Linked list implementation of stack of states.
+    RegExpParserState* const previous_state_;
+    // Builder for the stored disjunction.
+    RegExpBuilder* const builder_;
+    // Stored disjunction type (capture, look-ahead or grouping), if any.
+    const SubexpressionType group_type_;
+    // Stored read direction.
+    const RegExpLookaround::Type lookaround_type_;
+    // Stored disjunction's capture index (if any).
+    const int disjunction_capture_index_;
+    // Stored capture name (if any).
+    const ZoneVector<uc16>* const capture_name_;
+  };
+
+  // Return the 1-indexed RegExpCapture object, allocate if necessary.
+  RegExpCapture* GetCapture(int index);
+
+  // Creates a new named capture at the specified index. Must be called exactly
+  // once for each named capture. Fails if a capture with the same name is
+  // encountered.
+  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
+
+  // Parses the name of a capture group (?<name>pattern). The name must adhere
+  // to IdentifierName in the ECMAScript standard.
+  const ZoneVector<uc16>* ParseCaptureGroupName();
+
+  bool ParseNamedBackReference(RegExpBuilder* builder,
+                               RegExpParserState* state);
+  RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
+
+  // After the initial parsing pass, patch corresponding RegExpCapture objects
+  // into all RegExpBackReferences. This is done after initial parsing in order
+  // to avoid complicating cases in which references comes before the capture.
+  void PatchNamedBackReferences();
+
+  Handle<FixedArray> CreateCaptureNameMap();
+
+  // Returns true iff the pattern contains named captures. May call
+  // ScanForCaptures to look ahead at the remaining pattern.
+  bool HasNamedCaptures();
+
+  Isolate* isolate() { return isolate_; }
+  Zone* zone() const { return zone_; }
+
+  uc32 current() { return current_; }
+  bool has_more() { return has_more_; }
+  bool has_next() { return next_pos_ < in()->length(); }
+  uc32 Next();
+  template <bool update_position>
+  uc32 ReadNext();
+  FlatStringReader* in() { return in_; }
+  void ScanForCaptures();
+
+  struct RegExpCaptureNameLess {
+    bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
+      DCHECK_NOT_NULL(lhs);
+      DCHECK_NOT_NULL(rhs);
+      return *lhs->name() < *rhs->name();
+    }
+  };
+
+  Isolate* isolate_;
+  Zone* zone_;
+  Handle<String>* error_;
+  ZoneList<RegExpCapture*>* captures_;
+  ZoneSet<RegExpCapture*, RegExpCaptureNameLess>* named_captures_;
+  ZoneList<RegExpBackReference*>* named_back_references_;
+  FlatStringReader* in_;
+  uc32 current_;
+  // These are the flags specified outside the regexp syntax ie after the
+  // terminating '/' or in the second argument to the constructor.  The current
+  // flags are stored on the RegExpBuilder.
+  JSRegExp::Flags top_level_flags_;
+  int next_pos_;
+  int captures_started_;
+  int capture_count_;  // Only valid after we have scanned for captures.
+  bool has_more_;
+  bool simple_;
+  bool contains_anchor_;
+  bool is_scanned_for_captures_;
+  bool has_named_captures_;  // Only valid after we have scanned for captures.
+  bool failed_;
+};
+
+}  // namespace internal
+}  // namespace v8
+
+#endif  // V8_REGEXP_REGEXP_PARSER_H_