summaryrefslogtreecommitdiff
path: root/js/src/regexp/regexp-compiler.h
blob: 1954f1a4c473f853830c48eaf4fe42c3a6d29ed3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
// Copyright 2019 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_REGEXP_REGEXP_COMPILER_H_
#define V8_REGEXP_REGEXP_COMPILER_H_

#include <bitset>

#include "regexp/regexp-nodes.h"

namespace v8 {
namespace internal {

class DynamicBitSet;
class Isolate;

namespace regexp_compiler_constants {

// The '2' variant is has inclusive from and exclusive to.
// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
constexpr uc32 kRangeEndMarker = 0x110000;
constexpr int kSpaceRanges[] = {
    '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
    0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
    0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);

constexpr int kWordRanges[] = {'0',     '9' + 1, 'A',     'Z' + 1,        '_',
                               '_' + 1, 'a',     'z' + 1, kRangeEndMarker};
constexpr int kWordRangeCount = arraysize(kWordRanges);
constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
constexpr int kDigitRangeCount = arraysize(kDigitRanges);
constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
                                    kLeadSurrogateStart + 1, kRangeEndMarker};
constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D,         0x000E,
                                         0x2028, 0x202A, kRangeEndMarker};
constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);

// More makes code generation slower, less makes V8 benchmark score lower.
constexpr int kMaxLookaheadForBoyerMoore = 8;
// In a 3-character pattern you can maximally step forwards 3 characters
// at a time, which is not always enough to pay for the extra logic.
constexpr int kPatternTooShortForBoyerMoore = 2;

}  // namespace regexp_compiler_constants

inline bool IgnoreCase(JSRegExp::Flags flags) {
  return (flags & JSRegExp::kIgnoreCase) != 0;
}

inline bool IsUnicode(JSRegExp::Flags flags) {
  return (flags & JSRegExp::kUnicode) != 0;
}

inline bool IsSticky(JSRegExp::Flags flags) {
  return (flags & JSRegExp::kSticky) != 0;
}

inline bool IsGlobal(JSRegExp::Flags flags) {
  return (flags & JSRegExp::kGlobal) != 0;
}

inline bool DotAll(JSRegExp::Flags flags) {
  return (flags & JSRegExp::kDotAll) != 0;
}

inline bool Multiline(JSRegExp::Flags flags) {
  return (flags & JSRegExp::kMultiline) != 0;
}

inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
  // Both unicode and ignore_case flags are set. We need to use ICU to find
  // the closure over case equivalents.
  return IsUnicode(flags) && IgnoreCase(flags);
}

// Details of a quick mask-compare check that can look ahead in the
// input stream.
class QuickCheckDetails {
 public:
  QuickCheckDetails()
      : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
  explicit QuickCheckDetails(int characters)
      : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
  bool Rationalize(bool one_byte);
  // Merge in the information from another branch of an alternation.
  void Merge(QuickCheckDetails* other, int from_index);
  // Advance the current position by some amount.
  void Advance(int by, bool one_byte);
  void Clear();
  bool cannot_match() { return cannot_match_; }
  void set_cannot_match() { cannot_match_ = true; }
  struct Position {
    Position() : mask(0), value(0), determines_perfectly(false) {}
    uc16 mask;
    uc16 value;
    bool determines_perfectly;
  };
  int characters() { return characters_; }
  void set_characters(int characters) { characters_ = characters; }
  Position* positions(int index) {
    DCHECK_LE(0, index);
    DCHECK_GT(characters_, index);
    return positions_ + index;
  }
  uint32_t mask() { return mask_; }
  uint32_t value() { return value_; }

 private:
  // How many characters do we have quick check information from.  This is
  // the same for all branches of a choice node.
  int characters_;
  Position positions_[4];
  // These values are the condensate of the above array after Rationalize().
  uint32_t mask_;
  uint32_t value_;
  // If set to true, there is no way this quick check can match at all.
  // E.g., if it requires to be at the start of the input, and isn't.
  bool cannot_match_;
};

// Improve the speed that we scan for an initial point where a non-anchored
// regexp can match by using a Boyer-Moore-like table. This is done by
// identifying non-greedy non-capturing loops in the nodes that eat any
// character one at a time.  For example in the middle of the regexp
// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
// inserted at the start of any non-anchored regexp.
//
// When we have found such a loop we look ahead in the nodes to find the set of
// characters that can come at given distances. For example for the regexp
// /.?foo/ we know that there are at least 3 characters ahead of us, and the
// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
// the lookahead info where the set of characters is reasonably constrained. In
// our example this is from index 1 to 2 (0 is not constrained). We can now
// look 3 characters ahead and if we don't find one of [f, o] (the union of
// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
//
// For Unicode input strings we do the same, but modulo 128.
//
// We also look at the first string fed to the regexp and use that to get a hint
// of the character frequencies in the inputs. This affects the assessment of
// whether the set of characters is 'reasonably constrained'.
//
// We also have another lookahead mechanism (called quick check in the code),
// which uses a wide load of multiple characters followed by a mask and compare
// to determine whether a match is possible at this point.
enum ContainedInLattice {
  kNotYet = 0,
  kLatticeIn = 1,
  kLatticeOut = 2,
  kLatticeUnknown = 3  // Can also mean both in and out.
};

inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
  return static_cast<ContainedInLattice>(a | b);
}

class BoyerMoorePositionInfo : public ZoneObject {
 public:
  bool at(int i) const { return map_[i]; }

  static constexpr int kMapSize = 128;
  static constexpr int kMask = kMapSize - 1;

  int map_count() const { return map_count_; }

  void Set(int character);
  void SetInterval(const Interval& interval);
  void SetAll();

  bool is_non_word() { return w_ == kLatticeOut; }
  bool is_word() { return w_ == kLatticeIn; }

  using Bitset = std::bitset<kMapSize>;
  Bitset raw_bitset() const { return map_; }

 private:
  Bitset map_;
  int map_count_ = 0;               // Number of set bits in the map.
  ContainedInLattice w_ = kNotYet;  // The \w character class.
};

class BoyerMooreLookahead : public ZoneObject {
 public:
  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);

  int length() { return length_; }
  int max_char() { return max_char_; }
  RegExpCompiler* compiler() { return compiler_; }

  int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }

  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }

  void Set(int map_number, int character) {
    if (character > max_char_) return;
    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    info->Set(character);
  }

  void SetInterval(int map_number, const Interval& interval) {
    if (interval.from() > max_char_) return;
    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    if (interval.to() > max_char_) {
      info->SetInterval(Interval(interval.from(), max_char_));
    } else {
      info->SetInterval(interval);
    }
  }

  void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }

  void SetRest(int from_map) {
    for (int i = from_map; i < length_; i++) SetAll(i);
  }
  void EmitSkipInstructions(RegExpMacroAssembler* masm);

 private:
  // This is the value obtained by EatsAtLeast.  If we do not have at least this
  // many characters left in the sample string then the match is bound to fail.
  // Therefore it is OK to read a character this far ahead of the current match
  // point.
  int length_;
  RegExpCompiler* compiler_;
  // 0xff for Latin1, 0xffff for UTF-16.
  int max_char_;
  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;

  int GetSkipTable(int min_lookahead, int max_lookahead,
                   Handle<ByteArray> boolean_skip_table);
  bool FindWorthwhileInterval(int* from, int* to);
  int FindBestInterval(int max_number_of_chars, int old_biggest_points,
                       int* from, int* to);
};

// There are many ways to generate code for a node.  This class encapsulates
// the current way we should be generating.  In other words it encapsulates
// the current state of the code generator.  The effect of this is that we
// generate code for paths that the matcher can take through the regular
// expression.  A given node in the regexp can be code-generated several times
// as it can be part of several traces.  For example for the regexp:
// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
// of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
// to match foo is generated only once (the traces have a common prefix).  The
// code to store the capture is deferred and generated (twice) after the places
// where baz has been matched.
class Trace {
 public:
  // A value for a property that is either known to be true, know to be false,
  // or not known.
  enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };

  class DeferredAction {
   public:
    DeferredAction(ActionNode::ActionType action_type, int reg)
        : action_type_(action_type), reg_(reg), next_(nullptr) {}
    DeferredAction* next() { return next_; }
    bool Mentions(int reg);
    int reg() { return reg_; }
    ActionNode::ActionType action_type() { return action_type_; }

   private:
    ActionNode::ActionType action_type_;
    int reg_;
    DeferredAction* next_;
    friend class Trace;
  };

  class DeferredCapture : public DeferredAction {
   public:
    DeferredCapture(int reg, bool is_capture, Trace* trace)
        : DeferredAction(ActionNode::STORE_POSITION, reg),
          cp_offset_(trace->cp_offset()),
          is_capture_(is_capture) {}
    int cp_offset() { return cp_offset_; }
    bool is_capture() { return is_capture_; }

   private:
    int cp_offset_;
    bool is_capture_;
    void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
  };

  class DeferredSetRegisterForLoop : public DeferredAction {
   public:
    DeferredSetRegisterForLoop(int reg, int value)
        : DeferredAction(ActionNode::SET_REGISTER_FOR_LOOP, reg),
          value_(value) {}
    int value() { return value_; }

   private:
    int value_;
  };

  class DeferredClearCaptures : public DeferredAction {
   public:
    explicit DeferredClearCaptures(Interval range)
        : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
    Interval range() { return range_; }

   private:
    Interval range_;
  };

  class DeferredIncrementRegister : public DeferredAction {
   public:
    explicit DeferredIncrementRegister(int reg)
        : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
  };

  Trace()
      : cp_offset_(0),
        actions_(nullptr),
        backtrack_(nullptr),
        stop_node_(nullptr),
        loop_label_(nullptr),
        characters_preloaded_(0),
        bound_checked_up_to_(0),
        flush_budget_(100),
        at_start_(UNKNOWN) {}

  // End the trace.  This involves flushing the deferred actions in the trace
  // and pushing a backtrack location onto the backtrack stack.  Once this is
  // done we can start a new trace or go to one that has already been
  // generated.
  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
  int cp_offset() { return cp_offset_; }
  DeferredAction* actions() { return actions_; }
  // A trivial trace is one that has no deferred actions or other state that
  // affects the assumptions used when generating code.  There is no recorded
  // backtrack location in a trivial trace, so with a trivial trace we will
  // generate code that, on a failure to match, gets the backtrack location
  // from the backtrack stack rather than using a direct jump instruction.  We
  // always start code generation with a trivial trace and non-trivial traces
  // are created as we emit code for nodes or add to the list of deferred
  // actions in the trace.  The location of the code generated for a node using
  // a trivial trace is recorded in a label in the node so that gotos can be
  // generated to that code.
  bool is_trivial() {
    return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
           characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
           quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
  }
  TriBool at_start() { return at_start_; }
  void set_at_start(TriBool at_start) { at_start_ = at_start; }
  Label* backtrack() { return backtrack_; }
  Label* loop_label() { return loop_label_; }
  RegExpNode* stop_node() { return stop_node_; }
  int characters_preloaded() { return characters_preloaded_; }
  int bound_checked_up_to() { return bound_checked_up_to_; }
  int flush_budget() { return flush_budget_; }
  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
  bool mentions_reg(int reg);
  // Returns true if a deferred position store exists to the specified
  // register and stores the offset in the out-parameter.  Otherwise
  // returns false.
  bool GetStoredPosition(int reg, int* cp_offset);
  // These set methods and AdvanceCurrentPositionInTrace should be used only on
  // new traces - the intention is that traces are immutable after creation.
  void add_action(DeferredAction* new_action) {
    DCHECK(new_action->next_ == nullptr);
    new_action->next_ = actions_;
    actions_ = new_action;
  }
  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
  void set_loop_label(Label* label) { loop_label_ = label; }
  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
  void set_flush_budget(int to) { flush_budget_ = to; }
  void set_quick_check_performed(QuickCheckDetails* d) {
    quick_check_performed_ = *d;
  }
  void InvalidateCurrentCharacter();
  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);

 private:
  int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
  void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
                              const DynamicBitSet& affected_registers,
                              DynamicBitSet* registers_to_pop,
                              DynamicBitSet* registers_to_clear, Zone* zone);
  void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
                                const DynamicBitSet& registers_to_pop,
                                const DynamicBitSet& registers_to_clear);
  int cp_offset_;
  DeferredAction* actions_;
  Label* backtrack_;
  RegExpNode* stop_node_;
  Label* loop_label_;
  int characters_preloaded_;
  int bound_checked_up_to_;
  QuickCheckDetails quick_check_performed_;
  int flush_budget_;
  TriBool at_start_;
};

class GreedyLoopState {
 public:
  explicit GreedyLoopState(bool not_at_start);

  Label* label() { return &label_; }
  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }

 private:
  Label label_;
  Trace counter_backtrack_trace_;
};

struct PreloadState {
  static const int kEatsAtLeastNotYetInitialized = -1;
  bool preload_is_current_;
  bool preload_has_checked_bounds_;
  int preload_characters_;
  int eats_at_least_;
  void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
};

// Analysis performs assertion propagation and computes eats_at_least_ values.
// See the comments on AssertionPropagator and EatsAtLeastPropagator for more
// details.
RegExpError AnalyzeRegExp(Isolate* isolate, bool is_one_byte, RegExpNode* node);

class FrequencyCollator {
 public:
  FrequencyCollator() : total_samples_(0) {
    for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
      frequencies_[i] = CharacterFrequency(i);
    }
  }

  void CountCharacter(int character) {
    int index = (character & RegExpMacroAssembler::kTableMask);
    frequencies_[index].Increment();
    total_samples_++;
  }

  // Does not measure in percent, but rather per-128 (the table size from the
  // regexp macro assembler).
  int Frequency(int in_character) {
    DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
    if (total_samples_ < 1) return 1;  // Division by zero.
    int freq_in_per128 =
        (frequencies_[in_character].counter() * 128) / total_samples_;
    return freq_in_per128;
  }

 private:
  class CharacterFrequency {
   public:
    CharacterFrequency() : counter_(0), character_(-1) {}
    explicit CharacterFrequency(int character)
        : counter_(0), character_(character) {}

    void Increment() { counter_++; }
    int counter() { return counter_; }
    int character() { return character_; }

   private:
    int counter_;
    int character_;
  };

 private:
  CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
  int total_samples_;
};

class RegExpCompiler {
 public:
  RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
                 bool is_one_byte);

  int AllocateRegister() {
    if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
      reg_exp_too_big_ = true;
      return next_register_;
    }
    return next_register_++;
  }

  // Lookarounds to match lone surrogates for unicode character class matches
  // are never nested. We can therefore reuse registers.
  int UnicodeLookaroundStackRegister() {
    if (unicode_lookaround_stack_register_ == kNoRegister) {
      unicode_lookaround_stack_register_ = AllocateRegister();
    }
    return unicode_lookaround_stack_register_;
  }

  int UnicodeLookaroundPositionRegister() {
    if (unicode_lookaround_position_register_ == kNoRegister) {
      unicode_lookaround_position_register_ = AllocateRegister();
    }
    return unicode_lookaround_position_register_;
  }

  struct CompilationResult final {
    explicit CompilationResult(RegExpError err) : error(err) {}
    CompilationResult(Object code, int registers)
        : code(code), num_registers(registers) {}

    static CompilationResult RegExpTooBig() {
      return CompilationResult(RegExpError::kTooLarge);
    }

    bool Succeeded() const { return error == RegExpError::kNone; }

    const RegExpError error = RegExpError::kNone;
    Object code;
    int num_registers = 0;
  };

  CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
                             RegExpNode* start, int capture_count,
                             Handle<String> pattern);

  // If the regexp matching starts within a surrogate pair, step back to the
  // lead surrogate and start matching from there.
  static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
                                                       RegExpNode* on_success,
                                                       JSRegExp::Flags flags);

  inline void AddWork(RegExpNode* node) {
    if (!node->on_work_list() && !node->label()->is_bound()) {
      node->set_on_work_list(true);
      work_list_->push_back(node);
    }
  }

  static const int kImplementationOffset = 0;
  static const int kNumberOfRegistersOffset = 0;
  static const int kCodeOffset = 1;

  RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
  EndNode* accept() { return accept_; }

  static const int kMaxRecursion = 100;
  inline int recursion_depth() { return recursion_depth_; }
  inline void IncrementRecursionDepth() { recursion_depth_++; }
  inline void DecrementRecursionDepth() { recursion_depth_--; }

  void SetRegExpTooBig() { reg_exp_too_big_ = true; }

  inline bool one_byte() { return one_byte_; }
  inline bool optimize() { return optimize_; }
  inline void set_optimize(bool value) { optimize_ = value; }
  inline bool limiting_recursion() { return limiting_recursion_; }
  inline void set_limiting_recursion(bool value) {
    limiting_recursion_ = value;
  }
  bool read_backward() { return read_backward_; }
  void set_read_backward(bool value) { read_backward_ = value; }
  FrequencyCollator* frequency_collator() { return &frequency_collator_; }

  int current_expansion_factor() { return current_expansion_factor_; }
  void set_current_expansion_factor(int value) {
    current_expansion_factor_ = value;
  }

  Isolate* isolate() const { return isolate_; }
  Zone* zone() const { return zone_; }

  static const int kNoRegister = -1;

 private:
  EndNode* accept_;
  int next_register_;
  int unicode_lookaround_stack_register_;
  int unicode_lookaround_position_register_;
  ZoneVector<RegExpNode*>* work_list_;
  int recursion_depth_;
  RegExpMacroAssembler* macro_assembler_;
  bool one_byte_;
  bool reg_exp_too_big_;
  bool limiting_recursion_;
  bool optimize_;
  bool read_backward_;
  int current_expansion_factor_;
  FrequencyCollator frequency_collator_;
  Isolate* isolate_;
  Zone* zone_;
};

// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
class UnicodeRangeSplitter {
 public:
  V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);

  static constexpr int kInitialSize = 8;
  using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;

  const CharacterRangeVector* bmp() const { return &bmp_; }
  const CharacterRangeVector* lead_surrogates() const {
    return &lead_surrogates_;
  }
  const CharacterRangeVector* trail_surrogates() const {
    return &trail_surrogates_;
  }
  const CharacterRangeVector* non_bmp() const { return &non_bmp_; }

 private:
  void AddRange(CharacterRange range);

  CharacterRangeVector bmp_;
  CharacterRangeVector lead_surrogates_;
  CharacterRangeVector trail_surrogates_;
  CharacterRangeVector non_bmp_;
};

// We need to check for the following characters: 0x39C 0x3BC 0x178.
// TODO(jgruber): Move to CharacterRange.
bool RangeContainsLatin1Equivalents(CharacterRange range);

}  // namespace internal
}  // namespace v8

#endif  // V8_REGEXP_REGEXP_COMPILER_H_