summaryrefslogtreecommitdiff
path: root/deps/v8/src/regexp/regexp-compiler.h
diff options
context:
space:
mode:
Diffstat (limited to 'deps/v8/src/regexp/regexp-compiler.h')
-rw-r--r--deps/v8/src/regexp/regexp-compiler.h657
1 files changed, 657 insertions, 0 deletions
diff --git a/deps/v8/src/regexp/regexp-compiler.h b/deps/v8/src/regexp/regexp-compiler.h
new file mode 100644
index 0000000000..1b70abfd98
--- /dev/null
+++ b/deps/v8/src/regexp/regexp-compiler.h
@@ -0,0 +1,657 @@
+// Copyright 2019 the V8 project authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef V8_REGEXP_REGEXP_COMPILER_H_
+#define V8_REGEXP_REGEXP_COMPILER_H_
+
+#include <bitset>
+
+#include "src/base/small-vector.h"
+#include "src/regexp/regexp-nodes.h"
+
+namespace v8 {
+namespace internal {
+
+class DynamicBitSet;
+class Isolate;
+
+namespace regexp_compiler_constants {
+
+// The '2' variant is has inclusive from and exclusive to.
+// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
+// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
+constexpr uc32 kRangeEndMarker = 0x110000;
+constexpr int kSpaceRanges[] = {
+ '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
+ 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
+ 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
+constexpr int kSpaceRangeCount = arraysize(kSpaceRanges);
+
+constexpr int kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_',
+ '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
+constexpr int kWordRangeCount = arraysize(kWordRanges);
+constexpr int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
+constexpr int kDigitRangeCount = arraysize(kDigitRanges);
+constexpr int kSurrogateRanges[] = {kLeadSurrogateStart,
+ kLeadSurrogateStart + 1, kRangeEndMarker};
+constexpr int kSurrogateRangeCount = arraysize(kSurrogateRanges);
+constexpr int kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E,
+ 0x2028, 0x202A, kRangeEndMarker};
+constexpr int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
+
+// More makes code generation slower, less makes V8 benchmark score lower.
+constexpr int kMaxLookaheadForBoyerMoore = 8;
+// In a 3-character pattern you can maximally step forwards 3 characters
+// at a time, which is not always enough to pay for the extra logic.
+constexpr int kPatternTooShortForBoyerMoore = 2;
+
+} // namespace regexp_compiler_constants
+
+inline bool IgnoreCase(JSRegExp::Flags flags) {
+ return (flags & JSRegExp::kIgnoreCase) != 0;
+}
+
+inline bool IsUnicode(JSRegExp::Flags flags) {
+ return (flags & JSRegExp::kUnicode) != 0;
+}
+
+inline bool IsSticky(JSRegExp::Flags flags) {
+ return (flags & JSRegExp::kSticky) != 0;
+}
+
+inline bool IsGlobal(JSRegExp::Flags flags) {
+ return (flags & JSRegExp::kGlobal) != 0;
+}
+
+inline bool DotAll(JSRegExp::Flags flags) {
+ return (flags & JSRegExp::kDotAll) != 0;
+}
+
+inline bool Multiline(JSRegExp::Flags flags) {
+ return (flags & JSRegExp::kMultiline) != 0;
+}
+
+inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
+ // Both unicode and ignore_case flags are set. We need to use ICU to find
+ // the closure over case equivalents.
+ return IsUnicode(flags) && IgnoreCase(flags);
+}
+
+// Details of a quick mask-compare check that can look ahead in the
+// input stream.
+class QuickCheckDetails {
+ public:
+ QuickCheckDetails()
+ : characters_(0), mask_(0), value_(0), cannot_match_(false) {}
+ explicit QuickCheckDetails(int characters)
+ : characters_(characters), mask_(0), value_(0), cannot_match_(false) {}
+ bool Rationalize(bool one_byte);
+ // Merge in the information from another branch of an alternation.
+ void Merge(QuickCheckDetails* other, int from_index);
+ // Advance the current position by some amount.
+ void Advance(int by, bool one_byte);
+ void Clear();
+ bool cannot_match() { return cannot_match_; }
+ void set_cannot_match() { cannot_match_ = true; }
+ struct Position {
+ Position() : mask(0), value(0), determines_perfectly(false) {}
+ uc16 mask;
+ uc16 value;
+ bool determines_perfectly;
+ };
+ int characters() { return characters_; }
+ void set_characters(int characters) { characters_ = characters; }
+ Position* positions(int index) {
+ DCHECK_LE(0, index);
+ DCHECK_GT(characters_, index);
+ return positions_ + index;
+ }
+ uint32_t mask() { return mask_; }
+ uint32_t value() { return value_; }
+
+ private:
+ // How many characters do we have quick check information from. This is
+ // the same for all branches of a choice node.
+ int characters_;
+ Position positions_[4];
+ // These values are the condensate of the above array after Rationalize().
+ uint32_t mask_;
+ uint32_t value_;
+ // If set to true, there is no way this quick check can match at all.
+ // E.g., if it requires to be at the start of the input, and isn't.
+ bool cannot_match_;
+};
+
+// Improve the speed that we scan for an initial point where a non-anchored
+// regexp can match by using a Boyer-Moore-like table. This is done by
+// identifying non-greedy non-capturing loops in the nodes that eat any
+// character one at a time. For example in the middle of the regexp
+// /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
+// inserted at the start of any non-anchored regexp.
+//
+// When we have found such a loop we look ahead in the nodes to find the set of
+// characters that can come at given distances. For example for the regexp
+// /.?foo/ we know that there are at least 3 characters ahead of us, and the
+// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
+// the lookahead info where the set of characters is reasonably constrained. In
+// our example this is from index 1 to 2 (0 is not constrained). We can now
+// look 3 characters ahead and if we don't find one of [f, o] (the union of
+// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
+//
+// For Unicode input strings we do the same, but modulo 128.
+//
+// We also look at the first string fed to the regexp and use that to get a hint
+// of the character frequencies in the inputs. This affects the assessment of
+// whether the set of characters is 'reasonably constrained'.
+//
+// We also have another lookahead mechanism (called quick check in the code),
+// which uses a wide load of multiple characters followed by a mask and compare
+// to determine whether a match is possible at this point.
+enum ContainedInLattice {
+ kNotYet = 0,
+ kLatticeIn = 1,
+ kLatticeOut = 2,
+ kLatticeUnknown = 3 // Can also mean both in and out.
+};
+
+inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
+ return static_cast<ContainedInLattice>(a | b);
+}
+
+class BoyerMoorePositionInfo : public ZoneObject {
+ public:
+ bool at(int i) const { return map_[i]; }
+
+ static constexpr int kMapSize = 128;
+ static constexpr int kMask = kMapSize - 1;
+
+ int map_count() const { return map_count_; }
+
+ void Set(int character);
+ void SetInterval(const Interval& interval);
+ void SetAll();
+
+ bool is_non_word() { return w_ == kLatticeOut; }
+ bool is_word() { return w_ == kLatticeIn; }
+
+ using Bitset = std::bitset<kMapSize>;
+ Bitset raw_bitset() const { return map_; }
+
+ private:
+ Bitset map_;
+ int map_count_ = 0; // Number of set bits in the map.
+ ContainedInLattice w_ = kNotYet; // The \w character class.
+};
+
+class BoyerMooreLookahead : public ZoneObject {
+ public:
+ BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
+
+ int length() { return length_; }
+ int max_char() { return max_char_; }
+ RegExpCompiler* compiler() { return compiler_; }
+
+ int Count(int map_number) { return bitmaps_->at(map_number)->map_count(); }
+
+ BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
+
+ void Set(int map_number, int character) {
+ if (character > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
+ info->Set(character);
+ }
+
+ void SetInterval(int map_number, const Interval& interval) {
+ if (interval.from() > max_char_) return;
+ BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
+ if (interval.to() > max_char_) {
+ info->SetInterval(Interval(interval.from(), max_char_));
+ } else {
+ info->SetInterval(interval);
+ }
+ }
+
+ void SetAll(int map_number) { bitmaps_->at(map_number)->SetAll(); }
+
+ void SetRest(int from_map) {
+ for (int i = from_map; i < length_; i++) SetAll(i);
+ }
+ void EmitSkipInstructions(RegExpMacroAssembler* masm);
+
+ private:
+ // This is the value obtained by EatsAtLeast. If we do not have at least this
+ // many characters left in the sample string then the match is bound to fail.
+ // Therefore it is OK to read a character this far ahead of the current match
+ // point.
+ int length_;
+ RegExpCompiler* compiler_;
+ // 0xff for Latin1, 0xffff for UTF-16.
+ int max_char_;
+ ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
+
+ int GetSkipTable(int min_lookahead, int max_lookahead,
+ Handle<ByteArray> boolean_skip_table);
+ bool FindWorthwhileInterval(int* from, int* to);
+ int FindBestInterval(int max_number_of_chars, int old_biggest_points,
+ int* from, int* to);
+};
+
+// There are many ways to generate code for a node. This class encapsulates
+// the current way we should be generating. In other words it encapsulates
+// the current state of the code generator. The effect of this is that we
+// generate code for paths that the matcher can take through the regular
+// expression. A given node in the regexp can be code-generated several times
+// as it can be part of several traces. For example for the regexp:
+// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
+// of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
+// to match foo is generated only once (the traces have a common prefix). The
+// code to store the capture is deferred and generated (twice) after the places
+// where baz has been matched.
+class Trace {
+ public:
+ // A value for a property that is either known to be true, know to be false,
+ // or not known.
+ enum TriBool { UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1 };
+
+ class DeferredAction {
+ public:
+ DeferredAction(ActionNode::ActionType action_type, int reg)
+ : action_type_(action_type), reg_(reg), next_(nullptr) {}
+ DeferredAction* next() { return next_; }
+ bool Mentions(int reg);
+ int reg() { return reg_; }
+ ActionNode::ActionType action_type() { return action_type_; }
+
+ private:
+ ActionNode::ActionType action_type_;
+ int reg_;
+ DeferredAction* next_;
+ friend class Trace;
+ };
+
+ class DeferredCapture : public DeferredAction {
+ public:
+ DeferredCapture(int reg, bool is_capture, Trace* trace)
+ : DeferredAction(ActionNode::STORE_POSITION, reg),
+ cp_offset_(trace->cp_offset()),
+ is_capture_(is_capture) {}
+ int cp_offset() { return cp_offset_; }
+ bool is_capture() { return is_capture_; }
+
+ private:
+ int cp_offset_;
+ bool is_capture_;
+ void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
+ };
+
+ class DeferredSetRegister : public DeferredAction {
+ public:
+ DeferredSetRegister(int reg, int value)
+ : DeferredAction(ActionNode::SET_REGISTER, reg), value_(value) {}
+ int value() { return value_; }
+
+ private:
+ int value_;
+ };
+
+ class DeferredClearCaptures : public DeferredAction {
+ public:
+ explicit DeferredClearCaptures(Interval range)
+ : DeferredAction(ActionNode::CLEAR_CAPTURES, -1), range_(range) {}
+ Interval range() { return range_; }
+
+ private:
+ Interval range_;
+ };
+
+ class DeferredIncrementRegister : public DeferredAction {
+ public:
+ explicit DeferredIncrementRegister(int reg)
+ : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) {}
+ };
+
+ Trace()
+ : cp_offset_(0),
+ actions_(nullptr),
+ backtrack_(nullptr),
+ stop_node_(nullptr),
+ loop_label_(nullptr),
+ characters_preloaded_(0),
+ bound_checked_up_to_(0),
+ flush_budget_(100),
+ at_start_(UNKNOWN) {}
+
+ // End the trace. This involves flushing the deferred actions in the trace
+ // and pushing a backtrack location onto the backtrack stack. Once this is
+ // done we can start a new trace or go to one that has already been
+ // generated.
+ void Flush(RegExpCompiler* compiler, RegExpNode* successor);
+ int cp_offset() { return cp_offset_; }
+ DeferredAction* actions() { return actions_; }
+ // A trivial trace is one that has no deferred actions or other state that
+ // affects the assumptions used when generating code. There is no recorded
+ // backtrack location in a trivial trace, so with a trivial trace we will
+ // generate code that, on a failure to match, gets the backtrack location
+ // from the backtrack stack rather than using a direct jump instruction. We
+ // always start code generation with a trivial trace and non-trivial traces
+ // are created as we emit code for nodes or add to the list of deferred
+ // actions in the trace. The location of the code generated for a node using
+ // a trivial trace is recorded in a label in the node so that gotos can be
+ // generated to that code.
+ bool is_trivial() {
+ return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
+ characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
+ quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
+ }
+ TriBool at_start() { return at_start_; }
+ void set_at_start(TriBool at_start) { at_start_ = at_start; }
+ Label* backtrack() { return backtrack_; }
+ Label* loop_label() { return loop_label_; }
+ RegExpNode* stop_node() { return stop_node_; }
+ int characters_preloaded() { return characters_preloaded_; }
+ int bound_checked_up_to() { return bound_checked_up_to_; }
+ int flush_budget() { return flush_budget_; }
+ QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
+ bool mentions_reg(int reg);
+ // Returns true if a deferred position store exists to the specified
+ // register and stores the offset in the out-parameter. Otherwise
+ // returns false.
+ bool GetStoredPosition(int reg, int* cp_offset);
+ // These set methods and AdvanceCurrentPositionInTrace should be used only on
+ // new traces - the intention is that traces are immutable after creation.
+ void add_action(DeferredAction* new_action) {
+ DCHECK(new_action->next_ == nullptr);
+ new_action->next_ = actions_;
+ actions_ = new_action;
+ }
+ void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
+ void set_stop_node(RegExpNode* node) { stop_node_ = node; }
+ void set_loop_label(Label* label) { loop_label_ = label; }
+ void set_characters_preloaded(int count) { characters_preloaded_ = count; }
+ void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
+ void set_flush_budget(int to) { flush_budget_ = to; }
+ void set_quick_check_performed(QuickCheckDetails* d) {
+ quick_check_performed_ = *d;
+ }
+ void InvalidateCurrentCharacter();
+ void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
+
+ private:
+ int FindAffectedRegisters(DynamicBitSet* affected_registers, Zone* zone);
+ void PerformDeferredActions(RegExpMacroAssembler* macro, int max_register,
+ const DynamicBitSet& affected_registers,
+ DynamicBitSet* registers_to_pop,
+ DynamicBitSet* registers_to_clear, Zone* zone);
+ void RestoreAffectedRegisters(RegExpMacroAssembler* macro, int max_register,
+ const DynamicBitSet& registers_to_pop,
+ const DynamicBitSet& registers_to_clear);
+ int cp_offset_;
+ DeferredAction* actions_;
+ Label* backtrack_;
+ RegExpNode* stop_node_;
+ Label* loop_label_;
+ int characters_preloaded_;
+ int bound_checked_up_to_;
+ QuickCheckDetails quick_check_performed_;
+ int flush_budget_;
+ TriBool at_start_;
+};
+
+class GreedyLoopState {
+ public:
+ explicit GreedyLoopState(bool not_at_start);
+
+ Label* label() { return &label_; }
+ Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
+
+ private:
+ Label label_;
+ Trace counter_backtrack_trace_;
+};
+
+struct PreloadState {
+ static const int kEatsAtLeastNotYetInitialized = -1;
+ bool preload_is_current_;
+ bool preload_has_checked_bounds_;
+ int preload_characters_;
+ int eats_at_least_;
+ void init() { eats_at_least_ = kEatsAtLeastNotYetInitialized; }
+};
+
+// Assertion propagation moves information about assertions such as
+// \b to the affected nodes. For instance, in /.\b./ information must
+// be propagated to the first '.' that whatever follows needs to know
+// if it matched a word or a non-word, and to the second '.' that it
+// has to check if it succeeds a word or non-word. In this case the
+// result will be something like:
+//
+// +-------+ +------------+
+// | . | | . |
+// +-------+ ---> +------------+
+// | word? | | check word |
+// +-------+ +------------+
+class Analysis : public NodeVisitor {
+ public:
+ Analysis(Isolate* isolate, bool is_one_byte)
+ : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
+ void EnsureAnalyzed(RegExpNode* node);
+
+#define DECLARE_VISIT(Type) void Visit##Type(Type##Node* that) override;
+ FOR_EACH_NODE_TYPE(DECLARE_VISIT)
+#undef DECLARE_VISIT
+ void VisitLoopChoice(LoopChoiceNode* that) override;
+
+ bool has_failed() { return error_message_ != nullptr; }
+ const char* error_message() {
+ DCHECK(error_message_ != nullptr);
+ return error_message_;
+ }
+ void fail(const char* error_message) { error_message_ = error_message; }
+
+ Isolate* isolate() const { return isolate_; }
+
+ private:
+ Isolate* isolate_;
+ bool is_one_byte_;
+ const char* error_message_;
+
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
+};
+
+class FrequencyCollator {
+ public:
+ FrequencyCollator() : total_samples_(0) {
+ for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
+ frequencies_[i] = CharacterFrequency(i);
+ }
+ }
+
+ void CountCharacter(int character) {
+ int index = (character & RegExpMacroAssembler::kTableMask);
+ frequencies_[index].Increment();
+ total_samples_++;
+ }
+
+ // Does not measure in percent, but rather per-128 (the table size from the
+ // regexp macro assembler).
+ int Frequency(int in_character) {
+ DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
+ if (total_samples_ < 1) return 1; // Division by zero.
+ int freq_in_per128 =
+ (frequencies_[in_character].counter() * 128) / total_samples_;
+ return freq_in_per128;
+ }
+
+ private:
+ class CharacterFrequency {
+ public:
+ CharacterFrequency() : counter_(0), character_(-1) {}
+ explicit CharacterFrequency(int character)
+ : counter_(0), character_(character) {}
+
+ void Increment() { counter_++; }
+ int counter() { return counter_; }
+ int character() { return character_; }
+
+ private:
+ int counter_;
+ int character_;
+ };
+
+ private:
+ CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
+ int total_samples_;
+};
+
+class RegExpCompiler {
+ public:
+ RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
+ bool is_one_byte);
+
+ int AllocateRegister() {
+ if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
+ reg_exp_too_big_ = true;
+ return next_register_;
+ }
+ return next_register_++;
+ }
+
+ // Lookarounds to match lone surrogates for unicode character class matches
+ // are never nested. We can therefore reuse registers.
+ int UnicodeLookaroundStackRegister() {
+ if (unicode_lookaround_stack_register_ == kNoRegister) {
+ unicode_lookaround_stack_register_ = AllocateRegister();
+ }
+ return unicode_lookaround_stack_register_;
+ }
+
+ int UnicodeLookaroundPositionRegister() {
+ if (unicode_lookaround_position_register_ == kNoRegister) {
+ unicode_lookaround_position_register_ = AllocateRegister();
+ }
+ return unicode_lookaround_position_register_;
+ }
+
+ struct CompilationResult final {
+ explicit CompilationResult(const char* error_message)
+ : error_message(error_message) {}
+ CompilationResult(Object code, int registers)
+ : code(code), num_registers(registers) {}
+
+ static CompilationResult RegExpTooBig() {
+ return CompilationResult("RegExp too big");
+ }
+
+ bool Succeeded() const { return error_message == nullptr; }
+
+ const char* const error_message = nullptr;
+ Object code;
+ int num_registers = 0;
+ };
+
+ CompilationResult Assemble(Isolate* isolate, RegExpMacroAssembler* assembler,
+ RegExpNode* start, int capture_count,
+ Handle<String> pattern);
+
+ // If the regexp matching starts within a surrogate pair, step back to the
+ // lead surrogate and start matching from there.
+ static RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
+ RegExpNode* on_success,
+ JSRegExp::Flags flags);
+
+ inline void AddWork(RegExpNode* node) {
+ if (!node->on_work_list() && !node->label()->is_bound()) {
+ node->set_on_work_list(true);
+ work_list_->push_back(node);
+ }
+ }
+
+ static const int kImplementationOffset = 0;
+ static const int kNumberOfRegistersOffset = 0;
+ static const int kCodeOffset = 1;
+
+ RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
+ EndNode* accept() { return accept_; }
+
+ static const int kMaxRecursion = 100;
+ inline int recursion_depth() { return recursion_depth_; }
+ inline void IncrementRecursionDepth() { recursion_depth_++; }
+ inline void DecrementRecursionDepth() { recursion_depth_--; }
+
+ void SetRegExpTooBig() { reg_exp_too_big_ = true; }
+
+ inline bool one_byte() { return one_byte_; }
+ inline bool optimize() { return optimize_; }
+ inline void set_optimize(bool value) { optimize_ = value; }
+ inline bool limiting_recursion() { return limiting_recursion_; }
+ inline void set_limiting_recursion(bool value) {
+ limiting_recursion_ = value;
+ }
+ bool read_backward() { return read_backward_; }
+ void set_read_backward(bool value) { read_backward_ = value; }
+ FrequencyCollator* frequency_collator() { return &frequency_collator_; }
+
+ int current_expansion_factor() { return current_expansion_factor_; }
+ void set_current_expansion_factor(int value) {
+ current_expansion_factor_ = value;
+ }
+
+ Isolate* isolate() const { return isolate_; }
+ Zone* zone() const { return zone_; }
+
+ static const int kNoRegister = -1;
+
+ private:
+ EndNode* accept_;
+ int next_register_;
+ int unicode_lookaround_stack_register_;
+ int unicode_lookaround_position_register_;
+ std::vector<RegExpNode*>* work_list_;
+ int recursion_depth_;
+ RegExpMacroAssembler* macro_assembler_;
+ bool one_byte_;
+ bool reg_exp_too_big_;
+ bool limiting_recursion_;
+ bool optimize_;
+ bool read_backward_;
+ int current_expansion_factor_;
+ FrequencyCollator frequency_collator_;
+ Isolate* isolate_;
+ Zone* zone_;
+};
+
+// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
+class UnicodeRangeSplitter {
+ public:
+ V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList<CharacterRange>* base);
+
+ static constexpr int kInitialSize = 8;
+ using CharacterRangeVector = base::SmallVector<CharacterRange, kInitialSize>;
+
+ const CharacterRangeVector* bmp() const { return &bmp_; }
+ const CharacterRangeVector* lead_surrogates() const {
+ return &lead_surrogates_;
+ }
+ const CharacterRangeVector* trail_surrogates() const {
+ return &trail_surrogates_;
+ }
+ const CharacterRangeVector* non_bmp() const { return &non_bmp_; }
+
+ private:
+ void AddRange(CharacterRange range);
+
+ CharacterRangeVector bmp_;
+ CharacterRangeVector lead_surrogates_;
+ CharacterRangeVector trail_surrogates_;
+ CharacterRangeVector non_bmp_;
+};
+
+// We need to check for the following characters: 0x39C 0x3BC 0x178.
+// TODO(jgruber): Move to CharacterRange.
+bool RangeContainsLatin1Equivalents(CharacterRange range);
+
+} // namespace internal
+} // namespace v8
+
+#endif // V8_REGEXP_REGEXP_COMPILER_H_