diff options
Diffstat (limited to 'deps/v8/src/regexp/regexp-ast.h')
-rw-r--r-- | deps/v8/src/regexp/regexp-ast.h | 496 |
1 files changed, 496 insertions, 0 deletions
diff --git a/deps/v8/src/regexp/regexp-ast.h b/deps/v8/src/regexp/regexp-ast.h new file mode 100644 index 0000000000..f87778596a --- /dev/null +++ b/deps/v8/src/regexp/regexp-ast.h @@ -0,0 +1,496 @@ +// Copyright 2016 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_REGEXP_AST_H_ +#define V8_REGEXP_REGEXP_AST_H_ + +#include "src/utils.h" +#include "src/zone.h" + +namespace v8 { +namespace internal { + +#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ + VISIT(Disjunction) \ + VISIT(Alternative) \ + VISIT(Assertion) \ + VISIT(CharacterClass) \ + VISIT(Atom) \ + VISIT(Quantifier) \ + VISIT(Capture) \ + VISIT(Lookaround) \ + VISIT(BackReference) \ + VISIT(Empty) \ + VISIT(Text) + + +#define FORWARD_DECLARE(Name) class RegExp##Name; +FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) +#undef FORWARD_DECLARE + +class RegExpCompiler; +class RegExpNode; +class RegExpTree; + + +class RegExpVisitor BASE_EMBEDDED { + public: + virtual ~RegExpVisitor() {} +#define MAKE_CASE(Name) \ + virtual void* Visit##Name(RegExp##Name*, void* data) = 0; + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) +#undef MAKE_CASE +}; + + +// A simple closed interval. +class Interval { + public: + Interval() : from_(kNone), to_(kNone) {} + Interval(int from, int to) : from_(from), to_(to) {} + Interval Union(Interval that) { + if (that.from_ == kNone) + return *this; + else if (from_ == kNone) + return that; + else + return Interval(Min(from_, that.from_), Max(to_, that.to_)); + } + bool Contains(int value) { return (from_ <= value) && (value <= to_); } + bool is_empty() { return from_ == kNone; } + int from() const { return from_; } + int to() const { return to_; } + static Interval Empty() { return Interval(); } + static const int kNone = -1; + + private: + int from_; + int to_; +}; + + +// Represents code units in the range from from_ to to_, both ends are +// inclusive. +class CharacterRange { + public: + CharacterRange() : from_(0), to_(0) {} + // For compatibility with the CHECK_OK macro + CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT + CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {} + static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, + Zone* zone); + static Vector<const int> GetWordBounds(); + static inline CharacterRange Singleton(uc16 value) { + return CharacterRange(value, value); + } + static inline CharacterRange Range(uc16 from, uc16 to) { + DCHECK(from <= to); + return CharacterRange(from, to); + } + static inline CharacterRange Everything() { + return CharacterRange(0, 0xFFFF); + } + bool Contains(uc16 i) { return from_ <= i && i <= to_; } + uc16 from() const { return from_; } + void set_from(uc16 value) { from_ = value; } + uc16 to() const { return to_; } + void set_to(uc16 value) { to_ = value; } + bool is_valid() { return from_ <= to_; } + bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } + bool IsSingleton() { return (from_ == to_); } + void AddCaseEquivalents(Isolate* isolate, Zone* zone, + ZoneList<CharacterRange>* ranges, bool is_one_byte); + static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay, + ZoneList<CharacterRange>** included, + ZoneList<CharacterRange>** excluded, Zone* zone); + // Whether a range list is in canonical form: Ranges ordered by from value, + // and ranges non-overlapping and non-adjacent. + static bool IsCanonical(ZoneList<CharacterRange>* ranges); + // Convert range list to canonical form. The characters covered by the ranges + // will still be the same, but no character is in more than one range, and + // adjacent ranges are merged. The resulting list may be shorter than the + // original, but cannot be longer. + static void Canonicalize(ZoneList<CharacterRange>* ranges); + // Negate the contents of a character range in canonical form. + static void Negate(ZoneList<CharacterRange>* src, + ZoneList<CharacterRange>* dst, Zone* zone); + static const int kStartMarker = (1 << 24); + static const int kPayloadMask = (1 << 24) - 1; + + private: + uc16 from_; + uc16 to_; +}; + + +class CharacterSet final BASE_EMBEDDED { + public: + explicit CharacterSet(uc16 standard_set_type) + : ranges_(NULL), standard_set_type_(standard_set_type) {} + explicit CharacterSet(ZoneList<CharacterRange>* ranges) + : ranges_(ranges), standard_set_type_(0) {} + ZoneList<CharacterRange>* ranges(Zone* zone); + uc16 standard_set_type() { return standard_set_type_; } + void set_standard_set_type(uc16 special_set_type) { + standard_set_type_ = special_set_type; + } + bool is_standard() { return standard_set_type_ != 0; } + void Canonicalize(); + + private: + ZoneList<CharacterRange>* ranges_; + // If non-zero, the value represents a standard set (e.g., all whitespace + // characters) without having to expand the ranges. + uc16 standard_set_type_; +}; + + +class TextElement final BASE_EMBEDDED { + public: + enum TextType { ATOM, CHAR_CLASS }; + + static TextElement Atom(RegExpAtom* atom); + static TextElement CharClass(RegExpCharacterClass* char_class); + + int cp_offset() const { return cp_offset_; } + void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } + int length() const; + + TextType text_type() const { return text_type_; } + + RegExpTree* tree() const { return tree_; } + + RegExpAtom* atom() const { + DCHECK(text_type() == ATOM); + return reinterpret_cast<RegExpAtom*>(tree()); + } + + RegExpCharacterClass* char_class() const { + DCHECK(text_type() == CHAR_CLASS); + return reinterpret_cast<RegExpCharacterClass*>(tree()); + } + + private: + TextElement(TextType text_type, RegExpTree* tree) + : cp_offset_(-1), text_type_(text_type), tree_(tree) {} + + int cp_offset_; + TextType text_type_; + RegExpTree* tree_; +}; + + +class RegExpTree : public ZoneObject { + public: + static const int kInfinity = kMaxInt; + virtual ~RegExpTree() {} + virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; + virtual RegExpNode* ToNode(RegExpCompiler* compiler, + RegExpNode* on_success) = 0; + virtual bool IsTextElement() { return false; } + virtual bool IsAnchoredAtStart() { return false; } + virtual bool IsAnchoredAtEnd() { return false; } + virtual int min_match() = 0; + virtual int max_match() = 0; + // Returns the interval of registers used for captures within this + // expression. + virtual Interval CaptureRegisters() { return Interval::Empty(); } + virtual void AppendToText(RegExpText* text, Zone* zone); + std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT +#define MAKE_ASTYPE(Name) \ + virtual RegExp##Name* As##Name(); \ + virtual bool Is##Name(); + FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) +#undef MAKE_ASTYPE +}; + + +class RegExpDisjunction final : public RegExpTree { + public: + explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpDisjunction* AsDisjunction() override; + Interval CaptureRegisters() override; + bool IsDisjunction() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList<RegExpTree*>* alternatives() { return alternatives_; } + + private: + bool SortConsecutiveAtoms(RegExpCompiler* compiler); + void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); + void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); + ZoneList<RegExpTree*>* alternatives_; + int min_match_; + int max_match_; +}; + + +class RegExpAlternative final : public RegExpTree { + public: + explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpAlternative* AsAlternative() override; + Interval CaptureRegisters() override; + bool IsAlternative() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + ZoneList<RegExpTree*>* nodes() { return nodes_; } + + private: + ZoneList<RegExpTree*>* nodes_; + int min_match_; + int max_match_; +}; + + +class RegExpAssertion final : public RegExpTree { + public: + enum AssertionType { + START_OF_LINE, + START_OF_INPUT, + END_OF_LINE, + END_OF_INPUT, + BOUNDARY, + NON_BOUNDARY + }; + explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpAssertion* AsAssertion() override; + bool IsAssertion() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + AssertionType assertion_type() { return assertion_type_; } + + private: + AssertionType assertion_type_; +}; + + +class RegExpCharacterClass final : public RegExpTree { + public: + RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) + : set_(ranges), is_negated_(is_negated) {} + explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpCharacterClass* AsCharacterClass() override; + bool IsCharacterClass() override; + bool IsTextElement() override { return true; } + int min_match() override { return 1; } + int max_match() override { return 1; } + void AppendToText(RegExpText* text, Zone* zone) override; + CharacterSet character_set() { return set_; } + // TODO(lrn): Remove need for complex version if is_standard that + // recognizes a mangled standard set and just do { return set_.is_special(); } + bool is_standard(Zone* zone); + // Returns a value representing the standard character set if is_standard() + // returns true. + // Currently used values are: + // s : unicode whitespace + // S : unicode non-whitespace + // w : ASCII word character (digit, letter, underscore) + // W : non-ASCII word character + // d : ASCII digit + // D : non-ASCII digit + // . : non-unicode non-newline + // * : All characters + uc16 standard_type() { return set_.standard_set_type(); } + ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } + bool is_negated() { return is_negated_; } + + private: + CharacterSet set_; + bool is_negated_; +}; + + +class RegExpAtom final : public RegExpTree { + public: + explicit RegExpAtom(Vector<const uc16> data) : data_(data) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpAtom* AsAtom() override; + bool IsAtom() override; + bool IsTextElement() override { return true; } + int min_match() override { return data_.length(); } + int max_match() override { return data_.length(); } + void AppendToText(RegExpText* text, Zone* zone) override; + Vector<const uc16> data() { return data_; } + int length() { return data_.length(); } + + private: + Vector<const uc16> data_; +}; + + +class RegExpText final : public RegExpTree { + public: + explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpText* AsText() override; + bool IsText() override; + bool IsTextElement() override { return true; } + int min_match() override { return length_; } + int max_match() override { return length_; } + void AppendToText(RegExpText* text, Zone* zone) override; + void AddElement(TextElement elm, Zone* zone) { + elements_.Add(elm, zone); + length_ += elm.length(); + } + ZoneList<TextElement>* elements() { return &elements_; } + + private: + ZoneList<TextElement> elements_; + int length_; +}; + + +class RegExpQuantifier final : public RegExpTree { + public: + enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; + RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) + : body_(body), + min_(min), + max_(max), + min_match_(min * body->min_match()), + quantifier_type_(type) { + if (max > 0 && body->max_match() > kInfinity / max) { + max_match_ = kInfinity; + } else { + max_match_ = max * body->max_match(); + } + } + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, + RegExpCompiler* compiler, RegExpNode* on_success, + bool not_at_start = false); + RegExpQuantifier* AsQuantifier() override; + Interval CaptureRegisters() override; + bool IsQuantifier() override; + int min_match() override { return min_match_; } + int max_match() override { return max_match_; } + int min() { return min_; } + int max() { return max_; } + bool is_possessive() { return quantifier_type_ == POSSESSIVE; } + bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } + bool is_greedy() { return quantifier_type_ == GREEDY; } + RegExpTree* body() { return body_; } + + private: + RegExpTree* body_; + int min_; + int max_; + int min_match_; + int max_match_; + QuantifierType quantifier_type_; +}; + + +class RegExpCapture final : public RegExpTree { + public: + explicit RegExpCapture(int index) : body_(NULL), index_(index) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + static RegExpNode* ToNode(RegExpTree* body, int index, + RegExpCompiler* compiler, RegExpNode* on_success); + RegExpCapture* AsCapture() override; + bool IsAnchoredAtStart() override; + bool IsAnchoredAtEnd() override; + Interval CaptureRegisters() override; + bool IsCapture() override; + int min_match() override { return body_->min_match(); } + int max_match() override { return body_->max_match(); } + RegExpTree* body() { return body_; } + void set_body(RegExpTree* body) { body_ = body; } + int index() { return index_; } + static int StartRegister(int index) { return index * 2; } + static int EndRegister(int index) { return index * 2 + 1; } + + private: + RegExpTree* body_; + int index_; +}; + + +class RegExpLookaround final : public RegExpTree { + public: + enum Type { LOOKAHEAD, LOOKBEHIND }; + + RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, + int capture_from, Type type) + : body_(body), + is_positive_(is_positive), + capture_count_(capture_count), + capture_from_(capture_from), + type_(type) {} + + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpLookaround* AsLookaround() override; + Interval CaptureRegisters() override; + bool IsLookaround() override; + bool IsAnchoredAtStart() override; + int min_match() override { return 0; } + int max_match() override { return 0; } + RegExpTree* body() { return body_; } + bool is_positive() { return is_positive_; } + int capture_count() { return capture_count_; } + int capture_from() { return capture_from_; } + Type type() { return type_; } + + private: + RegExpTree* body_; + bool is_positive_; + int capture_count_; + int capture_from_; + Type type_; +}; + + +class RegExpBackReference final : public RegExpTree { + public: + explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpBackReference* AsBackReference() override; + bool IsBackReference() override; + int min_match() override { return 0; } + // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite + // recursion, we give up. Ignorance is bliss. + int max_match() override { return kInfinity; } + int index() { return capture_->index(); } + RegExpCapture* capture() { return capture_; } + + private: + RegExpCapture* capture_; +}; + + +class RegExpEmpty final : public RegExpTree { + public: + RegExpEmpty() {} + void* Accept(RegExpVisitor* visitor, void* data) override; + RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; + RegExpEmpty* AsEmpty() override; + bool IsEmpty() override; + int min_match() override { return 0; } + int max_match() override { return 0; } +}; + +} // namespace internal +} // namespace v8 + +#endif // V8_REGEXP_REGEXP_AST_H_ |