mirror of
https://github.com/dart-lang/sdk
synced 2024-10-14 04:50:22 +00:00
a64b06ec83
Issue https://dart-review.googlesource.com/c/sdk/+/115701 Change-Id: Ib579f0bbc8d694aec74afd837217316a10baf910 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/115707 Commit-Queue: Martin Kustermann <kustermann@google.com> Reviewed-by: Ryan Macnak <rmacnak@google.com> Reviewed-by: Daco Harkes <dacoharkes@google.com>
449 lines
16 KiB
C++
449 lines
16 KiB
C++
// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
|
|
// for details. All rights reserved. Use of this source code is governed by a
|
|
// BSD-style license that can be found in the LICENSE file.
|
|
|
|
#ifndef RUNTIME_VM_REGEXP_AST_H_
|
|
#define RUNTIME_VM_REGEXP_AST_H_
|
|
|
|
#include "platform/globals.h"
|
|
#include "platform/utils.h"
|
|
#include "vm/allocation.h"
|
|
#include "vm/regexp.h"
|
|
|
|
namespace dart {
|
|
|
|
class RegExpAlternative;
|
|
class RegExpAssertion;
|
|
class RegExpAtom;
|
|
class RegExpBackReference;
|
|
class RegExpCapture;
|
|
class RegExpCharacterClass;
|
|
class RegExpCompiler;
|
|
class RegExpDisjunction;
|
|
class RegExpEmpty;
|
|
class RegExpLookaround;
|
|
class RegExpQuantifier;
|
|
class RegExpText;
|
|
|
|
class RegExpVisitor : public ValueObject {
|
|
public:
|
|
virtual ~RegExpVisitor() {}
|
|
#define MAKE_CASE(Name) \
|
|
virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
|
|
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
|
|
#undef MAKE_CASE
|
|
};
|
|
|
|
class RegExpTree : public ZoneAllocated {
|
|
public:
|
|
static const intptr_t kInfinity = kMaxInt32;
|
|
virtual ~RegExpTree() {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler,
|
|
RegExpNode* on_success) = 0;
|
|
virtual bool IsTextElement() const { return false; }
|
|
virtual bool IsAnchoredAtStart() const { return false; }
|
|
virtual bool IsAnchoredAtEnd() const { return false; }
|
|
virtual intptr_t min_match() const = 0;
|
|
virtual intptr_t max_match() const = 0;
|
|
// Returns the interval of registers used for captures within this
|
|
// expression.
|
|
virtual Interval CaptureRegisters() const { return Interval::Empty(); }
|
|
virtual void AppendToText(RegExpText* text);
|
|
void Print();
|
|
#define MAKE_ASTYPE(Name) \
|
|
virtual RegExp##Name* As##Name(); \
|
|
virtual bool Is##Name() const;
|
|
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
|
|
#undef MAKE_ASTYPE
|
|
};
|
|
|
|
class RegExpDisjunction : public RegExpTree {
|
|
public:
|
|
explicit RegExpDisjunction(ZoneGrowableArray<RegExpTree*>* alternatives);
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpDisjunction* AsDisjunction();
|
|
virtual Interval CaptureRegisters() const;
|
|
virtual bool IsDisjunction() const;
|
|
virtual bool IsAnchoredAtStart() const;
|
|
virtual bool IsAnchoredAtEnd() const;
|
|
virtual intptr_t min_match() const { return min_match_; }
|
|
virtual intptr_t max_match() const { return max_match_; }
|
|
ZoneGrowableArray<RegExpTree*>* alternatives() const { return alternatives_; }
|
|
|
|
private:
|
|
ZoneGrowableArray<RegExpTree*>* alternatives_;
|
|
intptr_t min_match_;
|
|
intptr_t max_match_;
|
|
};
|
|
|
|
class RegExpAlternative : public RegExpTree {
|
|
public:
|
|
explicit RegExpAlternative(ZoneGrowableArray<RegExpTree*>* nodes);
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpAlternative* AsAlternative();
|
|
virtual Interval CaptureRegisters() const;
|
|
virtual bool IsAlternative() const;
|
|
virtual bool IsAnchoredAtStart() const;
|
|
virtual bool IsAnchoredAtEnd() const;
|
|
virtual intptr_t min_match() const { return min_match_; }
|
|
virtual intptr_t max_match() const { return max_match_; }
|
|
ZoneGrowableArray<RegExpTree*>* nodes() const { return nodes_; }
|
|
|
|
private:
|
|
ZoneGrowableArray<RegExpTree*>* nodes_;
|
|
intptr_t min_match_;
|
|
intptr_t max_match_;
|
|
};
|
|
|
|
class RegExpAssertion : public RegExpTree {
|
|
public:
|
|
enum AssertionType {
|
|
START_OF_LINE,
|
|
START_OF_INPUT,
|
|
END_OF_LINE,
|
|
END_OF_INPUT,
|
|
BOUNDARY,
|
|
NON_BOUNDARY
|
|
};
|
|
RegExpAssertion(AssertionType type, RegExpFlags flags)
|
|
: assertion_type_(type), flags_(flags) {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpAssertion* AsAssertion();
|
|
virtual bool IsAssertion() const;
|
|
virtual bool IsAnchoredAtStart() const;
|
|
virtual bool IsAnchoredAtEnd() const;
|
|
virtual intptr_t min_match() const { return 0; }
|
|
virtual intptr_t max_match() const { return 0; }
|
|
AssertionType assertion_type() const { return assertion_type_; }
|
|
|
|
private:
|
|
AssertionType assertion_type_;
|
|
RegExpFlags flags_;
|
|
};
|
|
|
|
class CharacterSet : public ValueObject {
|
|
public:
|
|
explicit CharacterSet(uint16_t standard_set_type)
|
|
: ranges_(NULL), standard_set_type_(standard_set_type) {}
|
|
explicit CharacterSet(ZoneGrowableArray<CharacterRange>* ranges)
|
|
: ranges_(ranges), standard_set_type_(0) {}
|
|
CharacterSet(const CharacterSet& that)
|
|
: ValueObject(),
|
|
ranges_(that.ranges_),
|
|
standard_set_type_(that.standard_set_type_) {}
|
|
ZoneGrowableArray<CharacterRange>* ranges();
|
|
uint16_t standard_set_type() const { return standard_set_type_; }
|
|
void set_standard_set_type(uint16_t special_set_type) {
|
|
standard_set_type_ = special_set_type;
|
|
}
|
|
bool is_standard() { return standard_set_type_ != 0; }
|
|
void Canonicalize();
|
|
|
|
private:
|
|
ZoneGrowableArray<CharacterRange>* ranges_;
|
|
// If non-zero, the value represents a standard set (e.g., all whitespace
|
|
// characters) without having to expand the ranges.
|
|
uint16_t standard_set_type_;
|
|
};
|
|
|
|
class RegExpCharacterClass : public RegExpTree {
|
|
public:
|
|
enum Flag {
|
|
// The character class is negated and should match everything but the
|
|
// specified ranges.
|
|
NEGATED = 1 << 0,
|
|
// The character class contains part of a split surrogate and should not
|
|
// be unicode-desugared.
|
|
CONTAINS_SPLIT_SURROGATE = 1 << 1,
|
|
};
|
|
using CharacterClassFlags = intptr_t;
|
|
static inline CharacterClassFlags DefaultFlags() { return 0; }
|
|
|
|
RegExpCharacterClass(
|
|
ZoneGrowableArray<CharacterRange>* ranges,
|
|
RegExpFlags flags,
|
|
CharacterClassFlags character_class_flags = DefaultFlags())
|
|
: set_(ranges),
|
|
flags_(flags),
|
|
character_class_flags_(character_class_flags) {
|
|
// Convert the empty set of ranges to the negated Everything() range.
|
|
if (ranges->is_empty()) {
|
|
ranges->Add(CharacterRange::Everything());
|
|
character_class_flags_ ^= NEGATED;
|
|
}
|
|
}
|
|
RegExpCharacterClass(uint16_t type, RegExpFlags flags)
|
|
: set_(type), flags_(flags), character_class_flags_(0) {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpCharacterClass* AsCharacterClass();
|
|
virtual bool IsCharacterClass() const;
|
|
virtual bool IsTextElement() const { return true; }
|
|
virtual intptr_t min_match() const { return 1; }
|
|
// The character class may match two code units for unicode regexps.
|
|
virtual intptr_t max_match() const { return 2; }
|
|
virtual void AppendToText(RegExpText* text);
|
|
CharacterSet character_set() const { return set_; }
|
|
// TODO(lrn): Remove need for complex version if is_standard that
|
|
// recognizes a mangled standard set and just do { return set_.is_special(); }
|
|
bool is_standard();
|
|
// Returns a value representing the standard character set if is_standard()
|
|
// returns true.
|
|
// Currently used values are:
|
|
// s : unicode whitespace
|
|
// S : unicode non-whitespace
|
|
// w : ASCII word character (digit, letter, underscore)
|
|
// W : non-ASCII word character
|
|
// d : ASCII digit
|
|
// D : non-ASCII digit
|
|
// . : non-unicode non-newline
|
|
// * : All characters
|
|
uint16_t standard_type() const { return set_.standard_set_type(); }
|
|
ZoneGrowableArray<CharacterRange>* ranges() { return set_.ranges(); }
|
|
bool is_negated() const { return (character_class_flags_ & NEGATED) != 0; }
|
|
RegExpFlags flags() const { return flags_; }
|
|
bool contains_split_surrogate() const {
|
|
return (character_class_flags_ & CONTAINS_SPLIT_SURROGATE) != 0;
|
|
}
|
|
|
|
private:
|
|
CharacterSet set_;
|
|
RegExpFlags flags_;
|
|
CharacterClassFlags character_class_flags_;
|
|
};
|
|
|
|
class RegExpAtom : public RegExpTree {
|
|
public:
|
|
RegExpAtom(ZoneGrowableArray<uint16_t>* data, RegExpFlags flags)
|
|
: data_(data), flags_(flags) {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpAtom* AsAtom();
|
|
virtual bool IsAtom() const;
|
|
virtual bool IsTextElement() const { return true; }
|
|
virtual intptr_t min_match() const { return data_->length(); }
|
|
virtual intptr_t max_match() const { return data_->length(); }
|
|
virtual void AppendToText(RegExpText* text);
|
|
ZoneGrowableArray<uint16_t>* data() const { return data_; }
|
|
intptr_t length() const { return data_->length(); }
|
|
RegExpFlags flags() const { return flags_; }
|
|
bool ignore_case() const { return flags_.IgnoreCase(); }
|
|
|
|
private:
|
|
ZoneGrowableArray<uint16_t>* data_;
|
|
const RegExpFlags flags_;
|
|
};
|
|
|
|
class RegExpText : public RegExpTree {
|
|
public:
|
|
RegExpText() : elements_(2), length_(0) {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpText* AsText();
|
|
virtual bool IsText() const;
|
|
virtual bool IsTextElement() const { return true; }
|
|
virtual intptr_t min_match() const { return length_; }
|
|
virtual intptr_t max_match() const { return length_; }
|
|
virtual void AppendToText(RegExpText* text);
|
|
void AddElement(TextElement elm) {
|
|
elements_.Add(elm);
|
|
length_ += elm.length();
|
|
}
|
|
GrowableArray<TextElement>* elements() { return &elements_; }
|
|
|
|
private:
|
|
GrowableArray<TextElement> elements_;
|
|
intptr_t length_;
|
|
};
|
|
|
|
class RegExpQuantifier : public RegExpTree {
|
|
public:
|
|
enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
|
|
RegExpQuantifier(intptr_t min,
|
|
intptr_t max,
|
|
QuantifierType type,
|
|
RegExpTree* body)
|
|
: body_(body),
|
|
min_(min),
|
|
max_(max),
|
|
min_match_(min * body->min_match()),
|
|
quantifier_type_(type) {
|
|
if (max > 0 && body->max_match() > kInfinity / max) {
|
|
max_match_ = kInfinity;
|
|
} else {
|
|
max_match_ = max * body->max_match();
|
|
}
|
|
}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
static RegExpNode* ToNode(intptr_t min,
|
|
intptr_t max,
|
|
bool is_greedy,
|
|
RegExpTree* body,
|
|
RegExpCompiler* compiler,
|
|
RegExpNode* on_success,
|
|
bool not_at_start = false);
|
|
virtual RegExpQuantifier* AsQuantifier();
|
|
virtual Interval CaptureRegisters() const;
|
|
virtual bool IsQuantifier() const;
|
|
virtual intptr_t min_match() const { return min_match_; }
|
|
virtual intptr_t max_match() const { return max_match_; }
|
|
intptr_t min() const { return min_; }
|
|
intptr_t max() const { return max_; }
|
|
bool is_possessive() const { return quantifier_type_ == POSSESSIVE; }
|
|
bool is_non_greedy() const { return quantifier_type_ == NON_GREEDY; }
|
|
bool is_greedy() const { return quantifier_type_ == GREEDY; }
|
|
RegExpTree* body() const { return body_; }
|
|
|
|
private:
|
|
RegExpTree* body_;
|
|
intptr_t min_;
|
|
intptr_t max_;
|
|
intptr_t min_match_;
|
|
intptr_t max_match_;
|
|
QuantifierType quantifier_type_;
|
|
};
|
|
|
|
class RegExpCapture : public RegExpTree {
|
|
public:
|
|
explicit RegExpCapture(intptr_t index)
|
|
: body_(nullptr), index_(index), name_(nullptr) {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
static RegExpNode* ToNode(RegExpTree* body,
|
|
intptr_t index,
|
|
RegExpCompiler* compiler,
|
|
RegExpNode* on_success);
|
|
virtual RegExpCapture* AsCapture();
|
|
virtual bool IsAnchoredAtStart() const;
|
|
virtual bool IsAnchoredAtEnd() const;
|
|
virtual Interval CaptureRegisters() const;
|
|
virtual bool IsCapture() const;
|
|
virtual intptr_t min_match() const { return body_->min_match(); }
|
|
virtual intptr_t max_match() const { return body_->max_match(); }
|
|
RegExpTree* body() const { return body_; }
|
|
// When a backreference is parsed before the corresponding capture group,
|
|
// which can happen because of lookbehind, we create the capture object when
|
|
// we create the backreference, and fill in the body later when the actual
|
|
// capture group is parsed.
|
|
void set_body(RegExpTree* body) { body_ = body; }
|
|
intptr_t index() const { return index_; }
|
|
const ZoneGrowableArray<uint16_t>* name() { return name_; }
|
|
void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; }
|
|
static intptr_t StartRegister(intptr_t index) { return index * 2; }
|
|
static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; }
|
|
|
|
private:
|
|
RegExpTree* body_;
|
|
intptr_t index_;
|
|
const ZoneGrowableArray<uint16_t>* name_;
|
|
};
|
|
|
|
class RegExpLookaround : public RegExpTree {
|
|
public:
|
|
enum Type { LOOKAHEAD, LOOKBEHIND };
|
|
RegExpLookaround(RegExpTree* body,
|
|
bool is_positive,
|
|
intptr_t capture_count,
|
|
intptr_t capture_from,
|
|
Type type)
|
|
: body_(body),
|
|
is_positive_(is_positive),
|
|
capture_count_(capture_count),
|
|
capture_from_(capture_from),
|
|
type_(type) {}
|
|
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpLookaround* AsLookaround();
|
|
virtual Interval CaptureRegisters() const;
|
|
virtual bool IsLookaround() const;
|
|
virtual bool IsAnchoredAtStart() const;
|
|
virtual intptr_t min_match() const { return 0; }
|
|
virtual intptr_t max_match() const { return 0; }
|
|
RegExpTree* body() const { return body_; }
|
|
bool is_positive() const { return is_positive_; }
|
|
intptr_t capture_count() const { return capture_count_; }
|
|
intptr_t capture_from() const { return capture_from_; }
|
|
Type type() const { return type_; }
|
|
|
|
// The RegExpLookaround::Builder class abstracts out the process of building
|
|
// the compiling a RegExpLookaround object by splitting it into two phases,
|
|
// represented by the provided methods.
|
|
class Builder : public ValueObject {
|
|
public:
|
|
Builder(bool is_positive,
|
|
RegExpNode* on_success,
|
|
intptr_t stack_pointer_register,
|
|
intptr_t position_register,
|
|
intptr_t capture_register_count = 0,
|
|
intptr_t capture_register_start = 0);
|
|
RegExpNode* on_match_success() { return on_match_success_; }
|
|
RegExpNode* ForMatch(RegExpNode* match);
|
|
|
|
private:
|
|
bool is_positive_;
|
|
RegExpNode* on_match_success_;
|
|
RegExpNode* on_success_;
|
|
intptr_t stack_pointer_register_;
|
|
intptr_t position_register_;
|
|
};
|
|
|
|
private:
|
|
RegExpTree* body_;
|
|
bool is_positive_;
|
|
intptr_t capture_count_;
|
|
intptr_t capture_from_;
|
|
Type type_;
|
|
};
|
|
|
|
class RegExpBackReference : public RegExpTree {
|
|
public:
|
|
explicit RegExpBackReference(RegExpFlags flags)
|
|
: capture_(nullptr), name_(nullptr), flags_(flags) {}
|
|
RegExpBackReference(RegExpCapture* capture, RegExpFlags flags)
|
|
: capture_(capture), name_(nullptr), flags_(flags) {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpBackReference* AsBackReference();
|
|
virtual bool IsBackReference() const;
|
|
virtual intptr_t min_match() const { return 0; }
|
|
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
|
|
// recursion, we give up and just assume arbitrary length, which matches v8's
|
|
// behavior.
|
|
virtual intptr_t max_match() const { return kInfinity; }
|
|
intptr_t index() const { return capture_->index(); }
|
|
RegExpCapture* capture() const { return capture_; }
|
|
void set_capture(RegExpCapture* capture) { capture_ = capture; }
|
|
const ZoneGrowableArray<uint16_t>* name() { return name_; }
|
|
void set_name(const ZoneGrowableArray<uint16_t>* name) { name_ = name; }
|
|
|
|
private:
|
|
RegExpCapture* capture_;
|
|
const ZoneGrowableArray<uint16_t>* name_;
|
|
RegExpFlags flags_;
|
|
};
|
|
|
|
class RegExpEmpty : public RegExpTree {
|
|
public:
|
|
RegExpEmpty() {}
|
|
virtual void* Accept(RegExpVisitor* visitor, void* data);
|
|
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
|
|
virtual RegExpEmpty* AsEmpty();
|
|
virtual bool IsEmpty() const;
|
|
virtual intptr_t min_match() const { return 0; }
|
|
virtual intptr_t max_match() const { return 0; }
|
|
static RegExpEmpty* GetInstance() {
|
|
static RegExpEmpty* instance = ::new RegExpEmpty();
|
|
return instance;
|
|
}
|
|
};
|
|
|
|
} // namespace dart
|
|
|
|
#endif // RUNTIME_VM_REGEXP_AST_H_
|