mirror of
https://github.com/dart-lang/sdk
synced 2024-09-19 20:41:45 +00:00
f31b7928ae
See https://github.com/tc39/proposal-regexp-lookbehind for a high-level description of the feature and examples. This is one of the features requested in https://github.com/dart-lang/sdk/issues/34935. This work takes the feature as present in the v8 engine and appropriately merges it into our irregexp fork. Notable changes to the irregexp codebase to introduce this feature: ----- We can no longer assume that all matching proceeds forwards, since lookbehind matching proceeds backwards. Similarly, we cannot assume that we can only be at the start of a string if we started matching from that point. The direction of matching must also be taken into consideration when doing bounds checking, which previously assumed the engine would never attempt to look before the start of a string. ----- We may now parse backreferences to captures before the capture they reference, since we parse regular expressions left to right, but lookbehinds perform captures as they evaluate the string from right to left. Since RegExpBackReference objects contain a pointer to their corresponding capture, this means that we may need to create RegExpCapture objects prior to the parsing of the corresponding captured subexpression. Thus, RegExpCapture objects are now only initialized with their index, and the body is set later when the subexpression is encountered and parsed. This means any method that operates on the body of a RegExpCapture can no longer be const, which also affects the rest of the RegExpTree class hierarchy. This also means that we don't have a valid max_match length for backreferences based off the capture body, and must assume they can end up being any length. ----- Change-Id: Iffe0e71b17b1a0c6fea77235e8aee5c093005811 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/94540 Commit-Queue: Stevie Strickland <sstrickl@google.com> Reviewed-by: Martin Kustermann <kustermann@google.com>
306 lines
8.9 KiB
C++
306 lines
8.9 KiB
C++
// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
|
|
// for details. All rights reserved. Use of this source code is governed by a
|
|
// BSD-style license that can be found in the LICENSE file.
|
|
|
|
#include "vm/regexp_ast.h"
|
|
|
|
#include "platform/utils.h"
|
|
#include "vm/os.h"
|
|
|
|
namespace dart {
|
|
|
|
#define MAKE_ACCEPT(Name) \
|
|
void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
|
|
return visitor->Visit##Name(this, data); \
|
|
}
|
|
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
|
|
#undef MAKE_ACCEPT
|
|
|
|
#define MAKE_TYPE_CASE(Name) \
|
|
RegExp##Name* RegExpTree::As##Name() { return NULL; } \
|
|
bool RegExpTree::Is##Name() const { return false; }
|
|
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
|
|
#undef MAKE_TYPE_CASE
|
|
|
|
#define MAKE_TYPE_CASE(Name) \
|
|
RegExp##Name* RegExp##Name::As##Name() { return this; } \
|
|
bool RegExp##Name::Is##Name() const { return true; }
|
|
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
|
|
#undef MAKE_TYPE_CASE
|
|
|
|
static Interval ListCaptureRegisters(ZoneGrowableArray<RegExpTree*>* children) {
|
|
Interval result = Interval::Empty();
|
|
for (intptr_t i = 0; i < children->length(); i++)
|
|
result = result.Union(children->At(i)->CaptureRegisters());
|
|
return result;
|
|
}
|
|
|
|
Interval RegExpAlternative::CaptureRegisters() const {
|
|
return ListCaptureRegisters(nodes());
|
|
}
|
|
|
|
Interval RegExpDisjunction::CaptureRegisters() const {
|
|
return ListCaptureRegisters(alternatives());
|
|
}
|
|
|
|
Interval RegExpLookaround::CaptureRegisters() const {
|
|
return body()->CaptureRegisters();
|
|
}
|
|
|
|
Interval RegExpCapture::CaptureRegisters() const {
|
|
Interval self(StartRegister(index()), EndRegister(index()));
|
|
return self.Union(body()->CaptureRegisters());
|
|
}
|
|
|
|
Interval RegExpQuantifier::CaptureRegisters() const {
|
|
return body()->CaptureRegisters();
|
|
}
|
|
|
|
bool RegExpAssertion::IsAnchoredAtStart() const {
|
|
return assertion_type() == RegExpAssertion::START_OF_INPUT;
|
|
}
|
|
|
|
bool RegExpAssertion::IsAnchoredAtEnd() const {
|
|
return assertion_type() == RegExpAssertion::END_OF_INPUT;
|
|
}
|
|
|
|
bool RegExpAlternative::IsAnchoredAtStart() const {
|
|
ZoneGrowableArray<RegExpTree*>* nodes = this->nodes();
|
|
for (intptr_t i = 0; i < nodes->length(); i++) {
|
|
RegExpTree* node = nodes->At(i);
|
|
if (node->IsAnchoredAtStart()) {
|
|
return true;
|
|
}
|
|
if (node->max_match() > 0) {
|
|
return false;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool RegExpAlternative::IsAnchoredAtEnd() const {
|
|
ZoneGrowableArray<RegExpTree*>* nodes = this->nodes();
|
|
for (intptr_t i = nodes->length() - 1; i >= 0; i--) {
|
|
RegExpTree* node = nodes->At(i);
|
|
if (node->IsAnchoredAtEnd()) {
|
|
return true;
|
|
}
|
|
if (node->max_match() > 0) {
|
|
return false;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool RegExpDisjunction::IsAnchoredAtStart() const {
|
|
ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
|
|
for (intptr_t i = 0; i < alternatives->length(); i++) {
|
|
if (!alternatives->At(i)->IsAnchoredAtStart()) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool RegExpDisjunction::IsAnchoredAtEnd() const {
|
|
ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
|
|
for (intptr_t i = 0; i < alternatives->length(); i++) {
|
|
if (!alternatives->At(i)->IsAnchoredAtEnd()) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool RegExpLookaround::IsAnchoredAtStart() const {
|
|
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
|
|
}
|
|
|
|
bool RegExpCapture::IsAnchoredAtStart() const {
|
|
return body()->IsAnchoredAtStart();
|
|
}
|
|
|
|
bool RegExpCapture::IsAnchoredAtEnd() const {
|
|
return body()->IsAnchoredAtEnd();
|
|
}
|
|
|
|
// Convert regular expression trees to a simple sexp representation.
|
|
// This representation should be different from the input grammar
|
|
// in as many cases as possible, to make it more difficult for incorrect
|
|
// parses to look as correct ones which is likely if the input and
|
|
// output formats are alike.
|
|
class RegExpUnparser : public RegExpVisitor {
|
|
public:
|
|
void VisitCharacterRange(CharacterRange that);
|
|
#define MAKE_CASE(Name) virtual void* Visit##Name(RegExp##Name*, void* data);
|
|
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
|
|
#undef MAKE_CASE
|
|
};
|
|
|
|
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
|
|
OS::PrintErr("(|");
|
|
for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
|
|
OS::PrintErr(" ");
|
|
(*that->alternatives())[i]->Accept(this, data);
|
|
}
|
|
OS::PrintErr(")");
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
|
|
OS::PrintErr("(:");
|
|
for (intptr_t i = 0; i < that->nodes()->length(); i++) {
|
|
OS::PrintErr(" ");
|
|
(*that->nodes())[i]->Accept(this, data);
|
|
}
|
|
OS::PrintErr(")");
|
|
return NULL;
|
|
}
|
|
|
|
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
|
|
PrintUtf16(that.from());
|
|
if (!that.IsSingleton()) {
|
|
OS::PrintErr("-");
|
|
PrintUtf16(that.to());
|
|
}
|
|
}
|
|
|
|
void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
|
|
void* data) {
|
|
if (that->is_negated()) OS::PrintErr("^");
|
|
OS::PrintErr("[");
|
|
for (intptr_t i = 0; i < that->ranges()->length(); i++) {
|
|
if (i > 0) OS::PrintErr(" ");
|
|
VisitCharacterRange((*that->ranges())[i]);
|
|
}
|
|
OS::PrintErr("]");
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
|
|
switch (that->assertion_type()) {
|
|
case RegExpAssertion::START_OF_INPUT:
|
|
OS::PrintErr("@^i");
|
|
break;
|
|
case RegExpAssertion::END_OF_INPUT:
|
|
OS::PrintErr("@$i");
|
|
break;
|
|
case RegExpAssertion::START_OF_LINE:
|
|
OS::PrintErr("@^l");
|
|
break;
|
|
case RegExpAssertion::END_OF_LINE:
|
|
OS::PrintErr("@$l");
|
|
break;
|
|
case RegExpAssertion::BOUNDARY:
|
|
OS::PrintErr("@b");
|
|
break;
|
|
case RegExpAssertion::NON_BOUNDARY:
|
|
OS::PrintErr("@B");
|
|
break;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
|
|
OS::PrintErr("'");
|
|
ZoneGrowableArray<uint16_t>* chardata = that->data();
|
|
for (intptr_t i = 0; i < chardata->length(); i++) {
|
|
PrintUtf16(chardata->At(i));
|
|
}
|
|
OS::PrintErr("'");
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
|
|
if (that->elements()->length() == 1) {
|
|
(*that->elements())[0].tree()->Accept(this, data);
|
|
} else {
|
|
OS::PrintErr("(!");
|
|
for (intptr_t i = 0; i < that->elements()->length(); i++) {
|
|
OS::PrintErr(" ");
|
|
(*that->elements())[i].tree()->Accept(this, data);
|
|
}
|
|
OS::PrintErr(")");
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
|
|
OS::PrintErr("(# %" Pd " ", that->min());
|
|
if (that->max() == RegExpTree::kInfinity) {
|
|
OS::PrintErr("- ");
|
|
} else {
|
|
OS::PrintErr("%" Pd " ", that->max());
|
|
}
|
|
OS::PrintErr(that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
|
|
that->body()->Accept(this, data);
|
|
OS::PrintErr(")");
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
|
|
OS::PrintErr("(^ ");
|
|
that->body()->Accept(this, data);
|
|
OS::PrintErr(")");
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
|
|
OS::PrintErr("(");
|
|
OS::PrintErr("(%s %s",
|
|
(that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-"),
|
|
(that->is_positive() ? "+ " : "- "));
|
|
that->body()->Accept(this, data);
|
|
OS::PrintErr(")");
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, void*) {
|
|
OS::PrintErr("(<- %" Pd ")", that->index());
|
|
return NULL;
|
|
}
|
|
|
|
void* RegExpUnparser::VisitEmpty(RegExpEmpty*, void*) {
|
|
OS::PrintErr("%%");
|
|
return NULL;
|
|
}
|
|
|
|
void RegExpTree::Print() {
|
|
RegExpUnparser unparser;
|
|
Accept(&unparser, NULL);
|
|
}
|
|
|
|
RegExpDisjunction::RegExpDisjunction(
|
|
ZoneGrowableArray<RegExpTree*>* alternatives)
|
|
: alternatives_(alternatives) {
|
|
ASSERT(alternatives->length() > 1);
|
|
RegExpTree* first_alternative = alternatives->At(0);
|
|
min_match_ = first_alternative->min_match();
|
|
max_match_ = first_alternative->max_match();
|
|
for (intptr_t i = 1; i < alternatives->length(); i++) {
|
|
RegExpTree* alternative = alternatives->At(i);
|
|
min_match_ = Utils::Minimum(min_match_, alternative->min_match());
|
|
max_match_ = Utils::Maximum(max_match_, alternative->max_match());
|
|
}
|
|
}
|
|
|
|
static intptr_t IncreaseBy(intptr_t previous, intptr_t increase) {
|
|
if (RegExpTree::kInfinity - previous < increase) {
|
|
return RegExpTree::kInfinity;
|
|
} else {
|
|
return previous + increase;
|
|
}
|
|
}
|
|
|
|
RegExpAlternative::RegExpAlternative(ZoneGrowableArray<RegExpTree*>* nodes)
|
|
: nodes_(nodes) {
|
|
ASSERT(nodes->length() > 1);
|
|
min_match_ = 0;
|
|
max_match_ = 0;
|
|
for (intptr_t i = 0; i < nodes->length(); i++) {
|
|
RegExpTree* node = nodes->At(i);
|
|
intptr_t node_min_match = node->min_match();
|
|
min_match_ = IncreaseBy(min_match_, node_min_match);
|
|
intptr_t node_max_match = node->max_match();
|
|
max_match_ = IncreaseBy(max_match_, node_max_match);
|
|
}
|
|
}
|
|
|
|
} // namespace dart
|