dart-sdk/runtime/vm/regexp_ast.cc
Stevie Strickland f31b7928ae [VM] Adding regexp lookbehind assertion support.
See https://github.com/tc39/proposal-regexp-lookbehind
for a high-level description of the feature and examples.  This is one of the
features requested in https://github.com/dart-lang/sdk/issues/34935.

This work takes the feature as present in the v8 engine and appropriately
merges it into our irregexp fork. Notable changes to the irregexp codebase to
introduce this feature:

-----

We can no longer assume that all matching proceeds forwards, since lookbehind
matching proceeds backwards. Similarly, we cannot assume that we can only be
at the start of a string if we started matching from that point. The direction
of matching must also be taken into consideration when doing bounds checking,
which previously assumed the engine would never attempt to look before the
start of a string.

-----

We may now parse backreferences to captures before the capture they
reference, since we parse regular expressions left to right, but lookbehinds
perform captures as they evaluate the string from right to left.  Since
RegExpBackReference objects contain a pointer to their corresponding capture,
this means that we may need to create RegExpCapture objects prior to the
parsing of the corresponding captured subexpression.

Thus, RegExpCapture objects are now only initialized with their index, and the
body is set later when the subexpression is encountered and parsed. This means
any method that operates on the body of a RegExpCapture can no longer be const,
which also affects the rest of the RegExpTree class hierarchy. This also means
that we don't have a valid max_match length for backreferences based off the
capture body, and must assume they can end up being any length.

-----


Change-Id: Iffe0e71b17b1a0c6fea77235e8aee5c093005811
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/94540
Commit-Queue: Stevie Strickland <sstrickl@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
2019-03-14 14:26:47 +00:00

306 lines
8.9 KiB
C++

// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
// for details. All rights reserved. Use of this source code is governed by a
// BSD-style license that can be found in the LICENSE file.
#include "vm/regexp_ast.h"
#include "platform/utils.h"
#include "vm/os.h"
namespace dart {
#define MAKE_ACCEPT(Name) \
void* RegExp##Name::Accept(RegExpVisitor* visitor, void* data) { \
return visitor->Visit##Name(this, data); \
}
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ACCEPT)
#undef MAKE_ACCEPT
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExpTree::As##Name() { return NULL; } \
bool RegExpTree::Is##Name() const { return false; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
#define MAKE_TYPE_CASE(Name) \
RegExp##Name* RegExp##Name::As##Name() { return this; } \
bool RegExp##Name::Is##Name() const { return true; }
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_TYPE_CASE)
#undef MAKE_TYPE_CASE
static Interval ListCaptureRegisters(ZoneGrowableArray<RegExpTree*>* children) {
Interval result = Interval::Empty();
for (intptr_t i = 0; i < children->length(); i++)
result = result.Union(children->At(i)->CaptureRegisters());
return result;
}
Interval RegExpAlternative::CaptureRegisters() const {
return ListCaptureRegisters(nodes());
}
Interval RegExpDisjunction::CaptureRegisters() const {
return ListCaptureRegisters(alternatives());
}
Interval RegExpLookaround::CaptureRegisters() const {
return body()->CaptureRegisters();
}
Interval RegExpCapture::CaptureRegisters() const {
Interval self(StartRegister(index()), EndRegister(index()));
return self.Union(body()->CaptureRegisters());
}
Interval RegExpQuantifier::CaptureRegisters() const {
return body()->CaptureRegisters();
}
bool RegExpAssertion::IsAnchoredAtStart() const {
return assertion_type() == RegExpAssertion::START_OF_INPUT;
}
bool RegExpAssertion::IsAnchoredAtEnd() const {
return assertion_type() == RegExpAssertion::END_OF_INPUT;
}
bool RegExpAlternative::IsAnchoredAtStart() const {
ZoneGrowableArray<RegExpTree*>* nodes = this->nodes();
for (intptr_t i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->At(i);
if (node->IsAnchoredAtStart()) {
return true;
}
if (node->max_match() > 0) {
return false;
}
}
return false;
}
bool RegExpAlternative::IsAnchoredAtEnd() const {
ZoneGrowableArray<RegExpTree*>* nodes = this->nodes();
for (intptr_t i = nodes->length() - 1; i >= 0; i--) {
RegExpTree* node = nodes->At(i);
if (node->IsAnchoredAtEnd()) {
return true;
}
if (node->max_match() > 0) {
return false;
}
}
return false;
}
bool RegExpDisjunction::IsAnchoredAtStart() const {
ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
for (intptr_t i = 0; i < alternatives->length(); i++) {
if (!alternatives->At(i)->IsAnchoredAtStart()) return false;
}
return true;
}
bool RegExpDisjunction::IsAnchoredAtEnd() const {
ZoneGrowableArray<RegExpTree*>* alternatives = this->alternatives();
for (intptr_t i = 0; i < alternatives->length(); i++) {
if (!alternatives->At(i)->IsAnchoredAtEnd()) return false;
}
return true;
}
bool RegExpLookaround::IsAnchoredAtStart() const {
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtStart() const {
return body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtEnd() const {
return body()->IsAnchoredAtEnd();
}
// Convert regular expression trees to a simple sexp representation.
// This representation should be different from the input grammar
// in as many cases as possible, to make it more difficult for incorrect
// parses to look as correct ones which is likely if the input and
// output formats are alike.
class RegExpUnparser : public RegExpVisitor {
public:
void VisitCharacterRange(CharacterRange that);
#define MAKE_CASE(Name) virtual void* Visit##Name(RegExp##Name*, void* data);
FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
#undef MAKE_CASE
};
void* RegExpUnparser::VisitDisjunction(RegExpDisjunction* that, void* data) {
OS::PrintErr("(|");
for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
OS::PrintErr(" ");
(*that->alternatives())[i]->Accept(this, data);
}
OS::PrintErr(")");
return NULL;
}
void* RegExpUnparser::VisitAlternative(RegExpAlternative* that, void* data) {
OS::PrintErr("(:");
for (intptr_t i = 0; i < that->nodes()->length(); i++) {
OS::PrintErr(" ");
(*that->nodes())[i]->Accept(this, data);
}
OS::PrintErr(")");
return NULL;
}
void RegExpUnparser::VisitCharacterRange(CharacterRange that) {
PrintUtf16(that.from());
if (!that.IsSingleton()) {
OS::PrintErr("-");
PrintUtf16(that.to());
}
}
void* RegExpUnparser::VisitCharacterClass(RegExpCharacterClass* that,
void* data) {
if (that->is_negated()) OS::PrintErr("^");
OS::PrintErr("[");
for (intptr_t i = 0; i < that->ranges()->length(); i++) {
if (i > 0) OS::PrintErr(" ");
VisitCharacterRange((*that->ranges())[i]);
}
OS::PrintErr("]");
return NULL;
}
void* RegExpUnparser::VisitAssertion(RegExpAssertion* that, void* data) {
switch (that->assertion_type()) {
case RegExpAssertion::START_OF_INPUT:
OS::PrintErr("@^i");
break;
case RegExpAssertion::END_OF_INPUT:
OS::PrintErr("@$i");
break;
case RegExpAssertion::START_OF_LINE:
OS::PrintErr("@^l");
break;
case RegExpAssertion::END_OF_LINE:
OS::PrintErr("@$l");
break;
case RegExpAssertion::BOUNDARY:
OS::PrintErr("@b");
break;
case RegExpAssertion::NON_BOUNDARY:
OS::PrintErr("@B");
break;
}
return NULL;
}
void* RegExpUnparser::VisitAtom(RegExpAtom* that, void* data) {
OS::PrintErr("'");
ZoneGrowableArray<uint16_t>* chardata = that->data();
for (intptr_t i = 0; i < chardata->length(); i++) {
PrintUtf16(chardata->At(i));
}
OS::PrintErr("'");
return NULL;
}
void* RegExpUnparser::VisitText(RegExpText* that, void* data) {
if (that->elements()->length() == 1) {
(*that->elements())[0].tree()->Accept(this, data);
} else {
OS::PrintErr("(!");
for (intptr_t i = 0; i < that->elements()->length(); i++) {
OS::PrintErr(" ");
(*that->elements())[i].tree()->Accept(this, data);
}
OS::PrintErr(")");
}
return NULL;
}
void* RegExpUnparser::VisitQuantifier(RegExpQuantifier* that, void* data) {
OS::PrintErr("(# %" Pd " ", that->min());
if (that->max() == RegExpTree::kInfinity) {
OS::PrintErr("- ");
} else {
OS::PrintErr("%" Pd " ", that->max());
}
OS::PrintErr(that->is_greedy() ? "g " : that->is_possessive() ? "p " : "n ");
that->body()->Accept(this, data);
OS::PrintErr(")");
return NULL;
}
void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
OS::PrintErr("(^ ");
that->body()->Accept(this, data);
OS::PrintErr(")");
return NULL;
}
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
OS::PrintErr("(");
OS::PrintErr("(%s %s",
(that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-"),
(that->is_positive() ? "+ " : "- "));
that->body()->Accept(this, data);
OS::PrintErr(")");
return NULL;
}
void* RegExpUnparser::VisitBackReference(RegExpBackReference* that, void*) {
OS::PrintErr("(<- %" Pd ")", that->index());
return NULL;
}
void* RegExpUnparser::VisitEmpty(RegExpEmpty*, void*) {
OS::PrintErr("%%");
return NULL;
}
void RegExpTree::Print() {
RegExpUnparser unparser;
Accept(&unparser, NULL);
}
RegExpDisjunction::RegExpDisjunction(
ZoneGrowableArray<RegExpTree*>* alternatives)
: alternatives_(alternatives) {
ASSERT(alternatives->length() > 1);
RegExpTree* first_alternative = alternatives->At(0);
min_match_ = first_alternative->min_match();
max_match_ = first_alternative->max_match();
for (intptr_t i = 1; i < alternatives->length(); i++) {
RegExpTree* alternative = alternatives->At(i);
min_match_ = Utils::Minimum(min_match_, alternative->min_match());
max_match_ = Utils::Maximum(max_match_, alternative->max_match());
}
}
static intptr_t IncreaseBy(intptr_t previous, intptr_t increase) {
if (RegExpTree::kInfinity - previous < increase) {
return RegExpTree::kInfinity;
} else {
return previous + increase;
}
}
RegExpAlternative::RegExpAlternative(ZoneGrowableArray<RegExpTree*>* nodes)
: nodes_(nodes) {
ASSERT(nodes->length() > 1);
min_match_ = 0;
max_match_ = 0;
for (intptr_t i = 0; i < nodes->length(); i++) {
RegExpTree* node = nodes->At(i);
intptr_t node_min_match = node->min_match();
min_match_ = IncreaseBy(min_match_, node_min_match);
intptr_t node_max_match = node->max_match();
max_match_ = IncreaseBy(max_match_, node_max_match);
}
}
} // namespace dart