[VM] Adding regexp lookbehind assertion support.

See https://github.com/tc39/proposal-regexp-lookbehind
for a high-level description of the feature and examples.  This is one of the
features requested in https://github.com/dart-lang/sdk/issues/34935.

This work takes the feature as present in the v8 engine and appropriately
merges it into our irregexp fork. Notable changes to the irregexp codebase to
introduce this feature:

-----

We can no longer assume that all matching proceeds forwards, since lookbehind
matching proceeds backwards. Similarly, we cannot assume that we can only be
at the start of a string if we started matching from that point. The direction
of matching must also be taken into consideration when doing bounds checking,
which previously assumed the engine would never attempt to look before the
start of a string.

-----

We may now parse backreferences to captures before the capture they
reference, since we parse regular expressions left to right, but lookbehinds
perform captures as they evaluate the string from right to left.  Since
RegExpBackReference objects contain a pointer to their corresponding capture,
this means that we may need to create RegExpCapture objects prior to the
parsing of the corresponding captured subexpression.

Thus, RegExpCapture objects are now only initialized with their index, and the
body is set later when the subexpression is encountered and parsed. This means
any method that operates on the body of a RegExpCapture can no longer be const,
which also affects the rest of the RegExpTree class hierarchy. This also means
that we don't have a valid max_match length for backreferences based off the
capture body, and must assume they can end up being any length.

-----


Change-Id: Iffe0e71b17b1a0c6fea77235e8aee5c093005811
Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/94540
Commit-Queue: Stevie Strickland <sstrickl@google.com>
Reviewed-by: Martin Kustermann <kustermann@google.com>
This commit is contained in:
Stevie Strickland 2019-03-14 14:26:47 +00:00 committed by commit-bot@chromium.org
parent dfb6d250b1
commit f31b7928ae
14 changed files with 1013 additions and 264 deletions

View file

@ -317,6 +317,8 @@ class RegExpCompiler : public ValueObject {
inline bool ignore_case() { return ignore_case_; }
inline bool one_byte() const { return is_one_byte_; }
bool read_backward() { return read_backward_; }
void set_read_backward(bool value) { read_backward_ = value; }
FrequencyCollator* frequency_collator() { return &frequency_collator_; }
intptr_t current_expansion_factor() { return current_expansion_factor_; }
@ -337,6 +339,7 @@ class RegExpCompiler : public ValueObject {
bool ignore_case_;
bool is_one_byte_;
bool reg_exp_too_big_;
bool read_backward_;
intptr_t current_expansion_factor_;
FrequencyCollator frequency_collator_;
Zone* zone_;
@ -368,6 +371,7 @@ RegExpCompiler::RegExpCompiler(intptr_t capture_count,
ignore_case_(ignore_case),
is_one_byte_(is_one_byte),
reg_exp_too_big_(false),
read_backward_(false),
current_expansion_factor_(1),
zone_(Thread::Current()->zone()) {
accept_ = new (Z) EndNode(EndNode::ACCEPT, Z);
@ -520,7 +524,8 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
intptr_t value = 0;
bool absolute = false;
bool clear = false;
intptr_t store_position = -1;
static const intptr_t kNoStore = kMinInt32;
intptr_t store_position = kNoStore;
// This is a little tricky because we are scanning the actions in reverse
// historical order (newest first).
for (DeferredAction* action = actions_; action != NULL;
@ -540,7 +545,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
// can set undo_action to ACTION_IGNORE if we know there is no
// value to restore.
undo_action = ACTION_RESTORE;
ASSERT(store_position == -1);
ASSERT(store_position == kNoStore);
ASSERT(!clear);
break;
}
@ -548,14 +553,14 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
if (!absolute) {
value++;
}
ASSERT(store_position == -1);
ASSERT(store_position == kNoStore);
ASSERT(!clear);
undo_action = ACTION_RESTORE;
break;
case ActionNode::STORE_POSITION: {
Trace::DeferredCapture* pc =
static_cast<Trace::DeferredCapture*>(action);
if (!clear && store_position == -1) {
if (!clear && store_position == kNoStore) {
store_position = pc->cp_offset();
}
@ -579,7 +584,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
// Since we're scanning in reverse order, if we've already
// set the position we have to ignore historically earlier
// clearing operations.
if (store_position == -1) {
if (store_position == kNoStore) {
clear = true;
}
undo_action = ACTION_RESTORE;
@ -602,7 +607,7 @@ void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
}
// Perform the chronologically last action (or accumulated increment)
// for the register.
if (store_position != -1) {
if (store_position != kNoStore) {
assembler->WriteCurrentPositionToRegister(reg, store_position);
} else if (clear) {
assembler->ClearRegisters(reg, reg);
@ -1494,6 +1499,7 @@ void AssertionNode::FillInBMInfo(intptr_t offset,
intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
intptr_t budget,
bool not_at_start) {
if (read_backward()) return 0;
if (budget <= 0) return 0;
return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
}
@ -1501,6 +1507,7 @@ intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
intptr_t budget,
bool not_at_start) {
if (read_backward()) return 0;
intptr_t answer = Length();
if (answer >= still_to_find) return answer;
if (budget <= 0) return answer;
@ -1509,9 +1516,9 @@ intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
on_success()->EatsAtLeast(still_to_find - answer, budget - 1, true);
}
intptr_t NegativeLookaheadChoiceNode::EatsAtLeast(intptr_t still_to_find,
intptr_t budget,
bool not_at_start) {
intptr_t NegativeLookaroundChoiceNode::EatsAtLeast(intptr_t still_to_find,
intptr_t budget,
bool not_at_start) {
if (budget <= 0) return 0;
// Alternative 0 is the negative lookahead, alternative 1 is what comes
// afterwards.
@ -1519,7 +1526,7 @@ intptr_t NegativeLookaheadChoiceNode::EatsAtLeast(intptr_t still_to_find,
return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
}
void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
QuickCheckDetails* details,
RegExpCompiler* compiler,
intptr_t filled_in,
@ -1682,6 +1689,9 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
ASSERT(details->characters() == 1 ||
(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__));
#endif
// Do not collect any quick check details if the text node reads backward,
// since it reads in the opposite direction than we use for quick checks.
if (read_backward()) return;
ASSERT(characters_filled_in < details->characters());
intptr_t characters = details->characters();
intptr_t char_mask;
@ -1838,8 +1848,9 @@ void QuickCheckDetails::Clear() {
}
void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
ASSERT(by >= 0);
if (by >= characters_) {
if (by >= characters_ || by < 0) {
// check that by < 0 => characters_ == 0
ASSERT(by >= 0 || characters_ == 0);
Clear();
return;
}
@ -2060,8 +2071,8 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) {
return this;
}
RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(intptr_t depth,
bool ignore_case) {
RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth,
bool ignore_case) {
if (info()->replacement_calculated) return replacement();
if (depth < 0) return this;
if (info()->visited) return this;
@ -2294,9 +2305,9 @@ void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
return;
}
if (trace->at_start() == Trace::UNKNOWN) {
assembler->CheckNotAtStart(trace->backtrack());
assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
Trace at_start_trace = *trace;
at_start_trace.set_at_start(true);
at_start_trace.set_at_start(Trace::TRUE_VALUE);
on_success()->Emit(compiler, &at_start_trace);
return;
}
@ -2365,9 +2376,10 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
BlockLabel* backtrack = trace->backtrack();
QuickCheckDetails* quick_check = trace->quick_check_performed();
intptr_t element_count = elms_->length();
intptr_t backward_offset = read_backward() ? -Length() : 0;
for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
TextElement elm = elms_->At(i);
intptr_t cp_offset = trace->cp_offset() + elm.cp_offset();
intptr_t cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
if (elm.text_type() == TextElement::ATOM) {
ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
@ -2395,9 +2407,11 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
break;
}
if (emit_function != NULL) {
bool bound_checked = emit_function(
Z, compiler, quarks->At(j), backtrack, cp_offset + j,
*checked_up_to < cp_offset + j, preloaded);
const bool bounds_check =
*checked_up_to < (cp_offset + j) || read_backward();
bool bound_checked =
emit_function(Z, compiler, quarks->At(j), backtrack,
cp_offset + j, bounds_check, preloaded);
if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
}
}
@ -2407,8 +2421,9 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler,
if (first_element_checked && i == 0) continue;
if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
RegExpCharacterClass* cc = elm.char_class();
bool bounds_check = *checked_up_to < cp_offset || read_backward();
EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
*checked_up_to < cp_offset, preloaded, Z);
bounds_check, preloaded, Z);
UpdateBoundsCheck(cp_offset, checked_up_to);
}
}
@ -2475,8 +2490,11 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
}
Trace successor_trace(*trace);
successor_trace.set_at_start(false);
successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
// If we advance backward, we may end up at the start.
successor_trace.AdvanceCurrentPositionInTrace(
read_backward() ? -Length() : Length(), compiler);
successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
: Trace::FALSE_VALUE);
RecursionCheck rc(compiler);
on_success()->Emit(compiler, &successor_trace);
}
@ -2487,7 +2505,6 @@ void Trace::InvalidateCurrentCharacter() {
void Trace::AdvanceCurrentPositionInTrace(intptr_t by,
RegExpCompiler* compiler) {
ASSERT(by > 0);
// We don't have an instruction for shifting the current character register
// down or for using a shifted value for anything so lets just forget that
// we preloaded any characters into it.
@ -2530,6 +2547,7 @@ intptr_t TextNode::GreedyLoopTextLength() {
RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
RegExpCompiler* compiler) {
if (read_backward()) return nullptr;
if (elms_->length() != 1) return NULL;
TextElement elm = elms_->At(0);
if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
@ -2574,7 +2592,7 @@ intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative(
SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
node = seq_node->on_success();
}
return length;
return read_backward() ? -length : length;
}
void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
@ -3002,7 +3020,7 @@ void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
GreedyLoopState::GreedyLoopState(bool not_at_start) {
counter_backtrack_trace_.set_backtrack(&label_);
if (not_at_start) counter_backtrack_trace_.set_at_start(false);
if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
}
void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
@ -3115,7 +3133,7 @@ Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
macro_assembler->PushCurrentPosition();
BlockLabel greedy_match_failed;
Trace greedy_match_trace;
if (not_at_start()) greedy_match_trace.set_at_start(false);
if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
greedy_match_trace.set_backtrack(&greedy_match_failed);
BlockLabel loop_label;
macro_assembler->BindBlock(&loop_label);
@ -3446,10 +3464,15 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
ASSERT(start_reg_ + 1 == end_reg_);
if (compiler->ignore_case()) {
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, trace->backtrack());
assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
trace->backtrack());
} else {
assembler->CheckNotBackReference(start_reg_, trace->backtrack());
assembler->CheckNotBackReference(start_reg_, read_backward(),
trace->backtrack());
}
// We are going to advance backward, so we may end up at the start.
if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
on_success()->Emit(compiler, trace);
}
@ -3694,7 +3717,7 @@ RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
ZoneGrowableArray<TextElement>* elms =
new (OZ) ZoneGrowableArray<TextElement>(1);
elms->Add(TextElement::Atom(this));
return new (OZ) TextNode(elms, on_success);
return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
}
RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
@ -3704,7 +3727,7 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
for (intptr_t i = 0; i < elements()->length(); i++) {
elms->Add(elements()->At(i));
}
return new (OZ) TextNode(elms, on_success);
return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
}
static bool CompareInverseRanges(ZoneGrowableArray<CharacterRange>* ranges,
@ -3795,7 +3818,7 @@ bool RegExpCharacterClass::is_standard() {
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return new (OZ) TextNode(this, on_success);
return new (OZ) TextNode(this, compiler->read_backward(), on_success);
}
RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
@ -3931,7 +3954,9 @@ RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
GuardedAlternative(body->ToNode(compiler, answer)));
}
answer = alternation;
if (not_at_start) alternation->set_not_at_start();
if (not_at_start && !compiler->read_backward()) {
alternation->set_not_at_start();
}
}
return answer;
}
@ -3942,9 +3967,9 @@ RegExpNode* RegExpQuantifier::ToNode(intptr_t min,
bool needs_counter = has_min || has_max;
intptr_t reg_ctr = needs_counter ? compiler->AllocateRegister()
: RegExpCompiler::kNoRegister;
LoopChoiceNode* center =
new (zone) LoopChoiceNode(body->min_match() == 0, zone);
if (not_at_start) center->set_not_at_start();
LoopChoiceNode* center = new (zone)
LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
RegExpNode* loop_return =
needs_counter ? static_cast<RegExpNode*>(
ActionNode::IncrementRegister(reg_ctr, center))
@ -4015,12 +4040,13 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
new ZoneGrowableArray<CharacterRange>(3);
CharacterRange::AddClassEscape('n', newline_ranges);
RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
TextNode* newline_matcher = new TextNode(
newline_atom, ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register,
0, // No captures inside.
-1, // Ignored if no captures.
on_success));
TextNode* newline_matcher =
new TextNode(newline_atom, /*read_backwards=*/false,
ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register,
0, // No captures inside.
-1, // Ignored if no captures.
on_success));
// Create an end-of-input matcher.
RegExpNode* end_of_line = ActionNode::BeginSubmatch(
stack_pointer_register, position_register, newline_matcher);
@ -4039,9 +4065,9 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
return new (OZ)
BackReferenceNode(RegExpCapture::StartRegister(index()),
RegExpCapture::EndRegister(index()), on_success);
return new (OZ) BackReferenceNode(RegExpCapture::StartRegister(index()),
RegExpCapture::EndRegister(index()),
compiler->read_backward(), on_success);
}
RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
@ -4049,8 +4075,47 @@ RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
return on_success;
}
RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
RegExpLookaround::Builder::Builder(bool is_positive,
RegExpNode* on_success,
intptr_t stack_pointer_register,
intptr_t position_register,
intptr_t capture_register_count,
intptr_t capture_register_start)
: is_positive_(is_positive),
on_success_(on_success),
stack_pointer_register_(stack_pointer_register),
position_register_(position_register) {
if (is_positive_) {
on_match_success_ = ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register, capture_register_count,
capture_register_start, on_success);
} else {
on_match_success_ = new (OZ) NegativeSubmatchSuccess(
stack_pointer_register, position_register, capture_register_count,
capture_register_start, OZ);
}
}
RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
if (is_positive_) {
return ActionNode::BeginSubmatch(stack_pointer_register_,
position_register_, match);
} else {
Zone* zone = on_success_->zone();
// We use a ChoiceNode to represent the negative lookaround. The first
// alternative is the negative match. On success, the end node backtracks.
// On failure, the second alternative is tried and leads to success.
// NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
// first exit when calculating quick checks.
ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
GuardedAlternative(match), GuardedAlternative(on_success_), zone);
return ActionNode::BeginSubmatch(stack_pointer_register_,
position_register_, choice_node);
}
}
RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
intptr_t stack_pointer_register = compiler->AllocateRegister();
intptr_t position_register = compiler->AllocateRegister();
@ -4060,36 +4125,15 @@ RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
intptr_t register_start =
register_of_first_capture + capture_from_ * registers_per_capture;
RegExpNode* success;
if (is_positive()) {
RegExpNode* node = ActionNode::BeginSubmatch(
stack_pointer_register, position_register,
body()->ToNode(compiler,
ActionNode::PositiveSubmatchSuccess(
stack_pointer_register, position_register,
register_count, register_start, on_success)));
return node;
} else {
// We use a ChoiceNode for a negative lookahead because it has most of
// the characteristics we need. It has the body of the lookahead as its
// first alternative and the expression after the lookahead of the second
// alternative. If the first alternative succeeds then the
// NegativeSubmatchSuccess will unwind the stack including everything the
// choice node set up and backtrack. If the first alternative fails then
// the second alternative is tried, which is exactly the desired result
// for a negative lookahead. The NegativeLookaheadChoiceNode is a special
// ChoiceNode that knows to ignore the first exit when calculating quick
// checks.
GuardedAlternative body_alt(
body()->ToNode(compiler, success = new (OZ) NegativeSubmatchSuccess(
stack_pointer_register, position_register,
register_count, register_start, OZ)));
ChoiceNode* choice_node = new (OZ) NegativeLookaheadChoiceNode(
body_alt, GuardedAlternative(on_success), OZ);
return ActionNode::BeginSubmatch(stack_pointer_register, position_register,
choice_node);
}
RegExpNode* result;
bool was_reading_backward = compiler->read_backward();
compiler->set_read_backward(type() == LOOKBEHIND);
Builder builder(is_positive(), on_success, stack_pointer_register,
position_register, register_count, register_start);
RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
result = builder.ForMatch(match);
compiler->set_read_backward(was_reading_backward);
return result;
}
RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
@ -4101,8 +4145,14 @@ RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
intptr_t index,
RegExpCompiler* compiler,
RegExpNode* on_success) {
ASSERT(body != nullptr);
intptr_t start_reg = RegExpCapture::StartRegister(index);
intptr_t end_reg = RegExpCapture::EndRegister(index);
if (compiler->read_backward()) {
intptr_t tmp = end_reg;
end_reg = start_reg;
start_reg = tmp;
}
RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
RegExpNode* body_node = body->ToNode(compiler, store_end);
return ActionNode::StorePosition(start_reg, true, body_node);
@ -4112,8 +4162,14 @@ RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneGrowableArray<RegExpTree*>* children = nodes();
RegExpNode* current = on_success;
for (intptr_t i = children->length() - 1; i >= 0; i--) {
current = children->At(i)->ToNode(compiler, current);
if (compiler->read_backward()) {
for (intptr_t i = 0; i < children->length(); i++) {
current = children->At(i)->ToNode(compiler, current);
}
} else {
for (intptr_t i = children->length() - 1; i >= 0; i--) {
current = children->At(i)->ToNode(compiler, current);
}
}
return current;
}
@ -4686,8 +4742,9 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR(
// at the start of input.
ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
new (zone) RegExpCharacterClass('*'), loop_node)));
first_step_node->AddAlternative(GuardedAlternative(
new (zone) TextNode(new (zone) RegExpCharacterClass('*'),
/*read_backwards=*/false, loop_node)));
node = first_step_node;
} else {
node = loop_node;
@ -4789,8 +4846,9 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode(
// at the start of input.
ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
new (zone) RegExpCharacterClass('*'), loop_node)));
first_step_node->AddAlternative(GuardedAlternative(
new (zone) TextNode(new (zone) RegExpCharacterClass('*'),
/*read_backwards=*/false, loop_node)));
node = first_step_node;
} else {
node = loop_node;

View file

@ -121,7 +121,7 @@ class OutSet : public ZoneAllocated {
VISIT(Atom) \
VISIT(Quantifier) \
VISIT(Capture) \
VISIT(Lookahead) \
VISIT(Lookaround) \
VISIT(BackReference) \
VISIT(Empty) \
VISIT(Text)
@ -549,11 +549,16 @@ class ActionNode : public SeqRegExpNode {
class TextNode : public SeqRegExpNode {
public:
TextNode(ZoneGrowableArray<TextElement>* elms, RegExpNode* on_success)
: SeqRegExpNode(on_success), elms_(elms) {}
TextNode(RegExpCharacterClass* that, RegExpNode* on_success)
TextNode(ZoneGrowableArray<TextElement>* elms,
bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass* that,
bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success),
elms_(new (zone()) ZoneGrowableArray<TextElement>(1)) {
elms_(new (zone()) ZoneGrowableArray<TextElement>(1)),
read_backward_(read_backward) {
elms_->Add(TextElement::CharClass(that));
}
virtual void Accept(NodeVisitor* visitor);
@ -566,6 +571,7 @@ class TextNode : public SeqRegExpNode {
intptr_t characters_filled_in,
bool not_at_start);
ZoneGrowableArray<TextElement>* elements() { return elms_; }
bool read_backward() { return read_backward_; }
void MakeCaseIndependent(bool is_one_byte);
virtual intptr_t GreedyLoopTextLength();
virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
@ -596,6 +602,7 @@ class TextNode : public SeqRegExpNode {
intptr_t* checked_up_to);
intptr_t Length();
ZoneGrowableArray<TextElement>* elms_;
bool read_backward_;
};
class AssertionNode : public SeqRegExpNode {
@ -652,11 +659,16 @@ class BackReferenceNode : public SeqRegExpNode {
public:
BackReferenceNode(intptr_t start_reg,
intptr_t end_reg,
bool read_backward,
RegExpNode* on_success)
: SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg) {}
: SeqRegExpNode(on_success),
start_reg_(start_reg),
end_reg_(end_reg),
read_backward_(read_backward) {}
virtual void Accept(NodeVisitor* visitor);
intptr_t start_register() { return start_reg_; }
intptr_t end_register() { return end_reg_; }
bool read_backward() { return read_backward_; }
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
virtual intptr_t EatsAtLeast(intptr_t still_to_find,
intptr_t recursion_depth,
@ -675,6 +687,7 @@ class BackReferenceNode : public SeqRegExpNode {
private:
intptr_t start_reg_;
intptr_t end_reg_;
bool read_backward_;
};
class EndNode : public RegExpNode {
@ -799,6 +812,7 @@ class ChoiceNode : public RegExpNode {
return true;
}
virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case);
virtual bool read_backward() { return false; }
protected:
intptr_t GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
@ -840,11 +854,11 @@ class ChoiceNode : public RegExpNode {
bool being_calculated_;
};
class NegativeLookaheadChoiceNode : public ChoiceNode {
class NegativeLookaroundChoiceNode : public ChoiceNode {
public:
explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail,
GuardedAlternative then_do_this,
Zone* zone)
explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
GuardedAlternative then_do_this,
Zone* zone)
: ChoiceNode(2, zone) {
AddAlternative(this_must_fail);
AddAlternative(then_do_this);
@ -877,11 +891,14 @@ class NegativeLookaheadChoiceNode : public ChoiceNode {
class LoopChoiceNode : public ChoiceNode {
public:
explicit LoopChoiceNode(bool body_can_be_zero_length, Zone* zone)
explicit LoopChoiceNode(bool body_can_be_zero_length,
bool read_backward,
Zone* zone)
: ChoiceNode(2, zone),
loop_node_(NULL),
continue_node_(NULL),
body_can_be_zero_length_(body_can_be_zero_length) {}
body_can_be_zero_length_(body_can_be_zero_length),
read_backward_(read_backward) {}
void AddLoopAlternative(GuardedAlternative alt);
void AddContinueAlternative(GuardedAlternative alt);
virtual void Emit(RegExpCompiler* compiler, Trace* trace);
@ -899,6 +916,7 @@ class LoopChoiceNode : public ChoiceNode {
RegExpNode* loop_node() { return loop_node_; }
RegExpNode* continue_node() { return continue_node_; }
bool body_can_be_zero_length() { return body_can_be_zero_length_; }
virtual bool read_backward() { return read_backward_; }
virtual void Accept(NodeVisitor* visitor);
virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case);
@ -913,6 +931,7 @@ class LoopChoiceNode : public ChoiceNode {
RegExpNode* loop_node_;
RegExpNode* continue_node_;
bool body_can_be_zero_length_;
bool read_backward_;
};
// Improve the speed that we scan for an initial point where a non-anchored
@ -1161,9 +1180,7 @@ class Trace {
quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
}
TriBool at_start() { return at_start_; }
void set_at_start(bool at_start) {
at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;
}
void set_at_start(TriBool at_start) { at_start_ = at_start; }
BlockLabel* backtrack() { return backtrack_; }
BlockLabel* loop_label() { return loop_label_; }
RegExpNode* stop_node() { return stop_node_; }

View file

@ -120,10 +120,13 @@ class RegExpMacroAssembler : public ZoneAllocated {
virtual void CheckCharacterGT(uint16_t limit, BlockLabel* on_greater) = 0;
virtual void CheckCharacterLT(uint16_t limit, BlockLabel* on_less) = 0;
virtual void CheckGreedyLoop(BlockLabel* on_tos_equals_current_position) = 0;
virtual void CheckNotAtStart(BlockLabel* on_not_at_start) = 0;
virtual void CheckNotAtStart(intptr_t cp_offset,
BlockLabel* on_not_at_start) = 0;
virtual void CheckNotBackReference(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match) = 0;
virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match) = 0;
// Check the current character for a match with a literal character. If we
// fail to match then goto the on_failure label. End of input always

View file

@ -246,8 +246,9 @@ void BytecodeRegExpMacroAssembler::CheckAtStart(BlockLabel* on_at_start) {
}
void BytecodeRegExpMacroAssembler::CheckNotAtStart(
intptr_t cp_offset,
BlockLabel* on_not_at_start) {
Emit(BC_CHECK_NOT_AT_START, 0);
Emit(BC_CHECK_NOT_AT_START, cp_offset);
EmitOrLink(on_not_at_start);
}
@ -336,19 +337,24 @@ void BytecodeRegExpMacroAssembler::CheckBitInTable(const TypedData& table,
void BytecodeRegExpMacroAssembler::CheckNotBackReference(
intptr_t start_reg,
bool read_backward,
BlockLabel* on_not_equal) {
ASSERT(start_reg >= 0);
ASSERT(start_reg <= kMaxRegister);
Emit(BC_CHECK_NOT_BACK_REF, start_reg);
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_BACKWARD : BC_CHECK_NOT_BACK_REF,
start_reg);
EmitOrLink(on_not_equal);
}
void BytecodeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(
intptr_t start_reg,
bool read_backward,
BlockLabel* on_not_equal) {
ASSERT(start_reg >= 0);
ASSERT(start_reg <= kMaxRegister);
Emit(BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg);
Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD
: BC_CHECK_NOT_BACK_REF_NO_CASE,
start_reg);
EmitOrLink(on_not_equal);
}

View file

@ -62,7 +62,7 @@ class BytecodeRegExpMacroAssembler : public RegExpMacroAssembler {
virtual void CheckCharacterLT(uint16_t limit, BlockLabel* on_less);
virtual void CheckGreedyLoop(BlockLabel* on_tos_equals_current_position);
virtual void CheckAtStart(BlockLabel* on_at_start);
virtual void CheckNotAtStart(BlockLabel* on_not_at_start);
virtual void CheckNotAtStart(intptr_t cp_offset, BlockLabel* on_not_at_start);
virtual void CheckNotCharacter(unsigned c, BlockLabel* on_not_equal);
virtual void CheckNotCharacterAfterAnd(unsigned c,
unsigned mask,
@ -79,8 +79,10 @@ class BytecodeRegExpMacroAssembler : public RegExpMacroAssembler {
BlockLabel* on_not_in_range);
virtual void CheckBitInTable(const TypedData& table, BlockLabel* on_bit_set);
virtual void CheckNotBackReference(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match);
virtual void IfRegisterLT(intptr_t register_index,
intptr_t comparand,

View file

@ -770,38 +770,27 @@ void IRRegExpMacroAssembler::CheckCharacterGT(uint16_t limit,
void IRRegExpMacroAssembler::CheckAtStart(BlockLabel* on_at_start) {
TAG();
BlockLabel not_at_start;
// Did we start the match at the start of the string at all?
BranchOrBacktrack(
Comparison(kNE, LoadLocal(start_index_param_), Uint64Constant(0)),
&not_at_start);
// If we did, are we still at the start of the input, i.e. is
// (offset == string_length * -1)?
// Are we at the start of the input, i.e. is (offset == string_length * -1)?
Definition* neg_len_def =
InstanceCall(InstanceCallDescriptor::FromToken(Token::kNEGATE),
PushLocal(string_param_length_));
Definition* offset_def = LoadLocal(current_position_);
BranchOrBacktrack(Comparison(kEQ, neg_len_def, offset_def), on_at_start);
BindBlock(&not_at_start);
}
void IRRegExpMacroAssembler::CheckNotAtStart(BlockLabel* on_not_at_start) {
// cp_offset => offset from the current (character) pointer
// This offset may be negative due to traversing backwards during lookbehind.
void IRRegExpMacroAssembler::CheckNotAtStart(intptr_t cp_offset,
BlockLabel* on_not_at_start) {
TAG();
// Did we start the match at the start of the string at all?
BranchOrBacktrack(
Comparison(kNE, LoadLocal(start_index_param_), Uint64Constant(0)),
on_not_at_start);
// If we did, are we still at the start of the input, i.e. is
// (offset == string_length * -1)?
Definition* neg_len_def =
InstanceCall(InstanceCallDescriptor::FromToken(Token::kNEGATE),
PushLocal(string_param_length_));
Definition* offset_def = LoadLocal(current_position_);
// Are we at the start of the input, i.e. is (offset == string_length * -1)?
auto offset_def =
PushArgument(Bind(Add(PushLocal(current_position_),
PushArgument(Bind(Int64Constant(cp_offset))))));
auto neg_len_def = PushArgument(
Bind(InstanceCall(InstanceCallDescriptor::FromToken(Token::kNEGATE),
PushLocal(string_param_length_))));
BranchOrBacktrack(Comparison(kNE, neg_len_def, offset_def), on_not_at_start);
}
@ -832,6 +821,7 @@ void IRRegExpMacroAssembler::CheckGreedyLoop(BlockLabel* on_equal) {
void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(
intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match) {
TAG();
ASSERT(start_reg + 1 <= registers_count_);
@ -857,20 +847,38 @@ void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(
Comparison(kEQ, LoadLocal(capture_length_), Uint64Constant(0)),
&fallthrough);
// Check that there are sufficient characters left in the input.
PushArgumentInstr* pos_push = PushLocal(current_position_);
PushArgumentInstr* len_push = PushLocal(capture_length_);
BranchOrBacktrack(
Comparison(kGT,
InstanceCall(InstanceCallDescriptor::FromToken(Token::kADD),
pos_push, len_push),
Uint64Constant(0)),
on_no_match);
PushArgumentInstr* pos_push = nullptr;
PushArgumentInstr* len_push = nullptr;
if (!read_backward) {
// Check that there are sufficient characters left in the input.
pos_push = PushLocal(current_position_);
len_push = PushLocal(capture_length_);
BranchOrBacktrack(
Comparison(kGT,
InstanceCall(InstanceCallDescriptor::FromToken(Token::kADD),
pos_push, len_push),
Uint64Constant(0)),
on_no_match);
}
pos_push = PushLocal(current_position_);
len_push = PushLocal(string_param_length_);
StoreLocal(match_start_index_, Bind(Add(pos_push, len_push)));
if (read_backward) {
// First check that there are enough characters before this point in
// the string that we can match the backreference.
BranchOrBacktrack(Comparison(kLT, LoadLocal(match_start_index_),
LoadLocal(capture_length_)),
on_no_match);
// The string to check is before the current position, not at it.
pos_push = PushLocal(match_start_index_);
len_push = PushLocal(capture_length_);
StoreLocal(match_start_index_, Bind(Sub(pos_push, len_push)));
}
pos_push = PushArgument(LoadRegister(start_reg));
len_push = PushLocal(string_param_length_);
StoreLocal(capture_start_index_, Bind(Add(pos_push, len_push)));
@ -970,15 +978,23 @@ void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase(
BindBlock(&success);
// Move current character position to position after match.
PushArgumentInstr* match_end_push = PushLocal(match_end_index_);
len_push = PushLocal(string_param_length_);
StoreLocal(current_position_, Bind(Sub(match_end_push, len_push)));
if (read_backward) {
// Move current character position to start of match.
pos_push = PushLocal(current_position_);
len_push = PushLocal(capture_length_);
StoreLocal(current_position_, Bind(Sub(pos_push, len_push)));
} else {
// Move current character position to position after match.
PushArgumentInstr* match_end_push = PushLocal(match_end_index_);
len_push = PushLocal(string_param_length_);
StoreLocal(current_position_, Bind(Sub(match_end_push, len_push)));
}
BindBlock(&fallthrough);
}
void IRRegExpMacroAssembler::CheckNotBackReference(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match) {
TAG();
ASSERT(start_reg + 1 <= registers_count_);
@ -1001,21 +1017,39 @@ void IRRegExpMacroAssembler::CheckNotBackReference(intptr_t start_reg,
Comparison(kEQ, LoadLocal(capture_length_), Uint64Constant(0)),
&fallthrough);
// Check that there are sufficient characters left in the input.
PushArgumentInstr* pos_push = PushLocal(current_position_);
PushArgumentInstr* len_push = PushLocal(capture_length_);
BranchOrBacktrack(
Comparison(kGT,
InstanceCall(InstanceCallDescriptor::FromToken(Token::kADD),
pos_push, len_push),
Uint64Constant(0)),
on_no_match);
PushArgumentInstr* pos_push = nullptr;
PushArgumentInstr* len_push = nullptr;
if (!read_backward) {
// Check that there are sufficient characters left in the input.
pos_push = PushLocal(current_position_);
len_push = PushLocal(capture_length_);
BranchOrBacktrack(
Comparison(kGT,
InstanceCall(InstanceCallDescriptor::FromToken(Token::kADD),
pos_push, len_push),
Uint64Constant(0)),
on_no_match);
}
// Compute pointers to match string and capture string.
pos_push = PushLocal(current_position_);
len_push = PushLocal(string_param_length_);
StoreLocal(match_start_index_, Bind(Add(pos_push, len_push)));
if (read_backward) {
// First check that there are enough characters before this point in
// the string that we can match the backreference.
BranchOrBacktrack(Comparison(kLT, LoadLocal(match_start_index_),
LoadLocal(capture_length_)),
on_no_match);
// The string to check is before the current position, not at it.
pos_push = PushLocal(match_start_index_);
len_push = PushLocal(capture_length_);
StoreLocal(match_start_index_, Bind(Sub(pos_push, len_push)));
}
pos_push = PushArgument(LoadRegister(start_reg));
len_push = PushLocal(string_param_length_);
StoreLocal(capture_start_index_, Bind(Add(pos_push, len_push)));
@ -1050,10 +1084,17 @@ void IRRegExpMacroAssembler::CheckNotBackReference(intptr_t start_reg,
BindBlock(&success);
// Move current character position to position after match.
PushArgumentInstr* match_end_push = PushLocal(match_end_index_);
len_push = PushLocal(string_param_length_);
StoreLocal(current_position_, Bind(Sub(match_end_push, len_push)));
if (read_backward) {
// Move current character position to start of match.
pos_push = PushLocal(current_position_);
len_push = PushLocal(capture_length_);
StoreLocal(current_position_, Bind(Sub(pos_push, len_push)));
} else {
// Move current character position to position after match.
PushArgumentInstr* match_end_push = PushLocal(match_end_index_);
len_push = PushLocal(string_param_length_);
StoreLocal(current_position_, Bind(Sub(match_end_push, len_push)));
}
BindBlock(&fallthrough);
}
@ -1361,10 +1402,13 @@ void IRRegExpMacroAssembler::LoadCurrentCharacter(intptr_t cp_offset,
bool check_bounds,
intptr_t characters) {
TAG();
ASSERT(cp_offset >= -1); // ^ and \b can look behind one character.
ASSERT(cp_offset < (1 << 30)); // Be sane! (And ensure negation works)
if (check_bounds) {
CheckPosition(cp_offset + characters - 1, on_end_of_input);
if (cp_offset >= 0) {
CheckPosition(cp_offset + characters - 1, on_end_of_input);
} else {
CheckPosition(cp_offset, on_end_of_input);
}
}
LoadCurrentCharacterUnchecked(cp_offset, characters);
}
@ -1594,13 +1638,24 @@ void IRRegExpMacroAssembler::WriteStackPointerToRegister(intptr_t reg) {
void IRRegExpMacroAssembler::CheckPosition(intptr_t cp_offset,
BlockLabel* on_outside_input) {
TAG();
Definition* curpos_def = LoadLocal(current_position_);
Definition* cp_off_def = Int64Constant(-cp_offset);
if (cp_offset >= 0) {
Definition* curpos_def = LoadLocal(current_position_);
Definition* cp_off_def = Int64Constant(-cp_offset);
// If (current_position_ < -cp_offset), we are in bounds.
// Remember, current_position_ is a negative offset from the string end.
// If (current_position_ < -cp_offset), we are in bounds.
// Remember, current_position_ is a negative offset from the string end.
BranchOrBacktrack(Comparison(kGTE, curpos_def, cp_off_def), on_outside_input);
BranchOrBacktrack(Comparison(kGTE, curpos_def, cp_off_def),
on_outside_input);
} else {
// We need to see if there's enough characters left in the string to go
// back cp_offset characters, so get the normalized position and then
// make sure that (normalized_position >= -cp_offset).
PushArgumentInstr* pos_push = PushLocal(current_position_);
PushArgumentInstr* len_push = PushLocal(string_param_length_);
BranchOrBacktrack(
Comparison(kLT, Add(pos_push, len_push), Uint64Constant(-cp_offset)),
on_outside_input);
}
}
void IRRegExpMacroAssembler::BranchOrBacktrack(ComparisonInstr* comparison,

View file

@ -61,10 +61,12 @@ class IRRegExpMacroAssembler : public RegExpMacroAssembler {
// A "greedy loop" is a loop that is both greedy and with a simple
// body. It has a particularly simple implementation.
virtual void CheckGreedyLoop(BlockLabel* on_tos_equals_current_position);
virtual void CheckNotAtStart(BlockLabel* on_not_at_start);
virtual void CheckNotAtStart(intptr_t cp_offset, BlockLabel* on_not_at_start);
virtual void CheckNotBackReference(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match);
virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg,
bool read_backward,
BlockLabel* on_no_match);
virtual void CheckNotCharacter(uint32_t c, BlockLabel* on_not_equal);
virtual void CheckNotCharacterAfterAnd(uint32_t c,

View file

@ -43,7 +43,7 @@ Interval RegExpDisjunction::CaptureRegisters() const {
return ListCaptureRegisters(alternatives());
}
Interval RegExpLookahead::CaptureRegisters() const {
Interval RegExpLookaround::CaptureRegisters() const {
return body()->CaptureRegisters();
}
@ -108,8 +108,8 @@ bool RegExpDisjunction::IsAnchoredAtEnd() const {
return true;
}
bool RegExpLookahead::IsAnchoredAtStart() const {
return is_positive() && body()->IsAnchoredAtStart();
bool RegExpLookaround::IsAnchoredAtStart() const {
return is_positive() && type() == LOOKAHEAD && body()->IsAnchoredAtStart();
}
bool RegExpCapture::IsAnchoredAtStart() const {
@ -241,8 +241,11 @@ void* RegExpUnparser::VisitCapture(RegExpCapture* that, void* data) {
return NULL;
}
void* RegExpUnparser::VisitLookahead(RegExpLookahead* that, void* data) {
OS::PrintErr("(-> %s", (that->is_positive() ? "+ " : "- "));
void* RegExpUnparser::VisitLookaround(RegExpLookaround* that, void* data) {
OS::PrintErr("(");
OS::PrintErr("(%s %s",
(that->type() == RegExpLookaround::LOOKAHEAD ? "->" : "<-"),
(that->is_positive() ? "+ " : "- "));
that->body()->Accept(this, data);
OS::PrintErr(")");
return NULL;

View file

@ -21,7 +21,7 @@ class RegExpCharacterClass;
class RegExpCompiler;
class RegExpDisjunction;
class RegExpEmpty;
class RegExpLookahead;
class RegExpLookaround;
class RegExpQuantifier;
class RegExpText;
@ -277,8 +277,7 @@ class RegExpQuantifier : public RegExpTree {
class RegExpCapture : public RegExpTree {
public:
explicit RegExpCapture(RegExpTree* body, intptr_t index)
: body_(body), index_(index) {}
explicit RegExpCapture(intptr_t index) : body_(nullptr), index_(index) {}
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
static RegExpNode* ToNode(RegExpTree* body,
@ -293,6 +292,11 @@ class RegExpCapture : public RegExpTree {
virtual intptr_t min_match() const { return body_->min_match(); }
virtual intptr_t max_match() const { return body_->max_match(); }
RegExpTree* body() const { return body_; }
// When a backreference is parsed before the corresponding capture group,
// which can happen because of lookbehind, we create the capture object when
// we create the backreference, and fill in the body later when the actual
// capture group is parsed.
void set_body(RegExpTree* body) { body_ = body; }
intptr_t index() const { return index_; }
static intptr_t StartRegister(intptr_t index) { return index * 2; }
static intptr_t EndRegister(intptr_t index) { return index * 2 + 1; }
@ -302,22 +306,25 @@ class RegExpCapture : public RegExpTree {
intptr_t index_;
};
class RegExpLookahead : public RegExpTree {
class RegExpLookaround : public RegExpTree {
public:
RegExpLookahead(RegExpTree* body,
bool is_positive,
intptr_t capture_count,
intptr_t capture_from)
enum Type { LOOKAHEAD, LOOKBEHIND };
RegExpLookaround(RegExpTree* body,
bool is_positive,
intptr_t capture_count,
intptr_t capture_from,
Type type)
: body_(body),
is_positive_(is_positive),
capture_count_(capture_count),
capture_from_(capture_from) {}
capture_from_(capture_from),
type_(type) {}
virtual void* Accept(RegExpVisitor* visitor, void* data);
virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success);
virtual RegExpLookahead* AsLookahead();
virtual RegExpLookaround* AsLookaround();
virtual Interval CaptureRegisters() const;
virtual bool IsLookahead() const;
virtual bool IsLookaround() const;
virtual bool IsAnchoredAtStart() const;
virtual intptr_t min_match() const { return 0; }
virtual intptr_t max_match() const { return 0; }
@ -325,12 +332,36 @@ class RegExpLookahead : public RegExpTree {
bool is_positive() const { return is_positive_; }
intptr_t capture_count() const { return capture_count_; }
intptr_t capture_from() const { return capture_from_; }
Type type() const { return type_; }
// The RegExpLookaround::Builder class abstracts out the process of building
// the compiling a RegExpLookaround object by splitting it into two phases,
// represented by the provided methods.
class Builder : public ValueObject {
public:
Builder(bool is_positive,
RegExpNode* on_success,
intptr_t stack_pointer_register,
intptr_t position_register,
intptr_t capture_register_count = 0,
intptr_t capture_register_start = 0);
RegExpNode* on_match_success() { return on_match_success_; }
RegExpNode* ForMatch(RegExpNode* match);
private:
bool is_positive_;
RegExpNode* on_match_success_;
RegExpNode* on_success_;
intptr_t stack_pointer_register_;
intptr_t position_register_;
};
private:
RegExpTree* body_;
bool is_positive_;
intptr_t capture_count_;
intptr_t capture_from_;
Type type_;
};
class RegExpBackReference : public RegExpTree {
@ -341,7 +372,10 @@ class RegExpBackReference : public RegExpTree {
virtual RegExpBackReference* AsBackReference();
virtual bool IsBackReference() const;
virtual intptr_t min_match() const { return 0; }
virtual intptr_t max_match() const { return capture_->max_match(); }
// The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
// recursion, we give up and just assume arbitrary length, which matches v8's
// behavior.
virtual intptr_t max_match() const { return kInfinity; }
intptr_t index() const { return capture_->index(); }
RegExpCapture* capture() const { return capture_; }

View file

@ -55,15 +55,17 @@ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \
V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_REGS_EQUAL, 39, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 40, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 41, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 42, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_AT_START, 43, 8) /* bc8 pad24 addr32 */ \
V(CHECK_NOT_AT_START, 44, 8) /* bc8 pad24 addr32 */ \
V(CHECK_GREEDY, 45, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 46, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 47, 4) /* bc8 idx24 */
V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \
V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \
V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \
V(CHECK_NOT_AT_START, 46, 8) /* bc8 offset24 addr32 */ \
V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \
V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \
V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */
// clang-format on

View file

@ -268,7 +268,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base,
break;
BYTECODE(LOAD_CURRENT_CHAR) {
int pos = current + (insn >> BYTECODE_SHIFT);
if (pos >= subject_length) {
if (pos < 0 || pos >= subject_length) {
pc = code_base + Load32Aligned(pc + 4);
} else {
current_char = subject.CharAt(pos);
@ -534,6 +534,55 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base,
}
break;
}
BYTECODE(CHECK_NOT_BACK_REF_BACKWARD) {
const int from = registers[insn >> BYTECODE_SHIFT];
const int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from < 0 || len <= 0) {
pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
break;
}
if ((current - len) < 0) {
pc = code_base + Load32Aligned(pc + 4);
break;
} else {
// When looking behind, the string to match (if it is there) lies
// before the current position, so we will check the [len] characters
// before the current position, excluding the current position itself.
const int start = current - len;
int i;
for (i = 0; i < len; i++) {
if (subject.CharAt(from + i) != subject.CharAt(start + i)) {
pc = code_base + Load32Aligned(pc + 4);
break;
}
}
if (i < len) break;
current -= len;
}
pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH;
break;
}
BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) {
int from = registers[insn >> BYTECODE_SHIFT];
int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
if (from < 0 || len <= 0) {
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH;
break;
}
if (current < len) {
pc = code_base + Load32Aligned(pc + 4);
break;
} else {
if (BackRefMatchesNoCase<Char>(&canonicalize, from, current - len,
len, subject)) {
current -= len;
pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH;
} else {
pc = code_base + Load32Aligned(pc + 4);
}
}
break;
}
BYTECODE(CHECK_AT_START)
if (current == 0) {
pc = code_base + Load32Aligned(pc + 4);
@ -541,13 +590,15 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base,
pc += BC_CHECK_AT_START_LENGTH;
}
break;
BYTECODE(CHECK_NOT_AT_START)
if (current == 0) {
pc += BC_CHECK_NOT_AT_START_LENGTH;
} else {
pc = code_base + Load32Aligned(pc + 4);
BYTECODE(CHECK_NOT_AT_START) {
const int32_t cp_offset = insn >> BYTECODE_SHIFT;
if (current + cp_offset == 0) {
pc += BC_CHECK_NOT_AT_START_LENGTH;
} else {
pc = code_base + Load32Aligned(pc + 4);
}
break;
}
break;
BYTECODE(SET_CURRENT_POSITION_FROM_END) {
int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
if (subject_length - current > by) {

View file

@ -129,13 +129,13 @@ RegExpTree* RegExpBuilder::ToRegExp() {
return new (Z) RegExpDisjunction(alternatives);
}
void RegExpBuilder::AddQuantifierToAtom(
bool RegExpBuilder::AddQuantifierToAtom(
intptr_t min,
intptr_t max,
RegExpQuantifier::QuantifierType quantifier_type) {
if (pending_empty_) {
pending_empty_ = false;
return;
return true;
}
RegExpTree* atom;
if (characters_ != NULL) {
@ -167,22 +167,28 @@ void RegExpBuilder::AddQuantifierToAtom(
} else if (terms_.length() > 0) {
DEBUG_ASSERT(last_added_ == ADD_ATOM);
atom = terms_.RemoveLast();
if (auto lookaround = atom->AsLookaround()) {
// Lookbehinds are not quantifiable.
if (lookaround->type() == RegExpLookaround::LOOKBEHIND) {
return false;
}
}
if (atom->max_match() == 0) {
// Guaranteed to only match an empty string.
LAST(ADD_TERM);
if (min == 0) {
return;
return true;
}
terms_.Add(atom);
return;
return true;
}
} else {
// Only call immediately after adding an atom or character!
UNREACHABLE();
return;
}
terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));
LAST(ADD_TERM);
return true;
}
// ----------------------------------------------------------------------------
@ -194,6 +200,7 @@ RegExpParser::RegExpParser(const String& in, String* error, bool multiline)
in_(in),
current_(kEndMarker),
next_pos_(0),
captures_started_(0),
capture_count_(0),
has_more_(true),
multiline_(multiline),
@ -275,7 +282,8 @@ RegExpTree* RegExpParser::ParsePattern() {
// Atom Quantifier
RegExpTree* RegExpParser::ParseDisjunction() {
// Used to store current state while parsing subexpressions.
RegExpParserState initial_state(NULL, INITIAL, 0, Z);
RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
Z);
RegExpParserState* stored_state = &initial_state;
// Cache the builder in a local variable for quick access.
RegExpBuilder* builder = initial_state.builder();
@ -307,23 +315,24 @@ RegExpTree* RegExpParser::ParseDisjunction() {
intptr_t capture_index = stored_state->capture_index();
SubexpressionType group_type = stored_state->group_type();
// Build result of subexpression.
if (group_type == CAPTURE) {
RegExpCapture* capture = GetCapture(capture_index);
capture->set_body(body);
body = capture;
} else if (group_type != GROUPING) {
ASSERT(group_type == POSITIVE_LOOKAROUND ||
group_type == NEGATIVE_LOOKAROUND);
bool is_positive = (group_type == POSITIVE_LOOKAROUND);
body = new (Z) RegExpLookaround(
body, is_positive, end_capture_index - capture_index,
capture_index, stored_state->lookaround_type());
}
// Restore previous state.
stored_state = stored_state->previous_state();
builder = stored_state->builder();
// Build result of subexpression.
if (group_type == CAPTURE) {
RegExpCapture* capture = new (Z) RegExpCapture(body, capture_index);
(*captures_)[capture_index - 1] = capture;
body = capture;
} else if (group_type != GROUPING) {
ASSERT(group_type == POSITIVE_LOOKAHEAD ||
group_type == NEGATIVE_LOOKAHEAD);
bool is_positive = (group_type == POSITIVE_LOOKAHEAD);
body = new (Z)
RegExpLookahead(body, is_positive,
end_capture_index - capture_index, capture_index);
}
builder->AddAtom(body);
// For compatibility with JSC and ES3, we allow quantifiers after
// lookaheads, and break in all cases.
@ -370,37 +379,7 @@ RegExpTree* RegExpParser::ParseDisjunction() {
break;
}
case '(': {
SubexpressionType subexpr_type = CAPTURE;
Advance();
if (current() == '?') {
switch (Next()) {
case ':':
subexpr_type = GROUPING;
break;
case '=':
subexpr_type = POSITIVE_LOOKAHEAD;
break;
case '!':
subexpr_type = NEGATIVE_LOOKAHEAD;
break;
default:
ReportError("Invalid group");
UNREACHABLE();
}
Advance(2);
} else {
if (captures_ == NULL) {
captures_ = new ZoneGrowableArray<RegExpCapture*>(2);
}
if (captures_started() >= kMaxCaptures) {
ReportError("Too many captures");
UNREACHABLE();
}
captures_->Add(NULL);
}
// Store current state and begin new disjunction parsing.
stored_state = new RegExpParserState(stored_state, subexpr_type,
captures_started(), Z);
stored_state = ParseOpenParenthesis(stored_state);
builder = stored_state->builder();
continue;
}
@ -457,16 +436,18 @@ RegExpTree* RegExpParser::ParseDisjunction() {
case '9': {
intptr_t index = 0;
if (ParseBackReferenceIndex(&index)) {
RegExpCapture* capture = NULL;
if (captures_ != NULL && index <= captures_->length()) {
capture = captures_->At(index - 1);
}
if (capture == NULL) {
if (stored_state->IsInsideCaptureGroup(index)) {
// The back reference is inside the capture group it refers to.
// Nothing can possibly have been captured yet, so we use empty
// instead. This ensures that, when checking a back reference,
// the capture registers of the referenced capture are either
// both set or both cleared.
builder->AddEmpty();
break;
} else {
RegExpCapture* capture = GetCapture(index);
RegExpTree* atom = new RegExpBackReference(capture);
builder->AddAtom(atom);
}
RegExpTree* atom = new RegExpBackReference(capture);
builder->AddAtom(atom);
break;
}
uint32_t first_digit = Next();
@ -610,7 +591,10 @@ RegExpTree* RegExpParser::ParseDisjunction() {
quantifier_type = RegExpQuantifier::POSSESSIVE;
Advance();
}
builder->AddQuantifierToAtom(min, max, quantifier_type);
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
ReportError("invalid quantifier.");
UNREACHABLE();
}
}
}
@ -631,6 +615,57 @@ static bool IsSpecialClassEscape(uint32_t c) {
}
#endif
RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
RegExpParserState* state) {
RegExpLookaround::Type lookaround_type = state->lookaround_type();
SubexpressionType subexpr_type = CAPTURE;
Advance();
if (current() == '?') {
switch (Next()) {
case ':':
Advance(2);
subexpr_type = GROUPING;
break;
case '=':
Advance(2);
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = POSITIVE_LOOKAROUND;
break;
case '!':
Advance(2);
lookaround_type = RegExpLookaround::LOOKAHEAD;
subexpr_type = NEGATIVE_LOOKAROUND;
break;
case '<':
Advance();
if (Next() == '=') {
Advance(2);
lookaround_type = RegExpLookaround::LOOKBEHIND;
subexpr_type = POSITIVE_LOOKAROUND;
} else if (Next() == '!') {
Advance(2);
lookaround_type = RegExpLookaround::LOOKBEHIND;
subexpr_type = NEGATIVE_LOOKAROUND;
}
break;
default:
ReportError("Invalid group");
UNREACHABLE();
}
}
if (subexpr_type == CAPTURE) {
if (captures_started_ >= kMaxCaptures) {
ReportError("Too many captures");
UNREACHABLE();
}
captures_started_++;
}
// Store current state and begin new disjunction parsing.
return new RegExpParserState(state, subexpr_type, lookaround_type,
captures_started_, Z);
}
// In order to know whether an escape is a backreference or not we have to scan
// the entire regexp and find the number of capturing parentheses. However we
// don't want to scan the regexp twice unless it is necessary. This mini-parser
@ -638,6 +673,8 @@ static bool IsSpecialClassEscape(uint32_t c) {
// noncapturing parentheses and can skip character classes and backslash-escaped
// characters.
void RegExpParser::ScanForCaptures() {
ASSERT(!is_scanned_for_captures_);
const intptr_t saved_position = position();
// Start with captures started previous to current position
intptr_t capture_count = captures_started();
// Add count of captures after this position.
@ -667,6 +704,7 @@ void RegExpParser::ScanForCaptures() {
}
capture_count_ = capture_count;
is_scanned_for_captures_ = true;
Reset(saved_position);
}
static inline bool IsDecimalDigit(int32_t c) {
@ -695,11 +733,7 @@ bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
}
}
if (value > captures_started()) {
if (!is_scanned_for_captures_) {
intptr_t saved_position = position();
ScanForCaptures();
Reset(saved_position);
}
if (!is_scanned_for_captures_) ScanForCaptures();
if (value > capture_count_) {
Reset(start);
return false;
@ -709,6 +743,32 @@ bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
return true;
}
RegExpCapture* RegExpParser::GetCapture(intptr_t index) {
// The index for the capture groups are one-based. Its index in the list is
// zero-based.
const intptr_t know_captures =
is_scanned_for_captures_ ? capture_count_ : captures_started_;
ASSERT(index <= know_captures);
if (captures_ == nullptr) {
captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures);
}
while (captures_->length() < know_captures) {
captures_->Add(new (Z) RegExpCapture(captures_->length() + 1));
}
return captures_->At(index - 1);
}
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) {
for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
if (s->group_type() != CAPTURE) continue;
// Return true if we found the matching capture index.
if (index == s->capture_index()) return true;
// Abort if index is larger than what has been parsed up till this state.
if (index > s->capture_index()) return false;
}
return false;
}
// QuantifierPrefix ::
// { DecimalDigits }
// { DecimalDigits , }

View file

@ -23,7 +23,10 @@ class RegExpBuilder : public ZoneAllocated {
void AddAtom(RegExpTree* tree);
void AddAssertion(RegExpTree* tree);
void NewAlternative(); // '|'
void AddQuantifierToAtom(intptr_t min,
// Attempt to add a quantifier to the last atom added. The return value
// denotes whether the attempt succeeded, since some atoms like lookbehind
// cannot be quantified.
bool AddQuantifierToAtom(intptr_t min,
intptr_t max,
RegExpQuantifier::QuantifierType type);
RegExpTree* ToRegExp();
@ -93,9 +96,7 @@ class RegExpParser : public ValueObject {
bool simple();
bool contains_anchor() { return contains_anchor_; }
void set_contains_anchor() { contains_anchor_ = true; }
intptr_t captures_started() {
return captures_ == NULL ? 0 : captures_->length();
}
intptr_t captures_started() { return captures_started_; }
intptr_t position() { return next_pos_ - 1; }
static const intptr_t kMaxCaptures = 1 << 16;
@ -105,8 +106,8 @@ class RegExpParser : public ValueObject {
enum SubexpressionType {
INITIAL,
CAPTURE, // All positive values represent captures.
POSITIVE_LOOKAHEAD,
NEGATIVE_LOOKAHEAD,
POSITIVE_LOOKAROUND,
NEGATIVE_LOOKAROUND,
GROUPING
};
@ -114,11 +115,13 @@ class RegExpParser : public ValueObject {
public:
RegExpParserState(RegExpParserState* previous_state,
SubexpressionType group_type,
RegExpLookaround::Type lookaround_type,
intptr_t disjunction_capture_index,
Zone* zone)
: previous_state_(previous_state),
builder_(new (zone) RegExpBuilder()),
group_type_(group_type),
lookaround_type_(lookaround_type),
disjunction_capture_index_(disjunction_capture_index) {}
// Parser state of containing expression, if any.
RegExpParserState* previous_state() { return previous_state_; }
@ -127,11 +130,16 @@ class RegExpParser : public ValueObject {
RegExpBuilder* builder() { return builder_; }
// Type of regexp being parsed (parenthesized group or entire regexp).
SubexpressionType group_type() { return group_type_; }
// Lookahead or lookbehind.
RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
// Index in captures array of first capture in this sub-expression, if any.
// Also the capture index of this sub-expression itself, if group_type
// is CAPTURE.
intptr_t capture_index() { return disjunction_capture_index_; }
// Check whether the parser is inside a capture group with the given index.
bool IsInsideCaptureGroup(intptr_t index);
private:
// Linked list implementation of stack of states.
RegExpParserState* previous_state_;
@ -139,10 +147,17 @@ class RegExpParser : public ValueObject {
RegExpBuilder* builder_;
// Stored disjunction type (capture, look-ahead or grouping), if any.
SubexpressionType group_type_;
// Stored read direction.
const RegExpLookaround::Type lookaround_type_;
// Stored disjunction's capture index (if any).
intptr_t disjunction_capture_index_;
};
// Return the 1-indexed RegExpCapture object, allocate if necessary.
RegExpCapture* GetCapture(intptr_t index);
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
Zone* zone() { return zone_; }
uint32_t current() { return current_; }
@ -157,6 +172,7 @@ class RegExpParser : public ValueObject {
const String& in_;
uint32_t current_;
intptr_t next_pos_;
intptr_t captures_started_;
// The capture count is only valid after we have scanned for captures.
intptr_t capture_count_;
bool has_more_;

View file

@ -0,0 +1,440 @@
// Copyright (c) 2019, the Dart project authors. All rights reserved.
// Copyright 2015 the V8 project authors. All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials provided
// with the distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import 'package:expect/expect.dart';
import 'v8_regexp_utils.dart';
void main() {
// Tests captures in positive and negative look-behind in regular expressions.
void testRE(RegExp re, String input, bool expectedResult) {
if (expectedResult) {
assertTrue(re.hasMatch(input));
} else {
assertFalse(re.hasMatch(input));
}
}
void execRE(RegExp re, String input, List<String> expectedResult) {
assertTrue(re.hasMatch(input));
shouldBe(re.firstMatch(input), expectedResult);
}
void multiRE(RegExp re, String input, List<List<String>> expectedResult) {
assertTrue(re.hasMatch(input));
final matches = re.allMatches(input);
assertEquals(matches.length, expectedResult.length);
for (var i = 0; i < matches.length; i++) {
shouldBe(matches.elementAt(i), expectedResult[i]);
}
}
// Simple fixed-length matches.
var re = new RegExp(r"^.(?<=a)");
execRE(re, "a", ["a"]);
testRE(re, "b", false);
re = new RegExp(r"^f..(?<=.oo)");
execRE(re, "foo1", ["foo"]);
re = new RegExp(r"^f\w\w(?<=\woo)");
execRE(re, "foo2", ["foo"]);
testRE(re, "boo", false);
testRE(re, "fao", false);
testRE(re, "foa", false);
re = new RegExp(r"(?<=abc)\w\w\w");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=a.c)\w\w\w");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=a\wc)\w\w\w");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=a[a-z])\w\w\w");
execRE(re, "abcdef", ["cde"]);
re = new RegExp(r"(?<=a[a-z][a-z])\w\w\w");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=a[a-z]{2})\w\w\w");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=a{1})\w\w\w");
execRE(re, "abcdef", ["bcd"]);
re = new RegExp(r"(?<=a{1}b{1})\w\w\w");
execRE(re, "abcdef", ["cde"]);
re = new RegExp(r"(?<=a{1}[a-z]{2})\w\w\w");
execRE(re, "abcdef", ["def"]);
// Variable-length matches.
re = new RegExp(r"(?<=[a|b|c]*)[^a|b|c]{3}");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=\w*)[^a|b|c]{3}");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=b|c)\w");
multiRE(re, "abcdef", [
["c"],
["d"]
]);
re = new RegExp(r"(?<=[b-e])\w{2}");
multiRE(re, "abcdef", [
["cd"],
["ef"]
]);
// Start of line matches.
re = new RegExp(r"(?<=^abc)def");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=^[a-c]{3})def");
execRE(re, "abcdef", ["def"]);
re = new RegExp(r"(?<=^[a-c]{3})def", multiLine: true);
execRE(re, "xyz\nabcdef", ["def"]);
re = new RegExp(r"(?<=^)\w+", multiLine: true);
multiRE(re, "ab\ncd\nefg", [
["ab"],
["cd"],
["efg"]
]);
re = new RegExp(r"\w+(?<=$)", multiLine: true);
multiRE(re, "ab\ncd\nefg", [
["ab"],
["cd"],
["efg"]
]);
re = new RegExp(r"(?<=^)\w+(?<=$)", multiLine: true);
multiRE(re, "ab\ncd\nefg", [
["ab"],
["cd"],
["efg"]
]);
re = new RegExp(r"(?<=^[^a-c]{3})def");
testRE(re, "abcdef", false);
re = new RegExp(r"^foooo(?<=^o+)$");
testRE(re, "foooo", false);
re = new RegExp(r"^foooo(?<=^o*)$");
testRE(re, "foooo", false);
re = new RegExp(r"^foo(?<=^fo+)$");
execRE(re, "foo", ["foo"]);
re = new RegExp(r"^foooo(?<=^fo*)");
execRE(re, "foooo", ["foooo"]);
re = new RegExp(r"^(f)oo(?<=^\1o+)$");
testRE(re, "foo", true);
execRE(re, "foo", ["foo", "f"]);
re = new RegExp(r"^(f)oo(?<=^\1o+)$", caseSensitive: false);
execRE(re, "foo", ["foo", "f"]);
re = new RegExp(r"^(f)oo(?<=^\1o+).$", caseSensitive: false);
execRE(re, "foo\u1234", ["foo\u1234", "f"]);
re = new RegExp(r"(?<=^\w+)def");
execRE(re, "abcdefdef", ["def"]);
multiRE(re, "abcdefdef", [
["def"],
["def"]
]);
// Word boundary matches.
re = new RegExp(r"(?<=\b)[d-f]{3}");
execRE(re, "abc def", ["def"]);
re = new RegExp(r"(?<=\B)\w{3}");
execRE(re, "ab cdef", ["def"]);
re = new RegExp(r"(?<=\B)(?<=c(?<=\w))\w{3}");
execRE(re, "ab cdef", ["def"]);
re = new RegExp(r"(?<=\b)[d-f]{3}");
testRE(re, "abcdef", false);
// Negative lookbehind.
re = new RegExp(r"(?<!abc)\w\w\w");
execRE(re, "abcdef", ["abc"]);
re = new RegExp(r"(?<!a.c)\w\w\w");
execRE(re, "abcdef", ["abc"]);
re = new RegExp(r"(?<!a\wc)\w\w\w");
execRE(re, "abcdef", ["abc"]);
re = new RegExp(r"(?<!a[a-z])\w\w\w");
execRE(re, "abcdef", ["abc"]);
re = new RegExp(r"(?<!a[a-z]{2})\w\w\w");
execRE(re, "abcdef", ["abc"]);
re = new RegExp(r"(?<!abc)def");
testRE(re, "abcdef", false);
re = new RegExp(r"(?<!a.c)def");
testRE(re, "abcdef", false);
re = new RegExp(r"(?<!a\wc)def");
testRE(re, "abcdef", false);
re = new RegExp(r"(?<!a[a-z][a-z])def");
testRE(re, "abcdef", false);
re = new RegExp(r"(?<!a[a-z]{2})def");
testRE(re, "abcdef", false);
re = new RegExp(r"(?<!a{1}b{1})cde");
testRE(re, "abcdef", false);
re = new RegExp(r"(?<!a{1}[a-z]{2})def");
testRE(re, "abcdef", false);
// Capturing matches.
re = new RegExp(r"(?<=(c))def");
execRE(re, "abcdef", ["def", "c"]);
re = new RegExp(r"(?<=(\w{2}))def");
execRE(re, "abcdef", ["def", "bc"]);
re = new RegExp(r"(?<=(\w(\w)))def");
execRE(re, "abcdef", ["def", "bc", "c"]);
re = new RegExp(r"(?<=(\w){3})def");
execRE(re, "abcdef", ["def", "a"]);
re = new RegExp(r"(?<=(bc)|(cd)).");
execRE(re, "abcdef", ["d", "bc", null]);
re = new RegExp(r"(?<=([ab]{1,2})\D|(abc))\w");
execRE(re, "abcdef", ["c", "a", null]);
re = new RegExp(r"\D(?<=([ab]+))(\w)");
execRE(re, "abcdef", ["ab", "a", "b"]);
// Captures inside negative lookbehind. (They never capture.)
re = new RegExp(r"(?<!(^|[ab]))\w{2}");
execRE(re, "abcdef", ["de", null]);
// Nested lookaround.
re = new RegExp(r"(?<=ab(?=c)\wd)\w\w");
execRE(re, "abcdef", ["ef"]);
re = new RegExp(r"(?<=a(?=([^a]{2})d)\w{3})\w\w");
execRE(re, "abcdef", ["ef", "bc"]);
re = new RegExp(r"(?<=a(?=([bc]{2}(?<!a{2}))d)\w{3})\w\w");
execRE(re, "abcdef", ["ef", "bc"]);
re = new RegExp(r"(?<=a(?=([bc]{2}(?<!a*))d)\w{3})\w\w/");
testRE(re, "abcdef", false);
re = new RegExp(r"^faaao?(?<=^f[oa]+(?=o))");
execRE(re, "faaao", ["faaa"]);
// Back references.
re = new RegExp(r"(.)(?<=(\1\1))");
execRE(re, "abb", ["b", "b", "bb"]);
re = new RegExp(r"(.)(?<=(\1\1))", caseSensitive: false);
execRE(re, "abB", ["B", "B", "bB"]);
re = new RegExp(r"((\w)\w)(?<=\1\2\1)", caseSensitive: false);
execRE(re, "aabAaBa", ["aB", "aB", "a"]);
re = new RegExp(r"(\w(\w))(?<=\1\2\1)", caseSensitive: false);
execRE(re, "aabAaBa", ["Ba", "Ba", "a"]);
re = new RegExp(r"(?=(\w))(?<=(\1)).", caseSensitive: false);
execRE(re, "abaBbAa", ["b", "b", "B"]);
re = new RegExp(r"(?<=(.))(\w+)(?=\1)");
execRE(re, " 'foo' ", ["foo", "'", "foo"]);
execRE(re, " \"foo\" ", ["foo", "\"", "foo"]);
testRE(re, " .foo\" ", false);
re = new RegExp(r"(.)(?<=\1\1\1)");
testRE(re, "ab", false);
testRE(re, "abb", false);
execRE(re, "abbb", ["b", "b"]);
re = new RegExp(r"(..)(?<=\1\1\1)");
testRE(re, "ab", false);
testRE(re, "abb", false);
testRE(re, "aabb", false);
testRE(re, "abab", false);
testRE(re, "fabxbab", false);
testRE(re, "faxabab", false);
execRE(re, "fababab", ["ab", "ab"]);
// Back references to captures inside the lookbehind.
re = new RegExp(r"(?<=\1(\w))d", caseSensitive: false);
execRE(re, "abcCd", ["d", "C"]);
re = new RegExp(r"(?<=\1([abx]))d");
execRE(re, "abxxd", ["d", "x"]);
re = new RegExp(r"(?<=\1(\w+))c");
execRE(re, "ababc", ["c", "ab"]);
execRE(re, "ababbc", ["c", "b"]);
testRE(re, "ababdc", false);
re = new RegExp(r"(?<=(\w+)\1)c");
execRE(re, "ababc", ["c", "abab"]);
// Alternations are tried left to right,
// and we do not backtrack into a lookbehind.
re = new RegExp(r".*(?<=(..|...|....))(.*)");
execRE(re, "xabcd", ["xabcd", "cd", ""]);
re = new RegExp(r".*(?<=(xx|...|....))(.*)");
execRE(re, "xabcd", ["xabcd", "bcd", ""]);
re = new RegExp(r".*(?<=(xx|...))(.*)");
execRE(re, "xxabcd", ["xxabcd", "bcd", ""]);
re = new RegExp(r".*(?<=(xx|xxx))(.*)");
execRE(re, "xxabcd", ["xxabcd", "xx", "abcd"]);
// We do not backtrack into a lookbehind.
// The lookbehind captures "abc" so that \1 does not match. We do not backtrack
// to capture only "bc" in the lookbehind.
re = new RegExp(r"(?<=([abc]+)).\1");
testRE(re, "abcdbc", false);
// Greedy loop.
re = new RegExp(r"(?<=(b+))c");
execRE(re, "abbbbbbc", ["c", "bbbbbb"]);
re = new RegExp(r"(?<=(b\d+))c");
execRE(re, "ab1234c", ["c", "b1234"]);
re = new RegExp(r"(?<=((?:b\d{2})+))c");
execRE(re, "ab12b23b34c", ["c", "b12b23b34"]);
// Sticky
re = new RegExp(r"(?<=^(\w+))def");
multiRE(re, "abcdefdef", [
["def", "abc"],
["def", "abcdef"]
]);
re = new RegExp(r"\Bdef");
multiRE(re, "abcdefdef", [
["def"],
["def"]
]);
// Misc
re = new RegExp(r"(?<=$abc)def");
testRE(re, "abcdef", false);
re = new RegExp(r"^foo(?<=foo)$");
execRE(re, "foo", ["foo"]);
re = new RegExp(r"^f.o(?<=foo)$");
execRE(re, "foo", ["foo"]);
re = new RegExp(r"^f.o(?<=foo)$");
testRE(re, "fno", false);
re = new RegExp(r"^foo(?<!foo)$");
testRE(re, "foo", false);
re = new RegExp(r"^f.o(?<!foo)$");
testRE(re, "foo", false);
execRE(re, "fno", ["fno"]);
re = new RegExp(r"^foooo(?<=fo+)$");
execRE(re, "foooo", ["foooo"]);
re = new RegExp(r"^foooo(?<=fo*)$");
execRE(re, "foooo", ["foooo"]);
re = new RegExp(r"(abc\1)");
execRE(re, "abc", ["abc", "abc"]);
execRE(re, "abc\u1234", ["abc", "abc"]);
re = new RegExp(r"(abc\1)", caseSensitive: false);
execRE(re, "abc", ["abc", "abc"]);
execRE(re, "abc\u1234", ["abc", "abc"]);
final oob_subject = "abcdefghijklmnabcdefghijklmn".substring(14);
re = new RegExp(r"(?=(abcdefghijklmn))(?<=\1)a", caseSensitive: false);
testRE(re, oob_subject, false);
re = new RegExp(r"(?=(abcdefghijklmn))(?<=\1)a");
testRE(re, oob_subject, false);
re = new RegExp(r"(?=(abcdefg))(?<=\1)");
testRE(re, "abcdefgabcdefg".substring(1), false);
// Mutual recursive capture/back references
re = new RegExp(r"(?<=a(.\2)b(\1)).{4}");
execRE(re, "aabcacbc", ["cacb", "a", ""]);
re = new RegExp(r"(?<=a(\2)b(..\1))b");
execRE(re, "aacbacb", ["b", "ac", "ac"]);
re = new RegExp(r"(?<=(?:\1b)(aa)).");
execRE(re, "aabaax", ["x", "aa"]);
re = new RegExp(r"(?<=(?:\1|b)(aa)).");
execRE(re, "aaaax", ["x", "aa"]);
// Restricted syntax in Annex B 1.4.
// The check for quantifiers on lookbehinds was added later than the
// original feature in v8, so we may need to approve failures here
// separately from the rest of the file.
assertThrows(() => new RegExp(r"(?<=.)*")); //# 01: ok
assertThrows(() => new RegExp(r"(?<=.)?")); //# 01: ok
assertThrows(() => new RegExp(r"(?<=.)+")); //# 01: ok
// No unicode flag (yet), so can't test these.
// See https://github.com/dart-lang/sdk/issues/36170.
// assertThrows("/(?<=.)*/u", SyntaxError);
// assertThrows("/(?<=.){1,2}/u", SyntaxError);
}