mirror of
https://github.com/dart-lang/sdk
synced 2024-09-19 15:11:35 +00:00
f84cf9413f
TEST=new test added Fixed: 52691 Change-Id: Ibc8865f8e9b1e908be80d4fbe8b8931eda448df8 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/312348 Reviewed-by: Alexander Aprelev <aam@google.com> Commit-Queue: Siva Annamalai <asiva@google.com>
1990 lines
60 KiB
C++
1990 lines
60 KiB
C++
// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
|
|
// for details. All rights reserved. Use of this source code is governed by a
|
|
// BSD-style license that can be found in the LICENSE file.
|
|
|
|
#include "vm/regexp_parser.h"
|
|
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uniset.h"
|
|
|
|
#include "platform/unicode.h"
|
|
|
|
#include "vm/longjump.h"
|
|
#include "vm/object_store.h"
|
|
#include "vm/symbols.h"
|
|
|
|
namespace dart {
|
|
|
|
#define Z zone()
|
|
|
|
// Enables possessive quantifier syntax for testing.
|
|
static constexpr bool FLAG_regexp_possessive_quantifier = false;
|
|
|
|
RegExpBuilder::RegExpBuilder(RegExpFlags flags)
|
|
: zone_(Thread::Current()->zone()),
|
|
pending_empty_(false),
|
|
flags_(flags),
|
|
characters_(nullptr),
|
|
pending_surrogate_(kNoPendingSurrogate),
|
|
terms_(),
|
|
text_(),
|
|
alternatives_()
|
|
#ifdef DEBUG
|
|
,
|
|
last_added_(ADD_NONE)
|
|
#endif
|
|
{
|
|
}
|
|
|
|
void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) {
|
|
ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
|
|
FlushPendingSurrogate();
|
|
// Hold onto the lead surrogate, waiting for a trail surrogate to follow.
|
|
pending_surrogate_ = lead_surrogate;
|
|
}
|
|
|
|
void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) {
|
|
ASSERT(Utf16::IsTrailSurrogate(trail_surrogate));
|
|
if (pending_surrogate_ != kNoPendingSurrogate) {
|
|
uint16_t lead_surrogate = pending_surrogate_;
|
|
pending_surrogate_ = kNoPendingSurrogate;
|
|
ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
|
|
uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate);
|
|
if (NeedsDesugaringForIgnoreCase(combined)) {
|
|
AddCharacterClassForDesugaring(combined);
|
|
} else {
|
|
auto surrogate_pair = new (Z) ZoneGrowableArray<uint16_t>(2);
|
|
surrogate_pair->Add(lead_surrogate);
|
|
surrogate_pair->Add(trail_surrogate);
|
|
RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_);
|
|
AddAtom(atom);
|
|
}
|
|
} else {
|
|
pending_surrogate_ = trail_surrogate;
|
|
FlushPendingSurrogate();
|
|
}
|
|
}
|
|
|
|
void RegExpBuilder::FlushPendingSurrogate() {
|
|
if (pending_surrogate_ != kNoPendingSurrogate) {
|
|
ASSERT(is_unicode());
|
|
uint32_t c = pending_surrogate_;
|
|
pending_surrogate_ = kNoPendingSurrogate;
|
|
AddCharacterClassForDesugaring(c);
|
|
}
|
|
}
|
|
|
|
void RegExpBuilder::FlushCharacters() {
|
|
FlushPendingSurrogate();
|
|
pending_empty_ = false;
|
|
if (characters_ != nullptr) {
|
|
RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_);
|
|
characters_ = nullptr;
|
|
text_.Add(atom);
|
|
LAST(ADD_ATOM);
|
|
}
|
|
}
|
|
|
|
void RegExpBuilder::FlushText() {
|
|
FlushCharacters();
|
|
intptr_t num_text = text_.length();
|
|
if (num_text == 0) {
|
|
return;
|
|
} else if (num_text == 1) {
|
|
terms_.Add(text_.Last());
|
|
} else {
|
|
RegExpText* text = new (Z) RegExpText();
|
|
for (intptr_t i = 0; i < num_text; i++)
|
|
text_[i]->AppendToText(text);
|
|
terms_.Add(text);
|
|
}
|
|
text_.Clear();
|
|
}
|
|
|
|
void RegExpBuilder::AddCharacter(uint16_t c) {
|
|
FlushPendingSurrogate();
|
|
pending_empty_ = false;
|
|
if (NeedsDesugaringForIgnoreCase(c)) {
|
|
AddCharacterClassForDesugaring(c);
|
|
} else {
|
|
if (characters_ == nullptr) {
|
|
characters_ = new (Z) ZoneGrowableArray<uint16_t>(4);
|
|
}
|
|
characters_->Add(c);
|
|
LAST(ADD_CHAR);
|
|
}
|
|
}
|
|
|
|
void RegExpBuilder::AddUnicodeCharacter(uint32_t c) {
|
|
if (c > static_cast<uint32_t>(Utf16::kMaxCodeUnit)) {
|
|
ASSERT(is_unicode());
|
|
uint16_t surrogates[2];
|
|
Utf16::Encode(c, surrogates);
|
|
AddLeadSurrogate(surrogates[0]);
|
|
AddTrailSurrogate(surrogates[1]);
|
|
} else if (is_unicode() && Utf16::IsLeadSurrogate(c)) {
|
|
AddLeadSurrogate(c);
|
|
} else if (is_unicode() && Utf16::IsTrailSurrogate(c)) {
|
|
AddTrailSurrogate(c);
|
|
} else {
|
|
AddCharacter(static_cast<uint16_t>(c));
|
|
}
|
|
}
|
|
|
|
void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) {
|
|
// A lead or trail surrogate parsed via escape sequence will not
|
|
// pair up with any preceding lead or following trail surrogate.
|
|
FlushPendingSurrogate();
|
|
AddUnicodeCharacter(character);
|
|
FlushPendingSurrogate();
|
|
}
|
|
|
|
void RegExpBuilder::AddEmpty() {
|
|
pending_empty_ = true;
|
|
}
|
|
|
|
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
|
|
if (NeedsDesugaringForUnicode(cc)) {
|
|
// With /u, character class needs to be desugared, so it
|
|
// must be a standalone term instead of being part of a RegExpText.
|
|
AddTerm(cc);
|
|
} else {
|
|
AddAtom(cc);
|
|
}
|
|
}
|
|
|
|
void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) {
|
|
auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c));
|
|
AddTerm(new (Z) RegExpCharacterClass(ranges, flags_));
|
|
}
|
|
|
|
void RegExpBuilder::AddAtom(RegExpTree* term) {
|
|
if (term->IsEmpty()) {
|
|
AddEmpty();
|
|
return;
|
|
}
|
|
if (term->IsTextElement()) {
|
|
FlushCharacters();
|
|
text_.Add(term);
|
|
} else {
|
|
FlushText();
|
|
terms_.Add(term);
|
|
}
|
|
LAST(ADD_ATOM);
|
|
}
|
|
|
|
void RegExpBuilder::AddTerm(RegExpTree* term) {
|
|
FlushText();
|
|
terms_.Add(term);
|
|
LAST(ADD_ATOM);
|
|
}
|
|
|
|
void RegExpBuilder::AddAssertion(RegExpTree* assert) {
|
|
FlushText();
|
|
terms_.Add(assert);
|
|
LAST(ADD_ASSERT);
|
|
}
|
|
|
|
void RegExpBuilder::NewAlternative() {
|
|
FlushTerms();
|
|
}
|
|
|
|
void RegExpBuilder::FlushTerms() {
|
|
FlushText();
|
|
intptr_t num_terms = terms_.length();
|
|
RegExpTree* alternative;
|
|
if (num_terms == 0) {
|
|
alternative = RegExpEmpty::GetInstance();
|
|
} else if (num_terms == 1) {
|
|
alternative = terms_.Last();
|
|
} else {
|
|
ZoneGrowableArray<RegExpTree*>* terms =
|
|
new (Z) ZoneGrowableArray<RegExpTree*>();
|
|
for (intptr_t i = 0; i < terms_.length(); i++) {
|
|
terms->Add(terms_[i]);
|
|
}
|
|
alternative = new (Z) RegExpAlternative(terms);
|
|
}
|
|
alternatives_.Add(alternative);
|
|
terms_.Clear();
|
|
LAST(ADD_NONE);
|
|
}
|
|
|
|
bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
|
|
if (!is_unicode()) return false;
|
|
// TODO(yangguo): we could be smarter than this. Case-insensitivity does not
|
|
// necessarily mean that we need to desugar. It's probably nicer to have a
|
|
// separate pass to figure out unicode desugarings.
|
|
if (ignore_case()) return true;
|
|
ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
|
|
CharacterRange::Canonicalize(ranges);
|
|
|
|
if (cc->is_negated()) {
|
|
auto negated_ranges =
|
|
new (Z) ZoneGrowableArray<CharacterRange>(ranges->length());
|
|
CharacterRange::Negate(ranges, negated_ranges);
|
|
ranges = negated_ranges;
|
|
}
|
|
|
|
for (int i = ranges->length() - 1; i >= 0; i--) {
|
|
uint32_t from = ranges->At(i).from();
|
|
uint32_t to = ranges->At(i).to();
|
|
// Check for non-BMP characters.
|
|
if (to >= Utf16::kMaxCodeUnit) return true;
|
|
// Check for lone surrogates.
|
|
if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) {
|
|
if (is_unicode() && ignore_case()) {
|
|
icu::UnicodeSet set(c, c);
|
|
set.closeOver(USET_CASE_INSENSITIVE);
|
|
set.removeAllStrings();
|
|
return set.size() > 1;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
RegExpTree* RegExpBuilder::ToRegExp() {
|
|
FlushTerms();
|
|
intptr_t num_alternatives = alternatives_.length();
|
|
if (num_alternatives == 0) {
|
|
return RegExpEmpty::GetInstance();
|
|
}
|
|
if (num_alternatives == 1) {
|
|
return alternatives_.Last();
|
|
}
|
|
ZoneGrowableArray<RegExpTree*>* alternatives =
|
|
new (Z) ZoneGrowableArray<RegExpTree*>();
|
|
for (intptr_t i = 0; i < alternatives_.length(); i++) {
|
|
alternatives->Add(alternatives_[i]);
|
|
}
|
|
return new (Z) RegExpDisjunction(alternatives);
|
|
}
|
|
|
|
bool RegExpBuilder::AddQuantifierToAtom(
|
|
intptr_t min,
|
|
intptr_t max,
|
|
RegExpQuantifier::QuantifierType quantifier_type) {
|
|
if (pending_empty_) {
|
|
pending_empty_ = false;
|
|
return true;
|
|
}
|
|
RegExpTree* atom;
|
|
if (characters_ != nullptr) {
|
|
DEBUG_ASSERT(last_added_ == ADD_CHAR);
|
|
// Last atom was character.
|
|
|
|
ZoneGrowableArray<uint16_t>* char_vector =
|
|
new (Z) ZoneGrowableArray<uint16_t>();
|
|
char_vector->AddArray(*characters_);
|
|
intptr_t num_chars = char_vector->length();
|
|
if (num_chars > 1) {
|
|
ZoneGrowableArray<uint16_t>* prefix =
|
|
new (Z) ZoneGrowableArray<uint16_t>();
|
|
for (intptr_t i = 0; i < num_chars - 1; i++) {
|
|
prefix->Add(char_vector->At(i));
|
|
}
|
|
text_.Add(new (Z) RegExpAtom(prefix, flags_));
|
|
ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>();
|
|
tail->Add(char_vector->At(num_chars - 1));
|
|
char_vector = tail;
|
|
}
|
|
characters_ = nullptr;
|
|
atom = new (Z) RegExpAtom(char_vector, flags_);
|
|
FlushText();
|
|
} else if (text_.length() > 0) {
|
|
DEBUG_ASSERT(last_added_ == ADD_ATOM);
|
|
atom = text_.RemoveLast();
|
|
FlushText();
|
|
} else if (terms_.length() > 0) {
|
|
DEBUG_ASSERT(last_added_ == ADD_ATOM);
|
|
atom = terms_.RemoveLast();
|
|
if (auto lookaround = atom->AsLookaround()) {
|
|
// With /u, lookarounds are not quantifiable.
|
|
if (is_unicode()) return false;
|
|
// Lookbehinds are not quantifiable.
|
|
if (lookaround->type() == RegExpLookaround::LOOKBEHIND) {
|
|
return false;
|
|
}
|
|
}
|
|
if (atom->max_match() == 0) {
|
|
// Guaranteed to only match an empty string.
|
|
LAST(ADD_TERM);
|
|
if (min == 0) {
|
|
return true;
|
|
}
|
|
terms_.Add(atom);
|
|
return true;
|
|
}
|
|
} else {
|
|
// Only call immediately after adding an atom or character!
|
|
UNREACHABLE();
|
|
}
|
|
terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));
|
|
LAST(ADD_TERM);
|
|
return true;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// Implementation of Parser
|
|
|
|
RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags)
|
|
: zone_(Thread::Current()->zone()),
|
|
captures_(nullptr),
|
|
named_captures_(nullptr),
|
|
named_back_references_(nullptr),
|
|
in_(in),
|
|
current_(kEndMarker),
|
|
next_pos_(0),
|
|
captures_started_(0),
|
|
capture_count_(0),
|
|
has_more_(true),
|
|
top_level_flags_(flags),
|
|
simple_(false),
|
|
contains_anchor_(false),
|
|
is_scanned_for_captures_(false),
|
|
has_named_captures_(false) {
|
|
Advance();
|
|
}
|
|
|
|
inline uint32_t RegExpParser::ReadNext(bool update_position) {
|
|
intptr_t position = next_pos_;
|
|
const uint16_t c0 = in().CharAt(position);
|
|
uint32_t c = c0;
|
|
position++;
|
|
if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) {
|
|
const uint16_t c1 = in().CharAt(position);
|
|
if (Utf16::IsTrailSurrogate(c1)) {
|
|
c = Utf16::Decode(c0, c1);
|
|
position++;
|
|
}
|
|
}
|
|
if (update_position) next_pos_ = position;
|
|
return c;
|
|
}
|
|
|
|
uint32_t RegExpParser::Next() {
|
|
if (has_next()) {
|
|
return ReadNext(false);
|
|
} else {
|
|
return kEndMarker;
|
|
}
|
|
}
|
|
|
|
void RegExpParser::Advance() {
|
|
if (has_next()) {
|
|
current_ = ReadNext(true);
|
|
} else {
|
|
current_ = kEndMarker;
|
|
// Advance so that position() points to 1 after the last character. This is
|
|
// important so that Reset() to this position works correctly.
|
|
next_pos_ = in().Length() + 1;
|
|
has_more_ = false;
|
|
}
|
|
}
|
|
|
|
void RegExpParser::Reset(intptr_t pos) {
|
|
next_pos_ = pos;
|
|
has_more_ = (pos < in().Length());
|
|
Advance();
|
|
}
|
|
|
|
void RegExpParser::Advance(intptr_t dist) {
|
|
next_pos_ += dist - 1;
|
|
Advance();
|
|
}
|
|
|
|
bool RegExpParser::simple() {
|
|
return simple_;
|
|
}
|
|
|
|
bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) {
|
|
switch (c) {
|
|
case '^':
|
|
case '$':
|
|
case '\\':
|
|
case '.':
|
|
case '*':
|
|
case '+':
|
|
case '?':
|
|
case '(':
|
|
case ')':
|
|
case '[':
|
|
case ']':
|
|
case '{':
|
|
case '}':
|
|
case '|':
|
|
case '/':
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void RegExpParser::ReportError(const char* message) {
|
|
// Zip to the end to make sure the no more input is read.
|
|
current_ = kEndMarker;
|
|
next_pos_ = in().Length();
|
|
|
|
// Throw a FormatException on parsing failures.
|
|
Array& args = Array::Handle();
|
|
String& str = String::Handle();
|
|
args ^= Array::New(3);
|
|
str ^= String::New(message);
|
|
args.SetAt(0, str);
|
|
args.SetAt(1, Symbols::Blank());
|
|
args.SetAt(2, in());
|
|
str ^= String::ConcatAll(args);
|
|
args ^= Array::New(1);
|
|
args.SetAt(0, str);
|
|
Exceptions::ThrowByType(Exceptions::kFormat, args);
|
|
UNREACHABLE();
|
|
}
|
|
|
|
// Pattern ::
|
|
// Disjunction
|
|
RegExpTree* RegExpParser::ParsePattern() {
|
|
RegExpTree* result = ParseDisjunction();
|
|
PatchNamedBackReferences();
|
|
ASSERT(!has_more());
|
|
// If the result of parsing is a literal string atom, and it has the
|
|
// same length as the input, then the atom is identical to the input.
|
|
if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
|
|
simple_ = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Used for error messages where we would have fallen back on treating an
|
|
// escape as the identity escape, but we are in Unicode mode.
|
|
static const char* kUnicodeIdentity =
|
|
"Invalid identity escape in Unicode pattern";
|
|
|
|
// Disjunction ::
|
|
// Alternative
|
|
// Alternative | Disjunction
|
|
// Alternative ::
|
|
// [empty]
|
|
// Term Alternative
|
|
// Term ::
|
|
// Assertion
|
|
// Atom
|
|
// Atom Quantifier
|
|
RegExpTree* RegExpParser::ParseDisjunction() {
|
|
// Used to store current state while parsing subexpressions.
|
|
RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
|
|
0, nullptr, top_level_flags_, Z);
|
|
RegExpParserState* stored_state = &initial_state;
|
|
// Cache the builder in a local variable for quick access.
|
|
RegExpBuilder* builder = initial_state.builder();
|
|
while (true) {
|
|
switch (current()) {
|
|
case kEndMarker:
|
|
if (stored_state->IsSubexpression()) {
|
|
// Inside a parenthesized group when hitting end of input.
|
|
ReportError("Unterminated group");
|
|
UNREACHABLE();
|
|
}
|
|
ASSERT(INITIAL == stored_state->group_type());
|
|
// Parsing completed successfully.
|
|
return builder->ToRegExp();
|
|
case ')': {
|
|
if (!stored_state->IsSubexpression()) {
|
|
ReportError("Unmatched ')'");
|
|
UNREACHABLE();
|
|
}
|
|
ASSERT(INITIAL != stored_state->group_type());
|
|
|
|
Advance();
|
|
// End disjunction parsing and convert builder content to new single
|
|
// regexp atom.
|
|
RegExpTree* body = builder->ToRegExp();
|
|
|
|
intptr_t end_capture_index = captures_started();
|
|
|
|
intptr_t capture_index = stored_state->capture_index();
|
|
SubexpressionType group_type = stored_state->group_type();
|
|
|
|
// Build result of subexpression.
|
|
if (group_type == CAPTURE) {
|
|
if (stored_state->IsNamedCapture()) {
|
|
CreateNamedCaptureAtIndex(stored_state->capture_name(),
|
|
capture_index);
|
|
}
|
|
RegExpCapture* capture = GetCapture(capture_index);
|
|
capture->set_body(body);
|
|
body = capture;
|
|
} else if (group_type != GROUPING) {
|
|
ASSERT(group_type == POSITIVE_LOOKAROUND ||
|
|
group_type == NEGATIVE_LOOKAROUND);
|
|
bool is_positive = (group_type == POSITIVE_LOOKAROUND);
|
|
body = new (Z) RegExpLookaround(
|
|
body, is_positive, end_capture_index - capture_index,
|
|
capture_index, stored_state->lookaround_type());
|
|
}
|
|
|
|
// Restore previous state.
|
|
stored_state = stored_state->previous_state();
|
|
builder = stored_state->builder();
|
|
|
|
builder->AddAtom(body);
|
|
// For compatibility with JSC and ES3, we allow quantifiers after
|
|
// lookaheads, and break in all cases.
|
|
break;
|
|
}
|
|
case '|': {
|
|
Advance();
|
|
builder->NewAlternative();
|
|
continue;
|
|
}
|
|
case '*':
|
|
case '+':
|
|
case '?':
|
|
ReportError("Nothing to repeat");
|
|
UNREACHABLE();
|
|
case '^': {
|
|
Advance();
|
|
if (builder->is_multi_line()) {
|
|
builder->AddAssertion(new (Z) RegExpAssertion(
|
|
RegExpAssertion::START_OF_LINE, builder->flags()));
|
|
} else {
|
|
builder->AddAssertion(new (Z) RegExpAssertion(
|
|
RegExpAssertion::START_OF_INPUT, builder->flags()));
|
|
set_contains_anchor();
|
|
}
|
|
continue;
|
|
}
|
|
case '$': {
|
|
Advance();
|
|
RegExpAssertion::AssertionType assertion_type =
|
|
builder->is_multi_line() ? RegExpAssertion::END_OF_LINE
|
|
: RegExpAssertion::END_OF_INPUT;
|
|
builder->AddAssertion(
|
|
new (Z) RegExpAssertion(assertion_type, builder->flags()));
|
|
continue;
|
|
}
|
|
case '.': {
|
|
Advance();
|
|
auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
|
|
if (builder->is_dot_all()) {
|
|
// Everything.
|
|
CharacterRange::AddClassEscape(
|
|
'*', ranges,
|
|
/*add_unicode_case_equivalents=*/false);
|
|
} else {
|
|
// everything except \x0a, \x0d, \u2028 and \u2029
|
|
CharacterRange::AddClassEscape(
|
|
'.', ranges,
|
|
/*add_unicode_case_equivalents=*/false);
|
|
}
|
|
RegExpCharacterClass* cc =
|
|
new (Z) RegExpCharacterClass(ranges, builder->flags());
|
|
builder->AddCharacterClass(cc);
|
|
break;
|
|
}
|
|
case '(': {
|
|
stored_state = ParseOpenParenthesis(stored_state);
|
|
builder = stored_state->builder();
|
|
continue;
|
|
}
|
|
case '[': {
|
|
RegExpTree* atom = ParseCharacterClass(builder);
|
|
builder->AddCharacterClass(atom->AsCharacterClass());
|
|
break;
|
|
}
|
|
// Atom ::
|
|
// \ AtomEscape
|
|
case '\\':
|
|
switch (Next()) {
|
|
case kEndMarker:
|
|
ReportError("\\ at end of pattern");
|
|
UNREACHABLE();
|
|
case 'b':
|
|
Advance(2);
|
|
builder->AddAssertion(new (Z) RegExpAssertion(
|
|
RegExpAssertion::BOUNDARY, builder->flags()));
|
|
continue;
|
|
case 'B':
|
|
Advance(2);
|
|
builder->AddAssertion(new (Z) RegExpAssertion(
|
|
RegExpAssertion::NON_BOUNDARY, builder->flags()));
|
|
continue;
|
|
// AtomEscape ::
|
|
// CharacterClassEscape
|
|
//
|
|
// CharacterClassEscape :: one of
|
|
// d D s S w W
|
|
case 'd':
|
|
case 'D':
|
|
case 's':
|
|
case 'S':
|
|
case 'w':
|
|
case 'W': {
|
|
uint32_t c = Next();
|
|
Advance(2);
|
|
auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
|
|
CharacterRange::AddClassEscape(
|
|
c, ranges, is_unicode() && builder->ignore_case());
|
|
RegExpCharacterClass* cc =
|
|
new (Z) RegExpCharacterClass(ranges, builder->flags());
|
|
builder->AddCharacterClass(cc);
|
|
break;
|
|
}
|
|
case 'p':
|
|
case 'P': {
|
|
uint32_t p = Next();
|
|
Advance(2);
|
|
|
|
if (is_unicode()) {
|
|
auto name_1 = new (Z) ZoneGrowableArray<char>();
|
|
auto name_2 = new (Z) ZoneGrowableArray<char>();
|
|
auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
|
|
if (ParsePropertyClassName(name_1, name_2)) {
|
|
if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
|
|
RegExpCharacterClass* cc =
|
|
new (Z) RegExpCharacterClass(ranges, builder->flags());
|
|
builder->AddCharacterClass(cc);
|
|
break;
|
|
}
|
|
}
|
|
ReportError("Invalid property name");
|
|
UNREACHABLE();
|
|
} else {
|
|
builder->AddCharacter(p);
|
|
}
|
|
break;
|
|
}
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
case '8':
|
|
case '9': {
|
|
intptr_t index = 0;
|
|
if (ParseBackReferenceIndex(&index)) {
|
|
if (stored_state->IsInsideCaptureGroup(index)) {
|
|
// The back reference is inside the capture group it refers to.
|
|
// Nothing can possibly have been captured yet, so we use empty
|
|
// instead. This ensures that, when checking a back reference,
|
|
// the capture registers of the referenced capture are either
|
|
// both set or both cleared.
|
|
builder->AddEmpty();
|
|
} else {
|
|
RegExpCapture* capture = GetCapture(index);
|
|
RegExpTree* atom =
|
|
new (Z) RegExpBackReference(capture, builder->flags());
|
|
builder->AddAtom(atom);
|
|
}
|
|
break;
|
|
}
|
|
// With /u, no identity escapes except for syntax characters are
|
|
// allowed. Otherwise, all identity escapes are allowed.
|
|
if (is_unicode()) {
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
uint32_t first_digit = Next();
|
|
if (first_digit == '8' || first_digit == '9') {
|
|
builder->AddCharacter(first_digit);
|
|
Advance(2);
|
|
break;
|
|
}
|
|
}
|
|
FALL_THROUGH;
|
|
case '0': {
|
|
Advance();
|
|
if (is_unicode() && Next() >= '0' && Next() <= '9') {
|
|
// With /u, decimal escape with leading 0 are not parsed as octal.
|
|
ReportError("Invalid decimal escape");
|
|
UNREACHABLE();
|
|
}
|
|
uint32_t octal = ParseOctalLiteral();
|
|
builder->AddCharacter(octal);
|
|
break;
|
|
}
|
|
// ControlEscape :: one of
|
|
// f n r t v
|
|
case 'f':
|
|
Advance(2);
|
|
builder->AddCharacter('\f');
|
|
break;
|
|
case 'n':
|
|
Advance(2);
|
|
builder->AddCharacter('\n');
|
|
break;
|
|
case 'r':
|
|
Advance(2);
|
|
builder->AddCharacter('\r');
|
|
break;
|
|
case 't':
|
|
Advance(2);
|
|
builder->AddCharacter('\t');
|
|
break;
|
|
case 'v':
|
|
Advance(2);
|
|
builder->AddCharacter('\v');
|
|
break;
|
|
case 'c': {
|
|
Advance();
|
|
uint32_t controlLetter = Next();
|
|
// Special case if it is an ASCII letter.
|
|
// Convert lower case letters to uppercase.
|
|
uint32_t letter = controlLetter & ~('a' ^ 'A');
|
|
if (letter < 'A' || 'Z' < letter) {
|
|
// controlLetter is not in range 'A'-'Z' or 'a'-'z'.
|
|
// This is outside the specification. We match JSC in
|
|
// reading the backslash as a literal character instead
|
|
// of as starting an escape.
|
|
if (is_unicode()) {
|
|
// With /u, invalid escapes are not treated as identity escapes.
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
builder->AddCharacter('\\');
|
|
} else {
|
|
Advance(2);
|
|
builder->AddCharacter(controlLetter & 0x1f);
|
|
}
|
|
break;
|
|
}
|
|
case 'x': {
|
|
Advance(2);
|
|
uint32_t value;
|
|
if (ParseHexEscape(2, &value)) {
|
|
builder->AddCharacter(value);
|
|
} else if (!is_unicode()) {
|
|
builder->AddCharacter('x');
|
|
} else {
|
|
// With /u, invalid escapes are not treated as identity escapes.
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
break;
|
|
}
|
|
case 'u': {
|
|
Advance(2);
|
|
uint32_t value;
|
|
if (ParseUnicodeEscape(&value)) {
|
|
builder->AddEscapedUnicodeCharacter(value);
|
|
} else if (!is_unicode()) {
|
|
builder->AddCharacter('u');
|
|
} else {
|
|
// With /u, invalid escapes are not treated as identity escapes.
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
break;
|
|
}
|
|
case 'k':
|
|
// Either an identity escape or a named back-reference. The two
|
|
// interpretations are mutually exclusive: '\k' is interpreted as
|
|
// an identity escape for non-Unicode patterns without named
|
|
// capture groups, and as the beginning of a named back-reference
|
|
// in all other cases.
|
|
if (is_unicode() || HasNamedCaptures()) {
|
|
Advance(2);
|
|
ParseNamedBackReference(builder, stored_state);
|
|
break;
|
|
}
|
|
FALL_THROUGH;
|
|
default:
|
|
Advance();
|
|
// With the unicode flag, no identity escapes except for syntax
|
|
// characters are allowed. Otherwise, all identity escapes are
|
|
// allowed.
|
|
if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) {
|
|
builder->AddCharacter(current());
|
|
Advance();
|
|
} else {
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
break;
|
|
}
|
|
break;
|
|
case '{': {
|
|
intptr_t dummy;
|
|
if (ParseIntervalQuantifier(&dummy, &dummy)) {
|
|
ReportError("Nothing to repeat");
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
FALL_THROUGH;
|
|
case '}':
|
|
case ']':
|
|
if (is_unicode()) {
|
|
ReportError("Lone quantifier brackets");
|
|
UNREACHABLE();
|
|
}
|
|
FALL_THROUGH;
|
|
default:
|
|
builder->AddUnicodeCharacter(current());
|
|
Advance();
|
|
break;
|
|
} // end switch(current())
|
|
|
|
intptr_t min;
|
|
intptr_t max;
|
|
switch (current()) {
|
|
// QuantifierPrefix ::
|
|
// *
|
|
// +
|
|
// ?
|
|
// {
|
|
case '*':
|
|
min = 0;
|
|
max = RegExpTree::kInfinity;
|
|
Advance();
|
|
break;
|
|
case '+':
|
|
min = 1;
|
|
max = RegExpTree::kInfinity;
|
|
Advance();
|
|
break;
|
|
case '?':
|
|
min = 0;
|
|
max = 1;
|
|
Advance();
|
|
break;
|
|
case '{':
|
|
if (ParseIntervalQuantifier(&min, &max)) {
|
|
if (max < min) {
|
|
ReportError("numbers out of order in {} quantifier.");
|
|
UNREACHABLE();
|
|
}
|
|
break;
|
|
} else {
|
|
continue;
|
|
}
|
|
default:
|
|
continue;
|
|
}
|
|
RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
|
|
if (current() == '?') {
|
|
quantifier_type = RegExpQuantifier::NON_GREEDY;
|
|
Advance();
|
|
} else if (FLAG_regexp_possessive_quantifier && current() == '+') {
|
|
// FLAG_regexp_possessive_quantifier is a debug-only flag.
|
|
quantifier_type = RegExpQuantifier::POSSESSIVE;
|
|
Advance();
|
|
}
|
|
if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
|
|
ReportError("invalid quantifier.");
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
// Currently only used in an ASSERT.
|
|
static bool IsSpecialClassEscape(uint32_t c) {
|
|
switch (c) {
|
|
case 'd':
|
|
case 'D':
|
|
case 's':
|
|
case 'S':
|
|
case 'w':
|
|
case 'W':
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
|
|
RegExpParserState* state) {
|
|
RegExpLookaround::Type lookaround_type = state->lookaround_type();
|
|
bool is_named_capture = false;
|
|
const RegExpCaptureName* capture_name = nullptr;
|
|
SubexpressionType subexpr_type = CAPTURE;
|
|
Advance();
|
|
if (current() == '?') {
|
|
switch (Next()) {
|
|
case ':':
|
|
Advance(2);
|
|
subexpr_type = GROUPING;
|
|
break;
|
|
case '=':
|
|
Advance(2);
|
|
lookaround_type = RegExpLookaround::LOOKAHEAD;
|
|
subexpr_type = POSITIVE_LOOKAROUND;
|
|
break;
|
|
case '!':
|
|
Advance(2);
|
|
lookaround_type = RegExpLookaround::LOOKAHEAD;
|
|
subexpr_type = NEGATIVE_LOOKAROUND;
|
|
break;
|
|
case '<':
|
|
Advance();
|
|
if (Next() == '=') {
|
|
Advance(2);
|
|
lookaround_type = RegExpLookaround::LOOKBEHIND;
|
|
subexpr_type = POSITIVE_LOOKAROUND;
|
|
break;
|
|
} else if (Next() == '!') {
|
|
Advance(2);
|
|
lookaround_type = RegExpLookaround::LOOKBEHIND;
|
|
subexpr_type = NEGATIVE_LOOKAROUND;
|
|
break;
|
|
}
|
|
is_named_capture = true;
|
|
has_named_captures_ = true;
|
|
Advance();
|
|
break;
|
|
default:
|
|
ReportError("Invalid group");
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
|
|
if (subexpr_type == CAPTURE) {
|
|
if (captures_started_ >= kMaxCaptures) {
|
|
ReportError("Too many captures");
|
|
UNREACHABLE();
|
|
}
|
|
captures_started_++;
|
|
|
|
if (is_named_capture) {
|
|
capture_name = ParseCaptureGroupName();
|
|
}
|
|
}
|
|
// Store current state and begin new disjunction parsing.
|
|
return new (Z)
|
|
RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
|
|
capture_name, state->builder()->flags(), Z);
|
|
}
|
|
|
|
// In order to know whether an escape is a backreference or not we have to scan
|
|
// the entire regexp and find the number of capturing parentheses. However we
|
|
// don't want to scan the regexp twice unless it is necessary. This mini-parser
|
|
// is called when needed. It can see the difference between capturing and
|
|
// noncapturing parentheses and can skip character classes and backslash-escaped
|
|
// characters.
|
|
void RegExpParser::ScanForCaptures() {
|
|
ASSERT(!is_scanned_for_captures_);
|
|
const intptr_t saved_position = position();
|
|
// Start with captures started previous to current position
|
|
intptr_t capture_count = captures_started();
|
|
// Add count of captures after this position.
|
|
uintptr_t n;
|
|
while ((n = current()) != kEndMarker) {
|
|
Advance();
|
|
switch (n) {
|
|
case '\\':
|
|
Advance();
|
|
break;
|
|
case '[': {
|
|
uintptr_t c;
|
|
while ((c = current()) != kEndMarker) {
|
|
Advance();
|
|
if (c == '\\') {
|
|
Advance();
|
|
} else {
|
|
if (c == ']') break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case '(':
|
|
// At this point we could be in
|
|
// * a non-capturing group '(:',
|
|
// * a lookbehind assertion '(?<=' '(?<!'
|
|
// * or a named capture '(?<'.
|
|
//
|
|
// Of these, only named captures are capturing groups.
|
|
if (current() == '?') {
|
|
Advance();
|
|
if (current() != '<') break;
|
|
|
|
Advance();
|
|
if (current() == '=' || current() == '!') break;
|
|
|
|
// Found a possible named capture. It could turn out to be a syntax
|
|
// error (e.g. an unterminated or invalid name), but that distinction
|
|
// does not matter for our purposes.
|
|
has_named_captures_ = true;
|
|
}
|
|
capture_count++;
|
|
break;
|
|
}
|
|
}
|
|
capture_count_ = capture_count;
|
|
is_scanned_for_captures_ = true;
|
|
Reset(saved_position);
|
|
}
|
|
|
|
bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
|
|
ASSERT('\\' == current());
|
|
ASSERT('1' <= Next() && Next() <= '9');
|
|
// Try to parse a decimal literal that is no greater than the total number
|
|
// of left capturing parentheses in the input.
|
|
intptr_t start = position();
|
|
intptr_t value = Next() - '0';
|
|
Advance(2);
|
|
while (true) {
|
|
uint32_t c = current();
|
|
if (Utils::IsDecimalDigit(c)) {
|
|
value = 10 * value + (c - '0');
|
|
if (value > kMaxCaptures) {
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
Advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
if (value > captures_started()) {
|
|
if (!is_scanned_for_captures_) ScanForCaptures();
|
|
if (value > capture_count_) {
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
}
|
|
*index_out = value;
|
|
return true;
|
|
}
|
|
|
|
namespace {
|
|
|
|
static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) {
|
|
return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$';
|
|
}
|
|
|
|
// ES#sec-names-and-keywords Names and Keywords
|
|
// UnicodeIDStart, '$', '_' and '\'
|
|
static bool IsIdentifierStartSlow(uint32_t c) {
|
|
// cannot use u_isIDStart because it does not work for
|
|
// Other_ID_Start characters.
|
|
return u_hasBinaryProperty(c, UCHAR_ID_START) ||
|
|
(c < 0x60 && (c == '$' || c == '\\' || c == '_'));
|
|
}
|
|
|
|
// ES#sec-names-and-keywords Names and Keywords
|
|
// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
|
|
static bool IsIdentifierPartSlow(uint32_t c) {
|
|
const uint32_t kZeroWidthNonJoiner = 0x200C;
|
|
const uint32_t kZeroWidthJoiner = 0x200D;
|
|
// Can't use u_isIDPart because it does not work for
|
|
// Other_ID_Continue characters.
|
|
return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) ||
|
|
(c < 0x60 && (c == '$' || c == '\\' || c == '_')) ||
|
|
c == kZeroWidthNonJoiner || c == kZeroWidthJoiner;
|
|
}
|
|
|
|
static inline bool IsIdentifierStart(uint32_t c) {
|
|
if (c > 127) return IsIdentifierStartSlow(c);
|
|
return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c);
|
|
}
|
|
|
|
static inline bool IsIdentifierPart(uint32_t c) {
|
|
if (c > 127) return IsIdentifierPartSlow(c);
|
|
return IsAsciiIdentifierPart(c);
|
|
}
|
|
|
|
static bool IsSameName(const RegExpCaptureName* name1,
|
|
const RegExpCaptureName* name2) {
|
|
if (name1->length() != name2->length()) return false;
|
|
for (intptr_t i = 0; i < name1->length(); i++) {
|
|
if (name1->At(i) != name2->At(i)) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // end namespace
|
|
|
|
static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) {
|
|
if (code_unit <= Utf16::kMaxCodeUnit) {
|
|
v->Add(code_unit);
|
|
} else {
|
|
uint16_t units[2];
|
|
Utf16::Encode(code_unit, units);
|
|
v->Add(units[0]);
|
|
v->Add(units[1]);
|
|
}
|
|
}
|
|
|
|
const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() {
|
|
auto name = new (Z) RegExpCaptureName();
|
|
|
|
bool at_start = true;
|
|
while (true) {
|
|
uint32_t c = current();
|
|
Advance();
|
|
|
|
// Convert unicode escapes.
|
|
if (c == '\\' && current() == 'u') {
|
|
Advance();
|
|
if (!ParseUnicodeEscape(&c)) {
|
|
ReportError("Invalid Unicode escape sequence");
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
|
|
// The backslash char is misclassified as both ID_Start and ID_Continue.
|
|
if (c == '\\') {
|
|
ReportError("Invalid capture group name");
|
|
UNREACHABLE();
|
|
}
|
|
|
|
if (at_start) {
|
|
if (!IsIdentifierStart(c)) {
|
|
ReportError("Invalid capture group name");
|
|
UNREACHABLE();
|
|
}
|
|
PushCodeUnit(name, c);
|
|
at_start = false;
|
|
} else {
|
|
if (c == '>') {
|
|
break;
|
|
} else if (IsIdentifierPart(c)) {
|
|
PushCodeUnit(name, c);
|
|
} else {
|
|
ReportError("Invalid capture group name");
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
}
|
|
|
|
return name;
|
|
}
|
|
|
|
intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) {
|
|
for (const auto& capture : *named_captures_) {
|
|
if (IsSameName(name, capture->name())) return capture->index();
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name,
|
|
intptr_t index) {
|
|
ASSERT(0 < index && index <= captures_started_);
|
|
ASSERT(name != nullptr);
|
|
|
|
if (named_captures_ == nullptr) {
|
|
named_captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(1);
|
|
} else {
|
|
// Check for duplicates and bail if we find any. Currently O(n^2).
|
|
if (GetNamedCaptureIndex(name) >= 0) {
|
|
ReportError("Duplicate capture group name");
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
|
|
RegExpCapture* capture = GetCapture(index);
|
|
ASSERT(capture->name() == nullptr);
|
|
|
|
capture->set_name(name);
|
|
named_captures_->Add(capture);
|
|
}
|
|
|
|
bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
|
|
RegExpParserState* state) {
|
|
// The parser is assumed to be on the '<' in \k<name>.
|
|
if (current() != '<') {
|
|
ReportError("Invalid named reference");
|
|
UNREACHABLE();
|
|
}
|
|
|
|
Advance();
|
|
const RegExpCaptureName* name = ParseCaptureGroupName();
|
|
if (name == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
if (state->IsInsideCaptureGroup(name)) {
|
|
builder->AddEmpty();
|
|
} else {
|
|
RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags());
|
|
atom->set_name(name);
|
|
|
|
builder->AddAtom(atom);
|
|
|
|
if (named_back_references_ == nullptr) {
|
|
named_back_references_ =
|
|
new (Z) ZoneGrowableArray<RegExpBackReference*>(1);
|
|
}
|
|
named_back_references_->Add(atom);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void RegExpParser::PatchNamedBackReferences() {
|
|
if (named_back_references_ == nullptr) return;
|
|
|
|
if (named_captures_ == nullptr) {
|
|
ReportError("Invalid named capture referenced");
|
|
return;
|
|
}
|
|
|
|
// Look up and patch the actual capture for each named back reference.
|
|
// Currently O(n^2), optimize if necessary.
|
|
for (intptr_t i = 0; i < named_back_references_->length(); i++) {
|
|
RegExpBackReference* ref = named_back_references_->At(i);
|
|
intptr_t index = GetNamedCaptureIndex(ref->name());
|
|
|
|
if (index < 0) {
|
|
ReportError("Invalid named capture referenced");
|
|
UNREACHABLE();
|
|
}
|
|
ref->set_capture(GetCapture(index));
|
|
}
|
|
}
|
|
|
|
RegExpCapture* RegExpParser::GetCapture(intptr_t index) {
|
|
// The index for the capture groups are one-based. Its index in the list is
|
|
// zero-based.
|
|
const intptr_t know_captures =
|
|
is_scanned_for_captures_ ? capture_count_ : captures_started_;
|
|
ASSERT(index <= know_captures);
|
|
if (captures_ == nullptr) {
|
|
captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures);
|
|
}
|
|
while (captures_->length() < know_captures) {
|
|
captures_->Add(new (Z) RegExpCapture(captures_->length() + 1));
|
|
}
|
|
return captures_->At(index - 1);
|
|
}
|
|
|
|
ArrayPtr RegExpParser::CreateCaptureNameMap() {
|
|
if (named_captures_ == nullptr || named_captures_->is_empty()) {
|
|
return Array::null();
|
|
}
|
|
|
|
const intptr_t len = named_captures_->length() * 2;
|
|
|
|
const Array& array = Array::Handle(Array::New(len));
|
|
|
|
auto& name = String::Handle();
|
|
auto& smi = Smi::Handle();
|
|
for (intptr_t i = 0; i < named_captures_->length(); i++) {
|
|
RegExpCapture* capture = named_captures_->At(i);
|
|
name =
|
|
String::FromUTF16(capture->name()->data(), capture->name()->length());
|
|
smi = Smi::New(capture->index());
|
|
array.SetAt(i * 2, name);
|
|
array.SetAt(i * 2 + 1, smi);
|
|
}
|
|
|
|
return array.ptr();
|
|
}
|
|
|
|
bool RegExpParser::HasNamedCaptures() {
|
|
if (has_named_captures_ || is_scanned_for_captures_) {
|
|
return has_named_captures_;
|
|
}
|
|
|
|
ScanForCaptures();
|
|
ASSERT(is_scanned_for_captures_);
|
|
return has_named_captures_;
|
|
}
|
|
|
|
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) {
|
|
for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
|
|
if (s->group_type() != CAPTURE) continue;
|
|
// Return true if we found the matching capture index.
|
|
if (index == s->capture_index()) return true;
|
|
// Abort if index is larger than what has been parsed up till this state.
|
|
if (index > s->capture_index()) return false;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
|
|
const RegExpCaptureName* name) {
|
|
ASSERT(name != nullptr);
|
|
for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
|
|
if (s->capture_name() == nullptr) continue;
|
|
if (IsSameName(s->capture_name(), name)) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// QuantifierPrefix ::
|
|
// { DecimalDigits }
|
|
// { DecimalDigits , }
|
|
// { DecimalDigits , DecimalDigits }
|
|
//
|
|
// Returns true if parsing succeeds, and set the min_out and max_out
|
|
// values. Values are truncated to RegExpTree::kInfinity if they overflow.
|
|
bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,
|
|
intptr_t* max_out) {
|
|
ASSERT(current() == '{');
|
|
intptr_t start = position();
|
|
Advance();
|
|
intptr_t min = 0;
|
|
if (!Utils::IsDecimalDigit(current())) {
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
while (Utils::IsDecimalDigit(current())) {
|
|
intptr_t next = current() - '0';
|
|
if (min > (RegExpTree::kInfinity - next) / 10) {
|
|
// Overflow. Skip past remaining decimal digits and return -1.
|
|
do {
|
|
Advance();
|
|
} while (Utils::IsDecimalDigit(current()));
|
|
min = RegExpTree::kInfinity;
|
|
break;
|
|
}
|
|
min = 10 * min + next;
|
|
Advance();
|
|
}
|
|
intptr_t max = 0;
|
|
if (current() == '}') {
|
|
max = min;
|
|
Advance();
|
|
} else if (current() == ',') {
|
|
Advance();
|
|
if (current() == '}') {
|
|
max = RegExpTree::kInfinity;
|
|
Advance();
|
|
} else {
|
|
while (Utils::IsDecimalDigit(current())) {
|
|
intptr_t next = current() - '0';
|
|
if (max > (RegExpTree::kInfinity - next) / 10) {
|
|
do {
|
|
Advance();
|
|
} while (Utils::IsDecimalDigit(current()));
|
|
max = RegExpTree::kInfinity;
|
|
break;
|
|
}
|
|
max = 10 * max + next;
|
|
Advance();
|
|
}
|
|
if (current() != '}') {
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
Advance();
|
|
}
|
|
} else {
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
*min_out = min;
|
|
*max_out = max;
|
|
return true;
|
|
}
|
|
|
|
uint32_t RegExpParser::ParseOctalLiteral() {
|
|
ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
|
|
// For compatibility with some other browsers (not all), we parse
|
|
// up to three octal digits with a value below 256.
|
|
uint32_t value = current() - '0';
|
|
Advance();
|
|
if ('0' <= current() && current() <= '7') {
|
|
value = value * 8 + current() - '0';
|
|
Advance();
|
|
if (value < 32 && '0' <= current() && current() <= '7') {
|
|
value = value * 8 + current() - '0';
|
|
Advance();
|
|
}
|
|
}
|
|
return value;
|
|
}
|
|
|
|
// Returns the value (0 .. 15) of a hexadecimal character c.
|
|
// If c is not a legal hexadecimal character, returns a value < 0.
|
|
static inline intptr_t HexValue(uint32_t c) {
|
|
c -= '0';
|
|
if (static_cast<unsigned>(c) <= 9) return c;
|
|
c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
|
|
if (static_cast<unsigned>(c) <= 5) return c + 10;
|
|
return -1;
|
|
}
|
|
|
|
bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {
|
|
intptr_t start = position();
|
|
uint32_t val = 0;
|
|
bool done = false;
|
|
for (intptr_t i = 0; !done; i++) {
|
|
uint32_t c = current();
|
|
intptr_t d = HexValue(c);
|
|
if (d < 0) {
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
val = val * 16 + d;
|
|
Advance();
|
|
if (i == length - 1) {
|
|
done = true;
|
|
}
|
|
}
|
|
*value = val;
|
|
return true;
|
|
}
|
|
|
|
// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
|
|
bool RegExpParser::ParseUnicodeEscape(uint32_t* value) {
|
|
// Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
|
|
// allowed). In the latter case, the number of hex digits between { } is
|
|
// arbitrary. \ and u have already been read.
|
|
if (current() == '{' && is_unicode()) {
|
|
int start = position();
|
|
Advance();
|
|
if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) {
|
|
if (current() == '}') {
|
|
Advance();
|
|
return true;
|
|
}
|
|
}
|
|
Reset(start);
|
|
return false;
|
|
}
|
|
// \u but no {, or \u{...} escapes not allowed.
|
|
bool result = ParseHexEscape(4, value);
|
|
if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) &&
|
|
current() == '\\') {
|
|
// Attempt to read trail surrogate.
|
|
int start = position();
|
|
if (Next() == 'u') {
|
|
Advance(2);
|
|
uint32_t trail;
|
|
if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) {
|
|
*value = Utf16::Decode(static_cast<uint16_t>(*value),
|
|
static_cast<uint16_t>(trail));
|
|
return true;
|
|
}
|
|
}
|
|
Reset(start);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
namespace {
|
|
|
|
bool IsExactPropertyAlias(const char* property_name, UProperty property) {
|
|
const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
|
|
if (short_name != nullptr && strcmp(property_name, short_name) == 0) {
|
|
return true;
|
|
}
|
|
for (int i = 0;; i++) {
|
|
const char* long_name = u_getPropertyName(
|
|
property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
|
|
if (long_name == nullptr) break;
|
|
if (strcmp(property_name, long_name) == 0) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool IsExactPropertyValueAlias(const char* property_value_name,
|
|
UProperty property,
|
|
int32_t property_value) {
|
|
const char* short_name =
|
|
u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
|
|
if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
|
|
return true;
|
|
}
|
|
for (int i = 0;; i++) {
|
|
const char* long_name = u_getPropertyValueName(
|
|
property, property_value,
|
|
static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
|
|
if (long_name == nullptr) break;
|
|
if (strcmp(property_value_name, long_name) == 0) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool LookupPropertyValueName(UProperty property,
|
|
const char* property_value_name,
|
|
bool negate,
|
|
ZoneGrowableArray<CharacterRange>* result) {
|
|
UProperty property_for_lookup = property;
|
|
if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
|
|
// For the property Script_Extensions, we have to do the property value
|
|
// name lookup as if the property is Script.
|
|
property_for_lookup = UCHAR_SCRIPT;
|
|
}
|
|
int32_t property_value =
|
|
u_getPropertyValueEnum(property_for_lookup, property_value_name);
|
|
if (property_value == UCHAR_INVALID_CODE) return false;
|
|
|
|
// We require the property name to match exactly to one of the property value
|
|
// aliases. However, u_getPropertyValueEnum uses loose matching.
|
|
if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
|
|
property_value)) {
|
|
return false;
|
|
}
|
|
|
|
UErrorCode ec = U_ZERO_ERROR;
|
|
icu::UnicodeSet set;
|
|
set.applyIntPropertyValue(property, property_value, ec);
|
|
bool success = ec == U_ZERO_ERROR && (set.isEmpty() == 0);
|
|
|
|
if (success) {
|
|
set.removeAllStrings();
|
|
if (negate) set.complement();
|
|
for (int i = 0; i < set.getRangeCount(); i++) {
|
|
result->Add(
|
|
CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
|
|
}
|
|
}
|
|
return success;
|
|
}
|
|
|
|
template <size_t N>
|
|
inline bool NameEquals(const char* name, const char (&literal)[N]) {
|
|
return strncmp(name, literal, N + 1) == 0;
|
|
}
|
|
|
|
bool LookupSpecialPropertyValueName(const char* name,
|
|
ZoneGrowableArray<CharacterRange>* result,
|
|
bool negate) {
|
|
if (NameEquals(name, "Any")) {
|
|
if (negate) {
|
|
// Leave the list of character ranges empty, since the negation of 'Any'
|
|
// is the empty set.
|
|
} else {
|
|
result->Add(CharacterRange::Everything());
|
|
}
|
|
} else if (NameEquals(name, "ASCII")) {
|
|
result->Add(negate ? CharacterRange::Range(0x80, Utf::kMaxCodePoint)
|
|
: CharacterRange::Range(0x0, 0x7F));
|
|
} else if (NameEquals(name, "Assigned")) {
|
|
return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
|
|
!negate, result);
|
|
} else {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Explicitly list supported binary properties. The spec forbids supporting
|
|
// properties outside of this set to ensure interoperability.
|
|
bool IsSupportedBinaryProperty(UProperty property) {
|
|
switch (property) {
|
|
case UCHAR_ALPHABETIC:
|
|
// 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
|
|
// 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
|
|
case UCHAR_ASCII_HEX_DIGIT:
|
|
// 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
|
|
case UCHAR_BIDI_CONTROL:
|
|
case UCHAR_BIDI_MIRRORED:
|
|
case UCHAR_CASE_IGNORABLE:
|
|
case UCHAR_CASED:
|
|
case UCHAR_CHANGES_WHEN_CASEFOLDED:
|
|
case UCHAR_CHANGES_WHEN_CASEMAPPED:
|
|
case UCHAR_CHANGES_WHEN_LOWERCASED:
|
|
case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
|
|
case UCHAR_CHANGES_WHEN_TITLECASED:
|
|
case UCHAR_CHANGES_WHEN_UPPERCASED:
|
|
case UCHAR_DASH:
|
|
case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
|
|
case UCHAR_DEPRECATED:
|
|
case UCHAR_DIACRITIC:
|
|
case UCHAR_EMOJI:
|
|
case UCHAR_EMOJI_COMPONENT:
|
|
case UCHAR_EMOJI_MODIFIER_BASE:
|
|
case UCHAR_EMOJI_MODIFIER:
|
|
case UCHAR_EMOJI_PRESENTATION:
|
|
case UCHAR_EXTENDED_PICTOGRAPHIC:
|
|
case UCHAR_EXTENDER:
|
|
case UCHAR_GRAPHEME_BASE:
|
|
case UCHAR_GRAPHEME_EXTEND:
|
|
case UCHAR_HEX_DIGIT:
|
|
case UCHAR_ID_CONTINUE:
|
|
case UCHAR_ID_START:
|
|
case UCHAR_IDEOGRAPHIC:
|
|
case UCHAR_IDS_BINARY_OPERATOR:
|
|
case UCHAR_IDS_TRINARY_OPERATOR:
|
|
case UCHAR_JOIN_CONTROL:
|
|
case UCHAR_LOGICAL_ORDER_EXCEPTION:
|
|
case UCHAR_LOWERCASE:
|
|
case UCHAR_MATH:
|
|
case UCHAR_NONCHARACTER_CODE_POINT:
|
|
case UCHAR_PATTERN_SYNTAX:
|
|
case UCHAR_PATTERN_WHITE_SPACE:
|
|
case UCHAR_QUOTATION_MARK:
|
|
case UCHAR_RADICAL:
|
|
case UCHAR_REGIONAL_INDICATOR:
|
|
case UCHAR_S_TERM:
|
|
case UCHAR_SOFT_DOTTED:
|
|
case UCHAR_TERMINAL_PUNCTUATION:
|
|
case UCHAR_UNIFIED_IDEOGRAPH:
|
|
case UCHAR_UPPERCASE:
|
|
case UCHAR_VARIATION_SELECTOR:
|
|
case UCHAR_WHITE_SPACE:
|
|
case UCHAR_XID_CONTINUE:
|
|
case UCHAR_XID_START:
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool IsUnicodePropertyValueCharacter(char c) {
|
|
// https://tc39.github.io/proposal-regexp-unicode-property-escapes/
|
|
//
|
|
// Note that using this to validate each parsed char is quite conservative.
|
|
// A possible alternative solution would be to only ensure the parsed
|
|
// property name/value candidate string does not contain '\0' characters and
|
|
// let ICU lookups trigger the final failure.
|
|
if (Utils::IsAlphaNumeric(c)) return true;
|
|
return (c == '_');
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
|
|
ZoneGrowableArray<char>* name_2) {
|
|
ASSERT(name_1->is_empty());
|
|
ASSERT(name_2->is_empty());
|
|
// Parse the property class as follows:
|
|
// - In \p{name}, 'name' is interpreted
|
|
// - either as a general category property value name.
|
|
// - or as a binary property name.
|
|
// - In \p{name=value}, 'name' is interpreted as an enumerated property name,
|
|
// and 'value' is interpreted as one of the available property value names.
|
|
// - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
|
|
// - Loose matching is not applied.
|
|
if (current() == '{') {
|
|
// Parse \p{[PropertyName=]PropertyNameValue}
|
|
for (Advance(); current() != '}' && current() != '='; Advance()) {
|
|
if (!IsUnicodePropertyValueCharacter(current())) return false;
|
|
if (!has_next()) return false;
|
|
name_1->Add(static_cast<char>(current()));
|
|
}
|
|
if (current() == '=') {
|
|
for (Advance(); current() != '}'; Advance()) {
|
|
if (!IsUnicodePropertyValueCharacter(current())) return false;
|
|
if (!has_next()) return false;
|
|
name_2->Add(static_cast<char>(current()));
|
|
}
|
|
name_2->Add(0); // null-terminate string.
|
|
}
|
|
} else {
|
|
return false;
|
|
}
|
|
Advance();
|
|
name_1->Add(0); // null-terminate string.
|
|
|
|
ASSERT(static_cast<size_t>(name_1->length() - 1) == strlen(name_1->data()));
|
|
ASSERT(name_2->is_empty() ||
|
|
static_cast<size_t>(name_2->length() - 1) == strlen(name_2->data()));
|
|
return true;
|
|
}
|
|
|
|
bool RegExpParser::AddPropertyClassRange(
|
|
ZoneGrowableArray<CharacterRange>* add_to,
|
|
bool negate,
|
|
ZoneGrowableArray<char>* name_1,
|
|
ZoneGrowableArray<char>* name_2) {
|
|
ASSERT(name_1->At(name_1->length() - 1) == '\0');
|
|
ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0');
|
|
if (name_2->is_empty()) {
|
|
// First attempt to interpret as general category property value name.
|
|
const char* name = name_1->data();
|
|
if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
|
|
add_to)) {
|
|
return true;
|
|
}
|
|
// Interpret "Any", "ASCII", and "Assigned".
|
|
if (LookupSpecialPropertyValueName(name, add_to, negate)) {
|
|
return true;
|
|
}
|
|
// Then attempt to interpret as binary property name with value name 'Y'.
|
|
UProperty property = u_getPropertyEnum(name);
|
|
if (!IsSupportedBinaryProperty(property)) return false;
|
|
if (!IsExactPropertyAlias(name, property)) return false;
|
|
return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to);
|
|
} else {
|
|
// Both property name and value name are specified. Attempt to interpret
|
|
// the property name as enumerated property.
|
|
const char* property_name = name_1->data();
|
|
const char* value_name = name_2->data();
|
|
UProperty property = u_getPropertyEnum(property_name);
|
|
if (!IsExactPropertyAlias(property_name, property)) return false;
|
|
if (property == UCHAR_GENERAL_CATEGORY) {
|
|
// We want to allow aggregate value names such as "Letter".
|
|
property = UCHAR_GENERAL_CATEGORY_MASK;
|
|
} else if (property != UCHAR_SCRIPT &&
|
|
property != UCHAR_SCRIPT_EXTENSIONS) {
|
|
return false;
|
|
}
|
|
return LookupPropertyValueName(property, value_name, negate, add_to);
|
|
}
|
|
}
|
|
|
|
bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value,
|
|
uint32_t* value) {
|
|
uint32_t x = 0;
|
|
int d = HexValue(current());
|
|
if (d < 0) {
|
|
return false;
|
|
}
|
|
while (d >= 0) {
|
|
x = x * 16 + d;
|
|
if (x > max_value) {
|
|
return false;
|
|
}
|
|
Advance();
|
|
d = HexValue(current());
|
|
}
|
|
*value = x;
|
|
return true;
|
|
}
|
|
|
|
uint32_t RegExpParser::ParseClassCharacterEscape() {
|
|
ASSERT(current() == '\\');
|
|
DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
|
|
Advance();
|
|
switch (current()) {
|
|
case 'b':
|
|
Advance();
|
|
return '\b';
|
|
// ControlEscape :: one of
|
|
// f n r t v
|
|
case 'f':
|
|
Advance();
|
|
return '\f';
|
|
case 'n':
|
|
Advance();
|
|
return '\n';
|
|
case 'r':
|
|
Advance();
|
|
return '\r';
|
|
case 't':
|
|
Advance();
|
|
return '\t';
|
|
case 'v':
|
|
Advance();
|
|
return '\v';
|
|
case 'c': {
|
|
uint32_t controlLetter = Next();
|
|
uint32_t letter = controlLetter & ~('A' ^ 'a');
|
|
// For compatibility with JSC, inside a character class
|
|
// we also accept digits and underscore as control characters.
|
|
if (letter >= 'A' && letter <= 'Z') {
|
|
Advance(2);
|
|
// Control letters mapped to ASCII control characters in the range
|
|
// 0x00-0x1f.
|
|
return controlLetter & 0x1f;
|
|
}
|
|
if (is_unicode()) {
|
|
// With /u, \c# or \c_ are invalid.
|
|
ReportError("Invalid class escape");
|
|
UNREACHABLE();
|
|
}
|
|
if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') {
|
|
Advance(2);
|
|
return controlLetter & 0x1f;
|
|
}
|
|
// We match JSC in reading the backslash as a literal
|
|
// character instead of as starting an escape.
|
|
return '\\';
|
|
}
|
|
case '0':
|
|
// With /u, \0 is interpreted as NUL if not followed by another digit.
|
|
if (is_unicode() && !(Next() >= '0' && Next() <= '9')) {
|
|
Advance();
|
|
return 0;
|
|
}
|
|
FALL_THROUGH;
|
|
case '1':
|
|
case '2':
|
|
case '3':
|
|
case '4':
|
|
case '5':
|
|
case '6':
|
|
case '7':
|
|
// For compatibility, we interpret a decimal escape that isn't
|
|
// a back reference (and therefore either \0 or not valid according
|
|
// to the specification) as a 1..3 digit octal character code.
|
|
if (is_unicode()) {
|
|
// With \u, decimal escape is not interpreted as octal character code.
|
|
ReportError("Invalid class escape");
|
|
UNREACHABLE();
|
|
}
|
|
return ParseOctalLiteral();
|
|
case 'x': {
|
|
Advance();
|
|
uint32_t value;
|
|
if (ParseHexEscape(2, &value)) {
|
|
return value;
|
|
}
|
|
if (is_unicode()) {
|
|
// With \u, invalid escapes are not treated as identity escapes.
|
|
ReportError("Invalid escape");
|
|
UNREACHABLE();
|
|
}
|
|
// If \x is not followed by a two-digit hexadecimal, treat it
|
|
// as an identity escape.
|
|
return 'x';
|
|
}
|
|
case 'u': {
|
|
Advance();
|
|
uint32_t value;
|
|
if (ParseUnicodeEscape(&value)) {
|
|
return value;
|
|
}
|
|
if (is_unicode()) {
|
|
// With \u, invalid escapes are not treated as identity escapes.
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
// If \u is not followed by a four-digit hexadecimal, treat it
|
|
// as an identity escape.
|
|
return 'u';
|
|
}
|
|
default: {
|
|
// Extended identity escape. We accept any character that hasn't
|
|
// been matched by a more specific case, not just the subset required
|
|
// by the ECMAScript specification.
|
|
uint32_t result = current();
|
|
if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
|
|
Advance();
|
|
return result;
|
|
}
|
|
ReportError(kUnicodeIdentity);
|
|
UNREACHABLE();
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool RegExpParser::ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
|
|
bool add_unicode_case_equivalents,
|
|
uint32_t* char_out) {
|
|
uint32_t first = current();
|
|
if (first == '\\') {
|
|
switch (Next()) {
|
|
case 'w':
|
|
case 'W':
|
|
case 'd':
|
|
case 'D':
|
|
case 's':
|
|
case 'S': {
|
|
CharacterRange::AddClassEscape(static_cast<uint16_t>(Next()), ranges,
|
|
add_unicode_case_equivalents);
|
|
Advance(2);
|
|
return true;
|
|
}
|
|
case 'p':
|
|
case 'P': {
|
|
if (!is_unicode()) break;
|
|
bool negate = Next() == 'P';
|
|
Advance(2);
|
|
auto name_1 = new (Z) ZoneGrowableArray<char>();
|
|
auto name_2 = new (Z) ZoneGrowableArray<char>();
|
|
if (!ParsePropertyClassName(name_1, name_2) ||
|
|
!AddPropertyClassRange(ranges, negate, name_1, name_2)) {
|
|
ReportError("Invalid property name in character class");
|
|
UNREACHABLE();
|
|
}
|
|
return true;
|
|
}
|
|
case kEndMarker:
|
|
ReportError("\\ at end of pattern");
|
|
UNREACHABLE();
|
|
default:
|
|
break;
|
|
}
|
|
*char_out = ParseClassCharacterEscape();
|
|
return false;
|
|
}
|
|
Advance();
|
|
*char_out = first;
|
|
return false;
|
|
}
|
|
|
|
RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
|
|
static const char* kUnterminated = "Unterminated character class";
|
|
static const char* kRangeInvalid = "Invalid character class";
|
|
static const char* kRangeOutOfOrder = "Range out of order in character class";
|
|
|
|
ASSERT(current() == '[');
|
|
Advance();
|
|
bool is_negated = false;
|
|
if (current() == '^') {
|
|
is_negated = true;
|
|
Advance();
|
|
}
|
|
ZoneGrowableArray<CharacterRange>* ranges =
|
|
new (Z) ZoneGrowableArray<CharacterRange>(2);
|
|
bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case();
|
|
while (has_more() && current() != ']') {
|
|
uint32_t char_1 = 0;
|
|
bool is_class_1 =
|
|
ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1);
|
|
if (current() == '-') {
|
|
Advance();
|
|
if (current() == kEndMarker) {
|
|
// If we reach the end we break out of the loop and let the
|
|
// following code report an error.
|
|
break;
|
|
} else if (current() == ']') {
|
|
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
|
|
ranges->Add(CharacterRange::Singleton('-'));
|
|
break;
|
|
}
|
|
uint32_t char_2 = 0;
|
|
bool is_class_2 =
|
|
ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2);
|
|
if (is_class_1 || is_class_2) {
|
|
// Either end is an escaped character class. Treat the '-' verbatim.
|
|
if (is_unicode()) {
|
|
// ES2015 21.2.2.15.1 step 1.
|
|
ReportError(kRangeInvalid);
|
|
UNREACHABLE();
|
|
}
|
|
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
|
|
ranges->Add(CharacterRange::Singleton('-'));
|
|
if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2));
|
|
continue;
|
|
}
|
|
if (char_1 > char_2) {
|
|
ReportError(kRangeOutOfOrder);
|
|
UNREACHABLE();
|
|
}
|
|
ranges->Add(CharacterRange::Range(char_1, char_2));
|
|
} else {
|
|
if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
|
|
}
|
|
}
|
|
if (!has_more()) {
|
|
ReportError(kUnterminated);
|
|
UNREACHABLE();
|
|
}
|
|
Advance();
|
|
RegExpCharacterClass::CharacterClassFlags character_class_flags =
|
|
RegExpCharacterClass::DefaultFlags();
|
|
if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED;
|
|
return new (Z)
|
|
RegExpCharacterClass(ranges, builder->flags(), character_class_flags);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// The Parser interface.
|
|
|
|
void RegExpParser::ParseRegExp(const String& input,
|
|
RegExpFlags flags,
|
|
RegExpCompileData* result) {
|
|
ASSERT(result != nullptr);
|
|
RegExpParser parser(input, &result->error, flags);
|
|
// Throws an exception if 'input' is not valid.
|
|
RegExpTree* tree = parser.ParsePattern();
|
|
ASSERT(tree != nullptr);
|
|
ASSERT(result->error.IsNull());
|
|
result->tree = tree;
|
|
intptr_t capture_count = parser.captures_started();
|
|
result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
|
|
result->contains_anchor = parser.contains_anchor();
|
|
result->capture_name_map = parser.CreateCaptureNameMap();
|
|
result->capture_count = capture_count;
|
|
}
|
|
|
|
} // namespace dart
|