mirror of
https://github.com/dart-lang/sdk
synced 2024-10-14 15:57:17 +00:00
2bfecc160b
TEST=build Change-Id: I2dd8ae69764af27f480a19995b491e98f52476ae Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/293902 Reviewed-by: Liam Appelbe <liama@google.com> Commit-Queue: Ryan Macnak <rmacnak@google.com>
268 lines
10 KiB
C++
268 lines
10 KiB
C++
// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
|
|
// for details. All rights reserved. Use of this source code is governed by a
|
|
// BSD-style license that can be found in the LICENSE file.
|
|
|
|
#ifndef RUNTIME_VM_REGEXP_PARSER_H_
|
|
#define RUNTIME_VM_REGEXP_PARSER_H_
|
|
|
|
#include "vm/allocation.h"
|
|
#include "vm/growable_array.h"
|
|
#include "vm/regexp_ast.h"
|
|
|
|
namespace dart {
|
|
|
|
// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
|
|
class RegExpBuilder : public ZoneAllocated {
|
|
public:
|
|
explicit RegExpBuilder(RegExpFlags flags);
|
|
|
|
void AddCharacter(uint16_t character);
|
|
void AddUnicodeCharacter(uint32_t character);
|
|
void AddEscapedUnicodeCharacter(uint32_t character);
|
|
// "Adds" an empty expression. Does nothing except consume a
|
|
// following quantifier
|
|
void AddEmpty();
|
|
void AddCharacterClass(RegExpCharacterClass* cc);
|
|
void AddCharacterClassForDesugaring(uint32_t c);
|
|
void AddAtom(RegExpTree* tree);
|
|
void AddTerm(RegExpTree* tree);
|
|
void AddAssertion(RegExpTree* tree);
|
|
void NewAlternative(); // '|'
|
|
// Attempt to add a quantifier to the last atom added. The return value
|
|
// denotes whether the attempt succeeded, since some atoms like lookbehind
|
|
// cannot be quantified.
|
|
bool AddQuantifierToAtom(intptr_t min,
|
|
intptr_t max,
|
|
RegExpQuantifier::QuantifierType type);
|
|
RegExpTree* ToRegExp();
|
|
RegExpFlags flags() const { return flags_; }
|
|
bool ignore_case() const { return flags_.IgnoreCase(); }
|
|
bool is_multi_line() const { return flags_.IsMultiLine(); }
|
|
bool is_dot_all() const { return flags_.IsDotAll(); }
|
|
|
|
private:
|
|
static constexpr uint16_t kNoPendingSurrogate = 0;
|
|
void AddLeadSurrogate(uint16_t lead_surrogate);
|
|
void AddTrailSurrogate(uint16_t trail_surrogate);
|
|
void FlushPendingSurrogate();
|
|
void FlushCharacters();
|
|
void FlushText();
|
|
void FlushTerms();
|
|
bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
|
|
bool NeedsDesugaringForIgnoreCase(uint32_t c);
|
|
|
|
Zone* zone() const { return zone_; }
|
|
bool is_unicode() const { return flags_.IsUnicode(); }
|
|
|
|
Zone* zone_;
|
|
bool pending_empty_;
|
|
RegExpFlags flags_;
|
|
ZoneGrowableArray<uint16_t>* characters_;
|
|
uint16_t pending_surrogate_;
|
|
GrowableArray<RegExpTree*> terms_;
|
|
GrowableArray<RegExpTree*> text_;
|
|
GrowableArray<RegExpTree*> alternatives_;
|
|
#ifdef DEBUG
|
|
enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_;
|
|
#define LAST(x) last_added_ = x;
|
|
#else
|
|
#define LAST(x)
|
|
#endif
|
|
};
|
|
|
|
using RegExpCaptureName = ZoneGrowableArray<uint16_t>;
|
|
|
|
class RegExpParser : public ValueObject {
|
|
public:
|
|
RegExpParser(const String& in, String* error, RegExpFlags regexp_flags);
|
|
|
|
static void ParseRegExp(const String& input,
|
|
RegExpFlags regexp_flags,
|
|
RegExpCompileData* result);
|
|
|
|
RegExpTree* ParsePattern();
|
|
RegExpTree* ParseDisjunction();
|
|
RegExpTree* ParseGroup();
|
|
|
|
// Parses a {...,...} quantifier and stores the range in the given
|
|
// out parameters.
|
|
bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out);
|
|
|
|
// Parses and returns a single escaped character. The character
|
|
// must not be 'b' or 'B' since they are usually handle specially.
|
|
uint32_t ParseClassCharacterEscape();
|
|
|
|
// Checks whether the following is a length-digit hexadecimal number,
|
|
// and sets the value if it is.
|
|
bool ParseHexEscape(intptr_t length, uint32_t* value);
|
|
bool ParseUnicodeEscape(uint32_t* value);
|
|
bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value);
|
|
|
|
// Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or
|
|
// the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the
|
|
// result in the given out parameters. If the shorthand is used,
|
|
// nothing will be added to name_2.
|
|
bool ParsePropertyClassName(ZoneGrowableArray<char>* name_1,
|
|
ZoneGrowableArray<char>* name_2);
|
|
// Adds the specified unicode property to the provided character range.
|
|
bool AddPropertyClassRange(ZoneGrowableArray<CharacterRange>* add_to,
|
|
bool negate,
|
|
ZoneGrowableArray<char>* name_1,
|
|
ZoneGrowableArray<char>* name_2);
|
|
// Returns a regexp node that corresponds to one of these unicode
|
|
// property sequences: "Any", "ASCII", "Assigned".
|
|
RegExpTree* GetPropertySequence(ZoneGrowableArray<char>* name_1);
|
|
RegExpTree* ParseCharacterClass(const RegExpBuilder* builder);
|
|
|
|
uint32_t ParseOctalLiteral();
|
|
|
|
// Tries to parse the input as a back reference. If successful it
|
|
// stores the result in the output parameter and returns true. If
|
|
// it fails it will push back the characters read so the same characters
|
|
// can be reparsed.
|
|
bool ParseBackReferenceIndex(intptr_t* index_out);
|
|
|
|
// Attempts to parse a possible escape within a character class.
|
|
bool ParseClassEscape(ZoneGrowableArray<CharacterRange>* ranges,
|
|
bool add_unicode_case_equivalents,
|
|
uint32_t* char_out);
|
|
void ReportError(const char* message);
|
|
void Advance();
|
|
void Advance(intptr_t dist);
|
|
void Reset(intptr_t pos);
|
|
|
|
// Reports whether the pattern might be used as a literal search string.
|
|
// Only use if the result of the parse is a single atom node.
|
|
bool simple();
|
|
bool contains_anchor() { return contains_anchor_; }
|
|
void set_contains_anchor() { contains_anchor_ = true; }
|
|
intptr_t captures_started() { return captures_started_; }
|
|
intptr_t position() { return next_pos_ - 1; }
|
|
bool is_unicode() const { return top_level_flags_.IsUnicode(); }
|
|
|
|
static bool IsSyntaxCharacterOrSlash(uint32_t c);
|
|
|
|
static constexpr intptr_t kMaxCaptures = 1 << 16;
|
|
static constexpr uint32_t kEndMarker = (1 << 21);
|
|
|
|
private:
|
|
enum SubexpressionType {
|
|
INITIAL,
|
|
CAPTURE, // All positive values represent captures.
|
|
POSITIVE_LOOKAROUND,
|
|
NEGATIVE_LOOKAROUND,
|
|
GROUPING
|
|
};
|
|
|
|
class RegExpParserState : public ZoneAllocated {
|
|
public:
|
|
RegExpParserState(RegExpParserState* previous_state,
|
|
SubexpressionType group_type,
|
|
RegExpLookaround::Type lookaround_type,
|
|
intptr_t disjunction_capture_index,
|
|
const RegExpCaptureName* capture_name,
|
|
RegExpFlags flags,
|
|
Zone* zone)
|
|
: previous_state_(previous_state),
|
|
builder_(new (zone) RegExpBuilder(flags)),
|
|
group_type_(group_type),
|
|
lookaround_type_(lookaround_type),
|
|
disjunction_capture_index_(disjunction_capture_index),
|
|
capture_name_(capture_name) {}
|
|
// Parser state of containing expression, if any.
|
|
RegExpParserState* previous_state() { return previous_state_; }
|
|
bool IsSubexpression() { return previous_state_ != nullptr; }
|
|
// RegExpBuilder building this regexp's AST.
|
|
RegExpBuilder* builder() { return builder_; }
|
|
// Type of regexp being parsed (parenthesized group or entire regexp).
|
|
SubexpressionType group_type() { return group_type_; }
|
|
// Lookahead or lookbehind.
|
|
RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
|
|
// Index in captures array of first capture in this sub-expression, if any.
|
|
// Also the capture index of this sub-expression itself, if group_type
|
|
// is CAPTURE.
|
|
intptr_t capture_index() { return disjunction_capture_index_; }
|
|
const RegExpCaptureName* capture_name() const { return capture_name_; }
|
|
|
|
bool IsNamedCapture() const { return capture_name_ != nullptr; }
|
|
|
|
// Check whether the parser is inside a capture group with the given index.
|
|
bool IsInsideCaptureGroup(intptr_t index);
|
|
// Check whether the parser is inside a capture group with the given name.
|
|
bool IsInsideCaptureGroup(const RegExpCaptureName* name);
|
|
|
|
private:
|
|
// Linked list implementation of stack of states.
|
|
RegExpParserState* previous_state_;
|
|
// Builder for the stored disjunction.
|
|
RegExpBuilder* builder_;
|
|
// Stored disjunction type (capture, look-ahead or grouping), if any.
|
|
SubexpressionType group_type_;
|
|
// Stored read direction.
|
|
const RegExpLookaround::Type lookaround_type_;
|
|
// Stored disjunction's capture index (if any).
|
|
intptr_t disjunction_capture_index_;
|
|
// Stored capture name (if any).
|
|
const RegExpCaptureName* const capture_name_;
|
|
};
|
|
|
|
// Return the 1-indexed RegExpCapture object, allocate if necessary.
|
|
RegExpCapture* GetCapture(intptr_t index);
|
|
|
|
// Creates a new named capture at the specified index. Must be called exactly
|
|
// once for each named capture. Fails if a capture with the same name is
|
|
// encountered.
|
|
void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index);
|
|
|
|
// Parses the name of a capture group (?<name>pattern). The name must adhere
|
|
// to IdentifierName in the ECMAScript standard.
|
|
const RegExpCaptureName* ParseCaptureGroupName();
|
|
|
|
bool ParseNamedBackReference(RegExpBuilder* builder,
|
|
RegExpParserState* state);
|
|
RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
|
|
intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name);
|
|
|
|
// After the initial parsing pass, patch corresponding RegExpCapture objects
|
|
// into all RegExpBackReferences. This is done after initial parsing in order
|
|
// to avoid complicating cases in which references come before the capture.
|
|
void PatchNamedBackReferences();
|
|
|
|
ArrayPtr CreateCaptureNameMap();
|
|
|
|
// Returns true iff the pattern contains named captures. May call
|
|
// ScanForCaptures to look ahead at the remaining pattern.
|
|
bool HasNamedCaptures();
|
|
|
|
Zone* zone() { return zone_; }
|
|
|
|
uint32_t current() { return current_; }
|
|
bool has_more() { return has_more_; }
|
|
bool has_next() { return next_pos_ < in().Length(); }
|
|
uint32_t Next();
|
|
uint32_t ReadNext(bool update_position);
|
|
const String& in() { return in_; }
|
|
void ScanForCaptures();
|
|
|
|
Zone* zone_;
|
|
ZoneGrowableArray<RegExpCapture*>* captures_;
|
|
ZoneGrowableArray<RegExpCapture*>* named_captures_;
|
|
ZoneGrowableArray<RegExpBackReference*>* named_back_references_;
|
|
const String& in_;
|
|
uint32_t current_;
|
|
intptr_t next_pos_;
|
|
intptr_t captures_started_;
|
|
// The capture count is only valid after we have scanned for captures.
|
|
intptr_t capture_count_;
|
|
bool has_more_;
|
|
RegExpFlags top_level_flags_;
|
|
bool simple_;
|
|
bool contains_anchor_;
|
|
bool is_scanned_for_captures_;
|
|
bool has_named_captures_;
|
|
};
|
|
|
|
} // namespace dart
|
|
|
|
#endif // RUNTIME_VM_REGEXP_PARSER_H_
|