// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file // for details. All rights reserved. Use of this source code is governed by a // BSD-style license that can be found in the LICENSE file. #ifndef RUNTIME_VM_REGEXP_PARSER_H_ #define RUNTIME_VM_REGEXP_PARSER_H_ #include "vm/allocation.h" #include "vm/growable_array.h" #include "vm/regexp_ast.h" namespace dart { // Accumulates RegExp atoms and assertions into lists of terms and alternatives. class RegExpBuilder : public ZoneAllocated { public: explicit RegExpBuilder(RegExpFlags flags); void AddCharacter(uint16_t character); void AddUnicodeCharacter(uint32_t character); void AddEscapedUnicodeCharacter(uint32_t character); // "Adds" an empty expression. Does nothing except consume a // following quantifier void AddEmpty(); void AddCharacterClass(RegExpCharacterClass* cc); void AddCharacterClassForDesugaring(uint32_t c); void AddAtom(RegExpTree* tree); void AddTerm(RegExpTree* tree); void AddAssertion(RegExpTree* tree); void NewAlternative(); // '|' // Attempt to add a quantifier to the last atom added. The return value // denotes whether the attempt succeeded, since some atoms like lookbehind // cannot be quantified. bool AddQuantifierToAtom(intptr_t min, intptr_t max, RegExpQuantifier::QuantifierType type); RegExpTree* ToRegExp(); RegExpFlags flags() const { return flags_; } bool ignore_case() const { return flags_.IgnoreCase(); } bool is_multi_line() const { return flags_.IsMultiLine(); } bool is_dot_all() const { return flags_.IsDotAll(); } private: static const uint16_t kNoPendingSurrogate = 0; void AddLeadSurrogate(uint16_t lead_surrogate); void AddTrailSurrogate(uint16_t trail_surrogate); void FlushPendingSurrogate(); void FlushCharacters(); void FlushText(); void FlushTerms(); bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); bool NeedsDesugaringForIgnoreCase(uint32_t c); Zone* zone() const { return zone_; } bool is_unicode() const { return flags_.IsUnicode(); } Zone* zone_; bool pending_empty_; RegExpFlags flags_; ZoneGrowableArray* characters_; uint16_t pending_surrogate_; GrowableArray terms_; GrowableArray text_; GrowableArray alternatives_; #ifdef DEBUG enum {ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM} last_added_; #define LAST(x) last_added_ = x; #else #define LAST(x) #endif }; using RegExpCaptureName = ZoneGrowableArray; class RegExpParser : public ValueObject { public: RegExpParser(const String& in, String* error, RegExpFlags regexp_flags); static void ParseRegExp(const String& input, RegExpFlags regexp_flags, RegExpCompileData* result); RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); RegExpTree* ParseGroup(); // Parses a {...,...} quantifier and stores the range in the given // out parameters. bool ParseIntervalQuantifier(intptr_t* min_out, intptr_t* max_out); // Parses and returns a single escaped character. The character // must not be 'b' or 'B' since they are usually handle specially. uint32_t ParseClassCharacterEscape(); // Checks whether the following is a length-digit hexadecimal number, // and sets the value if it is. bool ParseHexEscape(intptr_t length, uint32_t* value); bool ParseUnicodeEscape(uint32_t* value); bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value); // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the // result in the given out parameters. If the shorthand is used, // nothing will be added to name_2. bool ParsePropertyClassName(ZoneGrowableArray* name_1, ZoneGrowableArray* name_2); // Adds the specified unicode property to the provided character range. bool AddPropertyClassRange(ZoneGrowableArray* add_to, bool negate, ZoneGrowableArray* name_1, ZoneGrowableArray* name_2); // Returns a regexp node that corresponds to one of these unicode // property sequences: "Any", "ASCII", "Assigned". RegExpTree* GetPropertySequence(ZoneGrowableArray* name_1); RegExpTree* ParseCharacterClass(const RegExpBuilder* builder); uint32_t ParseOctalLiteral(); // Tries to parse the input as a back reference. If successful it // stores the result in the output parameter and returns true. If // it fails it will push back the characters read so the same characters // can be reparsed. bool ParseBackReferenceIndex(intptr_t* index_out); // Attempts to parse a possible escape within a character class. bool ParseClassEscape(ZoneGrowableArray* ranges, bool add_unicode_case_equivalents, uint32_t* char_out); void ReportError(const char* message); void Advance(); void Advance(intptr_t dist); void Reset(intptr_t pos); // Reports whether the pattern might be used as a literal search string. // Only use if the result of the parse is a single atom node. bool simple(); bool contains_anchor() { return contains_anchor_; } void set_contains_anchor() { contains_anchor_ = true; } intptr_t captures_started() { return captures_started_; } intptr_t position() { return next_pos_ - 1; } bool is_unicode() const { return top_level_flags_.IsUnicode(); } static bool IsSyntaxCharacterOrSlash(uint32_t c); static const intptr_t kMaxCaptures = 1 << 16; static const uint32_t kEndMarker = (1 << 21); private: enum SubexpressionType { INITIAL, CAPTURE, // All positive values represent captures. POSITIVE_LOOKAROUND, NEGATIVE_LOOKAROUND, GROUPING }; class RegExpParserState : public ZoneAllocated { public: RegExpParserState(RegExpParserState* previous_state, SubexpressionType group_type, RegExpLookaround::Type lookaround_type, intptr_t disjunction_capture_index, const RegExpCaptureName* capture_name, RegExpFlags flags, Zone* zone) : previous_state_(previous_state), builder_(new (zone) RegExpBuilder(flags)), group_type_(group_type), lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index), capture_name_(capture_name) {} // Parser state of containing expression, if any. RegExpParserState* previous_state() { return previous_state_; } bool IsSubexpression() { return previous_state_ != NULL; } // RegExpBuilder building this regexp's AST. RegExpBuilder* builder() { return builder_; } // Type of regexp being parsed (parenthesized group or entire regexp). SubexpressionType group_type() { return group_type_; } // Lookahead or lookbehind. RegExpLookaround::Type lookaround_type() { return lookaround_type_; } // Index in captures array of first capture in this sub-expression, if any. // Also the capture index of this sub-expression itself, if group_type // is CAPTURE. intptr_t capture_index() { return disjunction_capture_index_; } const RegExpCaptureName* capture_name() const { return capture_name_; } bool IsNamedCapture() const { return capture_name_ != nullptr; } // Check whether the parser is inside a capture group with the given index. bool IsInsideCaptureGroup(intptr_t index); // Check whether the parser is inside a capture group with the given name. bool IsInsideCaptureGroup(const RegExpCaptureName* name); private: // Linked list implementation of stack of states. RegExpParserState* previous_state_; // Builder for the stored disjunction. RegExpBuilder* builder_; // Stored disjunction type (capture, look-ahead or grouping), if any. SubexpressionType group_type_; // Stored read direction. const RegExpLookaround::Type lookaround_type_; // Stored disjunction's capture index (if any). intptr_t disjunction_capture_index_; // Stored capture name (if any). const RegExpCaptureName* const capture_name_; }; // Return the 1-indexed RegExpCapture object, allocate if necessary. RegExpCapture* GetCapture(intptr_t index); // Creates a new named capture at the specified index. Must be called exactly // once for each named capture. Fails if a capture with the same name is // encountered. void CreateNamedCaptureAtIndex(const RegExpCaptureName* name, intptr_t index); // Parses the name of a capture group (?pattern). The name must adhere // to IdentifierName in the ECMAScript standard. const RegExpCaptureName* ParseCaptureGroupName(); bool ParseNamedBackReference(RegExpBuilder* builder, RegExpParserState* state); RegExpParserState* ParseOpenParenthesis(RegExpParserState* state); intptr_t GetNamedCaptureIndex(const RegExpCaptureName* name); // After the initial parsing pass, patch corresponding RegExpCapture objects // into all RegExpBackReferences. This is done after initial parsing in order // to avoid complicating cases in which references come before the capture. void PatchNamedBackReferences(); RawArray* CreateCaptureNameMap(); // Returns true iff the pattern contains named captures. May call // ScanForCaptures to look ahead at the remaining pattern. bool HasNamedCaptures(); Zone* zone() { return zone_; } uint32_t current() { return current_; } bool has_more() { return has_more_; } bool has_next() { return next_pos_ < in().Length(); } uint32_t Next(); uint32_t ReadNext(bool update_position); const String& in() { return in_; } void ScanForCaptures(); Zone* zone_; ZoneGrowableArray* captures_; ZoneGrowableArray* named_captures_; ZoneGrowableArray* named_back_references_; const String& in_; uint32_t current_; intptr_t next_pos_; intptr_t captures_started_; // The capture count is only valid after we have scanned for captures. intptr_t capture_count_; bool has_more_; RegExpFlags top_level_flags_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; bool has_named_captures_; }; } // namespace dart #endif // RUNTIME_VM_REGEXP_PARSER_H_