From 4028fec3b56703752dbab6b5d5647fb9ac204774 Mon Sep 17 00:00:00 2001 From: Stevie Strickland Date: Mon, 29 Apr 2019 09:11:48 +0000 Subject: [PATCH] Reland "[vm] Finish adding support for ECMAScript 2018 features." This work pulls in v8 support for these features with appropriate changes for Dart and closes https://github.com/dart-lang/sdk/issues/34935. This adds support for the following features: * Interpreting patterns as Unicode patterns instead of BMP patterns * the dotAll flag (`/s`) for changing the behavior of '.' to also match line terminators * Escapes for character classes described by Unicode property groups (e.g., \p{Greek} to match all Greek characters, or \P{Greek} for all non-Greek characters). The following TC39 proposals describe some of the added features: * https://github.com/tc39/proposal-regexp-dotall-flag * https://github.com/tc39/proposal-regexp-unicode-property-escapes These additional changes are included: * Extends named capture group names to include the full range of identifier characters supported by ECMAScript, not just ASCII. * Changing the RegExp interface to return RegExpMatch objects, not Match objects, so that downcasting is not necessary to use named capture groups from Dart **Note**: The changes to the RegExp interface are a breaking change for implementers of the RegExp interface. Current users of the RegExp interface (i.e., code using Dart RegExp objects) will not be affected. Change-Id: Ie62e6082a0e2fedc1680ef2576ce0c6db80fc19a Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/100641 Reviewed-by: Martin Kustermann Commit-Queue: Stevie Strickland --- CHANGELOG.md | 29 + DEPS | 5 + build/config/android/config.gni | 3 + build/config/android/rules.gni | 10 + build/config/host_byteorder.gni | 30 + .../tool/input_sdk/patch/core_patch.dart | 10 +- .../tool/input_sdk/private/regexp_helper.dart | 72 +- runtime/lib/regexp.cc | 35 +- runtime/lib/regexp_patch.dart | 65 +- runtime/platform/splay-tree-inl.h | 273 +++++ runtime/platform/splay-tree.h | 172 +++ runtime/platform/unicode.h | 34 +- runtime/platform/utils.h | 9 +- runtime/vm/BUILD.gn | 3 +- runtime/vm/bootstrap_natives.h | 4 +- runtime/vm/clustered_snapshot.cc | 6 +- .../compiler/backend/constant_propagator.cc | 4 +- runtime/vm/compiler/backend/il.cc | 4 - runtime/vm/compiler/backend/il.h | 32 +- runtime/vm/compiler/backend/il_arm.cc | 5 +- runtime/vm/compiler/backend/il_arm64.cc | 5 +- runtime/vm/compiler/backend/il_dbc.cc | 2 +- runtime/vm/compiler/backend/il_ia32.cc | 7 +- runtime/vm/compiler/backend/il_x64.cc | 5 +- .../vm/compiler/backend/type_propagator.cc | 2 +- runtime/vm/compiler/jit/compiler.cc | 3 +- runtime/vm/object.cc | 42 +- runtime/vm/object.h | 117 +- runtime/vm/object_service.cc | 4 +- runtime/vm/raw_object.h | 10 +- runtime/vm/raw_object_snapshot.cc | 7 +- runtime/vm/regexp.cc | 1026 +++++++++++++---- runtime/vm/regexp.h | 184 ++- runtime/vm/regexp_assembler.cc | 68 +- runtime/vm/regexp_assembler.h | 31 +- runtime/vm/regexp_assembler_bytecode.cc | 20 +- runtime/vm/regexp_assembler_bytecode.h | 1 + runtime/vm/regexp_assembler_ir.cc | 15 +- runtime/vm/regexp_assembler_ir.h | 1 + runtime/vm/regexp_ast.h | 61 +- runtime/vm/regexp_bytecodes.h | 24 +- runtime/vm/regexp_interpreter.cc | 42 +- runtime/vm/regexp_parser.cc | 898 +++++++++++++-- runtime/vm/regexp_parser.h | 57 +- runtime/vm/regexp_test.cc | 2 +- runtime/vm/runtime_entry_list.h | 4 +- runtime/vm/splay-tree.h | 32 + runtime/vm/stub_code_arm64_test.cc | 2 +- runtime/vm/stub_code_arm_test.cc | 2 +- runtime/vm/stub_code_ia32_test.cc | 2 +- runtime/vm/stub_code_x64_test.cc | 2 +- .../_internal/js_runtime/lib/core_patch.dart | 10 +- .../js_runtime/lib/regexp_helper.dart | 74 +- sdk/lib/core/regexp.dart | 46 +- tests/corelib_2/regexp/dot-all_test.dart | 117 ++ tests/corelib_2/regexp/lookbehind_test.dart | 6 +- .../corelib_2/regexp/named-captures_test.dart | 255 +++- .../regexp/unicode-character-ranges_test.dart | 160 +++ .../unicode-escapes-in-regexps_test.dart | 307 +++++ .../regexp/unicode-property-binary_test.dart | 201 ++++ .../unicode-property-char-class_test.dart | 56 + .../unicode-property-enumerated_test.dart | 55 + .../unicode-property-exact-match_test.dart | 71 ++ ...nicode-property-general-category_test.dart | 114 ++ .../regexp/unicode-property-invalid_test.dart | 68 ++ .../regexp/unicode-property-scripts_test.dart | 77 ++ .../regexp/unicode-property-special_test.dart | 110 ++ .../regexp/unicode-regexp-backrefs_test.dart | 81 ++ .../unicode-regexp-ignore-case_test.dart | 135 +++ .../unicode-regexp-match-index_test.dart | 155 +++ ...unicode-regexp-restricted-syntax_test.dart | 74 ++ ...nicode-regexp-unanchored-advance_test.dart | 39 + .../unicode-regexp-zero-length_test.dart | 93 ++ tests/corelib_2/regexp/v8_regexp_utils.dart | 4 + 74 files changed, 5156 insertions(+), 635 deletions(-) create mode 100644 build/config/android/rules.gni create mode 100644 build/config/host_byteorder.gni create mode 100644 runtime/platform/splay-tree-inl.h create mode 100644 runtime/platform/splay-tree.h create mode 100644 runtime/vm/splay-tree.h create mode 100644 tests/corelib_2/regexp/dot-all_test.dart create mode 100644 tests/corelib_2/regexp/unicode-character-ranges_test.dart create mode 100644 tests/corelib_2/regexp/unicode-escapes-in-regexps_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-binary_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-char-class_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-enumerated_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-exact-match_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-general-category_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-invalid_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-scripts_test.dart create mode 100644 tests/corelib_2/regexp/unicode-property-special_test.dart create mode 100644 tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart create mode 100644 tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart create mode 100644 tests/corelib_2/regexp/unicode-regexp-match-index_test.dart create mode 100644 tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart create mode 100644 tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart create mode 100644 tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart diff --git a/CHANGELOG.md b/CHANGELOG.md index 21265c16a34..9d28400389f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,32 @@ +## 2.3.0-dev.XX.0 +(Add new changes here, and they will be copied to the change section for the + next dev version) + +### Core library changes + +#### `dart:core` + +* **Breaking change**: The `RegExp` interface has been extended with two new + constructor named parameters: + + * `unicode:` (`bool`, default: `false`), for Unicode patterns , and + * `dotAll:` (`bool`, default: `false`), to change the matching behavior of + '.' to also match line terminating characters. + + Appropriate properties for these named parameters have also been added so + their use can be detected after construction. + + In addition, `RegExp` methods that originally returned `Match` objects + now return a more specific subtype, `RegExpMatch`, which adds two features: + + * `Iterable groupNames`, a property that contains the names of all + named capture groups, and + * `String namedGroup(String name)`: a method that retrieves the match for + the given named capture group + + This change only affects implementers of the `RegExp` interface; current + code using Dart regular expressions will not be affected. + ## 2.3.0 The focus in this release is on the new "UI-as-code" language features which diff --git a/DEPS b/DEPS index 055ed823f35..adc6e97ba4b 100644 --- a/DEPS +++ b/DEPS @@ -91,6 +91,7 @@ vars = { "http_retry_tag": "0.1.1", "http_tag" : "0.12.0+2", "http_throttle_tag" : "1.0.2", + "icu_rev" : "c56c671998902fcc4fc9ace88c83daa99f980793", "idl_parser_rev": "5fb1ebf49d235b5a70c9f49047e83b0654031eb7", "intl_tag": "0.15.7", "jinja2_rev": "2222b31554f03e62600cd7e383376a7c187967a1", @@ -210,6 +211,10 @@ deps = { Var("chromium_git") + "/chromium/src/third_party/ply.git" + "@" + Var("ply_rev"), + Var("dart_root") + "/third_party/icu": + Var("chromium_git") + "/chromium/deps/icu.git" + + "@" + Var("icu_rev"), + Var("dart_root") + "/tools/idl_parser": Var("chromium_git") + "/chromium/src/tools/idl_parser.git" + "@" + Var("idl_parser_rev"), diff --git a/build/config/android/config.gni b/build/config/android/config.gni index 38e3f8e218f..c61d41acad6 100644 --- a/build/config/android/config.gni +++ b/build/config/android/config.gni @@ -15,6 +15,9 @@ if (is_android) { android_sdk_root = default_android_sdk_root android_sdk_version = default_android_sdk_version android_sdk_build_tools_version = default_android_sdk_build_tools_version + + # Unused by Dart. Required for GN files in the third_party package ICU. + enable_java_templates = false } # Host stuff ----------------------------------------------------------------- diff --git a/build/config/android/rules.gni b/build/config/android/rules.gni new file mode 100644 index 00000000000..f22ba846b2e --- /dev/null +++ b/build/config/android/rules.gni @@ -0,0 +1,10 @@ +# Copyright (c) 2019, the Dart project authors. Please see the AUTHORS file +# for details. All rights reserved. Use of this source code is governed by a +# BSD-style license that can be found in the LICENSE file. + +# A trivial rules file that allows for the Chromium third_party ICU to +# be built successfully. + +import("//build/config/android/config.gni") + +assert(is_android) diff --git a/build/config/host_byteorder.gni b/build/config/host_byteorder.gni new file mode 100644 index 00000000000..50ea94bca41 --- /dev/null +++ b/build/config/host_byteorder.gni @@ -0,0 +1,30 @@ +# Copyright (c) 2017 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# Originally from v8, included in our repository as the ICU third party +# import depends on it for building. + +# This header file defines the "host_byteorder" variable. +# Not that this is currently used only for building v8. +# The chromium code generally assumes little-endianness. +declare_args() { + host_byteorder = "undefined" +} + +# Detect host byteorder +# ppc64 can be either BE or LE +if (host_cpu == "ppc64") { + if (current_os == "aix") { + host_byteorder = "big" + } else { + # Only use the script when absolutely necessary + host_byteorder = + exec_script("//build/config/get_host_byteorder.py", [], "trim string") + } +} else if (host_cpu == "ppc" || host_cpu == "s390" || host_cpu == "s390x" || + host_cpu == "mips" || host_cpu == "mips64") { + host_byteorder = "big" +} else { + host_byteorder = "little" +} diff --git a/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart b/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart index 192b5f68d6e..da07a3c0d31 100644 --- a/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart +++ b/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart @@ -547,9 +547,15 @@ class bool { class RegExp { @patch factory RegExp(String source, - {bool multiLine = false, bool caseSensitive = true}) => + {bool multiLine = false, + bool caseSensitive = true, + bool unicode = false, + bool dotAll = false}) => JSSyntaxRegExp(source, - multiLine: multiLine, caseSensitive: caseSensitive); + multiLine: multiLine, + caseSensitive: caseSensitive, + unicode: unicode, + dotAll: dotAll); @patch static String escape(String text) => quoteStringForRegExp(text); diff --git a/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart b/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart index 92066850dd8..744effc1bee 100644 --- a/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart +++ b/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart @@ -47,18 +47,22 @@ class JSSyntaxRegExp implements RegExp { var _nativeGlobalRegExp; var _nativeAnchoredRegExp; - String toString() => "RegExp/$pattern/"; + String toString() => + 'RegExp/$pattern/' + JS('String', '#.flags', _nativeRegExp); JSSyntaxRegExp(String source, - {bool multiLine = false, bool caseSensitive = true}) + {bool multiLine = false, + bool caseSensitive = true, + bool unicode = false, + bool dotAll = false}) : this.pattern = source, - this._nativeRegExp = - makeNative(source, multiLine, caseSensitive, false); + this._nativeRegExp = makeNative( + source, multiLine, caseSensitive, unicode, dotAll, false); get _nativeGlobalVersion { if (_nativeGlobalRegExp != null) return _nativeGlobalRegExp; - return _nativeGlobalRegExp = - makeNative(pattern, _isMultiLine, _isCaseSensitive, true); + return _nativeGlobalRegExp = makeNative( + pattern, _isMultiLine, _isCaseSensitive, _isUnicode, _isDotAll, true); } get _nativeAnchoredVersion { @@ -68,17 +72,21 @@ class JSSyntaxRegExp implements RegExp { // that it tries, and you can see if the original regexp matched, or it // was the added zero-width match that matched, by looking at the last // capture. If it is a String, the match participated, otherwise it didn't. - return _nativeAnchoredRegExp = - makeNative("$pattern|()", _isMultiLine, _isCaseSensitive, true); + return _nativeAnchoredRegExp = makeNative("$pattern|()", _isMultiLine, + _isCaseSensitive, _isUnicode, _isDotAll, true); } bool get _isMultiLine => JS("bool", "#.multiline", _nativeRegExp); bool get _isCaseSensitive => JS("bool", "!#.ignoreCase", _nativeRegExp); + bool get _isUnicode => JS("bool", "#.unicode", _nativeRegExp); + bool get _isDotAll => JS("bool", "#.dotAll", _nativeRegExp); static makeNative(@nullCheck String source, bool multiLine, - bool caseSensitive, bool global) { + bool caseSensitive, bool unicode, bool dotAll, bool global) { String m = multiLine ? 'm' : ''; String i = caseSensitive ? '' : 'i'; + String u = unicode ? 'u' : ''; + String s = dotAll ? 's' : ''; String g = global ? 'g' : ''; // We're using the JavaScript's try catch instead of the Dart one // to avoid dragging in Dart runtime support just because of using @@ -87,7 +95,7 @@ class JSSyntaxRegExp implements RegExp { '', '(function() {' 'try {' - 'return new RegExp(#, # + # + #);' + 'return new RegExp(#, # + # + # + # + #);' '} catch (e) {' 'return e;' '}' @@ -95,6 +103,8 @@ class JSSyntaxRegExp implements RegExp { source, m, i, + u, + s, g); if (JS('bool', '# instanceof RegExp', regexp)) return regexp; // The returned value is the JavaScript exception. Turn it into a @@ -103,7 +113,7 @@ class JSSyntaxRegExp implements RegExp { throw FormatException("Illegal RegExp pattern: $source, $errorMessage"); } - Match firstMatch(@nullCheck String string) { + RegExpMatch firstMatch(@nullCheck String string) { List m = JS('JSExtendableArray|Null', r'#.exec(#)', _nativeRegExp, string); if (m == null) return null; return _MatchImplementation(this, JSArray.of(m)); @@ -120,7 +130,7 @@ class JSSyntaxRegExp implements RegExp { return null; } - Iterable allMatches(@nullCheck String string, + Iterable allMatches(@nullCheck String string, [@nullCheck int start = 0]) { if (start < 0 || start > string.length) { throw RangeError.range(start, 0, string.length); @@ -128,7 +138,7 @@ class JSSyntaxRegExp implements RegExp { return _AllMatchesIterable(this, string, start); } - Match _execGlobal(String string, int start) { + RegExpMatch _execGlobal(String string, int start) { Object regexp = _nativeGlobalVersion; JS("void", "#.lastIndex = #", regexp, start); List match = JS("JSExtendableArray|Null", "#.exec(#)", regexp, string); @@ -136,7 +146,7 @@ class JSSyntaxRegExp implements RegExp { return _MatchImplementation(this, JSArray.of(match)); } - Match _execAnchored(String string, int start) { + RegExpMatch _execAnchored(String string, int start) { Object regexp = _nativeAnchoredVersion; JS("void", "#.lastIndex = #", regexp, start); List match = JS("JSExtendableArray|Null", "#.exec(#)", regexp, string); @@ -148,7 +158,7 @@ class JSSyntaxRegExp implements RegExp { return _MatchImplementation(this, JSArray.of(match)); } - Match matchAsPrefix(String string, [int start = 0]) { + RegExpMatch matchAsPrefix(String string, [int start = 0]) { if (start < 0 || start > string.length) { throw RangeError.range(start, 0, string.length); } @@ -157,6 +167,8 @@ class JSSyntaxRegExp implements RegExp { bool get isMultiLine => _isMultiLine; bool get isCaseSensitive => _isCaseSensitive; + bool get isUnicode => _isUnicode; + bool get isDotAll => _isDotAll; } class _MatchImplementation implements RegExpMatch { @@ -207,25 +219,34 @@ class _MatchImplementation implements RegExpMatch { } } -class _AllMatchesIterable extends IterableBase { +class _AllMatchesIterable extends IterableBase { final JSSyntaxRegExp _re; final String _string; final int _start; _AllMatchesIterable(this._re, this._string, this._start); - Iterator get iterator => _AllMatchesIterator(_re, _string, _start); + Iterator get iterator => + _AllMatchesIterator(_re, _string, _start); } -class _AllMatchesIterator implements Iterator { +class _AllMatchesIterator implements Iterator { final JSSyntaxRegExp _regExp; String _string; int _nextIndex; - Match _current; + RegExpMatch _current; _AllMatchesIterator(this._regExp, this._string, this._nextIndex); - Match get current => _current; + RegExpMatch get current => _current; + + static bool _isLeadSurrogate(int c) { + return c >= 0xd800 && c <= 0xdbff; + } + + static bool _isTrailSurrogate(int c) { + return c >= 0xdc00 && c <= 0xdfff; + } bool moveNext() { if (_string == null) return false; @@ -235,6 +256,15 @@ class _AllMatchesIterator implements Iterator { _current = match; int nextIndex = match.end; if (match.start == nextIndex) { + // Zero-width match. Advance by one more, unless the regexp + // is in unicode mode and it would put us within a surrogate + // pair. In that case, advance past the code point as a whole. + if (_regExp.isUnicode && + _nextIndex + 1 < _string.length && + _isLeadSurrogate(_string.codeUnitAt(_nextIndex)) && + _isTrailSurrogate(_string.codeUnitAt(_nextIndex + 1))) { + nextIndex++; + } nextIndex++; } _nextIndex = nextIndex; @@ -248,6 +278,6 @@ class _AllMatchesIterator implements Iterator { } /** Find the first match of [regExp] in [string] at or after [start]. */ -Match firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { +RegExpMatch firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { return regExp._execGlobal(string, start); } diff --git a/runtime/lib/regexp.cc b/runtime/lib/regexp.cc index 4dba76e6eb7..eb4237c5954 100644 --- a/runtime/lib/regexp.cc +++ b/runtime/lib/regexp.cc @@ -14,7 +14,7 @@ namespace dart { -DEFINE_NATIVE_ENTRY(RegExp_factory, 0, 4) { +DEFINE_NATIVE_ENTRY(RegExp_factory, 0, 6) { ASSERT( TypeArguments::CheckedHandle(zone, arguments->NativeArgAt(0)).IsNull()); GET_NON_NULL_NATIVE_ARGUMENT(String, pattern, arguments->NativeArgAt(1)); @@ -22,17 +22,30 @@ DEFINE_NATIVE_ENTRY(RegExp_factory, 0, 4) { arguments->NativeArgAt(2)); GET_NON_NULL_NATIVE_ARGUMENT(Instance, handle_case_sensitive, arguments->NativeArgAt(3)); + GET_NON_NULL_NATIVE_ARGUMENT(Instance, handle_unicode, + arguments->NativeArgAt(4)); + GET_NON_NULL_NATIVE_ARGUMENT(Instance, handle_dot_all, + arguments->NativeArgAt(5)); bool ignore_case = handle_case_sensitive.raw() != Bool::True().raw(); bool multi_line = handle_multi_line.raw() == Bool::True().raw(); + bool unicode = handle_unicode.raw() == Bool::True().raw(); + bool dot_all = handle_dot_all.raw() == Bool::True().raw(); + + RegExpFlags flags; + + if (ignore_case) flags.SetIgnoreCase(); + if (multi_line) flags.SetMultiLine(); + if (unicode) flags.SetUnicode(); + if (dot_all) flags.SetDotAll(); // Parse the pattern once in order to throw any format exceptions within // the factory constructor. It is parsed again upon compilation. RegExpCompileData compileData; // Throws an exception on parsing failure. - RegExpParser::ParseRegExp(pattern, multi_line, &compileData); + RegExpParser::ParseRegExp(pattern, flags, &compileData); // Create a RegExp object containing only the initial parameters. - return RegExpEngine::CreateRegExp(thread, pattern, multi_line, ignore_case); + return RegExpEngine::CreateRegExp(thread, pattern, flags); } DEFINE_NATIVE_ENTRY(RegExp_getPattern, 0, 1) { @@ -44,13 +57,25 @@ DEFINE_NATIVE_ENTRY(RegExp_getPattern, 0, 1) { DEFINE_NATIVE_ENTRY(RegExp_getIsMultiLine, 0, 1) { const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); ASSERT(!regexp.IsNull()); - return Bool::Get(regexp.is_multi_line()).raw(); + return Bool::Get(regexp.flags().IsMultiLine()).raw(); +} + +DEFINE_NATIVE_ENTRY(RegExp_getIsUnicode, 0, 1) { + const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); + ASSERT(!regexp.IsNull()); + return Bool::Get(regexp.flags().IsUnicode()).raw(); +} + +DEFINE_NATIVE_ENTRY(RegExp_getIsDotAll, 0, 1) { + const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); + ASSERT(!regexp.IsNull()); + return Bool::Get(regexp.flags().IsDotAll()).raw(); } DEFINE_NATIVE_ENTRY(RegExp_getIsCaseSensitive, 0, 1) { const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); ASSERT(!regexp.IsNull()); - return Bool::Get(!regexp.is_ignore_case()).raw(); + return Bool::Get(!regexp.flags().IgnoreCase()).raw(); } DEFINE_NATIVE_ENTRY(RegExp_getGroupCount, 0, 1) { diff --git a/runtime/lib/regexp_patch.dart b/runtime/lib/regexp_patch.dart index 4eac4461187..35a13b1a937 100644 --- a/runtime/lib/regexp_patch.dart +++ b/runtime/lib/regexp_patch.dart @@ -8,8 +8,12 @@ class RegExp { @patch factory RegExp(String source, - {bool multiLine: false, bool caseSensitive: true}) { - _RegExpHashKey key = new _RegExpHashKey(source, multiLine, caseSensitive); + {bool multiLine: false, + bool caseSensitive: true, + bool unicode: false, + bool dotAll: false}) { + _RegExpHashKey key = + new _RegExpHashKey(source, multiLine, caseSensitive, unicode, dotAll); _RegExpHashValue value = _cache[key]; if (value == null) { @@ -21,7 +25,10 @@ class RegExp { value = new _RegExpHashValue( new _RegExp(source, - multiLine: multiLine, caseSensitive: caseSensitive), + multiLine: multiLine, + caseSensitive: caseSensitive, + unicode: unicode, + dotAll: dotAll), key); _cache[key] = value; } else { @@ -114,15 +121,20 @@ class _RegExpHashKey extends LinkedListEntry<_RegExpHashKey> { final String pattern; final bool multiLine; final bool caseSensitive; + final bool unicode; + final bool dotAll; - _RegExpHashKey(this.pattern, this.multiLine, this.caseSensitive); + _RegExpHashKey(this.pattern, this.multiLine, this.caseSensitive, this.unicode, + this.dotAll); int get hashCode => pattern.hashCode; bool operator ==(that) { return (that is _RegExpHashKey) && (this.pattern == that.pattern) && (this.multiLine == that.multiLine) && - (this.caseSensitive == that.caseSensitive); + (this.caseSensitive == that.caseSensitive) && + (this.unicode == that.unicode) && + (this.dotAll == that.dotAll); } } @@ -200,9 +212,11 @@ class _RegExpMatch implements RegExpMatch { class _RegExp implements RegExp { factory _RegExp(String pattern, {bool multiLine: false, - bool caseSensitive: true}) native "RegExp_factory"; + bool caseSensitive: true, + bool unicode: false, + bool dotAll: false}) native "RegExp_factory"; - Match firstMatch(String str) { + RegExpMatch firstMatch(String str) { if (str is! String) throw new ArgumentError(str); List match = _ExecuteMatch(str, 0); if (match == null) { @@ -211,7 +225,7 @@ class _RegExp implements RegExp { return new _RegExpMatch(this, str, match); } - Iterable allMatches(String string, [int start = 0]) { + Iterable allMatches(String string, [int start = 0]) { if (string is! String) throw new ArgumentError(string); if (start is! int) throw new ArgumentError(start); if (0 > start || start > string.length) { @@ -220,7 +234,7 @@ class _RegExp implements RegExp { return new _AllMatchesIterable(this, string, start); } - Match matchAsPrefix(String string, [int start = 0]) { + RegExpMatch matchAsPrefix(String string, [int start = 0]) { if (string is! String) throw new ArgumentError(string); if (start is! int) throw new ArgumentError(start); if (start < 0 || start > string.length) { @@ -252,6 +266,10 @@ class _RegExp implements RegExp { bool get isCaseSensitive native "RegExp_getIsCaseSensitive"; + bool get isUnicode native "RegExp_getIsUnicode"; + + bool get isDotAll native "RegExp_getIsDotAll"; + int get _groupCount native "RegExp_getGroupCount"; // Returns a List [String, int, String, int, ...] where each @@ -327,25 +345,34 @@ class _RegExp implements RegExp { native "RegExp_ExecuteMatchSticky"; } -class _AllMatchesIterable extends IterableBase { +class _AllMatchesIterable extends IterableBase { final _RegExp _re; final String _str; final int _start; _AllMatchesIterable(this._re, this._str, this._start); - Iterator get iterator => new _AllMatchesIterator(_re, _str, _start); + Iterator get iterator => + new _AllMatchesIterator(_re, _str, _start); } -class _AllMatchesIterator implements Iterator { +class _AllMatchesIterator implements Iterator { final String _str; int _nextIndex; _RegExp _re; - Match _current; + RegExpMatch _current; _AllMatchesIterator(this._re, this._str, this._nextIndex); - Match get current => _current; + RegExpMatch get current => _current; + + static bool _isLeadSurrogate(int c) { + return c >= 0xd800 && c <= 0xdbff; + } + + static bool _isTrailSurrogate(int c) { + return c >= 0xdc00 && c <= 0xdfff; + } bool moveNext() { if (_re == null) return false; // Cleared after a failed match. @@ -355,7 +382,15 @@ class _AllMatchesIterator implements Iterator { _current = new _RegExpMatch(_re, _str, match); _nextIndex = _current.end; if (_nextIndex == _current.start) { - // Zero-width match. Advance by one more. + // Zero-width match. Advance by one more, unless the regexp + // is in unicode mode and it would put us within a surrogate + // pair. In that case, advance past the code point as a whole. + if (_re.isUnicode && + _nextIndex + 1 < _str.length && + _isLeadSurrogate(_str.codeUnitAt(_nextIndex)) && + _isTrailSurrogate(_str.codeUnitAt(_nextIndex + 1))) { + _nextIndex++; + } _nextIndex++; } return true; diff --git a/runtime/platform/splay-tree-inl.h b/runtime/platform/splay-tree-inl.h new file mode 100644 index 00000000000..7322a56ce4c --- /dev/null +++ b/runtime/platform/splay-tree-inl.h @@ -0,0 +1,273 @@ +// Copyright (c) 2019, the Dart project authors. +// Copyright 2010 the V8 project authors. +// Please see the AUTHORS file for details. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// The original file can be found at: +// https://github.com/v8/v8/blob/master/src/splay-tree-inl.h + +#ifndef RUNTIME_PLATFORM_SPLAY_TREE_INL_H_ +#define RUNTIME_PLATFORM_SPLAY_TREE_INL_H_ + +#include + +#include "platform/splay-tree.h" + +namespace dart { + +template +SplayTree::~SplayTree() { + NodeDeleter deleter; + ForEachNode(&deleter); +} + +template +bool SplayTree::Insert(const Key& key, Locator* locator) { + if (is_empty()) { + // If the tree is empty, insert the new node. + root_ = new (allocator_) Node(key, Config::NoValue()); + } else { + // Splay on the key to move the last node on the search path + // for the key to the root of the tree. + Splay(key); + // Ignore repeated insertions with the same key. + int cmp = Config::Compare(key, root_->key_); + if (cmp == 0) { + locator->bind(root_); + return false; + } + // Insert the new node. + Node* node = new (allocator_) Node(key, Config::NoValue()); + InsertInternal(cmp, node); + } + locator->bind(root_); + return true; +} + +template +void SplayTree::InsertInternal(int cmp, Node* node) { + if (cmp > 0) { + node->left_ = root_; + node->right_ = root_->right_; + root_->right_ = nullptr; + } else { + node->right_ = root_; + node->left_ = root_->left_; + root_->left_ = nullptr; + } + root_ = node; +} + +template +bool SplayTree::FindInternal(const Key& key) { + if (is_empty()) return false; + Splay(key); + return Config::Compare(key, root_->key_) == 0; +} + +template +bool SplayTree::Contains(const Key& key) { + return FindInternal(key); +} + +template +bool SplayTree::Find(const Key& key, Locator* locator) { + if (FindInternal(key)) { + locator->bind(root_); + return true; + } else { + return false; + } +} + +template +bool SplayTree::FindGreatestLessThan(const Key& key, + Locator* locator) { + if (is_empty()) return false; + // Splay on the key to move the node with the given key or the last + // node on the search path to the top of the tree. + Splay(key); + // Now the result is either the root node or the greatest node in + // the left subtree. + int cmp = Config::Compare(root_->key_, key); + if (cmp <= 0) { + locator->bind(root_); + return true; + } else { + Node* temp = root_; + root_ = root_->left_; + bool result = FindGreatest(locator); + root_ = temp; + return result; + } +} + +template +bool SplayTree::FindLeastGreaterThan(const Key& key, + Locator* locator) { + if (is_empty()) return false; + // Splay on the key to move the node with the given key or the last + // node on the search path to the top of the tree. + Splay(key); + // Now the result is either the root node or the least node in + // the right subtree. + int cmp = Config::Compare(root_->key_, key); + if (cmp >= 0) { + locator->bind(root_); + return true; + } else { + Node* temp = root_; + root_ = root_->right_; + bool result = FindLeast(locator); + root_ = temp; + return result; + } +} + +template +bool SplayTree::FindGreatest(Locator* locator) { + if (is_empty()) return false; + Node* current = root_; + while (current->right_ != nullptr) + current = current->right_; + locator->bind(current); + return true; +} + +template +bool SplayTree::FindLeast(Locator* locator) { + if (is_empty()) return false; + Node* current = root_; + while (current->left_ != nullptr) + current = current->left_; + locator->bind(current); + return true; +} + +template +bool SplayTree::Move(const Key& old_key, + const Key& new_key) { + if (!FindInternal(old_key)) return false; + Node* node_to_move = root_; + RemoveRootNode(old_key); + Splay(new_key); + int cmp = Config::Compare(new_key, root_->key_); + if (cmp == 0) { + // A node with the target key already exists. + delete node_to_move; + return false; + } + node_to_move->key_ = new_key; + InsertInternal(cmp, node_to_move); + return true; +} + +template +bool SplayTree::Remove(const Key& key) { + if (!FindInternal(key)) return false; + Node* node_to_remove = root_; + RemoveRootNode(key); + delete node_to_remove; + return true; +} + +template +void SplayTree::RemoveRootNode(const Key& key) { + if (root_->left_ == nullptr) { + // No left child, so the new tree is just the right child. + root_ = root_->right_; + } else { + // Left child exists. + Node* right = root_->right_; + // Make the original left child the new root. + root_ = root_->left_; + // Splay to make sure that the new root has an empty right child. + Splay(key); + // Insert the original right child as the right child of the new + // root. + root_->right_ = right; + } +} + +template +void SplayTree::Splay(const Key& key) { + if (is_empty()) return; + Node dummy_node(Config::kNoKey, Config::NoValue()); + // Create a dummy node. The use of the dummy node is a bit + // counter-intuitive: The right child of the dummy node will hold + // the L tree of the algorithm. The left child of the dummy node + // will hold the R tree of the algorithm. Using a dummy node, left + // and right will always be nodes and we avoid special cases. + Node* dummy = &dummy_node; + Node* left = dummy; + Node* right = dummy; + Node* current = root_; + while (true) { + int cmp = Config::Compare(key, current->key_); + if (cmp < 0) { + if (current->left_ == nullptr) break; + if (Config::Compare(key, current->left_->key_) < 0) { + // Rotate right. + Node* temp = current->left_; + current->left_ = temp->right_; + temp->right_ = current; + current = temp; + if (current->left_ == nullptr) break; + } + // Link right. + right->left_ = current; + right = current; + current = current->left_; + } else if (cmp > 0) { + if (current->right_ == nullptr) break; + if (Config::Compare(key, current->right_->key_) > 0) { + // Rotate left. + Node* temp = current->right_; + current->right_ = temp->left_; + temp->left_ = current; + current = temp; + if (current->right_ == nullptr) break; + } + // Link left. + left->right_ = current; + left = current; + current = current->right_; + } else { + break; + } + } + // Assemble. + left->right_ = current->left_; + right->left_ = current->right_; + current->left_ = dummy->right_; + current->right_ = dummy->left_; + root_ = current; +} + +template +template +void SplayTree::ForEach(Callback* callback) { + NodeToPairAdaptor callback_adaptor(callback); + ForEachNode(&callback_adaptor); +} + +template +template +void SplayTree::ForEachNode(Callback* callback) { + if (root_ == nullptr) return; + // Pre-allocate some space for tiny trees. + std::vector nodes_to_visit; + nodes_to_visit.push_back(root_); + size_t pos = 0; + while (pos < nodes_to_visit.size()) { + Node* node = nodes_to_visit[pos++]; + if (node->left() != nullptr) nodes_to_visit.push_back(node->left()); + if (node->right() != nullptr) nodes_to_visit.push_back(node->right()); + callback->Call(node); + } +} + +} // namespace dart + +#endif // RUNTIME_PLATFORM_SPLAY_TREE_INL_H_ diff --git a/runtime/platform/splay-tree.h b/runtime/platform/splay-tree.h new file mode 100644 index 00000000000..a85488b154a --- /dev/null +++ b/runtime/platform/splay-tree.h @@ -0,0 +1,172 @@ +// Copyright (c) 2019, the Dart project authors. +// Copyright 2010 the V8 project authors. +// Please see the AUTHORS file for details. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// The original file can be found at: +// https://github.com/v8/v8/blob/master/src/splay-tree.h + +#ifndef RUNTIME_PLATFORM_SPLAY_TREE_H_ +#define RUNTIME_PLATFORM_SPLAY_TREE_H_ + +#include "platform/allocation.h" + +namespace dart { + +// A splay tree. The config type parameter encapsulates the different +// configurations of a concrete splay tree: +// +// typedef Key: the key type +// typedef Value: the value type +// static const Key kNoKey: the dummy key used when no key is set +// static Value kNoValue(): the dummy value used to initialize nodes +// static int (Compare)(Key& a, Key& b) -> {-1, 0, 1}: comparison function +// +// The tree is also parameterized by an allocation policy +// (Allocator). The policy is used for allocating lists in the C free +// store or the zone; see zone.h. + +template +class SplayTree : public B { + public: + typedef typename Config::Key Key; + typedef typename Config::Value Value; + + class Locator; + + explicit SplayTree(Allocator* allocator) + : root_(nullptr), allocator_(allocator) {} + ~SplayTree(); + + Allocator* allocator() { return allocator_; } + + // Checks if there is a mapping for the key. + bool Contains(const Key& key); + + // Inserts the given key in this tree with the given value. Returns + // true if a node was inserted, otherwise false. If found the locator + // is enabled and provides access to the mapping for the key. + bool Insert(const Key& key, Locator* locator); + + // Looks up the key in this tree and returns true if it was found, + // otherwise false. If the node is found the locator is enabled and + // provides access to the mapping for the key. + bool Find(const Key& key, Locator* locator); + + // Finds the mapping with the greatest key less than or equal to the + // given key. + bool FindGreatestLessThan(const Key& key, Locator* locator); + + // Find the mapping with the greatest key in this tree. + bool FindGreatest(Locator* locator); + + // Finds the mapping with the least key greater than or equal to the + // given key. + bool FindLeastGreaterThan(const Key& key, Locator* locator); + + // Find the mapping with the least key in this tree. + bool FindLeast(Locator* locator); + + // Move the node from one key to another. + bool Move(const Key& old_key, const Key& new_key); + + // Remove the node with the given key from the tree. + bool Remove(const Key& key); + + // Remove all keys from the tree. + void Clear() { ResetRoot(); } + + bool is_empty() { return root_ == nullptr; } + + // Perform the splay operation for the given key. Moves the node with + // the given key to the top of the tree. If no node has the given + // key, the last node on the search path is moved to the top of the + // tree. + void Splay(const Key& key); + + class Node : public B { + public: + Node(const Key& key, const Value& value) + : key_(key), value_(value), left_(nullptr), right_(nullptr) {} + + Key key() { return key_; } + Value value() { return value_; } + Node* left() { return left_; } + Node* right() { return right_; } + + private: + friend class SplayTree; + friend class Locator; + Key key_; + Value value_; + Node* left_; + Node* right_; + }; + + // A locator provides access to a node in the tree without actually + // exposing the node. + class Locator : public B { + public: + explicit Locator(Node* node) : node_(node) {} + Locator() : node_(nullptr) {} + const Key& key() { return node_->key_; } + Value& value() { return node_->value_; } + void set_value(const Value& value) { node_->value_ = value; } + inline void bind(Node* node) { node_ = node; } + + private: + Node* node_; + }; + + template + void ForEach(Callback* callback); + + protected: + // Resets tree root. Existing nodes become unreachable. + void ResetRoot() { root_ = nullptr; } + + private: + // Search for a node with a given key. If found, root_ points + // to the node. + bool FindInternal(const Key& key); + + // Inserts a node assuming that root_ is already set up. + void InsertInternal(int cmp, Node* node); + + // Removes root_ node. + void RemoveRootNode(const Key& key); + + template + class NodeToPairAdaptor : public B { + public: + explicit NodeToPairAdaptor(Callback* callback) : callback_(callback) {} + void Call(Node* node) { callback_->Call(node->key(), node->value()); } + + private: + Callback* callback_; + + DISALLOW_COPY_AND_ASSIGN(NodeToPairAdaptor); + }; + + class NodeDeleter : public B { + public: + NodeDeleter() = default; + void Call(Node* node) { delete node; } + + private: + DISALLOW_COPY_AND_ASSIGN(NodeDeleter); + }; + + template + void ForEachNode(Callback* callback); + + Node* root_; + Allocator* allocator_; + + DISALLOW_COPY_AND_ASSIGN(SplayTree); +}; + +} // namespace dart + +#endif // RUNTIME_PLATFORM_SPLAY_TREE_H_ diff --git a/runtime/platform/unicode.h b/runtime/platform/unicode.h index f3c0778f7e4..b784823e528 100644 --- a/runtime/platform/unicode.h +++ b/runtime/platform/unicode.h @@ -15,6 +15,7 @@ class String; class Utf : AllStatic { public: static const int32_t kMaxCodePoint = 0x10FFFF; + static const int32_t kInvalidChar = 0xFFFFFFFF; static bool IsLatin1(int32_t code_point) { return (code_point >= 0) && (code_point <= 0xFF); @@ -29,7 +30,7 @@ class Utf : AllStatic { } // Returns true if the code point value is above Plane 17. - static bool IsOutOfRange(intptr_t code_point) { + static bool IsOutOfRange(int32_t code_point) { return (code_point < 0) || (code_point > kMaxCodePoint); } }; @@ -56,11 +57,11 @@ class Utf8 : AllStatic { static intptr_t Length(const String& str); static intptr_t Encode(int32_t ch, char* dst); + static intptr_t Encode(const String& src, char* dst, intptr_t len); static intptr_t Decode(const uint8_t* utf8_array, intptr_t array_len, int32_t* ch); - static intptr_t Encode(const String& src, char* dst, intptr_t len); static bool DecodeToLatin1(const uint8_t* utf8_array, intptr_t array_len, @@ -152,6 +153,10 @@ class Utf16 : AllStatic { static void Encode(int32_t codepoint, uint16_t* dst); static const int32_t kMaxCodeUnit = 0xFFFF; + static const int32_t kLeadSurrogateStart = 0xD800; + static const int32_t kLeadSurrogateEnd = 0xDBFF; + static const int32_t kTrailSurrogateStart = 0xDC00; + static const int32_t kTrailSurrogateEnd = 0xDFFF; private: static const int32_t kLeadSurrogateOffset = (0xD800 - (0x10000 >> 10)); @@ -187,11 +192,11 @@ class CaseMapping : AllStatic { // The size of the stage 1 index. // TODO(cshapiro): improve indexing so this value is unnecessary. - static const int kStage1Size = 261; + static const intptr_t kStage1Size = 261; // The size of a stage 2 block in bytes. - static const int kBlockSizeLog2 = 8; - static const int kBlockSize = 1 << kBlockSizeLog2; + static const intptr_t kBlockSizeLog2 = 8; + static const intptr_t kBlockSize = 1 << kBlockSizeLog2; static int32_t Convert(int32_t ch, int32_t mapping) { if (Utf::IsLatin1(ch)) { @@ -222,6 +227,25 @@ class CaseMapping : AllStatic { static const int32_t stage2_exception_[][2]; }; +class Latin1 { + public: + static const int32_t kMaxChar = 0xff; + // Convert the character to Latin-1 case equivalent if possible. + static inline uint16_t TryConvertToLatin1(uint16_t c) { + switch (c) { + // This are equivalent characters in unicode. + case 0x39c: + case 0x3bc: + return 0xb5; + // This is an uppercase of a Latin-1 character + // outside of Latin-1. + case 0x178: + return 0xff; + } + return c; + } +}; + } // namespace dart #endif // RUNTIME_PLATFORM_UNICODE_H_ diff --git a/runtime/platform/utils.h b/runtime/platform/utils.h index 9f72205dd6e..993ff26be8d 100644 --- a/runtime/platform/utils.h +++ b/runtime/platform/utils.h @@ -203,7 +203,14 @@ class Utils { return (static_cast(high) << 32) | (low & 0x0ffffffffLL); } - static bool IsDecimalDigit(char c) { return ('0' <= c) && (c <= '9'); } + static inline constexpr bool IsAlphaNumeric(uint32_t c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || + IsDecimalDigit(c); + } + + static inline constexpr bool IsDecimalDigit(uint32_t c) { + return ('0' <= c) && (c <= '9'); + } static bool IsHexDigit(char c) { return IsDecimalDigit(c) || (('A' <= c) && (c <= 'F')) || diff --git a/runtime/vm/BUILD.gn b/runtime/vm/BUILD.gn index 680ab8411cd..f2737c98f68 100644 --- a/runtime/vm/BUILD.gn +++ b/runtime/vm/BUILD.gn @@ -60,8 +60,9 @@ config("libdart_vm_config") { library_for_all_configs("libdart_vm") { target_type = "source_set" + extra_deps = [ "//third_party/icu" ] if (is_fuchsia) { - extra_deps = [ + extra_deps += [ # TODO(US-399): Remove time_service specific code when it is no longer # necessary. "//sdk/lib/sys/cpp", diff --git a/runtime/vm/bootstrap_natives.h b/runtime/vm/bootstrap_natives.h index da1dc36e8c4..34cfbc43044 100644 --- a/runtime/vm/bootstrap_natives.h +++ b/runtime/vm/bootstrap_natives.h @@ -98,10 +98,12 @@ namespace dart { V(Double_toStringAsExponential, 2) \ V(Double_toStringAsPrecision, 2) \ V(Double_flipSignBit, 1) \ - V(RegExp_factory, 4) \ + V(RegExp_factory, 6) \ V(RegExp_getPattern, 1) \ V(RegExp_getIsMultiLine, 1) \ V(RegExp_getIsCaseSensitive, 1) \ + V(RegExp_getIsUnicode, 1) \ + V(RegExp_getIsDotAll, 1) \ V(RegExp_getGroupCount, 1) \ V(RegExp_getGroupNameMap, 1) \ V(RegExp_ExecuteMatch, 3) \ diff --git a/runtime/vm/clustered_snapshot.cc b/runtime/vm/clustered_snapshot.cc index 4f6602719f0..033ffbc63e3 100644 --- a/runtime/vm/clustered_snapshot.cc +++ b/runtime/vm/clustered_snapshot.cc @@ -3630,7 +3630,8 @@ class RegExpSerializationCluster : public SerializationCluster { RawRegExp* regexp = objects_[i]; AutoTraceObject(regexp); WriteFromTo(regexp); - s->Write(regexp->ptr()->num_registers_); + s->Write(regexp->ptr()->num_one_byte_registers_); + s->Write(regexp->ptr()->num_two_byte_registers_); s->Write(regexp->ptr()->type_flags_); } } @@ -3661,7 +3662,8 @@ class RegExpDeserializationCluster : public DeserializationCluster { Deserializer::InitializeHeader(regexp, kRegExpCid, RegExp::InstanceSize()); ReadFromTo(regexp); - regexp->ptr()->num_registers_ = d->Read(); + regexp->ptr()->num_one_byte_registers_ = d->Read(); + regexp->ptr()->num_two_byte_registers_ = d->Read(); regexp->ptr()->type_flags_ = d->Read(); } } diff --git a/runtime/vm/compiler/backend/constant_propagator.cc b/runtime/vm/compiler/backend/constant_propagator.cc index d27b6e21a87..acf4f9c484d 100644 --- a/runtime/vm/compiler/backend/constant_propagator.cc +++ b/runtime/vm/compiler/backend/constant_propagator.cc @@ -1272,8 +1272,8 @@ void ConstantPropagator::VisitMathMinMax(MathMinMaxInstr* instr) { } } -void ConstantPropagator::VisitCaseInsensitiveCompareUC16( - CaseInsensitiveCompareUC16Instr* instr) { +void ConstantPropagator::VisitCaseInsensitiveCompare( + CaseInsensitiveCompareInstr* instr) { SetValue(instr, non_constant_); } diff --git a/runtime/vm/compiler/backend/il.cc b/runtime/vm/compiler/backend/il.cc index 4ee0e27619f..dda6e01bdf0 100644 --- a/runtime/vm/compiler/backend/il.cc +++ b/runtime/vm/compiler/backend/il.cc @@ -5161,10 +5161,6 @@ const char* MathUnaryInstr::KindToCString(MathUnaryKind kind) { return ""; } -const RuntimeEntry& CaseInsensitiveCompareUC16Instr::TargetFunction() const { - return kCaseInsensitiveCompareUC16RuntimeEntry; -} - TruncDivModInstr::TruncDivModInstr(Value* lhs, Value* rhs, intptr_t deopt_id) : TemplateDefinition(deopt_id) { SetInputAt(0, lhs); diff --git a/runtime/vm/compiler/backend/il.h b/runtime/vm/compiler/backend/il.h index 54af80a71d1..a26a7012c1e 100644 --- a/runtime/vm/compiler/backend/il.h +++ b/runtime/vm/compiler/backend/il.h @@ -18,6 +18,7 @@ #include "vm/native_entry.h" #include "vm/object.h" #include "vm/parser.h" +#include "vm/runtime_entry.h" #include "vm/static_type_exactness_state.h" #include "vm/token_position.h" @@ -422,7 +423,7 @@ struct InstrAttrs { M(Unbox, kNoGC) \ M(BoxInt64, _) \ M(UnboxInt64, kNoGC) \ - M(CaseInsensitiveCompareUC16, _) \ + M(CaseInsensitiveCompare, _) \ M(BinaryInt64Op, kNoGC) \ M(ShiftInt64Op, kNoGC) \ M(SpeculativeShiftInt64Op, kNoGC) \ @@ -5956,18 +5957,18 @@ class MathUnaryInstr : public TemplateDefinition<1, NoThrow, Pure> { // Calls into the runtime and performs a case-insensitive comparison of the // UTF16 strings (i.e. TwoByteString or ExternalTwoByteString) located at // str[lhs_index:lhs_index + length] and str[rhs_index:rhs_index + length]. -// -// TODO(zerny): Remove this once (if) functions inherited from unibrow -// are moved to dart code. -class CaseInsensitiveCompareUC16Instr +// Depending on the runtime entry passed, we will treat the strings as either +// UCS2 (no surrogate handling) or UTF16 (surrogates handled appropriately). +class CaseInsensitiveCompareInstr : public TemplateDefinition<4, NoThrow, Pure> { public: - CaseInsensitiveCompareUC16Instr(Value* str, - Value* lhs_index, - Value* rhs_index, - Value* length, - intptr_t cid) - : cid_(cid) { + CaseInsensitiveCompareInstr(Value* str, + Value* lhs_index, + Value* rhs_index, + Value* length, + const RuntimeEntry& entry, + intptr_t cid) + : entry_(entry), cid_(cid) { ASSERT(cid == kTwoByteStringCid || cid == kExternalTwoByteStringCid); ASSERT(index_scale() == 2); SetInputAt(0, str); @@ -5981,7 +5982,7 @@ class CaseInsensitiveCompareUC16Instr Value* rhs_index() const { return inputs_[2]; } Value* length() const { return inputs_[3]; } - const RuntimeEntry& TargetFunction() const; + const RuntimeEntry& TargetFunction() const { return entry_; } bool IsExternal() const { return cid_ == kExternalTwoByteStringCid; } intptr_t class_id() const { return cid_; } intptr_t index_scale() const { return Instance::ElementSizeFor(cid_); } @@ -5990,17 +5991,18 @@ class CaseInsensitiveCompareUC16Instr virtual Representation representation() const { return kTagged; } - DECLARE_INSTRUCTION(CaseInsensitiveCompareUC16) + DECLARE_INSTRUCTION(CaseInsensitiveCompare) virtual CompileType ComputeType() const; virtual bool AttributesEqual(Instruction* other) const { - return other->AsCaseInsensitiveCompareUC16()->cid_ == cid_; + return other->AsCaseInsensitiveCompare()->cid_ == cid_; } private: + const RuntimeEntry& entry_; const intptr_t cid_; - DISALLOW_COPY_AND_ASSIGN(CaseInsensitiveCompareUC16Instr); + DISALLOW_COPY_AND_ASSIGN(CaseInsensitiveCompareInstr); }; // Represents Math's static min and max functions. diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc index 874b0d32aac..15fb9a67a98 100644 --- a/runtime/vm/compiler/backend/il_arm.cc +++ b/runtime/vm/compiler/backend/il_arm.cc @@ -5133,7 +5133,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -5147,8 +5147,7 @@ LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareUC16Instr::EmitNativeCode( - FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { // Call the function. __ CallRuntime(TargetFunction(), TargetFunction().argument_count()); } diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc index b2f42bd9326..427003054b7 100644 --- a/runtime/vm/compiler/backend/il_arm64.cc +++ b/runtime/vm/compiler/backend/il_arm64.cc @@ -4350,7 +4350,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -4364,8 +4364,7 @@ LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareUC16Instr::EmitNativeCode( - FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { // Call the function. __ CallRuntime(TargetFunction(), TargetFunction().argument_count()); } diff --git a/runtime/vm/compiler/backend/il_dbc.cc b/runtime/vm/compiler/backend/il_dbc.cc index 656651319cc..107a3c8d4f9 100644 --- a/runtime/vm/compiler/backend/il_dbc.cc +++ b/runtime/vm/compiler/backend/il_dbc.cc @@ -54,7 +54,7 @@ DECLARE_FLAG(int, optimization_counter_threshold); // - Optimized RegExps, // - Precompilation. #define FOR_EACH_UNREACHABLE_INSTRUCTION(M) \ - M(CaseInsensitiveCompareUC16) \ + M(CaseInsensitiveCompare) \ M(GenericCheckBound) \ M(IndirectGoto) \ M(Int64ToDouble) \ diff --git a/runtime/vm/compiler/backend/il_ia32.cc b/runtime/vm/compiler/backend/il_ia32.cc index f0b7b2d98a4..03ab961db8b 100644 --- a/runtime/vm/compiler/backend/il_ia32.cc +++ b/runtime/vm/compiler/backend/il_ia32.cc @@ -4454,7 +4454,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -4468,8 +4468,7 @@ LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareUC16Instr::EmitNativeCode( - FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { // Save ESP. EDI is chosen because it is callee saved so we do not need to // back it up before calling into the runtime. static const Register kSavedSPReg = EDI; @@ -4484,7 +4483,7 @@ void CaseInsensitiveCompareUC16Instr::EmitNativeCode( // Call the function. __ CallRuntime(TargetFunction(), TargetFunction().argument_count()); - // Restore ESP. + // Restore ESP and pop the old value off the stack. __ movl(ESP, kSavedSPReg); } diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc index a10e83a59df..780cf7aa879 100644 --- a/runtime/vm/compiler/backend/il_x64.cc +++ b/runtime/vm/compiler/backend/il_x64.cc @@ -4494,7 +4494,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -4508,8 +4508,7 @@ LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareUC16Instr::EmitNativeCode( - FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { // Save RSP. R13 is chosen because it is callee saved so we do not need to // back it up before calling into the runtime. static const Register kSavedSPReg = R13; diff --git a/runtime/vm/compiler/backend/type_propagator.cc b/runtime/vm/compiler/backend/type_propagator.cc index 0dec0c96690..0352767a6e2 100644 --- a/runtime/vm/compiler/backend/type_propagator.cc +++ b/runtime/vm/compiler/backend/type_propagator.cc @@ -1515,7 +1515,7 @@ CompileType MathMinMaxInstr::ComputeType() const { return CompileType::FromCid(result_cid_); } -CompileType CaseInsensitiveCompareUC16Instr::ComputeType() const { +CompileType CaseInsensitiveCompareInstr::ComputeType() const { return CompileType::FromCid(kBoolCid); } diff --git a/runtime/vm/compiler/jit/compiler.cc b/runtime/vm/compiler/jit/compiler.cc index db5a703d752..d3343307d0d 100644 --- a/runtime/vm/compiler/jit/compiler.cc +++ b/runtime/vm/compiler/jit/compiler.cc @@ -166,11 +166,10 @@ void IrregexpCompilationPipeline::ParseFunction( RegExp& regexp = RegExp::Handle(parsed_function->function().regexp()); const String& pattern = String::Handle(regexp.pattern()); - const bool multiline = regexp.is_multi_line(); RegExpCompileData* compile_data = new (zone) RegExpCompileData(); // Parsing failures are handled in the RegExp factory constructor. - RegExpParser::ParseRegExp(pattern, multiline, compile_data); + RegExpParser::ParseRegExp(pattern, regexp.flags(), compile_data); regexp.set_num_bracket_expressions(compile_data->capture_count); regexp.set_capture_name_map(compile_data->capture_name_map); diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc index 9804f6935c7..e25bf06d4d1 100644 --- a/runtime/vm/object.cc +++ b/runtime/vm/object.cc @@ -21751,23 +21751,45 @@ RawRegExp* RegExp::New(Heap::Space space) { NoSafepointScope no_safepoint; result ^= raw; result.set_type(kUninitialized); - result.set_flags(0); - result.set_num_registers(-1); + result.set_flags(RegExpFlags()); + result.set_num_registers(/*is_one_byte=*/false, -1); + result.set_num_registers(/*is_one_byte=*/true, -1); } return result.raw(); } -const char* RegExp::Flags() const { - switch (flags()) { - case kGlobal | kIgnoreCase | kMultiLine: +const char* RegExpFlags::ToCString() const { + switch (value_ & ~kGlobal) { + case kIgnoreCase | kMultiLine | kDotAll | kUnicode: + return "imsu"; + case kIgnoreCase | kMultiLine | kDotAll: + return "ims"; + case kIgnoreCase | kMultiLine | kUnicode: + return "imu"; + case kIgnoreCase | kUnicode | kDotAll: + return "ius"; + case kMultiLine | kDotAll | kUnicode: + return "msu"; case kIgnoreCase | kMultiLine: return "im"; - case kGlobal | kIgnoreCase: + case kIgnoreCase | kDotAll: + return "is"; + case kIgnoreCase | kUnicode: + return "iu"; + case kMultiLine | kDotAll: + return "ms"; + case kMultiLine | kUnicode: + return "mu"; + case kDotAll | kUnicode: + return "su"; case kIgnoreCase: return "i"; - case kGlobal | kMultiLine: case kMultiLine: return "m"; + case kDotAll: + return "s"; + case kUnicode: + return "u"; default: break; } @@ -21789,9 +21811,7 @@ bool RegExp::CanonicalizeEquals(const Instance& other) const { return false; } // Match the flags. - if ((is_global() != other_js.is_global()) || - (is_ignore_case() != other_js.is_ignore_case()) || - (is_multi_line() != other_js.is_multi_line())) { + if (flags() != other_js.flags()) { return false; } return true; @@ -21800,7 +21820,7 @@ bool RegExp::CanonicalizeEquals(const Instance& other) const { const char* RegExp::ToCString() const { const String& str = String::Handle(pattern()); return OS::SCreate(Thread::Current()->zone(), "RegExp: pattern=%s flags=%s", - str.ToCString(), Flags()); + str.ToCString(), flags().ToCString()); } RawWeakProperty* WeakProperty::New(Heap::Space space) { diff --git a/runtime/vm/object.h b/runtime/vm/object.h index 79ab05095a3..4b5bd5092cb 100644 --- a/runtime/vm/object.h +++ b/runtime/vm/object.h @@ -9122,6 +9122,55 @@ class StackTrace : public Instance { friend class Debugger; }; +class RegExpFlags { + public: + // Flags are passed to a regex object as follows: + // 'i': ignore case, 'g': do global matches, 'm': pattern is multi line, + // 'u': pattern is full Unicode, not just BMP, 's': '.' in pattern matches + // all characters including line terminators. + enum Flags { + kNone = 0, + kGlobal = 1, + kIgnoreCase = 2, + kMultiLine = 4, + kUnicode = 8, + kDotAll = 16, + }; + + static const int kDefaultFlags = 0; + + RegExpFlags() : value_(kDefaultFlags) {} + explicit RegExpFlags(int value) : value_(value) {} + + inline bool IsGlobal() const { return (value_ & kGlobal) != 0; } + inline bool IgnoreCase() const { return (value_ & kIgnoreCase) != 0; } + inline bool IsMultiLine() const { return (value_ & kMultiLine) != 0; } + inline bool IsUnicode() const { return (value_ & kUnicode) != 0; } + inline bool IsDotAll() const { return (value_ & kDotAll) != 0; } + + inline bool NeedsUnicodeCaseEquivalents() { + // Both unicode and ignore_case flags are set. We need to use ICU to find + // the closure over case equivalents. + return IsUnicode() && IgnoreCase(); + } + + void SetGlobal() { value_ |= kGlobal; } + void SetIgnoreCase() { value_ |= kIgnoreCase; } + void SetMultiLine() { value_ |= kMultiLine; } + void SetUnicode() { value_ |= kUnicode; } + void SetDotAll() { value_ |= kDotAll; } + + const char* ToCString() const; + + int value() const { return value_; } + + bool operator==(const RegExpFlags& other) { return value_ == other.value_; } + bool operator!=(const RegExpFlags& other) { return value_ != other.value_; } + + private: + int value_; +}; + // Internal JavaScript regular expression object. class RegExp : public Instance { public: @@ -9135,20 +9184,11 @@ class RegExp : public Instance { kComplex = 2, }; - // Flags are passed to a regex object as follows: - // 'i': ignore case, 'g': do global matches, 'm': pattern is multi line. - enum Flags { - kNone = 0, - kGlobal = 1, - kIgnoreCase = 2, - kMultiLine = 4, - }; - enum { kTypePos = 0, kTypeSize = 2, kFlagsPos = 2, - kFlagsSize = 4, + kFlagsSize = 5, }; class TypeBits : public BitField {}; @@ -9158,11 +9198,10 @@ class RegExp : public Instance { bool is_simple() const { return (type() == kSimple); } bool is_complex() const { return (type() == kComplex); } - bool is_global() const { return (flags() & kGlobal); } - bool is_ignore_case() const { return (flags() & kIgnoreCase); } - bool is_multi_line() const { return (flags() & kMultiLine); } - - intptr_t num_registers() const { return raw_ptr()->num_registers_; } + intptr_t num_registers(bool is_one_byte) const { + return is_one_byte ? raw_ptr()->num_one_byte_registers_ + : raw_ptr()->num_two_byte_registers_; + } RawString* pattern() const { return raw_ptr()->pattern_; } RawSmi* num_bracket_expressions() const { @@ -9226,15 +9265,48 @@ class RegExp : public Instance { void set_num_bracket_expressions(intptr_t value) const; void set_capture_name_map(const Array& array) const; - void set_is_global() const { set_flags(flags() | kGlobal); } - void set_is_ignore_case() const { set_flags(flags() | kIgnoreCase); } - void set_is_multi_line() const { set_flags(flags() | kMultiLine); } + void set_is_global() const { + RegExpFlags f = flags(); + f.SetGlobal(); + set_flags(f); + } + void set_is_ignore_case() const { + RegExpFlags f = flags(); + f.SetIgnoreCase(); + set_flags(f); + } + void set_is_multi_line() const { + RegExpFlags f = flags(); + f.SetMultiLine(); + set_flags(f); + } + void set_is_unicode() const { + RegExpFlags f = flags(); + f.SetUnicode(); + set_flags(f); + } + void set_is_dot_all() const { + RegExpFlags f = flags(); + f.SetDotAll(); + set_flags(f); + } void set_is_simple() const { set_type(kSimple); } void set_is_complex() const { set_type(kComplex); } - void set_num_registers(intptr_t value) const { - StoreNonPointer(&raw_ptr()->num_registers_, value); + void set_num_registers(bool is_one_byte, intptr_t value) const { + if (is_one_byte) { + StoreNonPointer(&raw_ptr()->num_one_byte_registers_, value); + } else { + StoreNonPointer(&raw_ptr()->num_two_byte_registers_, value); + } } + RegExpFlags flags() const { + return RegExpFlags(FlagsBits::decode(raw_ptr()->type_flags_)); + } + void set_flags(RegExpFlags flags) const { + StoreNonPointer(&raw_ptr()->type_flags_, + FlagsBits::update(flags.value(), raw_ptr()->type_flags_)); + } const char* Flags() const; virtual bool CanonicalizeEquals(const Instance& other) const; @@ -9250,13 +9322,8 @@ class RegExp : public Instance { StoreNonPointer(&raw_ptr()->type_flags_, TypeBits::update(type, raw_ptr()->type_flags_)); } - void set_flags(intptr_t value) const { - StoreNonPointer(&raw_ptr()->type_flags_, - FlagsBits::update(value, raw_ptr()->type_flags_)); - } RegExType type() const { return TypeBits::decode(raw_ptr()->type_flags_); } - intptr_t flags() const { return FlagsBits::decode(raw_ptr()->type_flags_); } FINAL_HEAP_OBJECT_IMPLEMENTATION(RegExp, Instance); friend class Class; diff --git a/runtime/vm/object_service.cc b/runtime/vm/object_service.cc index 0397f6e3181..9895c421f4c 100644 --- a/runtime/vm/object_service.cc +++ b/runtime/vm/object_service.cc @@ -1487,8 +1487,8 @@ void RegExp::PrintJSONImpl(JSONStream* stream, bool ref) const { return; } - jsobj.AddProperty("isCaseSensitive", !is_ignore_case()); - jsobj.AddProperty("isMultiLine", is_multi_line()); + jsobj.AddProperty("isCaseSensitive", !flags().IgnoreCase()); + jsobj.AddProperty("isMultiLine", flags().IsMultiLine()); if (!FLAG_interpret_irregexp) { Function& func = Function::Handle(); diff --git a/runtime/vm/raw_object.h b/runtime/vm/raw_object.h index 05d60ddf6dc..75e7a92d4db 100644 --- a/runtime/vm/raw_object.h +++ b/runtime/vm/raw_object.h @@ -2428,11 +2428,17 @@ class RawRegExp : public RawInstance { VISIT_TO(RawObject*, external_two_byte_sticky_function_) RawObject** to_snapshot(Snapshot::Kind kind) { return to(); } - intptr_t num_registers_; + // The same pattern may use different amount of registers if compiled + // for a one-byte target than a two-byte target. For example, we do not + // need to allocate registers to check whether the current position is within + // a surrogate pair when matching a Unicode pattern against a one-byte string. + intptr_t num_one_byte_registers_; + intptr_t num_two_byte_registers_; // A bitfield with two fields: // type: Uninitialized, simple or complex. - // flags: Represents global/local, case insensitive, multiline. + // flags: Represents global/local, case insensitive, multiline, unicode, + // dotAll. int8_t type_flags_; }; diff --git a/runtime/vm/raw_object_snapshot.cc b/runtime/vm/raw_object_snapshot.cc index 20e39835386..72ef288d6f7 100644 --- a/runtime/vm/raw_object_snapshot.cc +++ b/runtime/vm/raw_object_snapshot.cc @@ -2172,7 +2172,9 @@ RawRegExp* RegExp::ReadFrom(SnapshotReader* reader, *reader->StringHandle() ^= reader->ReadObjectImpl(kAsInlinedObject); regex.set_pattern(*reader->StringHandle()); - regex.StoreNonPointer(®ex.raw_ptr()->num_registers_, + regex.StoreNonPointer(®ex.raw_ptr()->num_one_byte_registers_, + reader->Read()); + regex.StoreNonPointer(®ex.raw_ptr()->num_two_byte_registers_, reader->Read()); regex.StoreNonPointer(®ex.raw_ptr()->type_flags_, reader->Read()); @@ -2202,7 +2204,8 @@ void RawRegExp::WriteTo(SnapshotWriter* writer, // Write out all the other fields. writer->Write(ptr()->num_bracket_expressions_); writer->WriteObjectImpl(ptr()->pattern_, kAsInlinedObject); - writer->Write(ptr()->num_registers_); + writer->Write(ptr()->num_one_byte_registers_); + writer->Write(ptr()->num_two_byte_registers_); writer->Write(ptr()->type_flags_); } diff --git a/runtime/vm/regexp.cc b/runtime/vm/regexp.cc index dd559bd9ee1..23eff8f9daa 100644 --- a/runtime/vm/regexp.cc +++ b/runtime/vm/regexp.cc @@ -4,7 +4,11 @@ #include "vm/regexp.h" +#include "platform/splay-tree-inl.h" #include "platform/unicode.h" + +#include "unicode/uniset.h" + #include "vm/dart_entry.h" #include "vm/regexp_assembler.h" #include "vm/regexp_assembler_bytecode.h" @@ -25,14 +29,14 @@ static const bool kRegexpOptimization = true; static const intptr_t kMaxLookaheadForBoyerMoore = 8; ContainedInLattice AddRange(ContainedInLattice containment, - const intptr_t* ranges, + const int32_t* ranges, intptr_t ranges_length, Interval new_range) { ASSERT((ranges_length & 1) == 1); - ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1); + ASSERT(ranges[ranges_length - 1] == Utf::kMaxCodePoint + 1); if (containment == kLatticeUnknown) return containment; bool inside = false; - intptr_t last = 0; + int32_t last = 0; for (intptr_t i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { // Consider the range from last to ranges[i]. @@ -282,10 +286,26 @@ class FrequencyCollator : public ValueObject { class RegExpCompiler : public ValueObject { public: - RegExpCompiler(intptr_t capture_count, bool ignore_case, bool is_one_byte); + RegExpCompiler(intptr_t capture_count, bool is_one_byte); intptr_t AllocateRegister() { return next_register_++; } + // Lookarounds to match lone surrogates for unicode character class matches + // are never nested. We can therefore reuse registers. + intptr_t UnicodeLookaroundStackRegister() { + if (unicode_lookaround_stack_register_ == kNoRegister) { + unicode_lookaround_stack_register_ = AllocateRegister(); + } + return unicode_lookaround_stack_register_; + } + + intptr_t UnicodeLookaroundPositionRegister() { + if (unicode_lookaround_position_register_ == kNoRegister) { + unicode_lookaround_position_register_ = AllocateRegister(); + } + return unicode_lookaround_position_register_; + } + #if !defined(DART_PRECOMPILED_RUNTIME) RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler, RegExpNode* start, @@ -315,7 +335,6 @@ class RegExpCompiler : public ValueObject { void SetRegExpTooBig() { reg_exp_too_big_ = true; } - inline bool ignore_case() { return ignore_case_; } inline bool one_byte() const { return is_one_byte_; } bool read_backward() { return read_backward_; } void set_read_backward(bool value) { read_backward_ = value; } @@ -333,10 +352,11 @@ class RegExpCompiler : public ValueObject { private: EndNode* accept_; intptr_t next_register_; + intptr_t unicode_lookaround_stack_register_; + intptr_t unicode_lookaround_position_register_; ZoneGrowableArray* work_list_; intptr_t recursion_depth_; RegExpMacroAssembler* macro_assembler_; - bool ignore_case_; bool is_one_byte_; bool reg_exp_too_big_; bool read_backward_; @@ -362,13 +382,12 @@ static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { // Attempts to compile the regexp using an Irregexp code generator. Returns // a fixed array or a null handle depending on whether it succeeded. -RegExpCompiler::RegExpCompiler(intptr_t capture_count, - bool ignore_case, - bool is_one_byte) +RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte) : next_register_(2 * (capture_count + 1)), + unicode_lookaround_stack_register_(kNoRegister), + unicode_lookaround_position_register_(kNoRegister), work_list_(NULL), recursion_depth_(0), - ignore_case_(ignore_case), is_one_byte_(is_one_byte), reg_exp_too_big_(false), read_backward_(false), @@ -999,7 +1018,7 @@ static inline bool EmitAtomLetter(Zone* zone, } static void EmitBoundaryTest(RegExpMacroAssembler* masm, - intptr_t border, + uint16_t border, BlockLabel* fall_through, BlockLabel* above_or_equal, BlockLabel* below) { @@ -1012,8 +1031,8 @@ static void EmitBoundaryTest(RegExpMacroAssembler* masm, } static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, - intptr_t first, - intptr_t last, + uint16_t first, + uint16_t last, BlockLabel* fall_through, BlockLabel* in_range, BlockLabel* out_of_range) { @@ -1036,10 +1055,10 @@ static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. static void EmitUseLookupTable(RegExpMacroAssembler* masm, - ZoneGrowableArray* ranges, + ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, - intptr_t min_char, + uint16_t min_char, BlockLabel* fall_through, BlockLabel* even_label, BlockLabel* odd_label) { @@ -1093,7 +1112,7 @@ static void EmitUseLookupTable(RegExpMacroAssembler* masm, } static void CutOutRange(RegExpMacroAssembler* masm, - ZoneGrowableArray* ranges, + ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, intptr_t cut_index, @@ -1119,17 +1138,17 @@ static void CutOutRange(RegExpMacroAssembler* masm, // Unicode case. Split the search space into kSize spaces that are handled // with recursion. -static void SplitSearchSpace(ZoneGrowableArray* ranges, +static void SplitSearchSpace(ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, intptr_t* new_start_index, intptr_t* new_end_index, - intptr_t* border) { + uint16_t* border) { static const intptr_t kSize = RegExpMacroAssembler::kTableSize; static const intptr_t kMask = RegExpMacroAssembler::kTableMask; - intptr_t first = ranges->At(start_index); - intptr_t last = ranges->At(end_index) - 1; + uint16_t first = ranges->At(start_index); + uint16_t last = ranges->At(end_index) - 1; *new_start_index = start_index; *border = (ranges->At(start_index) & ~kMask) + kSize; @@ -1189,7 +1208,7 @@ static void SplitSearchSpace(ZoneGrowableArray* ranges, // Either label can be NULL indicating backtracking. Either label can also be // equal to the fall_through label. static void GenerateBranches(RegExpMacroAssembler* masm, - ZoneGrowableArray* ranges, + ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, uint16_t min_char, @@ -1197,8 +1216,8 @@ static void GenerateBranches(RegExpMacroAssembler* masm, BlockLabel* fall_through, BlockLabel* even_label, BlockLabel* odd_label) { - intptr_t first = ranges->At(start_index); - intptr_t last = ranges->At(end_index) - 1; + uint16_t first = ranges->At(start_index); + uint16_t last = ranges->At(end_index) - 1; ASSERT(min_char < first); @@ -1258,7 +1277,7 @@ static void GenerateBranches(RegExpMacroAssembler* masm, intptr_t new_start_index = 0; intptr_t new_end_index = 0; - intptr_t border = 0; + uint16_t border = 0; SplitSearchSpace(ranges, start_index, end_index, &new_start_index, &new_end_index, &border); @@ -1315,7 +1334,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, CharacterRange::Canonicalize(ranges); } - intptr_t max_char; + uint16_t max_char; if (one_byte) { max_char = Symbols::kMaxOneCharCodeSymbol; } else { @@ -1326,7 +1345,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, intptr_t last_valid_range = range_count - 1; while (last_valid_range >= 0) { - CharacterRange& range = (*ranges)[last_valid_range]; + const CharacterRange& range = ranges->At(last_valid_range); if (range.from() <= max_char) { break; } @@ -1354,14 +1373,6 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, } return; } - if (last_valid_range == 0 && !cc->is_negated() && - ranges->At(0).IsEverything(max_char)) { - // This is a common case hit by non-anchored expressions. - if (check_offset) { - macro_assembler->CheckPosition(cp_offset, on_failure); - } - return; - } if (!preloaded) { macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); @@ -1378,25 +1389,24 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, // entry at zero which goes to the failure label, but if there // was already one there we fall through for success on that entry. // Subsequent entries have alternating meaning (success/failure). - ZoneGrowableArray* range_boundaries = - new (zone) ZoneGrowableArray(last_valid_range); + ZoneGrowableArray* range_boundaries = + new (zone) ZoneGrowableArray(last_valid_range); bool zeroth_entry_is_failure = !cc->is_negated(); for (intptr_t i = 0; i <= last_valid_range; i++) { - CharacterRange& range = (*ranges)[i]; + const CharacterRange& range = ranges->At(i); if (range.from() == 0) { ASSERT(i == 0); zeroth_entry_is_failure = !zeroth_entry_is_failure; } else { range_boundaries->Add(range.from()); } - range_boundaries->Add(range.to() + 1); + if (range.to() + 1 <= max_char) { + range_boundaries->Add(range.to() + 1); + } } intptr_t end_index = range_boundaries->length() - 1; - if (range_boundaries->At(end_index) > max_char) { - end_index--; - } BlockLabel fall_through; GenerateBranches(macro_assembler, range_boundaries, @@ -1694,7 +1704,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, if (read_backward()) return; ASSERT(characters_filled_in < details->characters()); intptr_t characters = details->characters(); - intptr_t char_mask; + int32_t char_mask; if (compiler->one_byte()) { char_mask = Symbols::kMaxOneCharCodeSymbol; } else { @@ -1719,7 +1729,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, pos->determines_perfectly = false; return; } - if (compiler->ignore_case()) { + if (elm.atom()->ignore_case()) { int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; intptr_t length = GetCaseIndependentLetters(c, compiler->one_byte(), chars); @@ -1769,6 +1779,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, details->positions(characters_filled_in); RegExpCharacterClass* tree = elm.char_class(); ZoneGrowableArray* ranges = tree->ranges(); + ASSERT(!ranges->is_empty()); if (tree->is_negated()) { // A quick check uses multi-character mask and compare. There is no // useful way to incorporate a negative char class into this scheme @@ -1907,16 +1918,16 @@ class VisitMarker : public ValueObject { NodeInfo* info_; }; -RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth, bool ignore_case) { +RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; ASSERT(!info()->visited); VisitMarker marker(info()); - return FilterSuccessor(depth - 1, ignore_case); + return FilterSuccessor(depth - 1); } -RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth, bool ignore_case) { - RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); +RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth) { + RegExpNode* next = on_success_->FilterOneByte(depth - 1); if (next == NULL) return set_replacement(NULL); on_success_ = next; return set_replacement(this); @@ -1953,7 +1964,7 @@ static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) { return 0; } -RegExpNode* TextNode::FilterOneByte(intptr_t depth, bool ignore_case) { +RegExpNode* TextNode::FilterOneByte(intptr_t depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; ASSERT(!info()->visited); @@ -1966,7 +1977,7 @@ RegExpNode* TextNode::FilterOneByte(intptr_t depth, bool ignore_case) { for (intptr_t j = 0; j < quarks->length(); j++) { uint16_t c = quarks->At(j); if (c <= Symbols::kMaxOneCharCodeSymbol) continue; - if (!ignore_case) return set_replacement(NULL); + if (!elm.atom()->ignore_case()) return set_replacement(NULL); // Here, we need to check for characters whose upper and lower cases // are outside the Latin-1 range. uint16_t converted = ConvertNonLatin1ToLatin1(c); @@ -1988,40 +1999,44 @@ RegExpNode* TextNode::FilterOneByte(intptr_t depth, bool ignore_case) { if (range_count != 0 && ranges->At(0).from() == 0 && ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) { // This will be handled in a later filter. - if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; + if (cc->flags().IgnoreCase() && + RangesContainLatin1Equivalents(ranges)) { + continue; + } return set_replacement(NULL); } } else { if (range_count == 0 || ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) { // This will be handled in a later filter. - if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; + if (cc->flags().IgnoreCase() && + RangesContainLatin1Equivalents(ranges)) + continue; return set_replacement(NULL); } } } } - return FilterSuccessor(depth - 1, ignore_case); + return FilterSuccessor(depth - 1); } -RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { +RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; { VisitMarker marker(info()); - RegExpNode* continue_replacement = - continue_node_->FilterOneByte(depth - 1, ignore_case); + RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); // If we can't continue after the loop then there is no sense in doing the // loop. if (continue_replacement == NULL) return set_replacement(NULL); } - return ChoiceNode::FilterOneByte(depth - 1, ignore_case); + return ChoiceNode::FilterOneByte(depth - 1); } -RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { +RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2040,8 +2055,7 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { RegExpNode* survivor = NULL; for (intptr_t i = 0; i < choice_count; i++) { GuardedAlternative alternative = alternatives_->At(i); - RegExpNode* replacement = - alternative.node()->FilterOneByte(depth - 1, ignore_case); + RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. if (replacement != NULL) { (*alternatives_)[i].set_node(replacement); @@ -2061,7 +2075,7 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { new (Z) ZoneGrowableArray(surviving); for (intptr_t i = 0; i < choice_count; i++) { RegExpNode* replacement = - (*alternatives_)[i].node()->FilterOneByte(depth - 1, ignore_case); + (*alternatives_)[i].node()->FilterOneByte(depth - 1); if (replacement != NULL) { (*alternatives_)[i].set_node(replacement); new_alternatives->Add((*alternatives_)[i]); @@ -2071,8 +2085,7 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { return this; } -RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth, - bool ignore_case) { +RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2080,12 +2093,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth, // Alternative 0 is the negative lookahead, alternative 1 is what comes // afterwards. RegExpNode* node = (*alternatives_)[1].node(); - RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); + RegExpNode* replacement = node->FilterOneByte(depth - 1); if (replacement == NULL) return set_replacement(NULL); (*alternatives_)[1].set_node(replacement); RegExpNode* neg_node = (*alternatives_)[0].node(); - RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); // If the negative lookahead is always going to fail then // we don't need to check it. if (neg_replacement == NULL) return set_replacement(replacement); @@ -2383,13 +2396,21 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, if (elm.text_type() == TextElement::ATOM) { ZoneGrowableArray* quarks = elm.atom()->data(); for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) { + if (SkipPass(pass, elm.atom()->ignore_case())) continue; if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; EmitCharacterFunction* emit_function = NULL; + uint16_t quark = quarks->At(j); + if (elm.atom()->ignore_case()) { + // Everywhere else we assume that a non-Latin-1 character cannot match + // a Latin-1 character. Avoid the cases where this is assumption is + // invalid by using the Latin1 equivalent instead. + quark = Latin1::TryConvertToLatin1(quark); + } switch (pass) { case NON_LATIN1_MATCH: ASSERT(one_byte); - if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) { + if (quark > Symbols::kMaxOneCharCodeSymbol) { assembler->GoTo(backtrack); return; } @@ -2445,6 +2466,34 @@ bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) { } } +TextNode* TextNode::CreateForCharacterRanges( + ZoneGrowableArray* ranges, + bool read_backward, + RegExpNode* on_success, + RegExpFlags flags) { + ASSERT(ranges != nullptr); + ZoneGrowableArray* elms = new ZoneGrowableArray(1); + elms->Add(TextElement::CharClass(new RegExpCharacterClass(ranges, flags))); + return new TextNode(elms, read_backward, on_success); +} + +TextNode* TextNode::CreateForSurrogatePair(CharacterRange lead, + CharacterRange trail, + bool read_backward, + RegExpNode* on_success, + RegExpFlags flags) { + auto lead_ranges = CharacterRange::List(on_success->zone(), lead); + auto trail_ranges = CharacterRange::List(on_success->zone(), trail); + auto elms = new ZoneGrowableArray(2); + + elms->Add( + TextElement::CharClass(new RegExpCharacterClass(lead_ranges, flags))); + elms->Add( + TextElement::CharClass(new RegExpCharacterClass(trail_ranges, flags))); + + return new TextNode(elms, read_backward, on_success); +} + // This generates the code to match a text node. A text node can contain // straight character sequences (possibly to be matched in a case-independent // way) and character classes. For efficiency we do not do this in a single @@ -2474,19 +2523,15 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { // check that now. if (trace->characters_preloaded() == 1) { for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) { - if (!SkipPass(pass, compiler->ignore_case())) { - TextEmitPass(compiler, static_cast(pass), true, trace, - false, &bound_checked_to); - } + TextEmitPass(compiler, static_cast(pass), true, trace, + false, &bound_checked_to); } first_elt_done = true; } for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) { - if (!SkipPass(pass, compiler->ignore_case())) { - TextEmitPass(compiler, static_cast(pass), false, trace, - first_elt_done, &bound_checked_to); - } + TextEmitPass(compiler, static_cast(pass), false, trace, + first_elt_done, &bound_checked_to); } Trace successor_trace(*trace); @@ -2528,13 +2573,13 @@ void TextNode::MakeCaseIndependent(bool is_one_byte) { TextElement elm = elms_->At(i); if (elm.text_type() == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.char_class(); - // None of the standard character classes is different in the case - // independent case and it slows us down if we don't know that. - if (cc->is_standard()) continue; - ZoneGrowableArray* ranges = cc->ranges(); - intptr_t range_count = ranges->length(); - for (intptr_t j = 0; j < range_count; j++) { - (*ranges)[j].AddCaseEquivalents(ranges, is_one_byte, Z); + bool case_equivalents_already_added = + cc->flags().NeedsUnicodeCaseEquivalents(); + if (cc->flags().IgnoreCase() && !case_equivalents_already_added) { + // None of the standard character classes is different in the case + // independent case and it slows us down if we don't know that. + if (cc->is_standard()) continue; + CharacterRange::AddCaseEquivalents(cc->ranges(), is_one_byte, Z); } } } @@ -2574,7 +2619,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( // length nodes or other complications in the way then return a sentinel // value indicating that a greedy loop cannot be constructed. intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative( - GuardedAlternative* alternative) { + const GuardedAlternative* alternative) { intptr_t length = 0; RegExpNode* node = alternative->node(); // Later we will generate code for all these text nodes using recursion @@ -2612,7 +2657,7 @@ void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { if (trace->stop_node() == this) { // Back edge of greedy optimized loop node graph. intptr_t text_length = - GreedyLoopTextLengthForAlternative(&((*alternatives_)[0])); + GreedyLoopTextLengthForAlternative(&alternatives_->At(0)); ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); // Update the counter-based backtracking info on the stack. This is an // optimization for greedy loops (see below). @@ -2693,25 +2738,27 @@ class AlternativeGenerationList { DISALLOW_ALLOCATION(); }; +static const int32_t kRangeEndMarker = Utf::kMaxCodePoint + 1; + // The '2' variant is inclusive from and exclusive to. // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, // which include WhiteSpace (7.2) or LineTerminator (7.3) values. // 0x180E has been removed from Unicode's Zs category and thus // from ECMAScript's WhiteSpace category as of Unicode 6.3. -static const intptr_t kSpaceRanges[] = { +static const int32_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, - 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000}; + 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); -static const intptr_t kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', - '_' + 1, 'a', 'z' + 1, 0x10000}; +static const int32_t kWordRanges[] = { + '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges); -static const intptr_t kDigitRanges[] = {'0', '9' + 1, 0x10000}; +static const int32_t kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges); -static const intptr_t kSurrogateRanges[] = {0xd800, 0xe000, 0x10000}; +static const int32_t kSurrogateRanges[] = {0xd800, 0xe000, kRangeEndMarker}; static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); -static const intptr_t kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, - 0x2028, 0x202A, 0x10000}; +static const int32_t kLineTerminatorRanges[] = { + 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; static const intptr_t kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); @@ -3057,6 +3104,11 @@ void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { intptr_t choice_count = alternatives_->length(); + if (choice_count == 1 && alternatives_->At(0).guards() == nullptr) { + alternatives_->At(0).node()->Emit(compiler, trace); + return; + } + AssertGuardsMentionRegisters(trace); LimitResult limit_result = LimitVersions(compiler, trace); @@ -3077,7 +3129,7 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { GreedyLoopState greedy_loop_state(not_at_start()); intptr_t text_length = - GreedyLoopTextLengthForAlternative(&((*alternatives_)[0])); + GreedyLoopTextLengthForAlternative(&alternatives_->At(0)); AlternativeGenerationList alt_gens(choice_count); if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { @@ -3463,9 +3515,9 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RecursionCheck rc(compiler); ASSERT(start_reg_ + 1 == end_reg_); - if (compiler->ignore_case()) { - assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), - trace->backtrack()); + if (flags_.IgnoreCase()) { + assembler->CheckNotBackReferenceIgnoreCase( + start_reg_, read_backward(), flags_.IsUnicode(), trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -3473,6 +3525,11 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { // We are going to advance backward, so we may end up at the start. if (read_backward()) trace->set_at_start(Trace::UNKNOWN); + // Check that the back reference does not end inside a surrogate pair. + if (flags_.IsUnicode() && !compiler->one_byte()) { + assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); + } + on_success()->Emit(compiler, trace); } @@ -3731,10 +3788,10 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, } static bool CompareInverseRanges(ZoneGrowableArray* ranges, - const intptr_t* special_class, + const int32_t* special_class, intptr_t length) { - length--; // Remove final 0x10000. - ASSERT(special_class[length] == 0x10000); + length--; // Remove final kRangeEndMarker. + ASSERT(special_class[length] == kRangeEndMarker); ASSERT(ranges->length() != 0); ASSERT(length != 0); ASSERT(special_class[0] != 0); @@ -3754,17 +3811,17 @@ static bool CompareInverseRanges(ZoneGrowableArray* ranges, return false; } } - if (range.to() != 0xffff) { + if (range.to() != Utf::kMaxCodePoint) { return false; } return true; } static bool CompareRanges(ZoneGrowableArray* ranges, - const intptr_t* special_class, + const int32_t* special_class, intptr_t length) { - length--; // Remove final 0x10000. - ASSERT(special_class[length] == 0x10000); + length--; // Remove final kRangeEndMarker. + ASSERT(special_class[length] == kRangeEndMarker); if (ranges->length() * 2 != length) { return false; } @@ -3781,7 +3838,7 @@ static bool CompareRanges(ZoneGrowableArray* ranges, bool RegExpCharacterClass::is_standard() { // TODO(lrn): Remove need for this function, by not throwing away information // along the way. - if (is_negated_) { + if (is_negated()) { return false; } if (set_.is_standard()) { @@ -3816,8 +3873,307 @@ bool RegExpCharacterClass::is_standard() { return false; } +UnicodeRangeSplitter::UnicodeRangeSplitter( + Zone* zone, + ZoneGrowableArray* base) + : zone_(zone), + table_(zone), + bmp_(nullptr), + lead_surrogates_(nullptr), + trail_surrogates_(nullptr), + non_bmp_(nullptr) { + // The unicode range splitter categorizes given character ranges into: + // - Code points from the BMP representable by one code unit. + // - Code points outside the BMP that need to be split into surrogate pairs. + // - Lone lead surrogates. + // - Lone trail surrogates. + // Lone surrogates are valid code points, even though no actual characters. + // They require special matching to make sure we do not split surrogate pairs. + // We use the dispatch table to accomplish this. The base range is split up + // by the table by the overlay ranges, and the Call callback is used to + // filter and collect ranges for each category. + for (intptr_t i = 0; i < base->length(); i++) { + table_.AddRange(base->At(i), kBase, zone_); + } + // Add overlay ranges. + table_.AddRange(CharacterRange::Range(0, Utf16::kLeadSurrogateStart - 1), + kBmpCodePoints, zone_); + table_.AddRange(CharacterRange::Range(Utf16::kLeadSurrogateStart, + Utf16::kLeadSurrogateEnd), + kLeadSurrogates, zone_); + table_.AddRange(CharacterRange::Range(Utf16::kTrailSurrogateStart, + Utf16::kTrailSurrogateEnd), + kTrailSurrogates, zone_); + table_.AddRange( + CharacterRange::Range(Utf16::kTrailSurrogateEnd + 1, Utf16::kMaxCodeUnit), + kBmpCodePoints, zone_); + table_.AddRange( + CharacterRange::Range(Utf16::kMaxCodeUnit + 1, Utf::kMaxCodePoint), + kNonBmpCodePoints, zone_); + table_.ForEach(this); +} + +void UnicodeRangeSplitter::Call(uint32_t from, DispatchTable::Entry entry) { + OutSet* outset = entry.out_set(); + if (!outset->Get(kBase)) return; + ZoneGrowableArray** target = nullptr; + if (outset->Get(kBmpCodePoints)) { + target = &bmp_; + } else if (outset->Get(kLeadSurrogates)) { + target = &lead_surrogates_; + } else if (outset->Get(kTrailSurrogates)) { + target = &trail_surrogates_; + } else { + ASSERT(outset->Get(kNonBmpCodePoints)); + target = &non_bmp_; + } + if (*target == nullptr) { + *target = new (zone_) ZoneGrowableArray(2); + } + (*target)->Add(CharacterRange::Range(entry.from(), entry.to())); +} + +void AddBmpCharacters(RegExpCompiler* compiler, + ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + ZoneGrowableArray* bmp = splitter->bmp(); + if (bmp == nullptr) return; + result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( + bmp, compiler->read_backward(), on_success, RegExpFlags()))); +} + +void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, + ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + ZoneGrowableArray* non_bmp = splitter->non_bmp(); + if (non_bmp == nullptr) return; + ASSERT(!compiler->one_byte()); + CharacterRange::Canonicalize(non_bmp); + for (int i = 0; i < non_bmp->length(); i++) { + // Match surrogate pair. + // E.g. [\u10005-\u11005] becomes + // \ud800[\udc05-\udfff]| + // [\ud801-\ud803][\udc00-\udfff]| + // \ud804[\udc00-\udc05] + uint32_t from = non_bmp->At(i).from(); + uint32_t to = non_bmp->At(i).to(); + uint16_t from_points[2]; + Utf16::Encode(from, from_points); + uint16_t to_points[2]; + Utf16::Encode(to, to_points); + if (from_points[0] == to_points[0]) { + // The lead surrogate is the same. + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + CharacterRange::Singleton(from_points[0]), + CharacterRange::Range(from_points[1], to_points[1]), + compiler->read_backward(), on_success, RegExpFlags()))); + } else { + if (from_points[1] != Utf16::kTrailSurrogateStart) { + // Add [from_l][from_t-\udfff] + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + CharacterRange::Singleton(from_points[0]), + CharacterRange::Range(from_points[1], + Utf16::kTrailSurrogateEnd), + compiler->read_backward(), on_success, RegExpFlags()))); + from_points[0]++; + } + if (to_points[1] != Utf16::kTrailSurrogateEnd) { + // Add [to_l][\udc00-to_t] + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + CharacterRange::Singleton(to_points[0]), + CharacterRange::Range(Utf16::kTrailSurrogateStart, + to_points[1]), + compiler->read_backward(), on_success, RegExpFlags()))); + to_points[0]--; + } + if (from_points[0] <= to_points[0]) { + // Add [from_l-to_l][\udc00-\udfff] + result->AddAlternative( + GuardedAlternative(TextNode::CreateForSurrogatePair( + CharacterRange::Range(from_points[0], to_points[0]), + CharacterRange::Range(Utf16::kTrailSurrogateStart, + Utf16::kTrailSurrogateEnd), + compiler->read_backward(), on_success, RegExpFlags()))); + } + } + } +} + +RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( + RegExpCompiler* compiler, + ZoneGrowableArray* lookbehind, + ZoneGrowableArray* match, + RegExpNode* on_success, + bool read_backward, + RegExpFlags flags) { + RegExpNode* match_node = TextNode::CreateForCharacterRanges( + match, read_backward, on_success, flags); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpLookaround::Builder lookaround(false, match_node, stack_register, + position_register); + RegExpNode* negative_match = TextNode::CreateForCharacterRanges( + lookbehind, !read_backward, lookaround.on_match_success(), flags); + return lookaround.ForMatch(negative_match); +} + +RegExpNode* MatchAndNegativeLookaroundInReadDirection( + RegExpCompiler* compiler, + ZoneGrowableArray* match, + ZoneGrowableArray* lookahead, + RegExpNode* on_success, + bool read_backward, + RegExpFlags flags) { + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpLookaround::Builder lookaround(false, on_success, stack_register, + position_register); + RegExpNode* negative_match = TextNode::CreateForCharacterRanges( + lookahead, read_backward, lookaround.on_match_success(), flags); + return TextNode::CreateForCharacterRanges( + match, read_backward, lookaround.ForMatch(negative_match), flags); +} + +void AddLoneLeadSurrogates(RegExpCompiler* compiler, + ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + auto lead_surrogates = splitter->lead_surrogates(); + if (lead_surrogates == nullptr) return; + // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). + auto trail_surrogates = CharacterRange::List( + on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart, + Utf16::kTrailSurrogateEnd)); + + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Assert that reading forward, there is no trail + // surrogate, and then backward match the lead surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, trail_surrogates, lead_surrogates, on_success, true, + RegExpFlags()); + } else { + // Reading forward. Forward match the lead surrogate and assert that + // no trail surrogate follows. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, lead_surrogates, trail_surrogates, on_success, false, + RegExpFlags()); + } + result->AddAlternative(GuardedAlternative(match)); +} + +void AddLoneTrailSurrogates(RegExpCompiler* compiler, + ChoiceNode* result, + RegExpNode* on_success, + UnicodeRangeSplitter* splitter) { + auto trail_surrogates = splitter->trail_surrogates(); + if (trail_surrogates == nullptr) return; + // E.g. \udc01 becomes (?zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart, + Utf16::kLeadSurrogateEnd)); + + RegExpNode* match; + if (compiler->read_backward()) { + // Reading backward. Backward match the trail surrogate and assert that no + // lead surrogate precedes it. + match = MatchAndNegativeLookaroundInReadDirection( + compiler, trail_surrogates, lead_surrogates, on_success, true, + RegExpFlags()); + } else { + // Reading forward. Assert that reading backward, there is no lead + // surrogate, and then forward match the trail surrogate. + match = NegativeLookaroundAgainstReadDirectionAndMatch( + compiler, lead_surrogates, trail_surrogates, on_success, false, + RegExpFlags()); + } + result->AddAlternative(GuardedAlternative(match)); +} + +RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, + RegExpNode* on_success) { + // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. + ASSERT(!compiler->read_backward()); + // Advance any character. If the character happens to be a lead surrogate and + // we advanced into the middle of a surrogate pair, it will work out, as + // nothing will match from there. We will have to advance again, consuming + // the associated trail surrogate. + auto range = CharacterRange::List( + on_success->zone(), CharacterRange::Range(0, Utf16::kMaxCodeUnit)); + return TextNode::CreateForCharacterRanges(range, false, on_success, + RegExpFlags()); +} + +void AddUnicodeCaseEquivalents(ZoneGrowableArray* ranges) { + ASSERT(CharacterRange::IsCanonical(ranges)); + + // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. + // See also https://crbug.com/v8/6727. + // TODO(sstrickl): This only covers the special case of the {0,0x10FFFF} + // range, which we use frequently internally. But large ranges can also easily + // be created by the user. We might want to have a more general caching + // mechanism for such ranges. + if (ranges->length() == 1 && ranges->At(0).IsEverything(Utf::kMaxCodePoint)) { + return; + } + + icu::UnicodeSet set; + for (int i = 0; i < ranges->length(); i++) { + set.add(ranges->At(i).from(), ranges->At(i).to()); + } + ranges->Clear(); + set.closeOver(USET_CASE_INSENSITIVE); + // Full case mapping map single characters to multiple characters. + // Those are represented as strings in the set. Remove them so that + // we end up with only simple and common case mappings. + set.removeAllStrings(); + for (int i = 0; i < set.getRangeCount(); i++) { + ranges->Add( + CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i))); + } + // No errors and everything we collected have been ranges. + CharacterRange::Canonicalize(ranges); +} + RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { + set_.Canonicalize(); + ZoneGrowableArray* ranges = this->ranges(); + if (flags_.NeedsUnicodeCaseEquivalents()) { + AddUnicodeCaseEquivalents(ranges); + } + if (flags_.IsUnicode() && !compiler->one_byte() && + !contains_split_surrogate()) { + if (is_negated()) { + ZoneGrowableArray* negated = + new ZoneGrowableArray(2); + CharacterRange::Negate(ranges, negated); + ranges = negated; + } + if (ranges->length() == 0) { + RegExpCharacterClass* fail = + new RegExpCharacterClass(ranges, RegExpFlags()); + return new TextNode(fail, compiler->read_backward(), on_success); + } + if (standard_type() == '*') { + return UnanchoredAdvance(compiler, on_success); + } else { + ChoiceNode* result = new (OZ) ChoiceNode(2, OZ); + UnicodeRangeSplitter splitter(OZ, ranges); + AddBmpCharacters(compiler, result, on_success, &splitter); + AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); + AddLoneLeadSurrogates(compiler, result, on_success, &splitter); + AddLoneTrailSurrogates(compiler, result, on_success, &splitter); + return result; + } + } else { + return new TextNode(this, compiler->read_backward(), on_success); + } return new (OZ) TextNode(this, compiler->read_backward(), on_success); } @@ -4014,6 +4370,43 @@ RegExpNode* RegExpQuantifier::ToNode(intptr_t min, } } +namespace { +// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and +// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) +RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpAssertion::AssertionType type, + RegExpFlags flags) { + ASSERT(flags.NeedsUnicodeCaseEquivalents()); + ZoneGrowableArray* word_range = + new ZoneGrowableArray(2); + CharacterRange::AddClassEscape('w', word_range, true); + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + ChoiceNode* result = new (OZ) ChoiceNode(2, OZ); + // Add two choices. The (non-)boundary could start with a word or + // a non-word-character. + for (int i = 0; i < 2; i++) { + bool lookbehind_for_word = i == 0; + bool lookahead_for_word = + (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; + // Look to the left. + RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, + stack_register, position_register); + RegExpNode* backward = TextNode::CreateForCharacterRanges( + word_range, true, lookbehind.on_match_success(), flags); + // Look to the right. + RegExpLookaround::Builder lookahead(lookahead_for_word, + lookbehind.ForMatch(backward), + stack_register, position_register); + RegExpNode* forward = TextNode::CreateForCharacterRanges( + word_range, false, lookahead.on_match_success(), flags); + result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); + } + return result; +} +} // anonymous namespace + RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { switch (assertion_type()) { @@ -4022,9 +4415,15 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, case START_OF_INPUT: return AssertionNode::AtStart(on_success); case BOUNDARY: - return AssertionNode::AtBoundary(on_success); + return flags_.NeedsUnicodeCaseEquivalents() + ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, + flags_) + : AssertionNode::AtBoundary(on_success); case NON_BOUNDARY: - return AssertionNode::AtNonBoundary(on_success); + return flags_.NeedsUnicodeCaseEquivalents() + ? BoundaryAssertionAsLookaround(compiler, on_success, + NON_BOUNDARY, flags_) + : AssertionNode::AtNonBoundary(on_success); case END_OF_INPUT: return AssertionNode::AtEnd(on_success); case END_OF_LINE: { @@ -4039,7 +4438,8 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, ZoneGrowableArray* newline_ranges = new ZoneGrowableArray(3); CharacterRange::AddClassEscape('n', newline_ranges); - RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n'); + RegExpCharacterClass* newline_atom = + new RegExpCharacterClass('n', RegExpFlags()); TextNode* newline_matcher = new TextNode(newline_atom, /*read_backwards=*/false, ActionNode::PositiveSubmatchSuccess( @@ -4066,7 +4466,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { return new (OZ) BackReferenceNode(RegExpCapture::StartRegister(index()), - RegExpCapture::EndRegister(index()), + RegExpCapture::EndRegister(index()), flags_, compiler->read_backward(), on_success); } @@ -4174,24 +4574,24 @@ RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, return current; } -static void AddClass(const intptr_t* elmv, +static void AddClass(const int32_t* elmv, intptr_t elmc, ZoneGrowableArray* ranges) { elmc--; - ASSERT(elmv[elmc] == 0x10000); + ASSERT(elmv[elmc] == kRangeEndMarker); for (intptr_t i = 0; i < elmc; i += 2) { ASSERT(elmv[i] < elmv[i + 1]); ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); } } -static void AddClassNegated(const intptr_t* elmv, +static void AddClassNegated(const int32_t* elmv, intptr_t elmc, ZoneGrowableArray* ranges) { elmc--; - ASSERT(elmv[elmc] == 0x10000); + ASSERT(elmv[elmc] == kRangeEndMarker); ASSERT(elmv[0] != 0x0000); - ASSERT(elmv[elmc - 1] != Utf16::kMaxCodeUnit); + ASSERT(elmv[elmc - 1] != Utf::kMaxCodePoint); uint16_t last = 0x0000; for (intptr_t i = 0; i < elmc; i += 2) { ASSERT(last <= elmv[i] - 1); @@ -4199,7 +4599,30 @@ static void AddClassNegated(const intptr_t* elmv, ranges->Add(CharacterRange(last, elmv[i] - 1)); last = elmv[i + 1]; } - ranges->Add(CharacterRange(last, Utf16::kMaxCodeUnit)); + ranges->Add(CharacterRange(last, Utf::kMaxCodePoint)); +} + +void CharacterRange::AddClassEscape(uint16_t type, + ZoneGrowableArray* ranges, + bool add_unicode_case_equivalents) { + if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { + // See #sec-runtime-semantics-wordcharacters-abstract-operation + // In case of unicode and ignore_case, we need to create the closure over + // case equivalent characters before negating. + ZoneGrowableArray* new_ranges = + new ZoneGrowableArray(2); + AddClass(kWordRanges, kWordRangeCount, new_ranges); + AddUnicodeCaseEquivalents(new_ranges); + if (type == 'W') { + ZoneGrowableArray* negated = + new ZoneGrowableArray(2); + CharacterRange::Negate(new_ranges, negated); + new_ranges = negated; + } + ranges->AddArray(*new_ranges); + return; + } + AddClassEscape(type, ranges); } void CharacterRange::AddClassEscape(uint16_t type, @@ -4246,69 +4669,79 @@ void CharacterRange::AddCaseEquivalents( ZoneGrowableArray* ranges, bool is_one_byte, Zone* zone) { - uint16_t bottom = from(); - uint16_t top = to(); - if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { - if (bottom > Symbols::kMaxOneCharCodeSymbol) return; - if (top > Symbols::kMaxOneCharCodeSymbol) { - top = Symbols::kMaxOneCharCodeSymbol; + CharacterRange::Canonicalize(ranges); + int range_count = ranges->length(); + for (intptr_t i = 0; i < range_count; i++) { + CharacterRange range = ranges->At(i); + int32_t bottom = range.from(); + if (bottom > Utf16::kMaxCodeUnit) continue; + int32_t top = Utils::Minimum(range.to(), Utf16::kMaxCodeUnit); + // Nothing to be done for surrogates + if (bottom >= Utf16::kLeadSurrogateStart && + top <= Utf16::kTrailSurrogateEnd) { + continue; + } + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { + if (bottom > Symbols::kMaxOneCharCodeSymbol) continue; + if (top > Symbols::kMaxOneCharCodeSymbol) { + top = Symbols::kMaxOneCharCodeSymbol; + } } - } - unibrow::Mapping jsregexp_uncanonicalize; - unibrow::Mapping jsregexp_canonrange; - int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - if (top == bottom) { - // If this is a singleton we just expand the one character. - intptr_t length = - jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLINT - for (intptr_t i = 0; i < length; i++) { - uint32_t chr = chars[i]; - if (chr != bottom) { - ranges->Add(CharacterRange::Singleton(chars[i])); - } - } - } else { - // If this is a range we expand the characters block by block, - // expanding contiguous subranges (blocks) one at a time. - // The approach is as follows. For a given start character we - // look up the remainder of the block that contains it (represented - // by the end point), for instance we find 'z' if the character - // is 'c'. A block is characterized by the property - // that all characters uncanonicalize in the same way, except that - // each entry in the result is incremented by the distance from the first - // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and - // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. - // Once we've found the end point we look up its uncanonicalization - // and produce a range for each element. For instance for [c-f] - // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only - // add a range if it is not already contained in the input, so [c-f] - // will be skipped but [C-F] will be added. If this range is not - // completely contained in a block we do this for all the blocks - // covered by the range (handling characters that is not in a block - // as a "singleton block"). - int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - intptr_t pos = bottom; - while (pos <= top) { - intptr_t length = jsregexp_canonrange.get(pos, '\0', range); - uint16_t block_end; - if (length == 0) { - block_end = pos; - } else { - ASSERT(length == 1); - block_end = range[0]; - } - intptr_t end = (block_end > top) ? top : block_end; - length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT + unibrow::Mapping jsregexp_uncanonicalize; + unibrow::Mapping jsregexp_canonrange; + int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + if (top == bottom) { + // If this is a singleton we just expand the one character. + intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); for (intptr_t i = 0; i < length; i++) { - uint32_t c = range[i]; - uint16_t range_from = c - (block_end - pos); - uint16_t range_to = c - (block_end - end); - if (!(bottom <= range_from && range_to <= top)) { - ranges->Add(CharacterRange(range_from, range_to)); + int32_t chr = chars[i]; + if (chr != bottom) { + ranges->Add(CharacterRange::Singleton(chars[i])); } } - pos = end + 1; + } else { + // If this is a range we expand the characters block by block, + // expanding contiguous subranges (blocks) one at a time. + // The approach is as follows. For a given start character we + // look up the remainder of the block that contains it (represented + // by the end point), for instance we find 'z' if the character + // is 'c'. A block is characterized by the property + // that all characters uncanonicalize in the same way, except that + // each entry in the result is incremented by the distance from the first + // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] + // and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. + // Once we've found the end point we look up its uncanonicalization + // and produce a range for each element. For instance for [c-f] + // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only + // add a range if it is not already contained in the input, so [c-f] + // will be skipped but [C-F] will be added. If this range is not + // completely contained in a block we do this for all the blocks + // covered by the range (handling characters that is not in a block + // as a "singleton block"). + int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + intptr_t pos = bottom; + while (pos <= top) { + intptr_t length = jsregexp_canonrange.get(pos, '\0', range); + int32_t block_end; + if (length == 0) { + block_end = pos; + } else { + ASSERT(length == 1); + block_end = range[0]; + } + intptr_t end = (block_end > top) ? top : block_end; + length = jsregexp_uncanonicalize.get(block_end, '\0', range); + for (intptr_t i = 0; i < length; i++) { + int32_t c = range[i]; + int32_t range_from = c - (block_end - pos); + int32_t range_to = c - (block_end - end); + if (!(bottom <= range_from && range_to <= top)) { + ranges->Add(CharacterRange(range_from, range_to)); + } + } + pos = end + 1; + } } } } @@ -4361,8 +4794,8 @@ static intptr_t InsertRangeInCanonicalList( // list[0..count] for the result. Returns the number of resulting // canonicalized ranges. Inserting a range may collapse existing ranges into // fewer ranges, so the return value can be anything in the range 1..count+1. - uint16_t from = insert.from(); - uint16_t to = insert.to(); + int32_t from = insert.from(); + int32_t to = insert.to(); intptr_t start_pos = 0; intptr_t end_pos = count; for (intptr_t i = count - 1; i >= 0; i--) { @@ -4458,7 +4891,7 @@ void CharacterRange::Negate(ZoneGrowableArray* ranges, ASSERT(CharacterRange::IsCanonical(ranges)); ASSERT(negated_ranges->length() == 0); intptr_t range_count = ranges->length(); - uint16_t from = 0; + uint32_t from = 0; intptr_t i = 0; if (range_count > 0 && ranges->At(0).from() == 0) { from = ranges->At(0).to(); @@ -4470,8 +4903,8 @@ void CharacterRange::Negate(ZoneGrowableArray* ranges, from = range.to(); i++; } - if (from < Utf16::kMaxCodeUnit) { - negated_ranges->Add(CharacterRange(from + 1, Utf16::kMaxCodeUnit)); + if (from < Utf::kMaxCodePoint) { + negated_ranges->Add(CharacterRange(from + 1, Utf::kMaxCodePoint)); } } @@ -4488,6 +4921,22 @@ static bool ArrayContains(ZoneGrowableArray* array, unsigned value) { return false; } +OutSet* OutSet::Extend(unsigned value, Zone* zone) { + if (Get(value)) return this; + if (successors() != nullptr) { + for (int i = 0; i < successors()->length(); i++) { + OutSet* successor = successors()->At(i); + if (successor->Get(value)) return successor; + } + } else { + successors_ = new (zone) ZoneGrowableArray(2); + } + OutSet* result = new (zone) OutSet(first_, remaining_); + result->Set(value, zone); + successors()->Add(result); + return result; +} + void OutSet::Set(unsigned value, Zone* zone) { if (value < kFirstLimit) { first_ |= (1 << value); @@ -4512,6 +4961,108 @@ bool OutSet::Get(unsigned value) const { } } +const int32_t DispatchTable::Config::kNoKey = Utf::kInvalidChar; + +void DispatchTable::AddRange(CharacterRange full_range, + int32_t value, + Zone* zone) { + CharacterRange current = full_range; + if (tree()->is_empty()) { + // If this is the first range we just insert into the table. + ZoneSplayTree::Locator loc; + bool inserted = tree()->Insert(current.from(), &loc); + ASSERT(inserted); + USE(inserted); + loc.set_value( + Entry(current.from(), current.to(), empty()->Extend(value, zone))); + return; + } + // First see if there is a range to the left of this one that + // overlaps. + ZoneSplayTree::Locator loc; + if (tree()->FindGreatestLessThan(current.from(), &loc)) { + Entry* entry = &loc.value(); + // If we've found a range that overlaps with this one, and it + // starts strictly to the left of this one, we have to fix it + // because the following code only handles ranges that start on + // or after the start point of the range we're adding. + if (entry->from() < current.from() && entry->to() >= current.from()) { + // Snap the overlapping range in half around the start point of + // the range we're adding. + CharacterRange left = + CharacterRange::Range(entry->from(), current.from() - 1); + CharacterRange right = CharacterRange::Range(current.from(), entry->to()); + // The left part of the overlapping range doesn't overlap. + // Truncate the whole entry to be just the left part. + entry->set_to(left.to()); + // The right part is the one that overlaps. We add this part + // to the map and let the next step deal with merging it with + // the range we're adding. + ZoneSplayTree::Locator loc; + bool inserted = tree()->Insert(right.from(), &loc); + ASSERT(inserted); + USE(inserted); + loc.set_value(Entry(right.from(), right.to(), entry->out_set())); + } + } + while (current.is_valid()) { + if (tree()->FindLeastGreaterThan(current.from(), &loc) && + (loc.value().from() <= current.to()) && + (loc.value().to() >= current.from())) { + Entry* entry = &loc.value(); + // We have overlap. If there is space between the start point of + // the range we're adding and where the overlapping range starts + // then we have to add a range covering just that space. + if (current.from() < entry->from()) { + ZoneSplayTree::Locator ins; + bool inserted = tree()->Insert(current.from(), &ins); + ASSERT(inserted); + USE(inserted); + ins.set_value(Entry(current.from(), entry->from() - 1, + empty()->Extend(value, zone))); + current.set_from(entry->from()); + } + ASSERT(current.from() == entry->from()); + // If the overlapping range extends beyond the one we want to add + // we have to snap the right part off and add it separately. + if (entry->to() > current.to()) { + ZoneSplayTree::Locator ins; + bool inserted = tree()->Insert(current.to() + 1, &ins); + ASSERT(inserted); + USE(inserted); + ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set())); + entry->set_to(current.to()); + } + ASSERT(entry->to() <= current.to()); + // The overlapping range is now completely contained by the range + // we're adding so we can just update it and move the start point + // of the range we're adding just past it. + entry->AddValue(value, zone); + ASSERT(entry->to() + 1 > current.from()); + current.set_from(entry->to() + 1); + } else { + // There is no overlap so we can just add the range + ZoneSplayTree::Locator ins; + bool inserted = tree()->Insert(current.from(), &ins); + ASSERT(inserted); + USE(inserted); + ins.set_value( + Entry(current.from(), current.to(), empty()->Extend(value, zone))); + break; + } + } +} + +OutSet* DispatchTable::Get(int32_t value) { + ZoneSplayTree::Locator loc; + if (!tree()->FindGreatestLessThan(value, &loc)) return empty(); + Entry* entry = &loc.value(); + if (value <= entry->to()) + return entry->out_set(); + else + return empty(); +} + // ------------------------------------------------------------------- // Analysis @@ -4540,9 +5091,7 @@ void TextNode::CalculateOffsets() { } void Analysis::VisitText(TextNode* that) { - if (ignore_case_) { - that->MakeCaseIndependent(is_one_byte_); - } + that->MakeCaseIndependent(is_one_byte_); EnsureAnalyzed(that->on_success()); if (!has_failed()) { that->CalculateOffsets(); @@ -4649,7 +5198,7 @@ void TextNode::FillInBMInfo(intptr_t initial_offset, return; } uint16_t character = atom->data()->At(j); - if (bm->compiler()->ignore_case()) { + if (atom->flags().IgnoreCase()) { int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; intptr_t length = GetCaseIndependentLetters( character, bm->max_char() == Symbols::kMaxOneCharCodeSymbol, @@ -4669,7 +5218,7 @@ void TextNode::FillInBMInfo(intptr_t initial_offset, bm->SetAll(offset); } else { for (intptr_t k = 0; k < ranges->length(); k++) { - CharacterRange& range = (*ranges)[k]; + const CharacterRange& range = ranges->At(k); if (range.from() > max_char) continue; intptr_t to = Utils::Minimum(max_char, static_cast(range.to())); @@ -4688,6 +5237,40 @@ void TextNode::FillInBMInfo(intptr_t initial_offset, if (initial_offset == 0) set_bm_info(not_at_start, bm); } +RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, + RegExpNode* on_success, + RegExpFlags flags) { + // If the regexp matching starts within a surrogate pair, step back + // to the lead surrogate and start matching from there. + ASSERT(!compiler->read_backward()); + Zone* zone = compiler->zone(); + + auto lead_surrogates = CharacterRange::List( + on_success->zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart, + Utf16::kLeadSurrogateEnd)); + auto trail_surrogates = CharacterRange::List( + on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart, + Utf16::kTrailSurrogateEnd)); + + ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); + + int stack_register = compiler->UnicodeLookaroundStackRegister(); + int position_register = compiler->UnicodeLookaroundPositionRegister(); + RegExpNode* step_back = TextNode::CreateForCharacterRanges( + lead_surrogates, /*read_backward=*/true, on_success, flags); + RegExpLookaround::Builder builder(/*is_positive=*/true, step_back, + stack_register, position_register); + RegExpNode* match_trail = TextNode::CreateForCharacterRanges( + trail_surrogates, /*read_backward=*/false, builder.on_match_success(), + flags); + + optional_step_back->AddAlternative( + GuardedAlternative(builder.ForMatch(match_trail))); + optional_step_back->AddAlternative(GuardedAlternative(on_success)); + + return optional_step_back; +} + #if !defined(DART_PRECOMPILED_RUNTIME) RegExpEngine::CompilationResult RegExpEngine::CompileIR( RegExpCompileData* data, @@ -4708,10 +5291,10 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR( ASSERT(!regexp.IsNull()); ASSERT(!pattern.IsNull()); - const bool ignore_case = regexp.is_ignore_case(); - const bool is_global = regexp.is_global(); + const bool is_global = regexp.flags().IsGlobal(); + const bool is_unicode = regexp.flags().IsUnicode(); - RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte); + RegExpCompiler compiler(data->capture_count, is_one_byte); // TODO(zerny): Frequency sampling is currently disabled because of several // issues. We do not want to store subject strings in the regexp object since @@ -4734,34 +5317,37 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR( // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning or is sticky. RegExpNode* loop_node = RegExpQuantifier::ToNode( - 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), - &compiler, captured_body, data->contains_anchor); + 0, RegExpTree::kInfinity, false, + new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler, + captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start // at the start of input. ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone); first_step_node->AddAlternative(GuardedAlternative(captured_body)); - first_step_node->AddAlternative(GuardedAlternative( - new (zone) TextNode(new (zone) RegExpCharacterClass('*'), - /*read_backwards=*/false, loop_node))); + first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( + new (zone) RegExpCharacterClass('*', RegExpFlags()), + /*read_backwards=*/false, loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != NULL) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); } + } else if (is_unicode && (is_global || is_sticky)) { + node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags()); } if (node == NULL) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - Analysis analysis(ignore_case, is_one_byte); + Analysis analysis(is_one_byte); analysis.EnsureAnalyzed(node); if (analysis.has_failed()) { const char* error_message = analysis.error_message(); @@ -4783,10 +5369,13 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR( } if (is_global) { - macro_assembler->set_global_mode( - (data->tree->min_match() > 0) - ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK - : RegExpMacroAssembler::GLOBAL); + RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; + if (data->tree->min_match() > 0) { + mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; + } else if (is_unicode) { + mode = RegExpMacroAssembler::GLOBAL_UNICODE; + } + macro_assembler->set_global_mode(mode); } RegExpEngine::CompilationResult result = @@ -4812,10 +5401,10 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode( ASSERT(!regexp.IsNull()); ASSERT(!pattern.IsNull()); - const bool ignore_case = regexp.is_ignore_case(); - const bool is_global = regexp.is_global(); + const bool is_global = regexp.flags().IsGlobal(); + const bool is_unicode = regexp.flags().IsUnicode(); - RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte); + RegExpCompiler compiler(data->capture_count, is_one_byte); // TODO(zerny): Frequency sampling is currently disabled because of several // issues. We do not want to store subject strings in the regexp object since @@ -4838,34 +5427,37 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode( // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning. RegExpNode* loop_node = RegExpQuantifier::ToNode( - 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), - &compiler, captured_body, data->contains_anchor); + 0, RegExpTree::kInfinity, false, + new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler, + captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start // at the start of input. ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone); first_step_node->AddAlternative(GuardedAlternative(captured_body)); - first_step_node->AddAlternative(GuardedAlternative( - new (zone) TextNode(new (zone) RegExpCharacterClass('*'), - /*read_backwards=*/false, loop_node))); + first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( + new (zone) RegExpCharacterClass('*', RegExpFlags()), + /*read_backwards=*/false, loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != NULL) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); } + } else if (is_unicode && (is_global || is_sticky)) { + node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags()); } if (node == NULL) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - Analysis analysis(ignore_case, is_one_byte); + Analysis analysis(is_one_byte); analysis.EnsureAnalyzed(node); if (analysis.has_failed()) { const char* error_message = analysis.error_message(); @@ -4887,10 +5479,13 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode( } if (is_global) { - macro_assembler->set_global_mode( - (data->tree->min_match() > 0) - ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK - : RegExpMacroAssembler::GLOBAL); + RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; + if (data->tree->min_match() > 0) { + mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; + } else if (is_unicode) { + mode = RegExpMacroAssembler::GLOBAL_UNICODE; + } + macro_assembler->set_global_mode(mode); } RegExpEngine::CompilationResult result = @@ -4952,19 +5547,12 @@ static void CreateSpecializedFunction(Thread* thread, RawRegExp* RegExpEngine::CreateRegExp(Thread* thread, const String& pattern, - bool multi_line, - bool ignore_case) { + RegExpFlags flags) { Zone* zone = thread->zone(); const RegExp& regexp = RegExp::Handle(RegExp::New()); regexp.set_pattern(pattern); - - if (multi_line) { - regexp.set_is_multi_line(); - } - if (ignore_case) { - regexp.set_is_ignore_case(); - } + regexp.set_flags(flags); // TODO(zerny): We might want to use normal string searching algorithms // for simple patterns. diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h index 092a06ad4bb..febcb91bec3 100644 --- a/runtime/vm/regexp.h +++ b/runtime/vm/regexp.h @@ -5,11 +5,14 @@ #ifndef RUNTIME_VM_REGEXP_H_ #define RUNTIME_VM_REGEXP_H_ +#include "platform/unicode.h" + #include "vm/compiler/assembler/assembler.h" #include "vm/compiler/backend/flow_graph_compiler.h" #include "vm/compiler/backend/il.h" #include "vm/object.h" #include "vm/regexp_assembler.h" +#include "vm/splay-tree.h" namespace dart { @@ -25,32 +28,42 @@ class BoyerMooreLookahead; class CharacterRange { public: CharacterRange() : from_(0), to_(0) {} - CharacterRange(uint16_t from, uint16_t to) : from_(from), to_(to) {} + CharacterRange(int32_t from, int32_t to) : from_(from), to_(to) {} static void AddClassEscape(uint16_t type, ZoneGrowableArray* ranges); + // Add class escapes with case equivalent closure for \w and \W if necessary. + static void AddClassEscape(uint16_t type, + ZoneGrowableArray* ranges, + bool add_unicode_case_equivalents); static GrowableArray GetWordBounds(); - static inline CharacterRange Singleton(uint16_t value) { + static inline CharacterRange Singleton(int32_t value) { return CharacterRange(value, value); } - static inline CharacterRange Range(uint16_t from, uint16_t to) { + static inline CharacterRange Range(int32_t from, int32_t to) { ASSERT(from <= to); return CharacterRange(from, to); } static inline CharacterRange Everything() { - return CharacterRange(0, 0xFFFF); + return CharacterRange(0, Utf::kMaxCodePoint); } - bool Contains(uint16_t i) const { return from_ <= i && i <= to_; } - uint16_t from() const { return from_; } - void set_from(uint16_t value) { from_ = value; } - uint16_t to() const { return to_; } - void set_to(uint16_t value) { to_ = value; } + static inline ZoneGrowableArray* List(Zone* zone, + CharacterRange range) { + auto list = new (zone) ZoneGrowableArray(1); + list->Add(range); + return list; + } + bool Contains(int32_t i) const { return from_ <= i && i <= to_; } + int32_t from() const { return from_; } + void set_from(int32_t value) { from_ = value; } + int32_t to() const { return to_; } + void set_to(int32_t value) { to_ = value; } bool is_valid() const { return from_ <= to_; } - bool IsEverything(uint16_t max) const { return from_ == 0 && to_ >= max; } + bool IsEverything(int32_t max) const { return from_ == 0 && to_ >= max; } bool IsSingleton() const { return (from_ == to_); } - void AddCaseEquivalents(ZoneGrowableArray* ranges, - bool is_one_byte, - Zone* zone); + static void AddCaseEquivalents(ZoneGrowableArray* ranges, + bool is_one_byte, + Zone* zone); static void Split(ZoneGrowableArray* base, GrowableArray overlay, ZoneGrowableArray** included, @@ -71,8 +84,8 @@ class CharacterRange { static const intptr_t kPayloadMask = (1 << 24) - 1; private: - uint16_t from_; - uint16_t to_; + int32_t from_; + int32_t to_; DISALLOW_ALLOCATION(); }; @@ -105,6 +118,98 @@ class OutSet : public ZoneAllocated { friend class Trace; }; +// A mapping from integers, specified as ranges, to a set of integers. +// Used for mapping character ranges to choices. +class DispatchTable : public ValueObject { + public: + explicit DispatchTable(Zone* zone) : tree_(zone) {} + + class Entry { + public: + Entry() : from_(0), to_(0), out_set_(nullptr) {} + Entry(int32_t from, int32_t to, OutSet* out_set) + : from_(from), to_(to), out_set_(out_set) { + ASSERT(from <= to); + } + int32_t from() { return from_; } + int32_t to() { return to_; } + void set_to(int32_t value) { to_ = value; } + void AddValue(int value, Zone* zone) { + out_set_ = out_set_->Extend(value, zone); + } + OutSet* out_set() { return out_set_; } + + private: + int32_t from_; + int32_t to_; + OutSet* out_set_; + }; + + class Config { + public: + typedef int32_t Key; + typedef Entry Value; + static const int32_t kNoKey; + static const Entry NoValue() { return Value(); } + static inline int Compare(int32_t a, int32_t b) { + if (a == b) + return 0; + else if (a < b) + return -1; + else + return 1; + } + }; + + void AddRange(CharacterRange range, int32_t value, Zone* zone); + OutSet* Get(int32_t value); + void Dump(); + + template + void ForEach(Callback* callback) { + return tree()->ForEach(callback); + } + + private: + // There can't be a static empty set since it allocates its + // successors in a zone and caches them. + OutSet* empty() { return &empty_; } + OutSet empty_; + ZoneSplayTree* tree() { return &tree_; } + ZoneSplayTree tree_; +}; + +// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. +class UnicodeRangeSplitter : public ValueObject { + public: + UnicodeRangeSplitter(Zone* zone, ZoneGrowableArray* base); + void Call(uint32_t from, DispatchTable::Entry entry); + + ZoneGrowableArray* bmp() { return bmp_; } + ZoneGrowableArray* lead_surrogates() { + return lead_surrogates_; + } + ZoneGrowableArray* trail_surrogates() { + return trail_surrogates_; + } + ZoneGrowableArray* non_bmp() const { return non_bmp_; } + + private: + static const int kBase = 0; + // Separate ranges into + static const int kBmpCodePoints = 1; + static const int kLeadSurrogates = 2; + static const int kTrailSurrogates = 3; + static const int kNonBmpCodePoints = 4; + + Zone* zone_; + DispatchTable table_; + ZoneGrowableArray* bmp_; + ZoneGrowableArray* lead_surrogates_; + ZoneGrowableArray* trail_surrogates_; + ZoneGrowableArray* non_bmp_; +}; + #define FOR_EACH_NODE_TYPE(VISIT) \ VISIT(End) \ VISIT(Action) \ @@ -344,9 +449,7 @@ class RegExpNode : public ZoneAllocated { // If we know that the input is one-byte then there are some nodes that can // never match. This method returns a node that can be substituted for // itself, or NULL if the node can never match. - virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case) { - return this; - } + virtual RegExpNode* FilterOneByte(intptr_t depth) { return this; } // Helper for FilterOneByte. RegExpNode* replacement() { ASSERT(info()->replacement_calculated); @@ -443,7 +546,7 @@ class SeqRegExpNode : public RegExpNode { : RegExpNode(on_success->zone()), on_success_(on_success) {} RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } - virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(intptr_t depth); virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead* bm, @@ -453,7 +556,7 @@ class SeqRegExpNode : public RegExpNode { } protected: - RegExpNode* FilterSuccessor(intptr_t depth, bool ignore_case); + RegExpNode* FilterSuccessor(intptr_t depth); private: RegExpNode* on_success_; @@ -561,6 +664,19 @@ class TextNode : public SeqRegExpNode { read_backward_(read_backward) { elms_->Add(TextElement::CharClass(that)); } + // Create TextNode for a single character class for the given ranges. + static TextNode* CreateForCharacterRanges( + ZoneGrowableArray* ranges, + bool read_backward, + RegExpNode* on_success, + RegExpFlags flags); + // Create TextNode for a surrogate pair with a range given for the + // lead and the trail surrogate each. + static TextNode* CreateForSurrogatePair(CharacterRange lead, + CharacterRange trail, + bool read_backward, + RegExpNode* on_success, + RegExpFlags flags); virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual intptr_t EatsAtLeast(intptr_t still_to_find, @@ -581,7 +697,7 @@ class TextNode : public SeqRegExpNode { BoyerMooreLookahead* bm, bool not_at_start); void CalculateOffsets(); - virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(intptr_t depth); private: enum TextEmitPassType { @@ -659,11 +775,13 @@ class BackReferenceNode : public SeqRegExpNode { public: BackReferenceNode(intptr_t start_reg, intptr_t end_reg, + RegExpFlags flags, bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg), + flags_(flags), read_backward_(read_backward) {} virtual void Accept(NodeVisitor* visitor); intptr_t start_register() { return start_reg_; } @@ -687,6 +805,7 @@ class BackReferenceNode : public SeqRegExpNode { private: intptr_t start_reg_; intptr_t end_reg_; + RegExpFlags flags_; bool read_backward_; }; @@ -761,9 +880,9 @@ class GuardedAlternative { public: explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) {} void AddGuard(Guard* guard, Zone* zone); - RegExpNode* node() { return node_; } + RegExpNode* node() const { return node_; } void set_node(RegExpNode* node) { node_ = node; } - ZoneGrowableArray* guards() { return guards_; } + ZoneGrowableArray* guards() const { return guards_; } private: RegExpNode* node_; @@ -811,11 +930,12 @@ class ChoiceNode : public RegExpNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return true; } - virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(intptr_t depth); virtual bool read_backward() { return false; } protected: - intptr_t GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); + intptr_t GreedyLoopTextLengthForAlternative( + const GuardedAlternative* alternative); ZoneGrowableArray* alternatives_; private: @@ -886,7 +1006,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return !is_first; } - virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(intptr_t depth); }; class LoopChoiceNode : public ChoiceNode { @@ -918,7 +1038,7 @@ class LoopChoiceNode : public ChoiceNode { bool body_can_be_zero_length() { return body_can_be_zero_length_; } virtual bool read_backward() { return read_backward_; } virtual void Accept(NodeVisitor* visitor); - virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); + virtual RegExpNode* FilterOneByte(intptr_t depth); private: // AddAlternative is made private for loop nodes because alternatives @@ -1286,10 +1406,8 @@ class NodeVisitor : public ValueObject { // +-------+ +------------+ class Analysis : public NodeVisitor { public: - Analysis(bool ignore_case, bool is_one_byte) - : ignore_case_(ignore_case), - is_one_byte_(is_one_byte), - error_message_(NULL) {} + explicit Analysis(bool is_one_byte) + : is_one_byte_(is_one_byte), error_message_(NULL) {} void EnsureAnalyzed(RegExpNode* node); #define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that); @@ -1305,7 +1423,6 @@ class Analysis : public NodeVisitor { void fail(const char* error_message) { error_message_ = error_message; } private: - bool ignore_case_; bool is_one_byte_; const char* error_message_; @@ -1398,8 +1515,7 @@ class RegExpEngine : public AllStatic { static RawRegExp* CreateRegExp(Thread* thread, const String& pattern, - bool multi_line, - bool ignore_case); + RegExpFlags flags); static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); }; diff --git a/runtime/vm/regexp_assembler.cc b/runtime/vm/regexp_assembler.cc index fb7dc746fc8..ab242c4bd54 100644 --- a/runtime/vm/regexp_assembler.cc +++ b/runtime/vm/regexp_assembler.cc @@ -4,6 +4,10 @@ #include "vm/regexp_assembler.h" +#include "unicode/uchar.h" + +#include "platform/unicode.h" + #include "vm/flags.h" #include "vm/regexp.h" #include "vm/unibrow-inl.h" @@ -16,11 +20,10 @@ void PrintUtf16(uint16_t c) { OS::PrintErr(format, c); } - -static RawBool* CaseInsensitiveCompareUC16(RawString* str_raw, - RawSmi* lhs_index_raw, - RawSmi* rhs_index_raw, - RawSmi* length_raw) { +RawBool* CaseInsensitiveCompareUCS2(RawString* str_raw, + RawSmi* lhs_index_raw, + RawSmi* rhs_index_raw, + RawSmi* length_raw) { const String& str = String::Handle(str_raw); const Smi& lhs_index = Smi::Handle(lhs_index_raw); const Smi& rhs_index = Smi::Handle(rhs_index_raw); @@ -48,13 +51,50 @@ static RawBool* CaseInsensitiveCompareUC16(RawString* str_raw, return Bool::True().raw(); } +RawBool* CaseInsensitiveCompareUTF16(RawString* str_raw, + RawSmi* lhs_index_raw, + RawSmi* rhs_index_raw, + RawSmi* length_raw) { + const String& str = String::Handle(str_raw); + const Smi& lhs_index = Smi::Handle(lhs_index_raw); + const Smi& rhs_index = Smi::Handle(rhs_index_raw); + const Smi& length = Smi::Handle(length_raw); + + for (intptr_t i = 0; i < length.Value(); i++) { + int32_t c1 = str.CharAt(lhs_index.Value() + i); + int32_t c2 = str.CharAt(rhs_index.Value() + i); + if (Utf16::IsLeadSurrogate(c1)) { + // Non-BMP characters do not have case-equivalents in the BMP. + // Both have to be non-BMP for them to be able to match. + if (!Utf16::IsLeadSurrogate(c2)) return Bool::False().raw(); + if (i + 1 < length.Value()) { + uint16_t c1t = str.CharAt(lhs_index.Value() + i + 1); + uint16_t c2t = str.CharAt(rhs_index.Value() + i + 1); + if (Utf16::IsTrailSurrogate(c1t) && Utf16::IsTrailSurrogate(c2t)) { + c1 = Utf16::Decode(c1, c1t); + c2 = Utf16::Decode(c2, c2t); + i++; + } + } + } + c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT); + c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT); + if (c1 != c2) return Bool::False().raw(); + } + return Bool::True().raw(); +} DEFINE_RAW_LEAF_RUNTIME_ENTRY( - CaseInsensitiveCompareUC16, + CaseInsensitiveCompareUCS2, 4, false /* is_float */, - reinterpret_cast(&CaseInsensitiveCompareUC16)); + reinterpret_cast(&CaseInsensitiveCompareUCS2)); +DEFINE_RAW_LEAF_RUNTIME_ENTRY( + CaseInsensitiveCompareUTF16, + 4, + false /* is_float */, + reinterpret_cast(&CaseInsensitiveCompareUTF16)); BlockLabel::BlockLabel() : block_(NULL), is_bound_(false), is_linked_(false), pos_(-1) { @@ -72,4 +112,18 @@ RegExpMacroAssembler::RegExpMacroAssembler(Zone* zone) RegExpMacroAssembler::~RegExpMacroAssembler() {} +void RegExpMacroAssembler::CheckNotInSurrogatePair(intptr_t cp_offset, + BlockLabel* on_failure) { + BlockLabel ok; + // Check that current character is not a trail surrogate. + LoadCurrentCharacter(cp_offset, &ok); + CheckCharacterNotInRange(Utf16::kTrailSurrogateStart, + Utf16::kTrailSurrogateEnd, &ok); + // Check that previous character is not a lead surrogate. + LoadCurrentCharacter(cp_offset - 1, &ok); + CheckCharacterInRange(Utf16::kLeadSurrogateStart, Utf16::kLeadSurrogateEnd, + on_failure); + BindBlock(&ok); +} + } // namespace dart diff --git a/runtime/vm/regexp_assembler.h b/runtime/vm/regexp_assembler.h index a7b087e23ff..818f5730c50 100644 --- a/runtime/vm/regexp_assembler.h +++ b/runtime/vm/regexp_assembler.h @@ -13,6 +13,19 @@ namespace dart { // Utility function for the DotPrinter void PrintUtf16(uint16_t c); +// Compares two-byte strings case insensitively as UCS2. +// Called from generated RegExp code. +RawBool* CaseInsensitiveCompareUCS2(RawString* str_raw, + RawSmi* lhs_index_raw, + RawSmi* rhs_index_raw, + RawSmi* length_raw); + +// Compares two-byte strings case insensitively as UTF16. +// Called from generated RegExp code. +RawBool* CaseInsensitiveCompareUTF16(RawString* str_raw, + RawSmi* lhs_index_raw, + RawSmi* rhs_index_raw, + RawSmi* length_raw); /// Convenience wrapper around a BlockEntryInstr pointer. class BlockLabel : public ValueObject { @@ -127,6 +140,7 @@ class RegExpMacroAssembler : public ZoneAllocated { BlockLabel* on_no_match) = 0; virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, + bool unicode, BlockLabel* on_no_match) = 0; // Check the current character for a match with a literal character. If we // fail to match then goto the on_failure label. End of input always @@ -215,22 +229,33 @@ class RegExpMacroAssembler : public ZoneAllocated { virtual void ClearRegisters(intptr_t reg_from, intptr_t reg_to) = 0; virtual void WriteStackPointerToRegister(intptr_t reg) = 0; + // Check that we are not in the middle of a surrogate pair. + void CheckNotInSurrogatePair(intptr_t cp_offset, BlockLabel* on_failure); + // Controls the generation of large inlined constants in the code. void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } bool slow_safe() { return slow_safe_compiler_; } - enum GlobalMode { NOT_GLOBAL, GLOBAL, GLOBAL_NO_ZERO_LENGTH_CHECK }; + enum GlobalMode { + NOT_GLOBAL, + GLOBAL, + GLOBAL_NO_ZERO_LENGTH_CHECK, + GLOBAL_UNICODE + }; // Set whether the regular expression has the global flag. Exiting due to // a failure in a global regexp may still mean success overall. inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } inline bool global() { return global_mode_ != NOT_GLOBAL; } - inline bool global_with_zero_length_check() { return global_mode_ == GLOBAL; } + inline bool global_with_zero_length_check() { + return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; + } + inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; } Zone* zone() const { return zone_; } private: bool slow_safe_compiler_; - bool global_mode_; + GlobalMode global_mode_; Zone* zone_; }; diff --git a/runtime/vm/regexp_assembler_bytecode.cc b/runtime/vm/regexp_assembler_bytecode.cc index 7a555731738..3b248090012 100644 --- a/runtime/vm/regexp_assembler_bytecode.cc +++ b/runtime/vm/regexp_assembler_bytecode.cc @@ -349,11 +349,14 @@ void BytecodeRegExpMacroAssembler::CheckNotBackReference( void BytecodeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( intptr_t start_reg, bool read_backward, + bool unicode, BlockLabel* on_not_equal) { ASSERT(start_reg >= 0); ASSERT(start_reg <= kMaxRegister); - Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD - : BC_CHECK_NOT_BACK_REF_NO_CASE, + Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD + : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) + : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE + : BC_CHECK_NOT_BACK_REF_NO_CASE), start_reg); EmitOrLink(on_not_equal); } @@ -434,11 +437,10 @@ static intptr_t Prepare(const RegExp& regexp, } #endif // !defined(PRODUCT) - const bool multiline = regexp.is_multi_line(); RegExpCompileData* compile_data = new (zone) RegExpCompileData(); // Parsing failures are handled in the RegExp factory constructor. - RegExpParser::ParseRegExp(pattern, multiline, compile_data); + RegExpParser::ParseRegExp(pattern, regexp.flags(), compile_data); regexp.set_num_bracket_expressions(compile_data->capture_count); regexp.set_capture_name_map(compile_data->capture_name_map); @@ -451,15 +453,15 @@ static intptr_t Prepare(const RegExp& regexp, RegExpEngine::CompilationResult result = RegExpEngine::CompileBytecode( compile_data, regexp, is_one_byte, sticky, zone); ASSERT(result.bytecode != NULL); - ASSERT((regexp.num_registers() == -1) || - (regexp.num_registers() == result.num_registers)); - regexp.set_num_registers(result.num_registers); + ASSERT(regexp.num_registers(is_one_byte) == -1 || + regexp.num_registers(is_one_byte) == result.num_registers); + regexp.set_num_registers(is_one_byte, result.num_registers); regexp.set_bytecode(is_one_byte, sticky, *(result.bytecode)); } - ASSERT(regexp.num_registers() != -1); + ASSERT(regexp.num_registers(is_one_byte) != -1); - return regexp.num_registers() + + return regexp.num_registers(is_one_byte) + (Smi::Value(regexp.num_bracket_expressions()) + 1) * 2; } diff --git a/runtime/vm/regexp_assembler_bytecode.h b/runtime/vm/regexp_assembler_bytecode.h index 3e17d493238..59cf68f4575 100644 --- a/runtime/vm/regexp_assembler_bytecode.h +++ b/runtime/vm/regexp_assembler_bytecode.h @@ -83,6 +83,7 @@ class BytecodeRegExpMacroAssembler : public RegExpMacroAssembler { BlockLabel* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, + bool unicode, BlockLabel* on_no_match); virtual void IfRegisterLT(intptr_t register_index, intptr_t comparand, diff --git a/runtime/vm/regexp_assembler_ir.cc b/runtime/vm/regexp_assembler_ir.cc index b1efebec92b..f7f18ac1380 100644 --- a/runtime/vm/regexp_assembler_ir.cc +++ b/runtime/vm/regexp_assembler_ir.cc @@ -821,6 +821,7 @@ void IRRegExpMacroAssembler::CheckGreedyLoop(BlockLabel* on_equal) { void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( intptr_t start_reg, bool read_backward, + bool unicode, BlockLabel* on_no_match) { TAG(); ASSERT(start_reg + 1 <= registers_count_); @@ -967,9 +968,17 @@ void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( Value* rhs_index_value = Bind(LoadLocal(capture_start_index_)); Value* length_value = Bind(LoadLocal(capture_length_)); - Definition* is_match_def = new (Z) CaseInsensitiveCompareUC16Instr( - string_value, lhs_index_value, rhs_index_value, length_value, - specialization_cid_); + Definition* is_match_def; + + if (unicode) { + is_match_def = new (Z) CaseInsensitiveCompareInstr( + string_value, lhs_index_value, rhs_index_value, length_value, + kCaseInsensitiveCompareUTF16RuntimeEntry, specialization_cid_); + } else { + is_match_def = new (Z) CaseInsensitiveCompareInstr( + string_value, lhs_index_value, rhs_index_value, length_value, + kCaseInsensitiveCompareUCS2RuntimeEntry, specialization_cid_); + } BranchOrBacktrack(Comparison(kNE, is_match_def, BoolConstant(true)), on_no_match); diff --git a/runtime/vm/regexp_assembler_ir.h b/runtime/vm/regexp_assembler_ir.h index c4f6e1fd2b7..d8c222a568f 100644 --- a/runtime/vm/regexp_assembler_ir.h +++ b/runtime/vm/regexp_assembler_ir.h @@ -67,6 +67,7 @@ class IRRegExpMacroAssembler : public RegExpMacroAssembler { BlockLabel* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, + bool unicode, BlockLabel* on_no_match); virtual void CheckNotCharacter(uint32_t c, BlockLabel* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/runtime/vm/regexp_ast.h b/runtime/vm/regexp_ast.h index 17a175e3d2d..4fb64a3ae20 100644 --- a/runtime/vm/regexp_ast.h +++ b/runtime/vm/regexp_ast.h @@ -108,7 +108,8 @@ class RegExpAssertion : public RegExpTree { BOUNDARY, NON_BOUNDARY }; - explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} + RegExpAssertion(AssertionType type, RegExpFlags flags) + : assertion_type_(type), flags_(flags) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpAssertion* AsAssertion(); @@ -121,6 +122,7 @@ class RegExpAssertion : public RegExpTree { private: AssertionType assertion_type_; + RegExpFlags flags_; }; class CharacterSet : public ValueObject { @@ -150,18 +152,40 @@ class CharacterSet : public ValueObject { class RegExpCharacterClass : public RegExpTree { public: - RegExpCharacterClass(ZoneGrowableArray* ranges, - bool is_negated) - : set_(ranges), is_negated_(is_negated) {} - explicit RegExpCharacterClass(uint16_t type) - : set_(type), is_negated_(false) {} + enum Flag { + // The character class is negated and should match everything but the + // specified ranges. + NEGATED = 1 << 0, + // The character class contains part of a split surrogate and should not + // be unicode-desugared. + CONTAINS_SPLIT_SURROGATE = 1 << 1, + }; + using CharacterClassFlags = intptr_t; + static inline CharacterClassFlags DefaultFlags() { return 0; } + + RegExpCharacterClass( + ZoneGrowableArray* ranges, + RegExpFlags flags, + CharacterClassFlags character_class_flags = DefaultFlags()) + : set_(ranges), + flags_(flags), + character_class_flags_(character_class_flags) { + // Convert the empty set of ranges to the negated Everything() range. + if (ranges->is_empty()) { + ranges->Add(CharacterRange::Everything()); + character_class_flags_ ^= NEGATED; + } + } + RegExpCharacterClass(uint16_t type, RegExpFlags flags) + : set_(type), flags_(flags), character_class_flags_(0) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpCharacterClass* AsCharacterClass(); virtual bool IsCharacterClass() const; virtual bool IsTextElement() const { return true; } virtual intptr_t min_match() const { return 1; } - virtual intptr_t max_match() const { return 1; } + // The character class may match two code units for unicode regexps. + virtual intptr_t max_match() const { return 2; } virtual void AppendToText(RegExpText* text); CharacterSet character_set() const { return set_; } // TODO(lrn): Remove need for complex version if is_standard that @@ -180,16 +204,22 @@ class RegExpCharacterClass : public RegExpTree { // * : All characters uint16_t standard_type() const { return set_.standard_set_type(); } ZoneGrowableArray* ranges() { return set_.ranges(); } - bool is_negated() const { return is_negated_; } + bool is_negated() const { return character_class_flags_ & NEGATED; } + RegExpFlags flags() const { return flags_; } + bool contains_split_surrogate() const { + return character_class_flags_ & CONTAINS_SPLIT_SURROGATE; + } private: CharacterSet set_; - bool is_negated_; + RegExpFlags flags_; + CharacterClassFlags character_class_flags_; }; class RegExpAtom : public RegExpTree { public: - explicit RegExpAtom(ZoneGrowableArray* data) : data_(data) {} + RegExpAtom(ZoneGrowableArray* data, RegExpFlags flags) + : data_(data), flags_(flags) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpAtom* AsAtom(); @@ -200,9 +230,12 @@ class RegExpAtom : public RegExpTree { virtual void AppendToText(RegExpText* text); ZoneGrowableArray* data() const { return data_; } intptr_t length() const { return data_->length(); } + RegExpFlags flags() const { return flags_; } + bool ignore_case() const { return flags_.IgnoreCase(); } private: ZoneGrowableArray* data_; + const RegExpFlags flags_; }; class RegExpText : public RegExpTree { @@ -370,9 +403,10 @@ class RegExpLookaround : public RegExpTree { class RegExpBackReference : public RegExpTree { public: - RegExpBackReference() : capture_(nullptr), name_(nullptr) {} - explicit RegExpBackReference(RegExpCapture* capture) - : capture_(capture), name_(nullptr) {} + explicit RegExpBackReference(RegExpFlags flags) + : capture_(nullptr), name_(nullptr), flags_(flags) {} + RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) + : capture_(capture), name_(nullptr), flags_(flags) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpBackReference* AsBackReference(); @@ -391,6 +425,7 @@ class RegExpBackReference : public RegExpTree { private: RegExpCapture* capture_; const ZoneGrowableArray* name_; + RegExpFlags flags_; }; class RegExpEmpty : public RegExpTree { diff --git a/runtime/vm/regexp_bytecodes.h b/runtime/vm/regexp_bytecodes.h index 858eef5ee23..178c9500318 100644 --- a/runtime/vm/regexp_bytecodes.h +++ b/runtime/vm/regexp_bytecodes.h @@ -55,17 +55,19 @@ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ -V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \ -V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \ -V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \ -V(CHECK_NOT_AT_START, 46, 8) /* bc8 offset24 addr32 */ \ -V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \ -V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \ -V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */ +V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /*bc8 reg_idx24 addr32*/ \ +V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ +V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ +V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ +V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \ +V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \ +V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \ +V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \ +V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ // clang-format on diff --git a/runtime/vm/regexp_interpreter.cc b/runtime/vm/regexp_interpreter.cc index e3443cdc100..86bc67711de 100644 --- a/runtime/vm/regexp_interpreter.cc +++ b/runtime/vm/regexp_interpreter.cc @@ -24,27 +24,25 @@ static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, intptr_t from, intptr_t current, intptr_t len, - const String& subject); + const String& subject, + bool unicode); template <> bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, intptr_t from, intptr_t current, intptr_t len, - const String& subject) { - for (int i = 0; i < len; i++) { - int32_t old_char = subject.CharAt(from++); - int32_t new_char = subject.CharAt(current++); - if (old_char == new_char) continue; - int32_t old_string[1] = {old_char}; - int32_t new_string[1] = {new_char}; - interp_canonicalize->get(old_char, '\0', old_string); - interp_canonicalize->get(new_char, '\0', new_string); - if (old_string[0] != new_string[0]) { - return false; - } + const String& subject, + bool unicode) { + Bool& ret = Bool::Handle(); + if (unicode) { + ret = CaseInsensitiveCompareUTF16(subject.raw(), Smi::New(from), + Smi::New(current), Smi::New(len)); + } else { + ret = CaseInsensitiveCompareUCS2(subject.raw(), Smi::New(from), + Smi::New(current), Smi::New(len)); } - return true; + return ret.value(); } template <> @@ -52,7 +50,9 @@ bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, intptr_t from, intptr_t current, intptr_t len, - const String& subject) { + const String& subject, + bool unicode) { + // For Latin1 characters the unicode flag makes no difference. for (int i = 0; i < len; i++) { unsigned int old_char = subject.CharAt(from++); unsigned int new_char = subject.CharAt(current++); @@ -513,7 +513,11 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, pc += BC_CHECK_NOT_BACK_REF_LENGTH; break; } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) + FALL_THROUGH; BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { + const bool unicode = + (insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE; int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { @@ -525,7 +529,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, break; } else { if (BackRefMatchesNoCase(&canonicalize, from, current, len, - subject)) { + subject, unicode)) { current += len; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; } else { @@ -562,7 +566,11 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH; break; } + BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) + FALL_THROUGH; BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { + bool unicode = (insn & BYTECODE_MASK) == + BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD; int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { @@ -574,7 +582,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, break; } else { if (BackRefMatchesNoCase(&canonicalize, from, current - len, - len, subject)) { + len, subject, unicode)) { current -= len; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; } else { diff --git a/runtime/vm/regexp_parser.cc b/runtime/vm/regexp_parser.cc index b70b945ad0d..31aff3fb052 100644 --- a/runtime/vm/regexp_parser.cc +++ b/runtime/vm/regexp_parser.cc @@ -3,6 +3,12 @@ // BSD-style license that can be found in the LICENSE file. #include "vm/regexp_parser.h" + +#include "unicode/uchar.h" +#include "unicode/uniset.h" + +#include "platform/unicode.h" + #include "vm/longjump.h" #include "vm/object_store.h" @@ -13,10 +19,12 @@ namespace dart { // Enables possessive quantifier syntax for testing. static const bool FLAG_regexp_possessive_quantifier = false; -RegExpBuilder::RegExpBuilder() +RegExpBuilder::RegExpBuilder(RegExpFlags flags) : zone_(Thread::Current()->zone()), pending_empty_(false), + flags_(flags), characters_(NULL), + pending_surrogate_(kNoPendingSurrogate), terms_(), text_(), alternatives_() @@ -27,10 +35,49 @@ RegExpBuilder::RegExpBuilder() { } +void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) { + ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); + FlushPendingSurrogate(); + // Hold onto the lead surrogate, waiting for a trail surrogate to follow. + pending_surrogate_ = lead_surrogate; +} + +void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) { + ASSERT(Utf16::IsTrailSurrogate(trail_surrogate)); + if (pending_surrogate_ != kNoPendingSurrogate) { + uint16_t lead_surrogate = pending_surrogate_; + pending_surrogate_ = kNoPendingSurrogate; + ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); + uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate); + if (NeedsDesugaringForIgnoreCase(combined)) { + AddCharacterClassForDesugaring(combined); + } else { + auto surrogate_pair = new (Z) ZoneGrowableArray(2); + surrogate_pair->Add(lead_surrogate); + surrogate_pair->Add(trail_surrogate); + RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_); + AddAtom(atom); + } + } else { + pending_surrogate_ = trail_surrogate; + FlushPendingSurrogate(); + } +} + +void RegExpBuilder::FlushPendingSurrogate() { + if (pending_surrogate_ != kNoPendingSurrogate) { + ASSERT(is_unicode()); + uint32_t c = pending_surrogate_; + pending_surrogate_ = kNoPendingSurrogate; + AddCharacterClassForDesugaring(c); + } +} + void RegExpBuilder::FlushCharacters() { + FlushPendingSurrogate(); pending_empty_ = false; if (characters_ != NULL) { - RegExpTree* atom = new (Z) RegExpAtom(characters_); + RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_); characters_ = NULL; text_.Add(atom); LAST(ADD_ATOM); @@ -54,18 +101,62 @@ void RegExpBuilder::FlushText() { } void RegExpBuilder::AddCharacter(uint16_t c) { + FlushPendingSurrogate(); pending_empty_ = false; - if (characters_ == NULL) { - characters_ = new (Z) ZoneGrowableArray(4); + if (NeedsDesugaringForIgnoreCase(c)) { + AddCharacterClassForDesugaring(c); + } else { + if (characters_ == NULL) { + characters_ = new (Z) ZoneGrowableArray(4); + } + characters_->Add(c); + LAST(ADD_CHAR); } - characters_->Add(c); - LAST(ADD_CHAR); +} + +void RegExpBuilder::AddUnicodeCharacter(uint32_t c) { + if (c > static_cast(Utf16::kMaxCodeUnit)) { + ASSERT(is_unicode()); + uint16_t surrogates[2]; + Utf16::Encode(c, surrogates); + AddLeadSurrogate(surrogates[0]); + AddTrailSurrogate(surrogates[1]); + } else if (is_unicode() && Utf16::IsLeadSurrogate(c)) { + AddLeadSurrogate(c); + } else if (is_unicode() && Utf16::IsTrailSurrogate(c)) { + AddTrailSurrogate(c); + } else { + AddCharacter(static_cast(c)); + } +} + +void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) { + // A lead or trail surrogate parsed via escape sequence will not + // pair up with any preceding lead or following trail surrogate. + FlushPendingSurrogate(); + AddUnicodeCharacter(character); + FlushPendingSurrogate(); } void RegExpBuilder::AddEmpty() { pending_empty_ = true; } +void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { + if (NeedsDesugaringForUnicode(cc)) { + // With /u, character class needs to be desugared, so it + // must be a standalone term instead of being part of a RegExpText. + AddTerm(cc); + } else { + AddAtom(cc); + } +} + +void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) { + auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c)); + AddTerm(new (Z) RegExpCharacterClass(ranges, flags_)); +} + void RegExpBuilder::AddAtom(RegExpTree* term) { if (term->IsEmpty()) { AddEmpty(); @@ -81,6 +172,12 @@ void RegExpBuilder::AddAtom(RegExpTree* term) { LAST(ADD_ATOM); } +void RegExpBuilder::AddTerm(RegExpTree* term) { + FlushText(); + terms_.Add(term); + LAST(ADD_ATOM); +} + void RegExpBuilder::AddAssertion(RegExpTree* assert) { FlushText(); terms_.Add(assert); @@ -112,6 +209,37 @@ void RegExpBuilder::FlushTerms() { LAST(ADD_NONE); } +bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { + if (!is_unicode()) return false; + // TODO(yangguo): we could be smarter than this. Case-insensitivity does not + // necessarily mean that we need to desugar. It's probably nicer to have a + // separate pass to figure out unicode desugarings. + if (ignore_case()) return true; + ZoneGrowableArray* ranges = cc->ranges(); + CharacterRange::Canonicalize(ranges); + for (int i = ranges->length() - 1; i >= 0; i--) { + uint32_t from = ranges->At(i).from(); + uint32_t to = ranges->At(i).to(); + // Check for non-BMP characters. + if (to >= Utf16::kMaxCodeUnit) return true; + // Check for lone surrogates. + if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) { + return true; + } + } + return false; +} + +bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) { + if (is_unicode() && ignore_case()) { + icu::UnicodeSet set(c, c); + set.closeOver(USET_CASE_INSENSITIVE); + set.removeAllStrings(); + return set.size() > 1; + } + return false; +} + RegExpTree* RegExpBuilder::ToRegExp() { FlushTerms(); intptr_t num_alternatives = alternatives_.length(); @@ -152,13 +280,13 @@ bool RegExpBuilder::AddQuantifierToAtom( for (intptr_t i = 0; i < num_chars - 1; i++) { prefix->Add(char_vector->At(i)); } - text_.Add(new (Z) RegExpAtom(prefix)); + text_.Add(new (Z) RegExpAtom(prefix, flags_)); ZoneGrowableArray* tail = new (Z) ZoneGrowableArray(); tail->Add(char_vector->At(num_chars - 1)); char_vector = tail; } characters_ = NULL; - atom = new (Z) RegExpAtom(char_vector); + atom = new (Z) RegExpAtom(char_vector, flags_); FlushText(); } else if (text_.length() > 0) { DEBUG_ASSERT(last_added_ == ADD_ATOM); @@ -168,6 +296,8 @@ bool RegExpBuilder::AddQuantifierToAtom( DEBUG_ASSERT(last_added_ == ADD_ATOM); atom = terms_.RemoveLast(); if (auto lookaround = atom->AsLookaround()) { + // With /u, lookarounds are not quantifiable. + if (is_unicode()) return false; // Lookbehinds are not quantifiable. if (lookaround->type() == RegExpLookaround::LOOKBEHIND) { return false; @@ -194,7 +324,7 @@ bool RegExpBuilder::AddQuantifierToAtom( // ---------------------------------------------------------------------------- // Implementation of Parser -RegExpParser::RegExpParser(const String& in, String* error, bool multiline) +RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags) : zone_(Thread::Current()->zone()), captures_(nullptr), named_captures_(nullptr), @@ -205,7 +335,7 @@ RegExpParser::RegExpParser(const String& in, String* error, bool multiline) captures_started_(0), capture_count_(0), has_more_(true), - multiline_(multiline), + top_level_flags_(flags), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false), @@ -213,20 +343,38 @@ RegExpParser::RegExpParser(const String& in, String* error, bool multiline) Advance(); } +inline uint32_t RegExpParser::ReadNext(bool update_position) { + intptr_t position = next_pos_; + const uint16_t c0 = in().CharAt(position); + uint32_t c = c0; + position++; + if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) { + const uint16_t c1 = in().CharAt(position); + if (Utf16::IsTrailSurrogate(c1)) { + c = Utf16::Decode(c0, c1); + position++; + } + } + if (update_position) next_pos_ = position; + return c; +} + uint32_t RegExpParser::Next() { if (has_next()) { - return in().CharAt(next_pos_); + return ReadNext(false); } else { return kEndMarker; } } void RegExpParser::Advance() { - if (next_pos_ < in().Length()) { - current_ = in().CharAt(next_pos_); - next_pos_++; + if (has_next()) { + current_ = ReadNext(true); } else { current_ = kEndMarker; + // Advance so that position() points to 1 after the last character. This is + // important so that Reset() to this position works correctly. + next_pos_ = in().Length() + 1; has_more_ = false; } } @@ -246,6 +394,30 @@ bool RegExpParser::simple() { return simple_; } +bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) { + switch (c) { + case '^': + case '$': + case '\\': + case '.': + case '*': + case '+': + case '?': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '|': + case '/': + return true; + default: + break; + } + return false; +} + void RegExpParser::ReportError(const char* message) { // Zip to the end to make sure the no more input is read. current_ = kEndMarker; @@ -274,6 +446,11 @@ RegExpTree* RegExpParser::ParsePattern() { return result; } +// Used for error messages where we would have fallen back on treating an +// escape as the identity escape, but we are in Unicode mode. +static const char* kUnicodeIdentity = + "Invalid identity escape in Unicode pattern"; + // Disjunction :: // Alternative // Alternative | Disjunction @@ -287,7 +464,7 @@ RegExpTree* RegExpParser::ParsePattern() { RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, - 0, nullptr, Z); + 0, nullptr, top_level_flags_, Z); RegExpParserState* stored_state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -358,12 +535,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { UNREACHABLE(); case '^': { Advance(); - if (multiline_) { - builder->AddAssertion( - new (Z) RegExpAssertion(RegExpAssertion::START_OF_LINE)); + if (builder->is_multi_line()) { + builder->AddAssertion(new (Z) RegExpAssertion( + RegExpAssertion::START_OF_LINE, builder->flags())); } else { - builder->AddAssertion( - new (Z) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); + builder->AddAssertion(new (Z) RegExpAssertion( + RegExpAssertion::START_OF_INPUT, builder->flags())); set_contains_anchor(); } continue; @@ -371,19 +548,29 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '$': { Advance(); RegExpAssertion::AssertionType assertion_type = - multiline_ ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; - builder->AddAssertion(new RegExpAssertion(assertion_type)); + builder->is_multi_line() ? RegExpAssertion::END_OF_LINE + : RegExpAssertion::END_OF_INPUT; + builder->AddAssertion( + new (Z) RegExpAssertion(assertion_type, builder->flags())); continue; } case '.': { Advance(); - // everything except \x0a, \x0d, \u2028 and \u2029 - ZoneGrowableArray* ranges = - new ZoneGrowableArray(2); - CharacterRange::AddClassEscape('.', ranges); - RegExpTree* atom = new RegExpCharacterClass(ranges, false); - builder->AddAtom(atom); + auto ranges = new (Z) ZoneGrowableArray(2); + if (builder->is_dot_all()) { + // Everything. + CharacterRange::AddClassEscape( + '*', ranges, + /*add_unicode_case_equivalents=*/false); + } else { + // everything except \x0a, \x0d, \u2028 and \u2029 + CharacterRange::AddClassEscape( + '.', ranges, + /*add_unicode_case_equivalents=*/false); + } + RegExpCharacterClass* cc = + new (Z) RegExpCharacterClass(ranges, builder->flags()); + builder->AddCharacterClass(cc); break; } case '(': { @@ -392,8 +579,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { continue; } case '[': { - RegExpTree* atom = ParseCharacterClass(); - builder->AddAtom(atom); + RegExpTree* atom = ParseCharacterClass(builder); + builder->AddCharacterClass(atom->AsCharacterClass()); break; } // Atom :: @@ -405,13 +592,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { UNREACHABLE(); case 'b': Advance(2); - builder->AddAssertion( - new RegExpAssertion(RegExpAssertion::BOUNDARY)); + builder->AddAssertion(new (Z) RegExpAssertion( + RegExpAssertion::BOUNDARY, builder->flags())); continue; case 'B': Advance(2); - builder->AddAssertion( - new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); + builder->AddAssertion(new (Z) RegExpAssertion( + RegExpAssertion::NON_BOUNDARY, builder->flags())); continue; // AtomEscape :: // CharacterClassEscape @@ -426,11 +613,36 @@ RegExpTree* RegExpParser::ParseDisjunction() { case 'W': { uint32_t c = Next(); Advance(2); - ZoneGrowableArray* ranges = - new ZoneGrowableArray(2); - CharacterRange::AddClassEscape(c, ranges); - RegExpTree* atom = new RegExpCharacterClass(ranges, false); - builder->AddAtom(atom); + auto ranges = new (Z) ZoneGrowableArray(2); + CharacterRange::AddClassEscape( + c, ranges, is_unicode() && builder->ignore_case()); + RegExpCharacterClass* cc = + new (Z) RegExpCharacterClass(ranges, builder->flags()); + builder->AddCharacterClass(cc); + break; + } + case 'p': + case 'P': { + uint32_t p = Next(); + Advance(2); + + if (is_unicode()) { + auto name_1 = new (Z) ZoneGrowableArray(); + auto name_2 = new (Z) ZoneGrowableArray(); + auto ranges = new (Z) ZoneGrowableArray(2); + if (ParsePropertyClassName(name_1, name_2)) { + if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { + RegExpCharacterClass* cc = + new (Z) RegExpCharacterClass(ranges, builder->flags()); + builder->AddCharacterClass(cc); + break; + } + } + ReportError("Invalid property name"); + UNREACHABLE(); + } else { + builder->AddCharacter(p); + } break; } case '1': @@ -453,14 +665,20 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddEmpty(); } else { RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = new RegExpBackReference(capture); + RegExpTree* atom = + new (Z) RegExpBackReference(capture, builder->flags()); builder->AddAtom(atom); } break; } + // With /u, no identity escapes except for syntax characters are + // allowed. Otherwise, all identity escapes are allowed. + if (is_unicode()) { + ReportError(kUnicodeIdentity); + UNREACHABLE(); + } uint32_t first_digit = Next(); if (first_digit == '8' || first_digit == '9') { - // Treat as identity escape builder->AddCharacter(first_digit); Advance(2); break; @@ -469,6 +687,11 @@ RegExpTree* RegExpParser::ParseDisjunction() { } case '0': { Advance(); + if (is_unicode() && Next() >= '0' && Next() <= '9') { + // With /u, decimal escape with leading 0 are not parsed as octal. + ReportError("Invalid decimal escape"); + UNREACHABLE(); + } uint32_t octal = ParseOctalLiteral(); builder->AddCharacter(octal); break; @@ -506,6 +729,11 @@ RegExpTree* RegExpParser::ParseDisjunction() { // This is outside the specification. We match JSC in // reading the backslash as a literal character instead // of as starting an escape. + if (is_unicode()) { + // With /u, invalid escapes are not treated as identity escapes. + ReportError(kUnicodeIdentity); + UNREACHABLE(); + } builder->AddCharacter('\\'); } else { Advance(2); @@ -518,18 +746,26 @@ RegExpTree* RegExpParser::ParseDisjunction() { uint32_t value; if (ParseHexEscape(2, &value)) { builder->AddCharacter(value); - } else { + } else if (!is_unicode()) { builder->AddCharacter('x'); + } else { + // With /u, invalid escapes are not treated as identity escapes. + ReportError(kUnicodeIdentity); + UNREACHABLE(); } break; } case 'u': { Advance(2); uint32_t value; - if (ParseHexEscape(4, &value)) { - builder->AddCharacter(value); - } else { + if (ParseUnicodeEscape(&value)) { + builder->AddEscapedUnicodeCharacter(value); + } else if (!is_unicode()) { builder->AddCharacter('u'); + } else { + // With /u, invalid escapes are not treated as identity escapes. + ReportError(kUnicodeIdentity); + UNREACHABLE(); } break; } @@ -539,16 +775,24 @@ RegExpTree* RegExpParser::ParseDisjunction() { // an identity escape for non-Unicode patterns without named // capture groups, and as the beginning of a named back-reference // in all other cases. - if (HasNamedCaptures()) { + if (is_unicode() || HasNamedCaptures()) { Advance(2); ParseNamedBackReference(builder, stored_state); break; } FALL_THROUGH; default: - // Identity escape. - builder->AddCharacter(Next()); - Advance(2); + Advance(); + // With the unicode flag, no identity escapes except for syntax + // characters are allowed. Otherwise, all identity escapes are + // allowed. + if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) { + builder->AddCharacter(current()); + Advance(); + } else { + ReportError(kUnicodeIdentity); + UNREACHABLE(); + } break; } break; @@ -560,8 +804,15 @@ RegExpTree* RegExpParser::ParseDisjunction() { } FALL_THROUGH; } + case '}': + case ']': + if (is_unicode()) { + ReportError("Lone quantifier brackets"); + UNREACHABLE(); + } + FALL_THROUGH; default: - builder->AddCharacter(current()); + builder->AddUnicodeCharacter(current()); Advance(); break; } // end switch(current()) @@ -693,8 +944,9 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( } } // Store current state and begin new disjunction parsing. - return new RegExpParserState(state, subexpr_type, lookaround_type, - captures_started_, capture_name, Z); + return new (Z) + RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, + capture_name, state->builder()->flags(), Z); } // In order to know whether an escape is a backreference or not we have to scan @@ -756,10 +1008,6 @@ void RegExpParser::ScanForCaptures() { Reset(saved_position); } -static inline bool IsDecimalDigit(int32_t c) { - return '0' <= c && c <= '9'; -} - bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { ASSERT('\\' == current()); ASSERT('1' <= Next() && Next() <= '9'); @@ -770,7 +1018,7 @@ bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { Advance(2); while (true) { uint32_t c = current(); - if (IsDecimalDigit(c)) { + if (Utils::IsDecimalDigit(c)) { value = 10 * value + (c - '0'); if (value > kMaxCaptures) { Reset(start); @@ -794,17 +1042,43 @@ bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { namespace { -inline constexpr bool IsIdentifierStart(uint16_t ch) { - return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' || - ch == '$'; +static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) { + return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$'; } -inline constexpr bool IsIdentifierPart(uint16_t ch) { - return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9'); +// ES#sec-names-and-keywords Names and Keywords +// UnicodeIDStart, '$', '_' and '\' +static bool IsIdentifierStartSlow(uint32_t c) { + // cannot use u_isIDStart because it does not work for + // Other_ID_Start characters. + return u_hasBinaryProperty(c, UCHAR_ID_START) || + (c < 0x60 && (c == '$' || c == '\\' || c == '_')); } -bool IsSameName(const RegExpCaptureName* name1, - const RegExpCaptureName* name2) { +// ES#sec-names-and-keywords Names and Keywords +// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ +static bool IsIdentifierPartSlow(uint32_t c) { + const uint32_t kZeroWidthNonJoiner = 0x200C; + const uint32_t kZeroWidthJoiner = 0x200D; + // Can't use u_isIDPart because it does not work for + // Other_ID_Continue characters. + return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || + (c < 0x60 && (c == '$' || c == '\\' || c == '_')) || + c == kZeroWidthNonJoiner || c == kZeroWidthJoiner; +} + +static inline bool IsIdentifierStart(uint32_t c) { + if (c > 127) return IsIdentifierStartSlow(c); + return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c); +} + +static inline bool IsIdentifierPart(uint32_t c) { + if (c > 127) return IsIdentifierPartSlow(c); + return IsAsciiIdentifierPart(c); +} + +static bool IsSameName(const RegExpCaptureName* name1, + const RegExpCaptureName* name2) { if (name1->length() != name2->length()) return false; for (intptr_t i = 0; i < name1->length(); i++) { if (name1->At(i) != name2->At(i)) return false; @@ -814,14 +1088,34 @@ bool IsSameName(const RegExpCaptureName* name1, } // end namespace +static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) { + if (code_unit <= Utf16::kMaxCodeUnit) { + v->Add(code_unit); + } else { + uint16_t units[2]; + Utf16::Encode(code_unit, units); + v->Add(units[0]); + v->Add(units[1]); + } +} + const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() { auto name = new (Z) RegExpCaptureName(); bool at_start = true; while (true) { - const uint16_t c = current(); + uint32_t c = current(); Advance(); + // Convert unicode escapes. + if (c == '\\' && current() == 'u') { + Advance(); + if (!ParseUnicodeEscape(&c)) { + ReportError("Invalid Unicode escape sequence"); + UNREACHABLE(); + } + } + // The backslash char is misclassified as both ID_Start and ID_Continue. if (c == '\\') { ReportError("Invalid capture group name"); @@ -833,13 +1127,13 @@ const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() { ReportError("Invalid capture group name"); UNREACHABLE(); } - name->Add(c); + PushCodeUnit(name, c); at_start = false; } else { if (c == '>') { break; } else if (IsIdentifierPart(c)) { - name->Add(c); + PushCodeUnit(name, c); } else { ReportError("Invalid capture group name"); UNREACHABLE(); @@ -896,7 +1190,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, if (state->IsInsideCaptureGroup(name)) { builder->AddEmpty(); } else { - RegExpBackReference* atom = new (Z) RegExpBackReference(); + RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags()); atom->set_name(name); builder->AddAtom(atom); @@ -1015,17 +1309,17 @@ bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out, intptr_t start = position(); Advance(); intptr_t min = 0; - if (!IsDecimalDigit(current())) { + if (!Utils::IsDecimalDigit(current())) { Reset(start); return false; } - while (IsDecimalDigit(current())) { + while (Utils::IsDecimalDigit(current())) { intptr_t next = current() - '0'; if (min > (RegExpTree::kInfinity - next) / 10) { // Overflow. Skip past remaining decimal digits and return -1. do { Advance(); - } while (IsDecimalDigit(current())); + } while (Utils::IsDecimalDigit(current())); min = RegExpTree::kInfinity; break; } @@ -1042,12 +1336,12 @@ bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out, max = RegExpTree::kInfinity; Advance(); } else { - while (IsDecimalDigit(current())) { + while (Utils::IsDecimalDigit(current())) { intptr_t next = current() - '0'; if (max > (RegExpTree::kInfinity - next) / 10) { do { Advance(); - } while (IsDecimalDigit(current())); + } while (Utils::IsDecimalDigit(current())); max = RegExpTree::kInfinity; break; } @@ -1117,6 +1411,317 @@ bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) { return true; } +// This parses RegExpUnicodeEscapeSequence as described in ECMA262. +bool RegExpParser::ParseUnicodeEscape(uint32_t* value) { + // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are + // allowed). In the latter case, the number of hex digits between { } is + // arbitrary. \ and u have already been read. + if (current() == '{' && is_unicode()) { + int start = position(); + Advance(); + if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) { + if (current() == '}') { + Advance(); + return true; + } + } + Reset(start); + return false; + } + // \u but no {, or \u{...} escapes not allowed. + bool result = ParseHexEscape(4, value); + if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) && + current() == '\\') { + // Attempt to read trail surrogate. + int start = position(); + if (Next() == 'u') { + Advance(2); + uint32_t trail; + if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) { + *value = Utf16::Decode(static_cast(*value), + static_cast(trail)); + return true; + } + } + Reset(start); + } + return result; +} + +namespace { + +bool IsExactPropertyAlias(const char* property_name, UProperty property) { + const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); + if (short_name != nullptr && strcmp(property_name, short_name) == 0) { + return true; + } + for (int i = 0;; i++) { + const char* long_name = u_getPropertyName( + property, static_cast(U_LONG_PROPERTY_NAME + i)); + if (long_name == nullptr) break; + if (strcmp(property_name, long_name) == 0) return true; + } + return false; +} + +bool IsExactPropertyValueAlias(const char* property_value_name, + UProperty property, + int32_t property_value) { + const char* short_name = + u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); + if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { + return true; + } + for (int i = 0;; i++) { + const char* long_name = u_getPropertyValueName( + property, property_value, + static_cast(U_LONG_PROPERTY_NAME + i)); + if (long_name == nullptr) break; + if (strcmp(property_value_name, long_name) == 0) return true; + } + return false; +} + +bool LookupPropertyValueName(UProperty property, + const char* property_value_name, + bool negate, + ZoneGrowableArray* result) { + UProperty property_for_lookup = property; + if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { + // For the property Script_Extensions, we have to do the property value + // name lookup as if the property is Script. + property_for_lookup = UCHAR_SCRIPT; + } + int32_t property_value = + u_getPropertyValueEnum(property_for_lookup, property_value_name); + if (property_value == UCHAR_INVALID_CODE) return false; + + // We require the property name to match exactly to one of the property value + // aliases. However, u_getPropertyValueEnum uses loose matching. + if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, + property_value)) { + return false; + } + + UErrorCode ec = U_ZERO_ERROR; + icu::UnicodeSet set; + set.applyIntPropertyValue(property, property_value, ec); + bool success = ec == U_ZERO_ERROR && !set.isEmpty(); + + if (success) { + set.removeAllStrings(); + if (negate) set.complement(); + for (int i = 0; i < set.getRangeCount(); i++) { + result->Add( + CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i))); + } + } + return success; +} + +template +inline bool NameEquals(const char* name, const char (&literal)[N]) { + return strncmp(name, literal, N + 1) == 0; +} + +bool LookupSpecialPropertyValueName(const char* name, + ZoneGrowableArray* result, + bool negate) { + if (NameEquals(name, "Any")) { + if (negate) { + // Leave the list of character ranges empty, since the negation of 'Any' + // is the empty set. + } else { + result->Add(CharacterRange::Everything()); + } + } else if (NameEquals(name, "ASCII")) { + result->Add(negate ? CharacterRange::Range(0x80, Utf::kMaxCodePoint) + : CharacterRange::Range(0x0, 0x7F)); + } else if (NameEquals(name, "Assigned")) { + return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", + !negate, result); + } else { + return false; + } + return true; +} + +// Explicitly whitelist supported binary properties. The spec forbids supporting +// properties outside of this set to ensure interoperability. +bool IsSupportedBinaryProperty(UProperty property) { + switch (property) { + case UCHAR_ALPHABETIC: + // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. + // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. + case UCHAR_ASCII_HEX_DIGIT: + // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. + case UCHAR_BIDI_CONTROL: + case UCHAR_BIDI_MIRRORED: + case UCHAR_CASE_IGNORABLE: + case UCHAR_CASED: + case UCHAR_CHANGES_WHEN_CASEFOLDED: + case UCHAR_CHANGES_WHEN_CASEMAPPED: + case UCHAR_CHANGES_WHEN_LOWERCASED: + case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: + case UCHAR_CHANGES_WHEN_TITLECASED: + case UCHAR_CHANGES_WHEN_UPPERCASED: + case UCHAR_DASH: + case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: + case UCHAR_DEPRECATED: + case UCHAR_DIACRITIC: + case UCHAR_EMOJI: + case UCHAR_EMOJI_COMPONENT: + case UCHAR_EMOJI_MODIFIER_BASE: + case UCHAR_EMOJI_MODIFIER: + case UCHAR_EMOJI_PRESENTATION: + case UCHAR_EXTENDED_PICTOGRAPHIC: + case UCHAR_EXTENDER: + case UCHAR_GRAPHEME_BASE: + case UCHAR_GRAPHEME_EXTEND: + case UCHAR_HEX_DIGIT: + case UCHAR_ID_CONTINUE: + case UCHAR_ID_START: + case UCHAR_IDEOGRAPHIC: + case UCHAR_IDS_BINARY_OPERATOR: + case UCHAR_IDS_TRINARY_OPERATOR: + case UCHAR_JOIN_CONTROL: + case UCHAR_LOGICAL_ORDER_EXCEPTION: + case UCHAR_LOWERCASE: + case UCHAR_MATH: + case UCHAR_NONCHARACTER_CODE_POINT: + case UCHAR_PATTERN_SYNTAX: + case UCHAR_PATTERN_WHITE_SPACE: + case UCHAR_QUOTATION_MARK: + case UCHAR_RADICAL: + case UCHAR_REGIONAL_INDICATOR: + case UCHAR_S_TERM: + case UCHAR_SOFT_DOTTED: + case UCHAR_TERMINAL_PUNCTUATION: + case UCHAR_UNIFIED_IDEOGRAPH: + case UCHAR_UPPERCASE: + case UCHAR_VARIATION_SELECTOR: + case UCHAR_WHITE_SPACE: + case UCHAR_XID_CONTINUE: + case UCHAR_XID_START: + return true; + default: + break; + } + return false; +} + +bool IsUnicodePropertyValueCharacter(char c) { + // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ + // + // Note that using this to validate each parsed char is quite conservative. + // A possible alternative solution would be to only ensure the parsed + // property name/value candidate string does not contain '\0' characters and + // let ICU lookups trigger the final failure. + if (Utils::IsAlphaNumeric(c)) return true; + return (c == '_'); +} + +} // anonymous namespace + +bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray* name_1, + ZoneGrowableArray* name_2) { + ASSERT(name_1->is_empty()); + ASSERT(name_2->is_empty()); + // Parse the property class as follows: + // - In \p{name}, 'name' is interpreted + // - either as a general category property value name. + // - or as a binary property name. + // - In \p{name=value}, 'name' is interpreted as an enumerated property name, + // and 'value' is interpreted as one of the available property value names. + // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. + // - Loose matching is not applied. + if (current() == '{') { + // Parse \p{[PropertyName=]PropertyNameValue} + for (Advance(); current() != '}' && current() != '='; Advance()) { + if (!IsUnicodePropertyValueCharacter(current())) return false; + if (!has_next()) return false; + name_1->Add(static_cast(current())); + } + if (current() == '=') { + for (Advance(); current() != '}'; Advance()) { + if (!IsUnicodePropertyValueCharacter(current())) return false; + if (!has_next()) return false; + name_2->Add(static_cast(current())); + } + name_2->Add(0); // null-terminate string. + } + } else { + return false; + } + Advance(); + name_1->Add(0); // null-terminate string. + + ASSERT(static_cast(name_1->length() - 1) == strlen(name_1->data())); + ASSERT(name_2->is_empty() || + static_cast(name_2->length() - 1) == strlen(name_2->data())); + return true; +} + +bool RegExpParser::AddPropertyClassRange( + ZoneGrowableArray* add_to, + bool negate, + ZoneGrowableArray* name_1, + ZoneGrowableArray* name_2) { + ASSERT(name_1->At(name_1->length() - 1) == '\0'); + ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0'); + if (name_2->is_empty()) { + // First attempt to interpret as general category property value name. + const char* name = name_1->data(); + if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, + add_to)) { + return true; + } + // Interpret "Any", "ASCII", and "Assigned". + if (LookupSpecialPropertyValueName(name, add_to, negate)) { + return true; + } + // Then attempt to interpret as binary property name with value name 'Y'. + UProperty property = u_getPropertyEnum(name); + if (!IsSupportedBinaryProperty(property)) return false; + if (!IsExactPropertyAlias(name, property)) return false; + return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to); + } else { + // Both property name and value name are specified. Attempt to interpret + // the property name as enumerated property. + const char* property_name = name_1->data(); + const char* value_name = name_2->data(); + UProperty property = u_getPropertyEnum(property_name); + if (!IsExactPropertyAlias(property_name, property)) return false; + if (property == UCHAR_GENERAL_CATEGORY) { + // We want to allow aggregate value names such as "Letter". + property = UCHAR_GENERAL_CATEGORY_MASK; + } else if (property != UCHAR_SCRIPT && + property != UCHAR_SCRIPT_EXTENSIONS) { + return false; + } + return LookupPropertyValueName(property, value_name, negate, add_to); + } +} + +bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value, + uint32_t* value) { + uint32_t x = 0; + int d = HexValue(current()); + if (d < 0) { + return false; + } + while (d >= 0) { + x = x * 16 + d; + if (x > max_value) { + return false; + } + Advance(); + d = HexValue(current()); + } + *value = x; + return true; +} + uint32_t RegExpParser::ParseClassCharacterEscape() { ASSERT(current() == '\\'); DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next())); @@ -1147,35 +1752,47 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { uint32_t letter = controlLetter & ~('A' ^ 'a'); // For compatibility with JSC, inside a character class // we also accept digits and underscore as control characters. - if ((controlLetter >= '0' && controlLetter <= '9') || - controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { + if (letter >= 'A' && letter <= 'Z') { Advance(2); // Control letters mapped to ASCII control characters in the range // 0x00-0x1f. return controlLetter & 0x1f; } + if (is_unicode()) { + // With /u, \c# or \c_ are invalid. + ReportError("Invalid class escape"); + UNREACHABLE(); + } + if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') { + Advance(2); + return controlLetter & 0x1f; + } // We match JSC in reading the backslash as a literal // character instead of as starting an escape. return '\\'; } case '0': + // With /u, \0 is interpreted as NUL if not followed by another digit. + if (is_unicode() && !(Next() >= '0' && Next() <= '9')) { + Advance(); + return 0; + } FALL_THROUGH; case '1': - FALL_THROUGH; case '2': - FALL_THROUGH; case '3': - FALL_THROUGH; case '4': - FALL_THROUGH; case '5': - FALL_THROUGH; case '6': - FALL_THROUGH; case '7': // For compatibility, we interpret a decimal escape that isn't // a back reference (and therefore either \0 or not valid according // to the specification) as a 1..3 digit octal character code. + if (is_unicode()) { + // With \u, decimal escape is not interpreted as octal character code. + ReportError("Invalid class escape"); + UNREACHABLE(); + } return ParseOctalLiteral(); case 'x': { Advance(); @@ -1183,6 +1800,11 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { if (ParseHexEscape(2, &value)) { return value; } + if (is_unicode()) { + // With \u, invalid escapes are not treated as identity escapes. + ReportError("Invalid escape"); + UNREACHABLE(); + } // If \x is not followed by a two-digit hexadecimal, treat it // as an identity escape. return 'x'; @@ -1190,9 +1812,14 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { case 'u': { Advance(); uint32_t value; - if (ParseHexEscape(4, &value)) { + if (ParseUnicodeEscape(&value)) { return value; } + if (is_unicode()) { + // With \u, invalid escapes are not treated as identity escapes. + ReportError(kUnicodeIdentity); + UNREACHABLE(); + } // If \u is not followed by a four-digit hexadecimal, treat it // as an identity escape. return 'u'; @@ -1202,15 +1829,20 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { // been matched by a more specific case, not just the subset required // by the ECMAScript specification. uint32_t result = current(); - Advance(); - return result; + if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { + Advance(); + return result; + } + ReportError(kUnicodeIdentity); + UNREACHABLE(); } } return 0; } -CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) { - ASSERT(0 == *char_class); +bool RegExpParser::ParseClassEscape(ZoneGrowableArray* ranges, + bool add_unicode_case_equivalents, + uint32_t* char_out) { uint32_t first = current(); if (first == '\\') { switch (Next()) { @@ -1220,40 +1852,42 @@ CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) { case 'D': case 's': case 'S': { - *char_class = Next(); + CharacterRange::AddClassEscape(static_cast(Next()), ranges, + add_unicode_case_equivalents); Advance(2); - return CharacterRange::Singleton(0); // Return dummy value. + return true; + } + case 'p': + case 'P': { + if (!is_unicode()) break; + bool negate = Next() == 'P'; + Advance(2); + auto name_1 = new (Z) ZoneGrowableArray(); + auto name_2 = new (Z) ZoneGrowableArray(); + if (!ParsePropertyClassName(name_1, name_2) || + !AddPropertyClassRange(ranges, negate, name_1, name_2)) { + ReportError("Invalid property name in character class"); + UNREACHABLE(); + } + return true; } case kEndMarker: ReportError("\\ at end of pattern"); UNREACHABLE(); default: - uint32_t c = ParseClassCharacterEscape(); - return CharacterRange::Singleton(c); + break; } - } else { - Advance(); - return CharacterRange::Singleton(first); + *char_out = ParseClassCharacterEscape(); + return false; } + Advance(); + *char_out = first; + return false; } -static const uint16_t kNoCharClass = 0; - -// Adds range or pre-defined character class to character ranges. -// If char_class is not kInvalidClass, it's interpreted as a class -// escape (i.e., 's' means whitespace, from '\s'). -static inline void AddRangeOrEscape(ZoneGrowableArray* ranges, - uint16_t char_class, - CharacterRange range) { - if (char_class != kNoCharClass) { - CharacterRange::AddClassEscape(char_class, ranges); - } else { - ranges->Add(range); - } -} - -RegExpTree* RegExpParser::ParseCharacterClass() { +RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { static const char* kUnterminated = "Unterminated character class"; + static const char* kRangeInvalid = "Invalid character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; ASSERT(current() == '['); @@ -1265,9 +1899,11 @@ RegExpTree* RegExpParser::ParseCharacterClass() { } ZoneGrowableArray* ranges = new (Z) ZoneGrowableArray(2); + bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case(); while (has_more() && current() != ']') { - uint16_t char_class = kNoCharClass; - CharacterRange first = ParseClassAtom(&char_class); + uint32_t char_1; + bool is_class_1 = + ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1); if (current() == '-') { Advance(); if (current() == kEndMarker) { @@ -1275,26 +1911,32 @@ RegExpTree* RegExpParser::ParseCharacterClass() { // following code report an error. break; } else if (current() == ']') { - AddRangeOrEscape(ranges, char_class, first); + if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); ranges->Add(CharacterRange::Singleton('-')); break; } - uint16_t char_class_2 = kNoCharClass; - CharacterRange next = ParseClassAtom(&char_class_2); - if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { + uint32_t char_2; + bool is_class_2 = + ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2); + if (is_class_1 || is_class_2) { // Either end is an escaped character class. Treat the '-' verbatim. - AddRangeOrEscape(ranges, char_class, first); + if (is_unicode()) { + // ES2015 21.2.2.15.1 step 1. + ReportError(kRangeInvalid); + UNREACHABLE(); + } + if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); ranges->Add(CharacterRange::Singleton('-')); - AddRangeOrEscape(ranges, char_class_2, next); + if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2)); continue; } - if (first.from() > next.to()) { + if (char_1 > char_2) { ReportError(kRangeOutOfOrder); UNREACHABLE(); } - ranges->Add(CharacterRange::Range(first.from(), next.to())); + ranges->Add(CharacterRange::Range(char_1, char_2)); } else { - AddRangeOrEscape(ranges, char_class, first); + if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); } } if (!has_more()) { @@ -1302,21 +1944,21 @@ RegExpTree* RegExpParser::ParseCharacterClass() { UNREACHABLE(); } Advance(); - if (ranges->length() == 0) { - ranges->Add(CharacterRange::Everything()); - is_negated = !is_negated; - } - return new (Z) RegExpCharacterClass(ranges, is_negated); + RegExpCharacterClass::CharacterClassFlags character_class_flags = + RegExpCharacterClass::DefaultFlags(); + if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED; + return new (Z) + RegExpCharacterClass(ranges, builder->flags(), character_class_flags); } // ---------------------------------------------------------------------------- // The Parser interface. void RegExpParser::ParseRegExp(const String& input, - bool multiline, + RegExpFlags flags, RegExpCompileData* result) { ASSERT(result != NULL); - RegExpParser parser(input, &result->error, multiline); + RegExpParser parser(input, &result->error, flags); // Throws an exception if 'input' is not valid. RegExpTree* tree = parser.ParsePattern(); ASSERT(tree != NULL); diff --git a/runtime/vm/regexp_parser.h b/runtime/vm/regexp_parser.h index a2b626b97a6..63c237a3568 100644 --- a/runtime/vm/regexp_parser.h +++ b/runtime/vm/regexp_parser.h @@ -14,13 +14,18 @@ namespace dart { // Accumulates RegExp atoms and assertions into lists of terms and alternatives. class RegExpBuilder : public ZoneAllocated { public: - RegExpBuilder(); + explicit RegExpBuilder(RegExpFlags flags); void AddCharacter(uint16_t character); + void AddUnicodeCharacter(uint32_t character); + void AddEscapedUnicodeCharacter(uint32_t character); // "Adds" an empty expression. Does nothing except consume a // following quantifier void AddEmpty(); + void AddCharacterClass(RegExpCharacterClass* cc); + void AddCharacterClassForDesugaring(uint32_t c); void AddAtom(RegExpTree* tree); + void AddTerm(RegExpTree* tree); void AddAssertion(RegExpTree* tree); void NewAlternative(); // '|' // Attempt to add a quantifier to the last atom added. The return value @@ -30,17 +35,30 @@ class RegExpBuilder : public ZoneAllocated { intptr_t max, RegExpQuantifier::QuantifierType type); RegExpTree* ToRegExp(); + RegExpFlags flags() const { return flags_; } + bool ignore_case() const { return flags_.IgnoreCase(); } + bool is_multi_line() const { return flags_.IsMultiLine(); } + bool is_dot_all() const { return flags_.IsDotAll(); } private: + static const uint16_t kNoPendingSurrogate = 0; + void AddLeadSurrogate(uint16_t lead_surrogate); + void AddTrailSurrogate(uint16_t trail_surrogate); + void FlushPendingSurrogate(); void FlushCharacters(); void FlushText(); void FlushTerms(); + bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); + bool NeedsDesugaringForIgnoreCase(uint32_t c); Zone* zone() const { return zone_; } + bool is_unicode() const { return flags_.IsUnicode(); } Zone* zone_; bool pending_empty_; + RegExpFlags flags_; ZoneGrowableArray* characters_; + uint16_t pending_surrogate_; GrowableArray terms_; GrowableArray text_; GrowableArray alternatives_; @@ -56,16 +74,15 @@ using RegExpCaptureName = ZoneGrowableArray; class RegExpParser : public ValueObject { public: - RegExpParser(const String& in, String* error, bool multiline_mode); + RegExpParser(const String& in, String* error, RegExpFlags regexp_flags); static void ParseRegExp(const String& input, - bool multiline, + RegExpFlags regexp_flags, RegExpCompileData* result); RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); RegExpTree* ParseGroup(); - RegExpTree* ParseCharacterClass(); // Parses a {...,...} quantifier and stores the range in the given // out parameters. @@ -78,6 +95,24 @@ class RegExpParser : public ValueObject { // Checks whether the following is a length-digit hexadecimal number, // and sets the value if it is. bool ParseHexEscape(intptr_t length, uint32_t* value); + bool ParseUnicodeEscape(uint32_t* value); + bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value); + + // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or + // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the + // result in the given out parameters. If the shorthand is used, + // nothing will be added to name_2. + bool ParsePropertyClassName(ZoneGrowableArray* name_1, + ZoneGrowableArray* name_2); + // Adds the specified unicode property to the provided character range. + bool AddPropertyClassRange(ZoneGrowableArray* add_to, + bool negate, + ZoneGrowableArray* name_1, + ZoneGrowableArray* name_2); + // Returns a regexp node that corresponds to one of these unicode + // property sequences: "Any", "ASCII", "Assigned". + RegExpTree* GetPropertySequence(ZoneGrowableArray* name_1); + RegExpTree* ParseCharacterClass(const RegExpBuilder* builder); uint32_t ParseOctalLiteral(); @@ -87,7 +122,10 @@ class RegExpParser : public ValueObject { // can be reparsed. bool ParseBackReferenceIndex(intptr_t* index_out); - CharacterRange ParseClassAtom(uint16_t* char_class); + // Attempts to parse a possible escape within a character class. + bool ParseClassEscape(ZoneGrowableArray* ranges, + bool add_unicode_case_equivalents, + uint32_t* char_out); void ReportError(const char* message); void Advance(); void Advance(intptr_t dist); @@ -100,6 +138,9 @@ class RegExpParser : public ValueObject { void set_contains_anchor() { contains_anchor_ = true; } intptr_t captures_started() { return captures_started_; } intptr_t position() { return next_pos_ - 1; } + bool is_unicode() const { return top_level_flags_.IsUnicode(); } + + static bool IsSyntaxCharacterOrSlash(uint32_t c); static const intptr_t kMaxCaptures = 1 << 16; static const uint32_t kEndMarker = (1 << 21); @@ -120,9 +161,10 @@ class RegExpParser : public ValueObject { RegExpLookaround::Type lookaround_type, intptr_t disjunction_capture_index, const RegExpCaptureName* capture_name, + RegExpFlags flags, Zone* zone) : previous_state_(previous_state), - builder_(new (zone) RegExpBuilder()), + builder_(new (zone) RegExpBuilder(flags)), group_type_(group_type), lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index), @@ -198,6 +240,7 @@ class RegExpParser : public ValueObject { bool has_more() { return has_more_; } bool has_next() { return next_pos_ < in().Length(); } uint32_t Next(); + uint32_t ReadNext(bool update_position); const String& in() { return in_; } void ScanForCaptures(); @@ -212,7 +255,7 @@ class RegExpParser : public ValueObject { // The capture count is only valid after we have scanned for captures. intptr_t capture_count_; bool has_more_; - bool multiline_; + RegExpFlags top_level_flags_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; diff --git a/runtime/vm/regexp_test.cc b/runtime/vm/regexp_test.cc index 531f12d5b97..f8009312e5b 100644 --- a/runtime/vm/regexp_test.cc +++ b/runtime/vm/regexp_test.cc @@ -16,7 +16,7 @@ static RawArray* Match(const String& pat, const String& str) { Thread* thread = Thread::Current(); Zone* zone = thread->zone(); const RegExp& regexp = - RegExp::Handle(RegExpEngine::CreateRegExp(thread, pat, false, false)); + RegExp::Handle(RegExpEngine::CreateRegExp(thread, pat, RegExpFlags())); const Smi& idx = Smi::Handle(Smi::New(0)); return IRRegExpMacroAssembler::Execute(regexp, str, idx, /*sticky=*/false, zone); diff --git a/runtime/vm/runtime_entry_list.h b/runtime/vm/runtime_entry_list.h index ddcd350bd43..dd4159d851e 100644 --- a/runtime/vm/runtime_entry_list.h +++ b/runtime/vm/runtime_entry_list.h @@ -79,7 +79,9 @@ namespace dart { V(double, LibcAsin, double) \ V(double, LibcAtan, double) \ V(double, LibcAtan2, double, double) \ - V(RawBool*, CaseInsensitiveCompareUC16, RawString*, RawSmi*, RawSmi*, \ + V(RawBool*, CaseInsensitiveCompareUCS2, RawString*, RawSmi*, RawSmi*, \ + RawSmi*) \ + V(RawBool*, CaseInsensitiveCompareUTF16, RawString*, RawSmi*, RawSmi*, \ RawSmi*) \ V(void, EnterSafepoint) \ V(void, ExitSafepoint) diff --git a/runtime/vm/splay-tree.h b/runtime/vm/splay-tree.h new file mode 100644 index 00000000000..54c5d9faf0e --- /dev/null +++ b/runtime/vm/splay-tree.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019, the Dart project authors. Please see the AUTHORS file +// for details. All rights reserved. Use of this source code is governed by a +// BSD-style license that can be found in the LICENSE file. + +#ifndef RUNTIME_VM_SPLAY_TREE_H_ +#define RUNTIME_VM_SPLAY_TREE_H_ + +#include "platform/splay-tree.h" +#include "vm/zone.h" + +namespace dart { + +// A zone splay tree. The config type parameter encapsulates the +// different configurations of a concrete splay tree (see +// platform/splay-tree.h). The tree itself and all its elements are allocated +// in the Zone. +template +class ZoneSplayTree final : public SplayTree { + public: + explicit ZoneSplayTree(Zone* zone) + : SplayTree(ASSERT_NOTNULL(zone)) {} + ~ZoneSplayTree() { + // Reset the root to avoid unneeded iteration over all tree nodes + // in the destructor. For a zone-allocated tree, nodes will be + // freed by the Zone. + SplayTree::ResetRoot(); + } +}; + +} // namespace dart + +#endif // RUNTIME_VM_SPLAY_TREE_H_ diff --git a/runtime/vm/stub_code_arm64_test.cc b/runtime/vm/stub_code_arm64_test.cc index 4e69a5a842d..7471c7de8fb 100644 --- a/runtime/vm/stub_code_arm64_test.cc +++ b/runtime/vm/stub_code_arm64_test.cc @@ -83,7 +83,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ LoadObject(R1, lhs_index); __ LoadObject(R2, rhs_index); __ LoadObject(R3, length); - __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); __ LeaveDartFrame(); __ ret(); // Return value is in R0. } diff --git a/runtime/vm/stub_code_arm_test.cc b/runtime/vm/stub_code_arm_test.cc index 2a664780063..d849dc0a292 100644 --- a/runtime/vm/stub_code_arm_test.cc +++ b/runtime/vm/stub_code_arm_test.cc @@ -82,7 +82,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ LoadObject(R1, lhs_index); __ LoadObject(R2, rhs_index); __ LoadObject(R3, length); - __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); __ LeaveDartFrameAndReturn(); // Return value is in R0. } diff --git a/runtime/vm/stub_code_ia32_test.cc b/runtime/vm/stub_code_ia32_test.cc index a4f79fcf25a..578afde029f 100644 --- a/runtime/vm/stub_code_ia32_test.cc +++ b/runtime/vm/stub_code_ia32_test.cc @@ -86,7 +86,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ movl(Address(ESP, 2 * kWordSize), EAX); // Push argument 3. __ LoadObject(EAX, length); __ movl(Address(ESP, 3 * kWordSize), EAX); // Push argument 4. - __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); __ leave(); __ ret(); // Return value is in EAX. } diff --git a/runtime/vm/stub_code_x64_test.cc b/runtime/vm/stub_code_x64_test.cc index a71484ab6b6..4cdf61386e2 100644 --- a/runtime/vm/stub_code_x64_test.cc +++ b/runtime/vm/stub_code_x64_test.cc @@ -83,7 +83,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ LoadObject(CallingConventions::kArg2Reg, lhs_index); __ LoadObject(CallingConventions::kArg3Reg, rhs_index); __ LoadObject(CallingConventions::kArg4Reg, length); - __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); __ LeaveStubFrame(); __ ret(); // Return value is in RAX. } diff --git a/sdk/lib/_internal/js_runtime/lib/core_patch.dart b/sdk/lib/_internal/js_runtime/lib/core_patch.dart index e31fe29db10..998f42b774d 100644 --- a/sdk/lib/_internal/js_runtime/lib/core_patch.dart +++ b/sdk/lib/_internal/js_runtime/lib/core_patch.dart @@ -528,9 +528,15 @@ class RegExp { @pragma('dart2js:noInline') @patch factory RegExp(String source, - {bool multiLine: false, bool caseSensitive: true}) => + {bool multiLine: false, + bool caseSensitive: true, + bool unicode: false, + bool dotAll: false}) => new JSSyntaxRegExp(source, - multiLine: multiLine, caseSensitive: caseSensitive); + multiLine: multiLine, + caseSensitive: caseSensitive, + unicode: unicode, + dotAll: dotAll); @patch static String escape(String text) => quoteStringForRegExp(text); diff --git a/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart b/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart index 51b4bfbe143..3dacc06ae0a 100644 --- a/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart +++ b/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart @@ -42,18 +42,22 @@ class JSSyntaxRegExp implements RegExp { var _nativeGlobalRegExp; var _nativeAnchoredRegExp; - String toString() => 'RegExp/$pattern/'; + String toString() => + 'RegExp/$pattern/' + JS('String', '#.flags', _nativeRegExp); JSSyntaxRegExp(String source, - {bool multiLine: false, bool caseSensitive: true}) + {bool multiLine: false, + bool caseSensitive: true, + bool unicode: false, + bool dotAll: false}) : this.pattern = source, - this._nativeRegExp = - makeNative(source, multiLine, caseSensitive, false); + this._nativeRegExp = makeNative( + source, multiLine, caseSensitive, unicode, dotAll, false); get _nativeGlobalVersion { if (_nativeGlobalRegExp != null) return _nativeGlobalRegExp; - return _nativeGlobalRegExp = - makeNative(pattern, _isMultiLine, _isCaseSensitive, true); + return _nativeGlobalRegExp = makeNative( + pattern, _isMultiLine, _isCaseSensitive, _isUnicode, _isDotAll, true); } get _nativeAnchoredVersion { @@ -63,18 +67,22 @@ class JSSyntaxRegExp implements RegExp { // that it tries, and you can see if the original regexp matched, or it // was the added zero-width match that matched, by looking at the last // capture. If it is a String, the match participated, otherwise it didn't. - return _nativeAnchoredRegExp = - makeNative('$pattern|()', _isMultiLine, _isCaseSensitive, true); + return _nativeAnchoredRegExp = makeNative('$pattern|()', _isMultiLine, + _isCaseSensitive, _isUnicode, _isDotAll, true); } bool get _isMultiLine => JS('bool', '#.multiline', _nativeRegExp); bool get _isCaseSensitive => JS('bool', '!#.ignoreCase', _nativeRegExp); + bool get _isUnicode => JS('bool', '#.unicode', _nativeRegExp); + bool get _isDotAll => JS('bool', '#.dotAll', _nativeRegExp); - static makeNative( - String source, bool multiLine, bool caseSensitive, bool global) { + static makeNative(String source, bool multiLine, bool caseSensitive, + bool unicode, bool dotAll, bool global) { checkString(source); String m = multiLine == true ? 'm' : ''; String i = caseSensitive == true ? '' : 'i'; + String u = unicode ? 'u' : ''; + String s = dotAll ? 's' : ''; String g = global ? 'g' : ''; // We're using the JavaScript's try catch instead of the Dart one to avoid // dragging in Dart runtime support just because of using RegExp. @@ -87,10 +95,12 @@ class JSSyntaxRegExp implements RegExp { } catch (e) { return e; } - })(#, # + # + #)''', + })(#, # + # + # + # + #)''', source, m, i, + u, + s, g); if (JS('bool', '# instanceof RegExp', regexp)) return regexp; // The returned value is the JavaScript exception. Turn it into a @@ -99,7 +109,7 @@ class JSSyntaxRegExp implements RegExp { throw new FormatException('Illegal RegExp pattern ($errorMessage)', source); } - Match firstMatch(String string) { + RegExpMatch firstMatch(String string) { List m = JS('JSExtendableArray|Null', r'#.exec(#)', _nativeRegExp, checkString(string)); if (m == null) return null; @@ -116,7 +126,7 @@ class JSSyntaxRegExp implements RegExp { return null; } - Iterable allMatches(String string, [int start = 0]) { + Iterable allMatches(String string, [int start = 0]) { checkString(string); checkInt(start); if (start < 0 || start > string.length) { @@ -125,7 +135,7 @@ class JSSyntaxRegExp implements RegExp { return new _AllMatchesIterable(this, string, start); } - Match _execGlobal(String string, int start) { + RegExpMatch _execGlobal(String string, int start) { Object regexp = _nativeGlobalVersion; JS('void', '#.lastIndex = #', regexp, start); List match = JS('JSExtendableArray|Null', '#.exec(#)', regexp, string); @@ -133,7 +143,7 @@ class JSSyntaxRegExp implements RegExp { return new _MatchImplementation(this, match); } - Match _execAnchored(String string, int start) { + RegExpMatch _execAnchored(String string, int start) { Object regexp = _nativeAnchoredVersion; JS('void', '#.lastIndex = #', regexp, start); List match = JS('JSExtendableArray|Null', '#.exec(#)', regexp, string); @@ -144,7 +154,7 @@ class JSSyntaxRegExp implements RegExp { return new _MatchImplementation(this, match); } - Match matchAsPrefix(String string, [int start = 0]) { + RegExpMatch matchAsPrefix(String string, [int start = 0]) { if (start < 0 || start > string.length) { throw new RangeError.range(start, 0, string.length); } @@ -153,6 +163,8 @@ class JSSyntaxRegExp implements RegExp { bool get isMultiLine => _isMultiLine; bool get isCaseSensitive => _isCaseSensitive; + bool get isUnicode => _isUnicode; + bool get isDotAll => _isDotAll; } class _MatchImplementation implements RegExpMatch { @@ -219,25 +231,34 @@ class _MatchImplementation implements RegExpMatch { } } -class _AllMatchesIterable extends IterableBase { +class _AllMatchesIterable extends IterableBase { final JSSyntaxRegExp _re; final String _string; final int _start; _AllMatchesIterable(this._re, this._string, this._start); - Iterator get iterator => new _AllMatchesIterator(_re, _string, _start); + Iterator get iterator => + new _AllMatchesIterator(_re, _string, _start); } -class _AllMatchesIterator implements Iterator { +class _AllMatchesIterator implements Iterator { final JSSyntaxRegExp _regExp; String _string; int _nextIndex; - Match _current; + RegExpMatch _current; _AllMatchesIterator(this._regExp, this._string, this._nextIndex); - Match get current => _current; + RegExpMatch get current => _current; + + static bool _isLeadSurrogate(int c) { + return c >= 0xd800 && c <= 0xdbff; + } + + static bool _isTrailSurrogate(int c) { + return c >= 0xdc00 && c <= 0xdfff; + } bool moveNext() { if (_string == null) return false; @@ -247,6 +268,15 @@ class _AllMatchesIterator implements Iterator { _current = match; int nextIndex = match.end; if (match.start == nextIndex) { + // Zero-width match. Advance by one more, unless the regexp + // is in unicode mode and it would put us within a surrogate + // pair. In that case, advance past the code point as a whole. + if (_regExp.isUnicode && + _nextIndex + 1 < _string.length && + _isLeadSurrogate(_string.codeUnitAt(_nextIndex)) && + _isTrailSurrogate(_string.codeUnitAt(_nextIndex + 1))) { + nextIndex++; + } nextIndex++; } _nextIndex = nextIndex; @@ -260,6 +290,6 @@ class _AllMatchesIterator implements Iterator { } /// Find the first match of [regExp] in [string] at or after [start]. -Match firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { +RegExpMatch firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { return regExp._execGlobal(string, start); } diff --git a/sdk/lib/core/regexp.dart b/sdk/lib/core/regexp.dart index c0bd153653e..b6044139297 100644 --- a/sdk/lib/core/regexp.dart +++ b/sdk/lib/core/regexp.dart @@ -16,7 +16,7 @@ part of dart.core; * for the specification of JavaScript regular expressions. * * [firstMatch] is the main implementation method that applies a regular - * expression to a string and returns the first [Match]. All + * expression to a string and returns the first [RegExpMatch]. All * other methods in [RegExp] can build on it. * * Use [allMatches] to look for all matches of a regular expression in @@ -27,7 +27,7 @@ part of dart.core; * ```dart * RegExp exp = new RegExp(r"(\w+)"); * String str = "Parse my string"; - * Iterable matches = exp.allMatches(str); + * Iterable matches = exp.allMatches(str); * ``` * * Note the use of a _raw string_ (a string prefixed with `r`) @@ -47,6 +47,12 @@ abstract class RegExp implements Pattern { * * If `caseSensitive` is disabled, then case is ignored. * + * If `unicode` is enabled, then the pattern is treated as a Unicode + * pattern as described by the ECMAScript standard. + * + * If `dotAll` is enabled, then the `.` pattern will match _all_ characters, + * including line terminators. + * * Example: * * ```dart @@ -60,7 +66,10 @@ abstract class RegExp implements Pattern { * interpolation is required. */ external factory RegExp(String source, - {bool multiLine = false, bool caseSensitive = true}); + {bool multiLine = false, + bool caseSensitive = true, + bool unicode = false, + bool dotAll = false}); /** * Returns a regular expression that matches [text]. @@ -79,14 +88,14 @@ abstract class RegExp implements Pattern { * Searches for the first match of the regular expression * in the string [input]. Returns `null` if there is no match. */ - Match firstMatch(String input); + RegExpMatch firstMatch(String input); /** * Returns an iterable of the matches of the regular expression on [input]. * * If [start] is provided, only start looking for matches at `start`. */ - Iterable allMatches(String input, [int start = 0]); + Iterable allMatches(String input, [int start = 0]); /** * Returns whether the regular expression has a match in the string [input]. @@ -120,6 +129,33 @@ abstract class RegExp implements Pattern { * versions of the same letter. */ bool get isCaseSensitive; + + /** + * Whether this regular expression is in Unicode mode. + * + * In Unicode mode, UTF-16 surrogate pairs in the original string will be + * treated as a single code point and will not match separately. Otherwise, + * the target string will be treated purely as a sequence of individual code + * units and surrogates will not be treated specially. + * + * In Unicode mode, the syntax of the RegExp pattern is more restricted, but + * some pattern features, like Unicode property escapes, are only available in + * this mode. + */ + bool get isUnicode; + + /** + * Whether "." in this regular expression matches line terminators. + * + * When false, the "." character matches a single character, unless that + * character is a line terminator. When true, then the "." character will + * match any single character including line terminators. + * + * This feature is distinct from [isMultiline], as they affect the behavior + * of different pattern characters, and so they can be used together or + * separately. + */ + bool get isDotAll; } /** diff --git a/tests/corelib_2/regexp/dot-all_test.dart b/tests/corelib_2/regexp/dot-all_test.dart new file mode 100644 index 00000000000..5610e7fd797 --- /dev/null +++ b/tests/corelib_2/regexp/dot-all_test.dart @@ -0,0 +1,117 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2017 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + // The flags accessors. + var re = new RegExp(r".", dotAll: true); + assertTrue(re.isCaseSensitive); + assertFalse(re.isMultiLine); + assertFalse(re.isUnicode); + assertTrue(re.isDotAll); + + re = new RegExp(r".", + caseSensitive: false, multiLine: true, unicode: true, dotAll: true); + assertFalse(re.isCaseSensitive); + assertTrue(re.isMultiLine); + assertTrue(re.isUnicode); + assertTrue(re.isDotAll); + + re = new RegExp(r".", caseSensitive: false, multiLine: true, unicode: true); + assertFalse(re.isCaseSensitive); + assertTrue(re.isMultiLine); + assertTrue(re.isUnicode); + assertFalse(re.isDotAll); + + // Default '.' behavior. + re = new RegExp(r"^.$"); + assertTrue(re.hasMatch("a")); + assertTrue(re.hasMatch("3")); + assertTrue(re.hasMatch("π")); + assertTrue(re.hasMatch("\u2027")); + assertTrue(re.hasMatch("\u0085")); + assertTrue(re.hasMatch("\v")); + assertTrue(re.hasMatch("\f")); + assertTrue(re.hasMatch("\u180E")); + assertFalse(re.hasMatch("\u{10300}")); // Supplementary plane. + assertFalse(re.hasMatch("\n")); + assertFalse(re.hasMatch("\r")); + assertFalse(re.hasMatch("\u2028")); + assertFalse(re.hasMatch("\u2029")); + + // Default '.' behavior (unicode). + re = new RegExp(r"^.$", unicode: true); + assertTrue(re.hasMatch("a")); + assertTrue(re.hasMatch("3")); + assertTrue(re.hasMatch("π")); + assertTrue(re.hasMatch("\u2027")); + assertTrue(re.hasMatch("\u0085")); + assertTrue(re.hasMatch("\v")); + assertTrue(re.hasMatch("\f")); + assertTrue(re.hasMatch("\u180E")); + assertTrue(re.hasMatch("\u{10300}")); // Supplementary plane. + assertFalse(re.hasMatch("\n")); + assertFalse(re.hasMatch("\r")); + assertFalse(re.hasMatch("\u2028")); + assertFalse(re.hasMatch("\u2029")); + + // DotAll '.' behavior. + re = new RegExp(r"^.$", dotAll: true); + assertTrue(re.hasMatch("a")); + assertTrue(re.hasMatch("3")); + assertTrue(re.hasMatch("π")); + assertTrue(re.hasMatch("\u2027")); + assertTrue(re.hasMatch("\u0085")); + assertTrue(re.hasMatch("\v")); + assertTrue(re.hasMatch("\f")); + assertTrue(re.hasMatch("\u180E")); + assertFalse(re.hasMatch("\u{10300}")); // Supplementary plane. + assertTrue(re.hasMatch("\n")); + assertTrue(re.hasMatch("\r")); + assertTrue(re.hasMatch("\u2028")); + assertTrue(re.hasMatch("\u2029")); + + // DotAll '.' behavior (unicode). + re = new RegExp(r"^.$", unicode: true, dotAll: true); + assertTrue(re.hasMatch("a")); + assertTrue(re.hasMatch("3")); + assertTrue(re.hasMatch("π")); + assertTrue(re.hasMatch("\u2027")); + assertTrue(re.hasMatch("\u0085")); + assertTrue(re.hasMatch("\v")); + assertTrue(re.hasMatch("\f")); + assertTrue(re.hasMatch("\u180E")); + assertTrue(re.hasMatch("\u{10300}")); // Supplementary plane. + assertTrue(re.hasMatch("\n")); + assertTrue(re.hasMatch("\r")); + assertTrue(re.hasMatch("\u2028")); + assertTrue(re.hasMatch("\u2029")); +} diff --git a/tests/corelib_2/regexp/lookbehind_test.dart b/tests/corelib_2/regexp/lookbehind_test.dart index 5ccdcdb6253..3fd11af5a67 100644 --- a/tests/corelib_2/regexp/lookbehind_test.dart +++ b/tests/corelib_2/regexp/lookbehind_test.dart @@ -433,8 +433,6 @@ void main() { assertThrows(() => new RegExp(r"(?<=.)?")); //# 01: ok assertThrows(() => new RegExp(r"(?<=.)+")); //# 01: ok - // No unicode flag (yet), so can't test these. - // See https://github.com/dart-lang/sdk/issues/36170. - // assertThrows("/(?<=.)*/u", SyntaxError); - // assertThrows("/(?<=.){1,2}/u", SyntaxError); + assertThrows(() => new RegExp(r"(?<=.)*", unicode: true)); //# 01: ok + assertThrows(() => new RegExp(r"(?<=.){1,2}", unicode: true)); //# 01: ok } diff --git a/tests/corelib_2/regexp/named-captures_test.dart b/tests/corelib_2/regexp/named-captures_test.dart index aff61b82d8b..ee2d843b385 100644 --- a/tests/corelib_2/regexp/named-captures_test.dart +++ b/tests/corelib_2/regexp/named-captures_test.dart @@ -31,35 +31,90 @@ import 'package:expect/expect.dart'; import 'v8_regexp_utils.dart'; void main() { - void testRE(RegExp re, String input, bool expectedResult) { - if (expectedResult) { - assertTrue(re.hasMatch(input)); - } else { - assertFalse(re.hasMatch(input)); - } - } - void execRE(RegExp re, String input, List expectedResult) { assertTrue(re.hasMatch(input)); shouldBe(re.firstMatch(input), expectedResult); } + void execString(String pattern, String input, List expectedResult, + {bool unicode = true, bool caseSensitive: false}) { + execRE(RegExp(pattern, unicode: unicode, caseSensitive: caseSensitive), + input, expectedResult); + } + void namedRE(RegExp re, String input, Map expectedResults) { assertTrue(re.hasMatch(input)); - var match = re.firstMatch(input) as RegExpMatch; + var match = re.firstMatch(input); for (var s in expectedResults.keys) { assertEquals(match.namedGroup(s), expectedResults[s]); } } + void execStringGroups( + String pattern, String input, Map expectedResults, + {bool unicode = true, bool caseSensitive: false}) { + namedRE(RegExp(pattern, unicode: unicode, caseSensitive: caseSensitive), + input, expectedResults); + } + void hasNames(RegExp re, String input, List expectedResults) { assertTrue(re.hasMatch(input)); - var match = re.firstMatch(input) as RegExpMatch; + var match = re.firstMatch(input); for (var s in match.groupNames) { assertTrue(expectedResults.contains(s)); } } + void matchesIndexEqual(String input, RegExp re1, RegExp re2) { + var m1 = re1.firstMatch(input); + var m2 = re2.firstMatch(input); + if (m2 == null) { + assertNull(m1); + } else { + assertTrue(m1 != null); + assertEquals(m1.groupCount, m2.groupCount); + for (int i = 0; i < m1.groupCount; i++) { + assertEquals(m1.group(i), m2.group(i)); + } + } + } + + // Malformed named captures. + // Empty name. + assertThrows(() => RegExp(r"(?<>a)", unicode: true)); + // Unterminated name. + assertThrows(() => RegExp(r"(? RegExp(r"(?<42a>a)", unicode: true)); + // Name starting with invalid char. + assertThrows(() => RegExp(r"(?<:a>a)", unicode: true)); + // Name containing invalid char. + assertThrows(() => RegExp(r"(?a)", unicode: true)); + // Duplicate name. + assertThrows(() => RegExp(r"(?a)(?a)", unicode: true)); + // Duplicate name. + assertThrows(() => RegExp(r"(?a)(?b)(?a)", unicode: true)); + // Invalid reference. + assertThrows(() => RegExp(r"\k", unicode: true)); + // Unterminated reference. + assertThrows(() => RegExp(r"\k RegExp(r"\k", unicode: true)); + // Lone \k. + assertThrows(() => RegExp(r"(?.)\k", unicode: true)); + // Unterminated reference. + assertThrows(() => RegExp(r"(?.)\k RegExp(r"(?.)\k", unicode: true)); + // Invalid reference. + assertThrows(() => RegExp(r"(?a)\k", unicode: true)); + // Invalid reference. + assertThrows(() => RegExp(r"(?a)\k", unicode: true)); + // Invalid reference. + assertThrows(() => RegExp(r"\k(?a)", unicode: true)); + // Identity escape in capture. + assertThrows(() => RegExp(r"(?\a)", unicode: true)); + // Behavior in non-unicode mode. assertThrows(() => RegExp(r"(?<>a)")); assertThrows(() => RegExp(r"(? RegExp(r"\k.)")); assertThrows(() => RegExp(r"\k(?.)")); - // TODO(sstrickl): Add more tests when unicode flag support is in. - // https://github.com/dart-lang/sdk/issues/36170 + // Basic named groups. + execString(r"(?a)", "bab", ["a", "a"]); + execString(r"(?a)", "bab", ["a", "a"]); + execString(r"(?<_>a)", "bab", ["a", "a"]); + execString(r"(?<$>a)", "bab", ["a", "a"]); + execString(r".(?<$>a).", "bab", ["bab", "a"]); + execString(r".(?a)(.)", "bab", ["bab", "a", "b"]); + execString(r".(?a)(?.)", "bab", ["bab", "a", "b"]); + execString(r".(?\w\w)", "bab", ["bab", "ab"]); + execString(r"(?\w\w\w)", "bab", ["bab", "bab"]); + execString(r"(?\w\w)(?\w)", "bab", ["bab", "ba", "b"]); + + execString(r"(?a)", "bab", ["a", "a"], unicode: false); + execString(r"(?a)", "bab", ["a", "a"], unicode: false); + execString(r"(?<_>a)", "bab", ["a", "a"], unicode: false); + execString(r"(?<$>a)", "bab", ["a", "a"], unicode: false); + execString(r".(?<$>a).", "bab", ["bab", "a"], unicode: false); + execString(r".(?a)(.)", "bab", ["bab", "a", "b"], unicode: false); + execString(r".(?a)(?.)", "bab", ["bab", "a", "b"], unicode: false); + execString(r".(?\w\w)", "bab", ["bab", "ab"], unicode: false); + execString(r"(?\w\w\w)", "bab", ["bab", "bab"], unicode: false); + execString(r"(?\w\w)(?\w)", "bab", ["bab", "ba", "b"], unicode: false); + + matchesIndexEqual( + "bab", RegExp(r"(?a)", unicode: true), RegExp(r"(a)", unicode: true)); + matchesIndexEqual("bab", RegExp(r"(?a)", unicode: true), + RegExp(r"(a)", unicode: true)); + matchesIndexEqual( + "bab", RegExp(r"(?<_>a)", unicode: true), RegExp(r"(a)", unicode: true)); + matchesIndexEqual( + "bab", RegExp(r"(?<$>a)", unicode: true), RegExp(r"(a)", unicode: true)); + matchesIndexEqual("bab", RegExp(r".(?<$>a).", unicode: true), + RegExp(r".(a).", unicode: true)); + matchesIndexEqual("bab", RegExp(r".(?a)(.)", unicode: true), + RegExp(r".(a)(.)", unicode: true)); + matchesIndexEqual("bab", RegExp(r".(?a)(?.)", unicode: true), + RegExp(r".(a)(.)", unicode: true)); + matchesIndexEqual("bab", RegExp(r".(?\w\w)", unicode: true), + RegExp(r".(\w\w)", unicode: true)); + matchesIndexEqual("bab", RegExp(r"(?\w\w\w)", unicode: true), + RegExp(r"(\w\w\w)", unicode: true)); + matchesIndexEqual("bab", RegExp(r"(?\w\w)(?\w)", unicode: true), + RegExp(r"(\w\w)(\w)", unicode: true)); + + execString(r"(?b).\1", "bab", ["bab", "b"]); + execString(r"(.)(?a)\1\2", "baba", ["baba", "b", "a"]); + execString(r"(.)(?a)(?\1)(\2)", "baba", ["baba", "b", "a", "b", "a"]); + execString(r"(?<)a", ">)a", ">a", [">a", ">"]); + + // Named references. + var pattern = r"(?.).\k"; + execString(pattern, "bab", ["bab", "b"]); + assertFalse(RegExp(pattern, unicode: true).hasMatch("baa")); + + // Nested groups. + pattern = r"(?.(?.(?.)))"; + execString(pattern, "bab", ["bab", "bab", "ab", "b"]); + execStringGroups(pattern, "bab", {"a": "bab", "b": "ab", "c": "b"}); + + // Reference inside group. + pattern = r"(?\k\w).."; + execString(pattern, "bab", ["bab", "b"]); + execStringGroups(pattern, "bab", {"a": "b"}); + + // Reference before group. + pattern = r"\k(?b)\w\k"; + execString(pattern, "bab", ["bab", "b"], unicode: false); + execString(pattern, "bab", ["bab", "b"]); + execStringGroups(pattern, "bab", {"a": "b"}); + + pattern = r"(?b)\k(?a)\k"; + execString(pattern, "bab", ["bab", "b", "a"], unicode: false); + execString(pattern, "bab", ["bab", "b", "a"]); + execStringGroups(pattern, "bab", {"a": "a", "b": "b"}); + + // Reference named groups. + var match = RegExp(r"(?a)(?b)\k", unicode: true).firstMatch("aba"); + assertEquals("a", match.namedGroup("a")); + assertEquals("b", match.namedGroup("b")); + assertFalse(match.groupNames.contains("c")); + + match = + RegExp(r"(?a)(?b)\k|(?c)", unicode: true).firstMatch("aba"); + assertNull(match.namedGroup("c")); + + // Unicode names. + execStringGroups(r"(?<π>a)", "bab", {"π": "a"}); + execStringGroups(r"(?<\u{03C0}>a)", "bab", {"π": "a"}); + execStringGroups(r"(?<π>a)", "bab", {"\u03C0": "a"}); + execStringGroups(r"(?<\u{03C0}>a)", "bab", {"\u03C0": "a"}); + execStringGroups(r"(?<$>a)", "bab", {"\$": "a"}); + execStringGroups(r"(?<_>a)", "bab", {"_": "a"}); + execStringGroups(r"(?<$𐒤>a)", "bab", {"\$𐒤": "a"}); + execStringGroups(r"(?<_\u200C>a)", "bab", {"_\u200C": "a"}); + execStringGroups(r"(?<_\u200D>a)", "bab", {"_\u200D": "a"}); + execStringGroups(r"(?<ಠ_ಠ>a)", "bab", {"ಠ_ಠ": "a"}); + // ID_Continue but not ID_Start. + assertThrows(() => RegExp(r"/(?<❤>a)", unicode: true)); + assertThrows(() => RegExp(r"/(?<𐒤>a)", unicode: true)); + + execStringGroups(r"(?<π>a)", "bab", {"π": "a"}, unicode: false); + execStringGroups(r"(?<$>a)", "bab", {"\$": "a"}, unicode: false); + execStringGroups(r"(?<_>a)", "bab", {"_": "a"}, unicode: false); + assertThrows(() => RegExp(r"(?<$𐒤>a)")); + execStringGroups(r"(?<ಠ_ಠ>a)", "bab", {"ಠ_ಠ": "a"}, unicode: false); + // ID_Continue but not ID_Start. + assertThrows(() => RegExp(r"/(?<❤>a)")); + assertThrows(() => RegExp(r"/(?<𐒤>a)")); + + // Interaction with lookbehind assertions. + pattern = r"(?<=(?\w){3})f"; + execString(pattern, "abcdef", ["f", "c"]); + execStringGroups(pattern, "abcdef", {"a": "c"}); + + execStringGroups(r"(?<=(?\w){4})f", "abcdef", {"a": "b"}); + execStringGroups(r"(?<=(?\w)+)f", "abcdef", {"a": "a"}); + assertFalse(RegExp(r"(?<=(?\w){6})f", unicode: true).hasMatch("abcdef")); + + execString(r"((?<=\w{3}))f", "abcdef", ["f", ""]); + execString(r"(?(?<=\w{3}))f", "abcdef", ["f", ""]); + + execString(r"(?\d){3})f", "abcdef", ["f", null]); + assertFalse(RegExp(r"(?\D){3})f", unicode: true).hasMatch("abcdef")); + + execString(r"(?\D){3})f|f", "abcdef", ["f", null]); + execString(r"(?(?.)|(?.)", unicode: true).firstMatch("abcd"); + Expect.setEquals(["fst", "snd"], match.groupNames); + + // Backslash as ID_Start and ID_Continue (v8:5868). + assertThrows(() => RegExp("(?<\\>.)")); // '\' misclassified as ID_Start. + assertThrows(() => RegExp("(?.)")); // '\' misclassified as ID_Continue. + + // Backreference before the group (exercises the capture mini-parser). + assertThrows(() => RegExp(r"/\1(?:.)", unicode: true)); + assertThrows(() => RegExp(r"/\1(?<=a).", unicode: true)); + assertThrows(() => RegExp(r"/\1(?.)", "abcd", ["a", "a"]); + + // Unicode escapes in capture names. (Testing both unicode interpreted by + // Dart string handling and also escaped unicode making it to RegExp parser.) + + // \u Lead \u Trail + assertTrue(RegExp("(?.)", unicode: true).hasMatch("a")); + assertTrue(RegExp(r"(?.)", unicode: true).hasMatch("a")); + assertThrows(() => RegExp("(?.)", unicode: true)); // \u Lead + assertThrows(() => RegExp(r"(?.)", unicode: true)); // \u Lead + assertThrows(() => RegExp("(?.)", unicode: true)); // \u Trail + assertThrows(() => RegExp(r"(?.)", unicode: true)); // \u Trail + // \u NonSurrogate + assertTrue(RegExp("(?<\u0041>.)", unicode: true).hasMatch("a")); + assertTrue(RegExp(r"(?<\u0041>.)", unicode: true).hasMatch("a")); + // \u{ Surrogate, ID_Continue } + assertTrue(RegExp("(?.)", unicode: true).hasMatch("a")); + assertTrue(RegExp(r"(?.)", unicode: true).hasMatch("a")); + + // \u{ Out-of-bounds } -- only need to test RegExp parser for this. + assertThrows(() => RegExp(r"(?.)", unicode: true)); + + // Also checking non-unicode patterns, where surrogate pairs will not + // be combined (so only \u0041 will have any success). + + assertThrows(() => RegExp("(?.)")); + assertThrows(() => RegExp(r"(?.)")); + assertThrows(() => RegExp("(?.)")); + assertThrows(() => RegExp(r"(?.)")); + assertThrows(() => RegExp("(?.)")); + assertThrows(() => RegExp(r"(?.)")); + assertTrue(RegExp("(?<\u0041>.)").hasMatch("a")); + assertTrue(RegExp(r"(?<\u0041>.)").hasMatch("a")); + assertThrows(() => RegExp("(?.)")); + assertThrows(() => RegExp(r"(?.)")); + assertThrows(() => RegExp("(?.)")); + assertThrows(() => RegExp(r"(?.)")); + assertThrows(() => RegExp(r"(?.)")); } diff --git a/tests/corelib_2/regexp/unicode-character-ranges_test.dart b/tests/corelib_2/regexp/unicode-character-ranges_test.dart new file mode 100644 index 00000000000..7154d0fa0bd --- /dev/null +++ b/tests/corelib_2/regexp/unicode-character-ranges_test.dart @@ -0,0 +1,160 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2011 the V8 project authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY +// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void execl(List expectation, RegExp re, String subject) { + shouldBe(re.firstMatch(subject), expectation); +} + +void execs(List expectation, String pattern, String subject) { + final re = RegExp(pattern, unicode: true); + shouldBe(re.firstMatch(subject), expectation); +} + +void main() { + // Character ranges. + execs(["A"], r"[A-D]", "A"); + execs(["ABCD"], r"[A-D]+", "ZABCDEF"); + + execs(["\u{12345}"], r"[\u1234-\u{12345}]", "\u{12345}"); + execs(null, r"[^\u1234-\u{12345}]", "\u{12345}"); + + execs(["\u{1234}"], r"[\u1234-\u{12345}]", "\u{1234}"); + execs(null, r"[^\u1234-\u{12345}]", "\u{1234}"); + + execs(null, r"[\u1234-\u{12345}]", "\u{1233}"); + execs(["\u{1233}"], r"[^\u1234-\u{12345}]", "\u{1233}"); + + execs(["\u{12346}"], r"[^\u1234-\u{12345}]", "\u{12346}"); + execs(null, r"[\u1234-\u{12345}]", "\u{12346}"); + + execs(["\u{12342}"], r"[\u{12340}-\u{12345}]", "\u{12342}"); + execs(["\u{12342}"], r"[\ud808\udf40-\ud808\udf45]", "\u{12342}"); + execs(null, r"[^\u{12340}-\u{12345}]", "\u{12342}"); + execs(null, r"[^\ud808\udf40-\ud808\udf45]", "\u{12342}"); + + execs(["\u{ffff}"], r"[\u{ff80}-\u{12345}]", "\u{ffff}"); + execs(["\u{ffff}"], r"[\u{ff80}-\ud808\udf45]", "\u{ffff}"); + execs(null, r"[^\u{ff80}-\u{12345}]", "\u{ffff}"); + execs(null, r"[^\u{ff80}-\ud808\udf45]", "\u{ffff}"); + + // Lone surrogate + execs(["\udc00"], r"[^\u{ff80}-\u{12345}]", "\uff99\u{dc00}A"); + execs(["\udc01"], r"[\u0100-\u{10ffff}]", "A\udc01"); + execs(["\udc03"], r"[\udc01-\udc03]", "\ud801\udc02\udc03"); + execs(["\ud801"], r"[\ud801-\ud803]", "\ud802\udc01\ud801"); + + // Paired surrogate. + execs(null, r"[^\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}"); + execs(["\ud800\udc00"], r"[\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}"); + execs(["foo\u{10e6d}bar"], r"foo\ud803\ude6dbar", "foo\u{10e6d}bar"); + + // Lone surrogates + execs(["\ud801\ud801"], r"\ud801+", "\ud801\udc01\ud801\ud801"); + execs(["\udc01\udc01"], r"\udc01+", "\ud801\ud801\udc01\udc01\udc01"); + + execs(["\udc02\udc03A"], r"\W\WA", "\ud801\udc01A\udc02\udc03A"); + execs(["\ud801\ud802"], r"\ud801.", "\ud801\udc01\ud801\ud802"); + execs(["\udc02\udc03A"], r"[\ud800-\udfff][\ud800-\udfff]A", + "\ud801\udc01A\udc02\udc03A"); + + // Character classes + execs(null, r"\w", "\ud801\udc01"); + execl(["\ud801"], RegExp(r"[^\w]"), "\ud801\udc01"); + execs(["\ud801\udc01"], r"[^\w]", "\ud801\udc01"); + execl(["\ud801"], RegExp(r"\W"), "\ud801\udc01"); + execs(["\ud801\udc01"], r"\W", "\ud801\udc01"); + + execs(["\ud800X"], r".X", "\ud800XaX"); + execs(["aX"], r".(? RegExp(r"\u", unicode: true)); + assertThrows(() => RegExp(r"\u12", unicode: true)); + assertThrows(() => RegExp(r"\ufoo", unicode: true)); + assertThrows(() => RegExp(r"\x", unicode: true)); + assertThrows(() => RegExp(r"\xfoo", unicode: true)); + assertThrows(() => RegExp(r"\z", unicode: true)); + assertThrows(() => RegExp(r"\8", unicode: true)); + assertThrows(() => RegExp(r"\9", unicode: true)); + + assertThrows(() => RegExp("\\u", unicode: true)); + assertThrows(() => RegExp("\\u12", unicode: true)); + assertThrows(() => RegExp("\\ufoo", unicode: true)); + assertThrows(() => RegExp("\\x", unicode: true)); + assertThrows(() => RegExp("\\xfoo", unicode: true)); + assertThrows(() => RegExp("\\z", unicode: true)); + assertThrows(() => RegExp("\\8", unicode: true)); + assertThrows(() => RegExp("\\9", unicode: true)); +} + +void TestTooBigHexEscape() { + // The hex number inside \u{} has a maximum value. + RegExp(r"\u{10ffff}", unicode: true); + RegExp("\\u{10ffff}", unicode: true); + assertThrows(() => RegExp(r"\u{110000}", unicode: true)); + assertThrows(() => RegExp("\\u{110000}", unicode: true)); + + // Without the u flag, they're of course fine ({x} is the count). + RegExp(r"\u{110000}"); + RegExp("\\u{110000}"); +} + +void TestSyntaxEscapes() { + // Syntax escapes work the same with or without the u flag. + void helper(RegExp r) { + assertTrue(r.hasMatch("foo[bar")); + assertFalse(r.hasMatch("foo]bar")); + } + + helper(RegExp(r"foo\[bar")); + helper(RegExp("foo\\[bar")); + helper(RegExp(r"foo\[bar", unicode: true)); + helper(RegExp("foo\\[bar", unicode: true)); +} + +void TestUnicodeSurrogates() { + // U+10E6D corresponds to the surrogate pair [U+D803, U+DE6D]. + void helper(RegExp r) { + assertTrue(r.hasMatch("foo\u{10e6d}bar")); + } + + helper(RegExp(r"foo\ud803\ude6dbar", unicode: true)); + helper(RegExp("foo\\ud803\\ude6dbar", unicode: true)); +} + +void main() { + TestUnicodeEscapes(); + TestUnicodeEscapesInCharacterClasses(); + TestBraceEscapesWithoutUnicodeFlag(); + TestInvalidEscapes(); + TestTooBigHexEscape(); + TestSyntaxEscapes(); + TestUnicodeSurrogates(); + + // Non-BMP patterns. + // Single character atom. + assertTrue(RegExp("\u{12345}", unicode: true).hasMatch("\u{12345}")); + assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\u{12345}")); + assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\ud808\udf45")); + assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\ud808\udf45")); + assertFalse(RegExp(r"\u{12345}", unicode: true).hasMatch("\udf45")); + assertFalse(RegExp(r"\u{12345}", unicode: true).hasMatch("\udf45")); + + // Multi-character atom. + assertTrue(RegExp(r"\u{12345}\u{23456}", unicode: true) + .hasMatch("a\u{12345}\u{23456}b")); + assertTrue(RegExp(r"\u{12345}\u{23456}", unicode: true) + .hasMatch("b\u{12345}\u{23456}c")); + assertFalse(RegExp(r"\u{12345}\u{23456}", unicode: true) + .hasMatch("a\udf45\u{23456}b")); + assertFalse(RegExp(r"\u{12345}\u{23456}", unicode: true) + .hasMatch("b\udf45\u{23456}c")); + + // Disjunction. + assertTrue(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) + .hasMatch("a\u{12345}\u{23456}b")); + assertTrue(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) + .hasMatch("b\u{12345}\u{23456}c")); + assertFalse(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) + .hasMatch("a\udf45\u{23456}b")); + assertFalse(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) + .hasMatch("b\udf45\u{23456}c")); + + // Alternative. + assertTrue( + RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("a\u{12345}b")); + assertTrue( + RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("b\u{23456}c")); + assertFalse( + RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("a\udf45\ud84db")); + assertFalse( + RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("b\udf45\ud808c")); + + // Capture. + assertTrue(RegExp("(\u{12345}|\u{23456}).\\1", unicode: true) + .hasMatch("\u{12345}b\u{12345}")); + assertTrue(RegExp(r"(\u{12345}|\u{23456}).\1", unicode: true) + .hasMatch("\u{12345}b\u{12345}")); + assertFalse(RegExp("(\u{12345}|\u{23456}).\\1", unicode: true) + .hasMatch("\u{12345}b\u{23456}")); + assertFalse(RegExp(r"(\u{12345}|\u{23456}).\1", unicode: true) + .hasMatch("\u{12345}b\u{23456}")); + + // Quantifier. + assertTrue(RegExp("\u{12345}{3}", unicode: true) + .hasMatch("\u{12345}\u{12345}\u{12345}")); + assertTrue(RegExp(r"\u{12345}{3}", unicode: true) + .hasMatch("\u{12345}\u{12345}\u{12345}")); + assertTrue(RegExp("\u{12345}{3}").hasMatch("\u{12345}\udf45\udf45")); + assertFalse(RegExp(r"\ud808\udf45{3}", unicode: true) + .hasMatch("\u{12345}\udf45\udf45")); + assertTrue(RegExp(r"\ud808\udf45{3}", unicode: true) + .hasMatch("\u{12345}\u{12345}\u{12345}")); + assertFalse( + RegExp("\u{12345}{3}", unicode: true).hasMatch("\u{12345}\udf45\udf45")); + assertFalse( + RegExp(r"\u{12345}{3}", unicode: true).hasMatch("\u{12345}\udf45\udf45")); + + // Literal surrogates. + shouldBe( + RegExp("\ud800\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"), + ["\u{10000}\u{10000}"]); + shouldBe( + RegExp("\\ud800\\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"), + ["\u{10000}\u{10000}"]); + + shouldBe( + RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", unicode: true) + .firstMatch("\u{10003}\u{50001}"), + ["\u{10003}\u{50001}"]); + shouldBe( + RegExp("[\ud800\udc03-\u{50001}\]+", unicode: true) + .firstMatch("\u{10003}\u{50001}"), + ["\u{10003}\u{50001}"]); + + // Unicode escape sequences to represent a non-BMP character cannot have + // mixed notation, and must follow the rules for RegExpUnicodeEscapeSequence. + assertThrows(() => RegExp("[\\ud800\udc03-\ud900\\udc01\]+", unicode: true)); + assertNull( + RegExp("\\ud800\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}")); + assertNull( + RegExp("\ud800\\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}")); + + assertNull(RegExp("[\\ud800\udc00]", unicode: true).firstMatch("\u{10000}")); + assertNull( + RegExp("[\\{ud800}\udc00]", unicode: true).firstMatch("\u{10000}")); + assertNull(RegExp("[\ud800\\udc00]", unicode: true).firstMatch("\u{10000}")); + assertNull( + RegExp("[\ud800\\{udc00}]", unicode: true).firstMatch("\u{10000}")); + + assertNull(RegExp(r"\u{d800}\u{dc00}+", unicode: true) + .firstMatch("\ud800\udc00\udc00")); + assertNull(RegExp(r"\ud800\u{dc00}+", unicode: true) + .firstMatch("\ud800\udc00\udc00")); + assertNull(RegExp(r"\u{d800}\udc00+", unicode: true) + .firstMatch("\ud800\udc00\udc00")); +} diff --git a/tests/corelib_2/regexp/unicode-property-binary_test.dart b/tests/corelib_2/regexp/unicode-property-binary_test.dart new file mode 100644 index 00000000000..b8981a7fb0b --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-binary_test.dart @@ -0,0 +1,201 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + void t(RegExp re, String s) { + assertTrue(re.hasMatch(s)); + } + + void f(RegExp re, String s) { + assertFalse(re.hasMatch(s)); + } + + assertThrows(() => RegExp("\\p{Hiragana}", unicode: true)); + assertThrows(() => RegExp("\\p{Bidi_Class}", unicode: true)); + assertThrows(() => RegExp("\\p{Bidi_C=False}", unicode: true)); + assertThrows(() => RegExp("\\P{Bidi_Control=Y}", unicode: true)); + assertThrows(() => RegExp("\\p{AHex=Yes}", unicode: true)); + + assertThrows(() => RegExp("\\p{Composition_Exclusion}", unicode: true)); + assertThrows(() => RegExp("\\p{CE}", unicode: true)); + assertThrows(() => RegExp("\\p{Full_Composition_Exclusion}", unicode: true)); + assertThrows(() => RegExp("\\p{Comp_Ex}", unicode: true)); + assertThrows(() => RegExp("\\p{Grapheme_Link}", unicode: true)); + assertThrows(() => RegExp("\\p{Gr_Link}", unicode: true)); + assertThrows(() => RegExp("\\p{Hyphen}", unicode: true)); + assertThrows(() => RegExp("\\p{NFD_Inert}", unicode: true)); + assertThrows(() => RegExp("\\p{NFDK_Inert}", unicode: true)); + assertThrows(() => RegExp("\\p{NFC_Inert}", unicode: true)); + assertThrows(() => RegExp("\\p{NFKC_Inert}", unicode: true)); + assertThrows(() => RegExp("\\p{Segment_Starter}", unicode: true)); + + t(RegExp(r"\p{Alphabetic}", unicode: true), "æ"); + f(RegExp(r"\p{Alpha}", unicode: true), "1"); + + t(RegExp(r"\p{ASCII_Hex_Digit}", unicode: true), "f"); + f(RegExp(r"\p{AHex}", unicode: true), "g"); + + t(RegExp(r"\p{Bidi_Control}", unicode: true), "\u200e"); + f(RegExp(r"\p{Bidi_C}", unicode: true), "g"); + + t(RegExp(r"\p{Bidi_Mirrored}", unicode: true), "("); + f(RegExp(r"\p{Bidi_M}", unicode: true), "-"); + + t(RegExp(r"\p{Case_Ignorable}", unicode: true), "\u02b0"); + f(RegExp(r"\p{CI}", unicode: true), "a"); + + t(RegExp(r"\p{Changes_When_Casefolded}", unicode: true), "B"); + f(RegExp(r"\p{CWCF}", unicode: true), "1"); + + t(RegExp(r"\p{Changes_When_Casemapped}", unicode: true), "b"); + f(RegExp(r"\p{CWCM}", unicode: true), "1"); + + t(RegExp(r"\p{Changes_When_Lowercased}", unicode: true), "B"); + f(RegExp(r"\p{CWL}", unicode: true), "1"); + + t(RegExp(r"\p{Changes_When_Titlecased}", unicode: true), "b"); + f(RegExp(r"\p{CWT}", unicode: true), "1"); + + t(RegExp(r"\p{Changes_When_Uppercased}", unicode: true), "b"); + f(RegExp(r"\p{CWU}", unicode: true), "1"); + + t(RegExp(r"\p{Dash}", unicode: true), "-"); + f(RegExp(r"\p{Dash}", unicode: true), "1"); + + t(RegExp(r"\p{Default_Ignorable_Code_Point}", unicode: true), "\u00ad"); + f(RegExp(r"\p{DI}", unicode: true), "1"); + + t(RegExp(r"\p{Deprecated}", unicode: true), "\u17a3"); + f(RegExp(r"\p{Dep}", unicode: true), "1"); + + t(RegExp(r"\p{Diacritic}", unicode: true), "\u0301"); + f(RegExp(r"\p{Dia}", unicode: true), "1"); + + t(RegExp(r"\p{Emoji}", unicode: true), "\u2603"); + f(RegExp(r"\p{Emoji}", unicode: true), "x"); + + t(RegExp(r"\p{Emoji_Component}", unicode: true), "\u{1F1E6}"); + f(RegExp(r"\p{Emoji_Component}", unicode: true), "x"); + + t(RegExp(r"\p{Emoji_Modifier_Base}", unicode: true), "\u{1F6CC}"); + f(RegExp(r"\p{Emoji_Modifier_Base}", unicode: true), "x"); + + t(RegExp(r"\p{Emoji_Modifier}", unicode: true), "\u{1F3FE}"); + f(RegExp(r"\p{Emoji_Modifier}", unicode: true), "x"); + + t(RegExp(r"\p{Emoji_Presentation}", unicode: true), "\u{1F308}"); + f(RegExp(r"\p{Emoji_Presentation}", unicode: true), "x"); + + t(RegExp(r"\p{Extender}", unicode: true), "\u3005"); + f(RegExp(r"\p{Ext}", unicode: true), "x"); + + t(RegExp(r"\p{Grapheme_Base}", unicode: true), " "); + f(RegExp(r"\p{Gr_Base}", unicode: true), "\u0010"); + + t(RegExp(r"\p{Grapheme_Extend}", unicode: true), "\u0300"); + f(RegExp(r"\p{Gr_Ext}", unicode: true), "x"); + + t(RegExp(r"\p{Hex_Digit}", unicode: true), "a"); + f(RegExp(r"\p{Hex}", unicode: true), "g"); + + t(RegExp(r"\p{ID_Continue}", unicode: true), "1"); + f(RegExp(r"\p{IDC}", unicode: true), "."); + + t(RegExp(r"\p{ID_Start}", unicode: true), "a"); + f(RegExp(r"\p{IDS}", unicode: true), "1"); + + t(RegExp(r"\p{Ideographic}", unicode: true), "漢"); + f(RegExp(r"\p{Ideo}", unicode: true), "H"); + + t(RegExp(r"\p{IDS_Binary_Operator}", unicode: true), "\u2FF0"); + f(RegExp(r"\p{IDSB}", unicode: true), "a"); + + t(RegExp(r"\p{IDS_Trinary_Operator}", unicode: true), "\u2FF2"); + f(RegExp(r"\p{IDST}", unicode: true), "a"); + + t(RegExp(r"\p{Join_Control}", unicode: true), "\u200c"); + f(RegExp(r"\p{Join_C}", unicode: true), "a"); + + t(RegExp(r"\p{Logical_Order_Exception}", unicode: true), "\u0e40"); + f(RegExp(r"\p{LOE}", unicode: true), "a"); + + t(RegExp(r"\p{Lowercase}", unicode: true), "a"); + f(RegExp(r"\p{Lower}", unicode: true), "A"); + + t(RegExp(r"\p{Math}", unicode: true), "="); + f(RegExp(r"\p{Math}", unicode: true), "A"); + + t(RegExp(r"\p{Noncharacter_Code_Point}", unicode: true), "\uFDD0"); + f(RegExp(r"\p{NChar}", unicode: true), "A"); + + t(RegExp(r"\p{Pattern_Syntax}", unicode: true), "\u0021"); + f(RegExp(r"\p{NChar}", unicode: true), "A"); + + t(RegExp(r"\p{Pattern_White_Space}", unicode: true), "\u0009"); + f(RegExp(r"\p{Pat_Syn}", unicode: true), "A"); + + t(RegExp(r"\p{Quotation_Mark}", unicode: true), "'"); + f(RegExp(r"\p{QMark}", unicode: true), "A"); + + t(RegExp(r"\p{Radical}", unicode: true), "\u2FAD"); + f(RegExp(r"\p{Radical}", unicode: true), "A"); + + t(RegExp(r"\p{Regional_Indicator}", unicode: true), "\u{1F1E6}"); + f(RegExp(r"\p{Regional_Indicator}", unicode: true), "A"); + + t(RegExp(r"\p{Sentence_Terminal}", unicode: true), "!"); + f(RegExp(r"\p{STerm}", unicode: true), "A"); + + t(RegExp(r"\p{Soft_Dotted}", unicode: true), "i"); + f(RegExp(r"\p{SD}", unicode: true), "A"); + + t(RegExp(r"\p{Terminal_Punctuation}", unicode: true), "."); + f(RegExp(r"\p{Term}", unicode: true), "A"); + + t(RegExp(r"\p{Unified_Ideograph}", unicode: true), "\u4e00"); + f(RegExp(r"\p{UIdeo}", unicode: true), "A"); + + t(RegExp(r"\p{Uppercase}", unicode: true), "A"); + f(RegExp(r"\p{Upper}", unicode: true), "a"); + + t(RegExp(r"\p{Variation_Selector}", unicode: true), "\uFE00"); + f(RegExp(r"\p{VS}", unicode: true), "A"); + + t(RegExp(r"\p{White_Space}", unicode: true), " "); + f(RegExp(r"\p{WSpace}", unicode: true), "A"); + + t(RegExp(r"\p{XID_Continue}", unicode: true), "1"); + f(RegExp(r"\p{XIDC}", unicode: true), " "); + + t(RegExp(r"\p{XID_Start}", unicode: true), "A"); + f(RegExp(r"\p{XIDS}", unicode: true), " "); +} diff --git a/tests/corelib_2/regexp/unicode-property-char-class_test.dart b/tests/corelib_2/regexp/unicode-property-char-class_test.dart new file mode 100644 index 00000000000..db87fada0ad --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-char-class_test.dart @@ -0,0 +1,56 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2011 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + assertThrows(() => RegExp("[\\p]", unicode: true)); + assertThrows(() => RegExp("[\\p{garbage}]", unicode: true)); + assertThrows(() => RegExp("[\\p{}]", unicode: true)); + assertThrows(() => RegExp("[\\p{]", unicode: true)); + assertThrows(() => RegExp("[\\p}]", unicode: true)); + assertThrows(() => RegExp("^[\\p{Lu}-\\p{Ll}]+\$", unicode: true)); + + assertTrue(RegExp(r"^[\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("ABCabc")); + assertTrue(RegExp(r"^[\p{Lu}-]+$", unicode: true).hasMatch("ABC-")); + assertFalse(RegExp(r"^[\P{Lu}\p{Ll}]+$", unicode: true).hasMatch("ABCabc")); + assertTrue(RegExp(r"^[\P{Lu}\p{Ll}]+$", unicode: true).hasMatch("abc")); + assertTrue(RegExp(r"^[\P{Lu}]+$", unicode: true).hasMatch("abc123")); + assertFalse(RegExp(r"^[\P{Lu}]+$", unicode: true).hasMatch("XYZ")); + assertTrue(RegExp(r"[\p{Math}]", unicode: true).hasMatch("+")); + assertTrue(RegExp(r"[\P{Bidi_M}]", unicode: true).hasMatch(" ")); + assertTrue(RegExp(r"[\p{Hex}]", unicode: true).hasMatch("A")); + + assertTrue(RegExp(r"^[^\P{Lu}]+$", unicode: true).hasMatch("XYZ")); + assertFalse(RegExp(r"^[^\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("abc")); + assertFalse(RegExp(r"^[^\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("ABC")); + assertTrue(RegExp(r"^[^\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("123")); + assertTrue(RegExp(r"^[^\p{Lu}\P{Ll}]+$", unicode: true).hasMatch("abc")); +} diff --git a/tests/corelib_2/regexp/unicode-property-enumerated_test.dart b/tests/corelib_2/regexp/unicode-property-enumerated_test.dart new file mode 100644 index 00000000000..cfb3e029e15 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-enumerated_test.dart @@ -0,0 +1,55 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + assertThrows(() => RegExp("\\p{Bidi_Class=L}+", unicode: true)); + assertThrows(() => RegExp("\\p{bc=Left_To_Right}+", unicode: true)); + assertThrows(() => RegExp("\\p{bc=AL}+", unicode: true)); + assertThrows(() => RegExp("\\p{bc=Arabic_Letter}+", unicode: true)); + + assertThrows(() => RegExp("\\p{Line_Break=Glue}", unicode: true)); + assertThrows(() => RegExp("\\p{lb=AL}", unicode: true)); + + assertThrows(() => RegExp("\\p{Block=}", unicode: true)); + assertThrows(() => RegExp("\\p{=}", unicode: true)); + assertThrows(() => RegExp("\\p{=L}", unicode: true)); + assertThrows(() => RegExp("\\p{=Hiragana}", unicode: true)); + assertThrows(() => RegExp("\\p{Block=CJK=}", unicode: true)); + + assertThrows(() => RegExp("\\p{Age=V8_0}", unicode: true)); + assertDoesNotThrow( + () => RegExp("\\p{General_Category=Letter}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{gc=L}", unicode: true)); + assertThrows( + () => RegExp("\\p{General_Category_Mask=Letter}", unicode: true)); + assertThrows(() => RegExp("\\p{gcm=L}", unicode: true)); +} diff --git a/tests/corelib_2/regexp/unicode-property-exact-match_test.dart b/tests/corelib_2/regexp/unicode-property-exact-match_test.dart new file mode 100644 index 00000000000..bbf99d0ab76 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-exact-match_test.dart @@ -0,0 +1,71 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + assertThrows(() => RegExp("\\p{In CJK}", unicode: true)); + assertThrows(() => RegExp("\\p{InCJKUnifiedIdeographs}", unicode: true)); + assertThrows(() => RegExp("\\p{InCJK}", unicode: true)); + assertThrows(() => RegExp("\\p{InCJK_Unified_Ideographs}", unicode: true)); + + assertThrows(() => RegExp("\\p{InCyrillic_Sup}", unicode: true)); + assertThrows(() => RegExp("\\p{InCyrillic_Supplement}", unicode: true)); + assertThrows(() => RegExp("\\p{InCyrillic_Supplementary}", unicode: true)); + assertThrows(() => RegExp("\\p{InCyrillicSupplementary}", unicode: true)); + assertThrows(() => RegExp("\\p{InCyrillic_supplementary}", unicode: true)); + + assertDoesNotThrow(() => RegExp("\\p{C}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{Other}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{Cc}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{Control}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{cntrl}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{M}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{Mark}", unicode: true)); + assertDoesNotThrow(() => RegExp("\\p{Combining_Mark}", unicode: true)); + assertThrows(() => RegExp("\\p{Combining Mark}", unicode: true)); + + assertDoesNotThrow(() => RegExp("\\p{Script=Copt}", unicode: true)); + assertThrows(() => RegExp("\\p{Coptic}", unicode: true)); + assertThrows(() => RegExp("\\p{Qaac}", unicode: true)); + assertThrows(() => RegExp("\\p{Egyp}", unicode: true)); + assertDoesNotThrow( + () => RegExp("\\p{Script=Egyptian_Hieroglyphs}", unicode: true)); + assertThrows(() => RegExp("\\p{EgyptianHieroglyphs}", unicode: true)); + + assertThrows(() => RegExp("\\p{BidiClass=LeftToRight}", unicode: true)); + assertThrows(() => RegExp("\\p{BidiC=LeftToRight}", unicode: true)); + assertThrows(() => RegExp("\\p{bidi_c=Left_To_Right}", unicode: true)); + + assertThrows(() => RegExp("\\p{Block=CJK}", unicode: true)); + assertThrows(() => RegExp("\\p{Block = CJK}", unicode: true)); + assertThrows(() => RegExp("\\p{Block=cjk}", unicode: true)); + assertThrows(() => RegExp("\\p{BLK=CJK}", unicode: true)); +} diff --git a/tests/corelib_2/regexp/unicode-property-general-category_test.dart b/tests/corelib_2/regexp/unicode-property-general-category_test.dart new file mode 100644 index 00000000000..cb94cacd43e --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-general-category_test.dart @@ -0,0 +1,114 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + assertThrows(() => RegExp("\\p", unicode: true)); + assertThrows(() => RegExp("\\p{garbage}", unicode: true)); + assertThrows(() => RegExp("\\p{}", unicode: true)); + assertThrows(() => RegExp("\\p{", unicode: true)); + assertThrows(() => RegExp("\\p}", unicode: true)); + assertThrows(() => RegExp("\\pL", unicode: true)); + assertThrows(() => RegExp("\\P", unicode: true)); + assertThrows(() => RegExp("\\P{garbage}", unicode: true)); + assertThrows(() => RegExp("\\P{}", unicode: true)); + assertThrows(() => RegExp("\\P{", unicode: true)); + assertThrows(() => RegExp("\\P}", unicode: true)); + assertThrows(() => RegExp("\\PL", unicode: true)); + + assertTrue(RegExp(r"\p{Ll}", unicode: true).hasMatch("a")); + assertFalse(RegExp(r"\P{Ll}", unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\P{Ll}", unicode: true).hasMatch("A")); + assertFalse(RegExp(r"\p{Ll}", unicode: true).hasMatch("A")); + assertTrue(RegExp(r"\p{Ll}", unicode: true).hasMatch("\u{1D7BE}")); + assertFalse(RegExp(r"\P{Ll}", unicode: true).hasMatch("\u{1D7BE}")); + assertFalse(RegExp(r"\p{Ll}", unicode: true).hasMatch("\u{1D5E3}")); + assertTrue(RegExp(r"\P{Ll}", unicode: true).hasMatch("\u{1D5E3}")); + + assertTrue( + RegExp(r"\p{Ll}", caseSensitive: false, unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\p{Ll}", caseSensitive: false, unicode: true) + .hasMatch("\u{118D4}")); + assertTrue( + RegExp(r"\p{Ll}", caseSensitive: false, unicode: true).hasMatch("A")); + assertTrue(RegExp(r"\p{Ll}", caseSensitive: false, unicode: true) + .hasMatch("\u{118B4}")); + assertTrue( + RegExp(r"\P{Ll}", caseSensitive: false, unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\P{Ll}", caseSensitive: false, unicode: true) + .hasMatch("\u{118D4}")); + assertTrue( + RegExp(r"\P{Ll}", caseSensitive: false, unicode: true).hasMatch("A")); + assertTrue(RegExp(r"\P{Ll}", caseSensitive: false, unicode: true) + .hasMatch("\u{118B4}")); + + assertTrue(RegExp(r"\p{Lu}", unicode: true).hasMatch("A")); + assertFalse(RegExp(r"\P{Lu}", unicode: true).hasMatch("A")); + assertTrue(RegExp(r"\P{Lu}", unicode: true).hasMatch("a")); + assertFalse(RegExp(r"\p{Lu}", unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\p{Lu}", unicode: true).hasMatch("\u{1D5E3}")); + assertFalse(RegExp(r"\P{Lu}", unicode: true).hasMatch("\u{1D5E3}")); + assertFalse(RegExp(r"\p{Lu}", unicode: true).hasMatch("\u{1D7BE}")); + assertTrue(RegExp(r"\P{Lu}", unicode: true).hasMatch("\u{1D7BE}")); + + assertTrue( + RegExp(r"\p{Lu}", caseSensitive: false, unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\p{Lu}", caseSensitive: false, unicode: true) + .hasMatch("\u{118D4}")); + assertTrue( + RegExp(r"\p{Lu}", caseSensitive: false, unicode: true).hasMatch("A")); + assertTrue(RegExp(r"\p{Lu}", caseSensitive: false, unicode: true) + .hasMatch("\u{118B4}")); + assertTrue( + RegExp(r"\P{Lu}", caseSensitive: false, unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\P{Lu}", caseSensitive: false, unicode: true) + .hasMatch("\u{118D4}")); + assertTrue( + RegExp(r"\P{Lu}", caseSensitive: false, unicode: true).hasMatch("A")); + assertTrue(RegExp(r"\P{Lu}", caseSensitive: false, unicode: true) + .hasMatch("\u{118B4}")); + + assertTrue(RegExp(r"\p{Sm}", unicode: true).hasMatch("+")); + assertFalse(RegExp(r"\P{Sm}", unicode: true).hasMatch("+")); + assertTrue(RegExp(r"\p{Sm}", unicode: true).hasMatch("\u{1D6C1}")); + assertFalse(RegExp(r"\P{Sm}", unicode: true).hasMatch("\u{1D6C1}")); + + assertFalse(RegExp(r"\p{L}", unicode: true).hasMatch("\uA6EE")); + assertTrue(RegExp(r"\P{L}", unicode: true).hasMatch("\uA6EE")); + + assertTrue(RegExp(r"\p{Lowercase_Letter}", unicode: true).hasMatch("a")); + assertTrue(RegExp(r"\p{Math_Symbol}", unicode: true).hasMatch("+")); + + assertTrue(RegExp(r"\p{gc=Ll}", unicode: true).hasMatch("a")); + assertTrue( + RegExp(r"\p{General_Category=Math_Symbol}", unicode: true).hasMatch("+")); + assertTrue(RegExp(r"\p{General_Category=L}", unicode: true).hasMatch("X")); +} diff --git a/tests/corelib_2/regexp/unicode-property-invalid_test.dart b/tests/corelib_2/regexp/unicode-property-invalid_test.dart new file mode 100644 index 00000000000..8dddbc1d5a3 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-invalid_test.dart @@ -0,0 +1,68 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); + assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); + assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); + assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); + + assertThrows(() => RegExp("\p{blk=CJK}+", unicode: true)); + assertThrows(() => RegExp("\p{blk=CJK_Unified_Ideographs}+", unicode: true)); + assertThrows(() => RegExp("\p{blk=CJK}+", unicode: true)); + assertThrows(() => RegExp("\p{blk=CJK_Unified_Ideographs}+", unicode: true)); + + assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); + assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); + assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); + assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); + + assertThrows(() => RegExp("\p{NFKD_Quick_Check=Y}+", unicode: true)); + assertThrows(() => RegExp("\p{NFKD_QC=Yes}+", unicode: true)); + + assertThrows(() => RegExp("\p{Numeric_Type=Decimal}+", unicode: true)); + assertThrows(() => RegExp("\p{nt=De}+", unicode: true)); + + assertThrows(() => RegExp("\p{Bidi_Class=Arabic_Letter}+", unicode: true)); + assertThrows(() => RegExp("\p{Bidi_Class=AN}+", unicode: true)); + + assertThrows(() => RegExp("\p{ccc=OV}+", unicode: true)); + + assertThrows(() => RegExp("\p{Sentence_Break=Format}+", unicode: true)); + + assertThrows(() => RegExp("\\p{In}", unicode: true)); + assertThrows(() => RegExp("\\pI", unicode: true)); + assertThrows(() => RegExp("\\p{I}", unicode: true)); + assertThrows(() => RegExp("\\p{CJK}", unicode: true)); + + assertThrows(() => RegExp("\\p{}", unicode: true)); +} diff --git a/tests/corelib_2/regexp/unicode-property-scripts_test.dart b/tests/corelib_2/regexp/unicode-property-scripts_test.dart new file mode 100644 index 00000000000..5adec29ae97 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-scripts_test.dart @@ -0,0 +1,77 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + void t(RegExp re, String s) { + assertTrue(re.hasMatch(s)); + } + + void f(RegExp re, String s) { + assertFalse(re.hasMatch(s)); + } + + t(RegExp(r"\p{Script=Common}+", unicode: true), "."); + f(RegExp(r"\p{Script=Common}+", unicode: true), + "supercalifragilisticexpialidocious"); + + t(RegExp(r"\p{Script=Han}+", unicode: true), "话说天下大势,分久必合,合久必分"); + t(RegExp(r"\p{Script=Hani}+", unicode: true), "吾庄后有一桃园,花开正盛"); + f(RegExp(r"\p{Script=Han}+", unicode: true), "おはようございます"); + f(RegExp(r"\p{Script=Hani}+", unicode: true), + "Something is rotten in the state of Denmark"); + + t(RegExp(r"\p{Script=Latin}+", unicode: true), + "Wie froh bin ich, daß ich weg bin!"); + t(RegExp(r"\p{Script=Latn}+", unicode: true), + "It was a bright day in April, and the clocks were striking thirteen"); + f(RegExp(r"\p{Script=Latin}+", unicode: true), "奔腾千里荡尘埃,渡水登山紫雾开"); + f(RegExp(r"\p{Script=Latn}+", unicode: true), "いただきます"); + + t(RegExp(r"\p{sc=Hiragana}", unicode: true), "いただきます"); + t(RegExp(r"\p{sc=Hira}", unicode: true), "ありがとうございました"); + f(RegExp(r"\p{sc=Hiragana}", unicode: true), + "Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte"); + f(RegExp(r"\p{sc=Hira}", unicode: true), "Call me Ishmael"); + + t(RegExp(r"\p{sc=Phoenician}", unicode: true), "\u{10900}\u{1091a}"); + t(RegExp(r"\p{sc=Phnx}", unicode: true), "\u{1091f}\u{10916}"); + f(RegExp(r"\p{sc=Phoenician}", unicode: true), "Arthur est un perroquet"); + f(RegExp(r"\p{sc=Phnx}", unicode: true), "设心狠毒非良士,操卓原来一路人"); + + t(RegExp(r"\p{sc=Grek}", unicode: true), + "ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ"); + t(RegExp(r"\p{sc=Greek}", unicode: true), + "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος"); + f(RegExp(r"\p{sc=Greek}", unicode: true), "高贤未服英雄志,屈节偏生杰士疑"); + f(RegExp(r"\p{sc=Greek}", unicode: true), + "Mr. Jones, of the Manor Farm, had locked the hen-houses for the night"); +} diff --git a/tests/corelib_2/regexp/unicode-property-special_test.dart b/tests/corelib_2/regexp/unicode-property-special_test.dart new file mode 100644 index 00000000000..1e0d2c18a33 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-property-special_test.dart @@ -0,0 +1,110 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + void t(RegExp re, String s) { + assertTrue(re.hasMatch(s)); + } + + void f(RegExp re, String s) { + assertFalse(re.hasMatch(s)); + } + + t(RegExp(r"\p{ASCII}+", unicode: true), "abc123"); + f(RegExp(r"\p{ASCII}+", unicode: true), "ⓐⓑⓒ①②③"); + f(RegExp(r"\p{ASCII}+", unicode: true), "🄰🄱🄲①②③"); + f(RegExp(r"\P{ASCII}+", unicode: true), "abcd123"); + t(RegExp(r"\P{ASCII}+", unicode: true), "ⓐⓑⓒ①②③"); + t(RegExp(r"\P{ASCII}+", unicode: true), "🄰🄱🄲①②③"); + + f(RegExp(r"[^\p{ASCII}]+", unicode: true), "abc123"); + f(RegExp(r"[\p{ASCII}]+", unicode: true), "ⓐⓑⓒ①②③"); + f(RegExp(r"[\p{ASCII}]+", unicode: true), "🄰🄱🄲①②③"); + t(RegExp(r"[^\P{ASCII}]+", unicode: true), "abcd123"); + t(RegExp(r"[\P{ASCII}]+", unicode: true), "ⓐⓑⓒ①②③"); + f(RegExp(r"[^\P{ASCII}]+", unicode: true), "🄰🄱🄲①②③"); + + t(RegExp(r"\p{Any}+", unicode: true), "🄰🄱🄲①②③"); + + shouldBe( + RegExp(r"\p{Any}", unicode: true).firstMatch("\ud800\ud801"), ["\ud800"]); + shouldBe( + RegExp(r"\p{Any}", unicode: true).firstMatch("\udc00\udc01"), ["\udc00"]); + shouldBe(RegExp(r"\p{Any}", unicode: true).firstMatch("\ud800\udc01"), + ["\ud800\udc01"]); + shouldBe(RegExp(r"\p{Any}", unicode: true).firstMatch("\udc01"), ["\udc01"]); + + f(RegExp(r"\P{Any}+", unicode: true), "123"); + f(RegExp(r"[\P{Any}]+", unicode: true), "123"); + t(RegExp(r"[\P{Any}\d]+", unicode: true), "123"); + t(RegExp(r"[^\P{Any}]+", unicode: true), "123"); + + t(RegExp(r"\p{Assigned}+", unicode: true), "123"); + t(RegExp(r"\p{Assigned}+", unicode: true), "🄰🄱🄲"); + f(RegExp(r"\p{Assigned}+", unicode: true), "\ufdd0"); + f(RegExp(r"\p{Assigned}+", unicode: true), "\u{fffff}"); + + f(RegExp(r"\P{Assigned}+", unicode: true), "123"); + f(RegExp(r"\P{Assigned}+", unicode: true), "🄰🄱🄲"); + t(RegExp(r"\P{Assigned}+", unicode: true), "\ufdd0"); + t(RegExp(r"\P{Assigned}+", unicode: true), "\u{fffff}"); + f(RegExp(r"\P{Assigned}", unicode: true), ""); + + t(RegExp(r"[^\P{Assigned}]+", unicode: true), "123"); + f(RegExp(r"[\P{Assigned}]+", unicode: true), "🄰🄱🄲"); + f(RegExp(r"[^\P{Assigned}]+", unicode: true), "\ufdd0"); + t(RegExp(r"[\P{Assigned}]+", unicode: true), "\u{fffff}"); + f(RegExp(r"[\P{Assigned}]", unicode: true), ""); + + f(RegExp(r"[^\u1234\p{ASCII}]+", unicode: true), "\u1234"); + t(RegExp(r"[x\P{ASCII}]+", unicode: true), "x"); + t(RegExp(r"[\u1234\p{ASCII}]+", unicode: true), "\u1234"); + +// Contributory binary properties are not supported. + assertThrows(() => RegExp("\\p{Other_Alphabetic}", unicode: true)); + assertThrows(() => RegExp("\\P{OAlpha}", unicode: true)); + assertThrows( + () => RegExp("\\p{Other_Default_Ignorable_Code_Point}", unicode: true)); + assertThrows(() => RegExp("\\P{ODI}", unicode: true)); + assertThrows(() => RegExp("\\p{Other_Grapheme_Extend}", unicode: true)); + assertThrows(() => RegExp("\\P{OGr_Ext}", unicode: true)); + assertThrows(() => RegExp("\\p{Other_ID_Continue}", unicode: true)); + assertThrows(() => RegExp("\\P{OIDC}", unicode: true)); + assertThrows(() => RegExp("\\p{Other_ID_Start}", unicode: true)); + assertThrows(() => RegExp("\\P{OIDS}", unicode: true)); + assertThrows(() => RegExp("\\p{Other_Lowercase}", unicode: true)); + assertThrows(() => RegExp("\\P{OLower}", unicode: true)); + assertThrows(() => RegExp("\\p{Other_Math}", unicode: true)); + assertThrows(() => RegExp("\\P{OMath}", unicode: true)); + assertThrows(() => RegExp("\\p{Other_Uppercase}", unicode: true)); + assertThrows(() => RegExp("\\P{OUpper}", unicode: true)); +} diff --git a/tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart b/tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart new file mode 100644 index 00000000000..2fcb7c1d580 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart @@ -0,0 +1,81 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +String replace(String string) { + return string + .replaceAll("L", "\ud800") + .replaceAll("l", "\ud801") + .replaceAll("T", "\udc00") + .replaceAll(".", "[^]"); +} + +void test(List expectation, String regexp_source, String subject) { + if (expectation != null) expectation = expectation.map(replace).toList(); + subject = replace(subject); + regexp_source = replace(regexp_source); + shouldBe(new RegExp(regexp_source, unicode: true).firstMatch(subject), + expectation); +} + +void main() { + // Back reference does not end in the middle of a surrogate pair. + test(null, "(L)\\1", "LLT"); + test(["LLTLl", "L", "l"], "(L).*\\1(.)", "LLTLl"); + test(null, "(aL).*\\1", "aLaLT"); + test(["aLaLTaLl", "aL", "l"], "(aL).*\\1(.)", "aLaLTaLl"); + + var s = "TabcLxLTabcLxTabcLTyTabcLz"; + test([s, "TabcL", "z"], "([^x]+).*\\1(.)", s); + + // Back reference does not start in the middle of a surrogate pair. + test(["TLTabTc", "T", "c"], "(T).*\\1(.)", "TLTabTc"); + + // Lookbehinds. + test(null, "(?<=\\1(T)x)", "LTTx"); + test(["", "b", "T"], "(?<=(.)\\2.*(T)x)", "bTaLTTx"); + test(null, "(?<=\\1.*(L)x)", "LTLx"); + test(["", "b", "L"], "(?<=(.)\\2.*(L)x)", "bLaLTLx"); + + test(null, "([^x]+)x*\\1", "LxLT"); + test(null, "([^x]+)x*\\1", "TxLT"); + test(null, "([^x]+)x*\\1", "LTxL"); + test(null, "([^x]+)x*\\1", "LTxT"); + test(null, "([^x]+)x*\\1", "xLxLT"); + test(null, "([^x]+)x*\\1", "xTxLT"); + test(null, "([^x]+)x*\\1", "xLTxL"); + test(null, "([^x]+)x*\\1", "xLTxT"); + test(null, "([^x]+)x*\\1", "xxxLxxLTxx"); + test(null, "([^x]+)x*\\1", "xxxTxxLTxx"); + test(null, "([^x]+)x*\\1", "xxxLTxxLxx"); + test(null, "([^x]+)x*\\1", "xxxLTxxTxx"); + test(["LTTxxLTT", "LTT"], "([^x]+)x*\\1", "xxxLTTxxLTTxx"); +} diff --git a/tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart b/tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart new file mode 100644 index 00000000000..c87047c5c06 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart @@ -0,0 +1,135 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + // Non-unicode use toUpperCase mappings. + assertFalse(RegExp(r"[\u00e5]", caseSensitive: false).hasMatch("\u212b")); + assertFalse( + RegExp(r"[\u212b]", caseSensitive: false).hasMatch("\u00e5\u1234")); + assertFalse(RegExp(r"[\u212b]", caseSensitive: false).hasMatch("\u00e5")); + + assertTrue("\u212b".toLowerCase() == "\u00e5"); + assertTrue("\u00c5".toLowerCase() == "\u00e5"); + assertTrue("\u00e5".toUpperCase() == "\u00c5"); + + // Unicode uses case folding mappings. + assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) + .hasMatch("\u212b")); + assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) + .hasMatch("\u00c5")); + assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) + .hasMatch("\u00e5")); + assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) + .hasMatch("\u212b")); + assertTrue(RegExp(r"\u00c5", caseSensitive: false, unicode: true) + .hasMatch("\u00e5")); + assertTrue(RegExp(r"\u00c5", caseSensitive: false, unicode: true) + .hasMatch("\u212b")); + assertTrue(RegExp(r"\u00c5", caseSensitive: false, unicode: true) + .hasMatch("\u00c5")); + assertTrue(RegExp(r"\u212b", caseSensitive: false, unicode: true) + .hasMatch("\u00c5")); + assertTrue(RegExp(r"\u212b", caseSensitive: false, unicode: true) + .hasMatch("\u00e5")); + assertTrue(RegExp(r"\u212b", caseSensitive: false, unicode: true) + .hasMatch("\u212b")); + + // Non-BMP. + assertFalse(RegExp(r"\u{10400}", caseSensitive: false).hasMatch("\u{10428}")); + assertTrue(RegExp(r"\u{10400}", caseSensitive: false, unicode: true) + .hasMatch("\u{10428}")); + assertTrue(RegExp(r"\ud801\udc00", caseSensitive: false, unicode: true) + .hasMatch("\u{10428}")); + assertTrue(RegExp(r"[\u{10428}]", caseSensitive: false, unicode: true) + .hasMatch("\u{10400}")); + assertTrue(RegExp(r"[\ud801\udc28]", caseSensitive: false, unicode: true) + .hasMatch("\u{10400}")); + shouldBe( + RegExp(r"[\uff40-\u{10428}]+", caseSensitive: false, unicode: true) + .firstMatch("\uff21\u{10400}abc"), + ["\uff21\u{10400}"]); + shouldBe( + RegExp(r"[^\uff40-\u{10428}]+", caseSensitive: false, unicode: true) + .firstMatch("\uff21\u{10400}abc\uff23"), + ["abc"]); + shouldBe( + RegExp(r"[\u24d5-\uff33]+", caseSensitive: false, unicode: true) + .firstMatch("\uff54\uff53\u24bb\u24ba"), + ["\uff53\u24bb"]); + + // Full mappings are ignored. + assertFalse( + RegExp(r"\u00df", caseSensitive: false, unicode: true).hasMatch("SS")); + assertFalse(RegExp(r"\u1f8d", caseSensitive: false, unicode: true) + .hasMatch("\u1f05\u03b9")); + + // Simple mappings work. + assertTrue(RegExp(r"\u1f8d", caseSensitive: false, unicode: true) + .hasMatch("\u1f85")); + + // Common mappings work. + assertTrue(RegExp(r"\u1f6b", caseSensitive: false, unicode: true) + .hasMatch("\u1f63")); + + // Back references. + shouldBe( + RegExp(r"(.)\1\1", caseSensitive: false, unicode: true) + .firstMatch("\u00e5\u212b\u00c5"), + ["\u00e5\u212b\u00c5", "\u00e5"]); + shouldBe( + RegExp(r"(.)\1", caseSensitive: false, unicode: true) + .firstMatch("\u{118aa}\u{118ca}"), + ["\u{118aa}\u{118ca}", "\u{118aa}"]); + + // Misc. + assertTrue(RegExp(r"\u00e5\u00e5\u00e5", caseSensitive: false, unicode: true) + .hasMatch("\u212b\u00e5\u00c5")); + assertTrue(RegExp(r"AB\u{10400}", caseSensitive: false, unicode: true) + .hasMatch("ab\u{10428}")); + + // Non-Latin1 maps to Latin1. + shouldBe( + RegExp(r"^\u017F", caseSensitive: false, unicode: true).firstMatch("s"), + ["s"]); + shouldBe( + RegExp(r"^\u017F", caseSensitive: false, unicode: true) + .firstMatch("s\u1234"), + ["s"]); + shouldBe( + RegExp(r"^a[\u017F]", caseSensitive: false, unicode: true) + .firstMatch("as"), + ["as"]); + shouldBe( + RegExp(r"^a[\u017F]", caseSensitive: false, unicode: true) + .firstMatch("as\u1234"), + ["as"]); +} diff --git a/tests/corelib_2/regexp/unicode-regexp-match-index_test.dart b/tests/corelib_2/regexp/unicode-regexp-match-index_test.dart new file mode 100644 index 00000000000..3e696584ac1 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-regexp-match-index_test.dart @@ -0,0 +1,155 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + // Testing handling of paired and non-paired surrogates in unicode mode + var r = new RegExp(r".", unicode: true); + + var m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); + shouldBe(m, ["\ud800\udc00"]); + assertEquals(m.end, 2); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); + shouldBe(m, ["\ud800\udc00"]); + assertEquals(m.end, 2); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); + shouldBe(m, ["\ud801\udc01"]); + assertEquals(m.end, 4); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); + shouldBe(m, ["\ud801\udc01"]); + assertEquals(m.end, 4); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); + + m = r.matchAsPrefix("\ud800\udc00\ud801\ud802", 3); + shouldBe(m, ["\ud802"]); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\ud802", 4)); + + // Testing handling of paired and non-paired surrogates in non-unicode mode + r = new RegExp(r"."); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); + shouldBe(m, ["\ud800"]); + assertEquals(m.end, 1); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); + shouldBe(m, ["\udc00"]); + assertEquals(m.end, 2); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); + shouldBe(m, ["\ud801"]); + assertEquals(m.end, 3); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); + shouldBe(m, ["\udc01"]); + assertEquals(m.end, 4); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); + + // Testing same with start anchor, unicode mode. + r = new RegExp("^.", unicode: true); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); + shouldBe(m, ["\ud800\udc00"]); + assertEquals(2, m.end); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); + shouldBe(m, ["\ud800\udc00"]); + assertEquals(2, m.end); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2)); + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3)); + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); + + // Testing same with start anchor, non-unicode mode. + r = new RegExp("^."); + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); + shouldBe(m, ["\ud800"]); + assertEquals(1, m.end); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1)); + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2)); + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3)); + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); + + // Now with both anchored and not as alternatives (with the anchored + // version as a captured group), unicode mode. + r = new RegExp(r"(?:(^.)|.)", unicode: true); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); + shouldBe(m, ["\ud800\udc00", "\ud800\udc00"]); + assertEquals(m.end, 2); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); + shouldBe(m, ["\ud800\udc00", "\ud800\udc00"]); + assertEquals(m.end, 2); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); + shouldBe(m, ["\ud801\udc01", null]); + assertEquals(m.end, 4); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); + shouldBe(m, ["\ud801\udc01", null]); + assertEquals(m.end, 4); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); + + m = r.matchAsPrefix("\ud800\udc00\ud801\ud802", 3); + shouldBe(m, ["\ud802", null]); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\ud802", 4)); + + // Now with both anchored and not as alternatives (with the anchored + // version as a captured group), non-unicode mode. + r = new RegExp(r"(?:(^.)|.)"); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); + shouldBe(m, ["\ud800", "\ud800"]); + assertEquals(m.end, 1); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); + shouldBe(m, ["\udc00", null]); + assertEquals(m.end, 2); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); + shouldBe(m, ["\ud801", null]); + assertEquals(m.end, 3); + + m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); + shouldBe(m, ["\udc01", null]); + assertEquals(m.end, 4); + + assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); +} diff --git a/tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart b/tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart new file mode 100644 index 00000000000..d894c7bb859 --- /dev/null +++ b/tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart @@ -0,0 +1,74 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + // test262/data/test/language/literals/regexp/u-dec-esc + assertThrows(() => RegExp(r"\1", unicode: true)); + // test262/language/literals/regexp/u-invalid-char-range-a + assertThrows(() => RegExp(r"[\w-a]", unicode: true)); + // test262/language/literals/regexp/u-invalid-char-range-b + assertThrows(() => RegExp(r"[a-\w]", unicode: true)); + // test262/language/literals/regexp/u-invalid-char-esc + assertThrows(() => RegExp(r"\c", unicode: true)); + assertThrows(() => RegExp(r"\c0", unicode: true)); + // test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion + assertThrows(() => RegExp(r"(?=.)*", unicode: true)); + assertThrows(() => RegExp(r"(?=.){1,2}", unicode: true)); + // test262/built-ins/RegExp/unicode_restricted_octal_escape + assertThrows(() => RegExp(r"[\1]", unicode: true)); + assertThrows(() => RegExp(r"\00", unicode: true)); + assertThrows(() => RegExp(r"\09", unicode: true)); + // test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha + assertThrows(() => RegExp(r"[\c]", unicode: true)); + // test262/built-ins/RegExp/unicode_restricted_identity_escape_c + assertThrows(() => RegExp(r"[\c0]", unicode: true)); + // test262/built-ins/RegExp/unicode_restricted_incomple_quantifier + assertThrows(() => RegExp(r"a{", unicode: true)); + assertThrows(() => RegExp(r"a{1,", unicode: true)); + assertThrows(() => RegExp(r"{", unicode: true)); + assertThrows(() => RegExp(r"}", unicode: true)); + // test262/data/test/built-ins/RegExp/unicode_restricted_brackets + assertThrows(() => RegExp(r"]", unicode: true)); + // test262/built-ins/RegExp/unicode_identity_escape + assertDoesNotThrow(() => RegExp(r"\/", unicode: true)); + + // escaped \0 (as NUL) is allowed inside a character class. + shouldBe(RegExp(r"[\0]", unicode: true).firstMatch("\u0000"), ["\u0000"]); + // unless it is followed by another digit. + assertThrows(() => RegExp(r"[\00]", unicode: true)); + assertThrows(() => RegExp(r"[\01]", unicode: true)); + assertThrows(() => RegExp(r"[\09]", unicode: true)); + shouldBe(RegExp(r"[1\0a]+", unicode: true).firstMatch("b\u{0}1\u{0}a\u{0}2"), + ["\u{0}1\u{0}a\u{0}"]); + // escaped \- is allowed inside a character class. + shouldBe(RegExp(r"[a\-z]", unicode: true).firstMatch("12-34"), ["-"]); +} diff --git a/tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart b/tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart new file mode 100644 index 00000000000..e57d810d16f --- /dev/null +++ b/tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart @@ -0,0 +1,39 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2013 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + final s = "a" * 10000000 + "\u1234"; + shouldBe( + RegExp(r"(\u1234)", unicode: true).firstMatch(s), ["\u1234", "\u1234"]); +} diff --git a/tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart b/tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart new file mode 100644 index 00000000000..1d07628b74a --- /dev/null +++ b/tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart @@ -0,0 +1,93 @@ +// Copyright (c) 2019, the Dart project authors. All rights reserved. +// Copyright 2016 the V8 project authors. All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import 'package:expect/expect.dart'; + +import 'v8_regexp_utils.dart'; + +void main() { + const L = "\ud800"; + const T = "\udc00"; + const x = "x"; + + var r = RegExp(r"()"); // Not unicode. + // Zero-length matches do not advance lastIndex. + var m = r.matchAsPrefix(L + T + L + T); + shouldBe(m, ["", ""]); + assertEquals(0, m.end); + + m = r.matchAsPrefix(L + T + L + T, 1); + shouldBe(m, ["", ""]); + assertEquals(1, m.end); + + var u = RegExp(r"()", unicode: true); + + // Zero-length matches do not advance lastIndex (but do respect paired + // surrogates). + m = u.matchAsPrefix(L + T + L + T); + shouldBe(m, ["", ""]); + assertEquals(0, m.end); + + m = u.matchAsPrefix(L + T + L + T, 1); + shouldBe(m, ["", ""]); + assertEquals(0, m.end); + + // However, with repeating matches, we do advance from match to match. + var ms = r.allMatches(L + T + L + T); + assertEquals(5, ms.length); + for (var i = 0; i < ms.length; i++) { + shouldBe(ms.elementAt(i), ["", ""]); + } + + // With unicode flag, we advance code point by code point. + ms = u.allMatches(L + T + L + T); + assertEquals(3, ms.length); + for (var i = 0; i < ms.length; i++) { + shouldBe(ms.elementAt(i), ["", ""]); + } + + // Test with a lot of copies. + const c = 1000; + ms = u.allMatches((L + T) * c); + assertEquals(c + 1, ms.length); + for (var i = 0; i < ms.length; i++) { + shouldBe(ms.elementAt(i), ["", ""]); + } + + // Same with replaceAll(). + assertEquals( + x + L + x + T + x + L + x + T + x, (L + T + L + T).replaceAll(r, "x")); + + assertEquals(x + L + T + x + L + T + x, (L + T + L + T).replaceAll(u, "x")); + + assertEquals((x + L + T) * c + x, ((L + T) * c).replaceAll(u, "x")); + + // Also test String#split. + Expect.deepEquals( + ["\u{12345}"], "\u{12345}".split(RegExp(r"(?:)", unicode: true))); +} diff --git a/tests/corelib_2/regexp/v8_regexp_utils.dart b/tests/corelib_2/regexp/v8_regexp_utils.dart index cde750a7294..111ab90f365 100644 --- a/tests/corelib_2/regexp/v8_regexp_utils.dart +++ b/tests/corelib_2/regexp/v8_regexp_utils.dart @@ -22,6 +22,10 @@ void assertThrows(fn, [num testid = null]) { Expect.throws(fn, null, "Test $testid"); } +void assertDoesNotThrow(fn, [num testid = null]) { + fn(); +} + void assertNull(actual, [num testid = null]) { Expect.isNull(actual, "Test $testid"); }