From 9238e253055b00281a5cf402c8608f3444f2a0fe Mon Sep 17 00:00:00 2001 From: Keerti Parthasarathy Date: Thu, 25 Apr 2019 14:29:51 +0000 Subject: [PATCH] Revert "[vm] Finish adding support for ECMAScript 2018 features." This reverts commit 5ebb640a67908e7694f1dacf3bc8d138ebb530eb. Reason for revert: Original change's description: > [vm] Finish adding support for ECMAScript 2018 features. > > This work pulls in v8 support for these features with > appropriate changes for Dart and closes > https://github.com/dart-lang/sdk/issues/34935. > > This adds support for the following features: > > * Interpreting patterns as Unicode patterns instead of > BMP patterns > * the dotAll flag (`/s`) for changing the behavior > of '.' to also match line terminators > * Escapes for character classes described by Unicode > property groups (e.g., \p{Greek} to match all Greek > characters, or \P{Greek} for all non-Greek characters). > > The following TC39 proposals describe some of the added features: > > * https://github.com/tc39/proposal-regexp-dotall-flag > * https://github.com/tc39/proposal-regexp-unicode-property-escapes > > These additional changes are included: > > * Extends named capture group names to include the full > range of identifier characters supported by ECMAScript, > not just ASCII. > * Changing the RegExp interface to return RegExpMatch > objects, not Match objects, so that downcasting is > not necessary to use named capture groups from Dart > > **Note**: The changes to the RegExp interface are a > breaking change for implementers of the RegExp interface. > Current users of the RegExp interface (i.e., code using Dart > RegExp objects) will not be affected. > > Change-Id: I0709ed0a8d5db36680e32bbad585594857b9ace4 > Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/95651 > Commit-Queue: Stevie Strickland > Reviewed-by: Johnni Winther > Reviewed-by: Lasse R.H. Nielsen > Reviewed-by: Martin Kustermann TBR=lrn@google.com,kustermann@google.com,jmesserly@google.com,johnniwinther@google.com,sstrickl@google.com # Not skipping CQ checks because original CL landed > 1 day ago. Change-Id: I1eda0fee4fd9e94df095944049833a67b07277e2 Reviewed-on: https://dart-review.googlesource.com/c/sdk/+/100560 Reviewed-by: Keerti Parthasarathy Reviewed-by: Martin Kustermann Commit-Queue: Keerti Parthasarathy --- CHANGELOG.md | 29 - DEPS | 5 - build/config/android/config.gni | 3 - build/config/android/rules.gni | 10 - build/config/host_byteorder.gni | 30 - .../tool/input_sdk/patch/core_patch.dart | 10 +- .../tool/input_sdk/private/regexp_helper.dart | 72 +- runtime/lib/regexp.cc | 35 +- runtime/lib/regexp_patch.dart | 65 +- runtime/platform/splay-tree-inl.h | 273 ----- runtime/platform/splay-tree.h | 172 --- runtime/platform/unicode.h | 34 +- runtime/platform/utils.h | 9 +- runtime/vm/BUILD.gn | 3 +- runtime/vm/bootstrap_natives.h | 4 +- runtime/vm/clustered_snapshot.cc | 6 +- .../compiler/backend/constant_propagator.cc | 4 +- runtime/vm/compiler/backend/il.cc | 4 + runtime/vm/compiler/backend/il.h | 32 +- runtime/vm/compiler/backend/il_arm.cc | 5 +- runtime/vm/compiler/backend/il_arm64.cc | 5 +- runtime/vm/compiler/backend/il_dbc.cc | 2 +- runtime/vm/compiler/backend/il_ia32.cc | 7 +- runtime/vm/compiler/backend/il_x64.cc | 5 +- .../vm/compiler/backend/type_propagator.cc | 2 +- runtime/vm/compiler/jit/compiler.cc | 3 +- runtime/vm/object.cc | 42 +- runtime/vm/object.h | 117 +- runtime/vm/object_service.cc | 4 +- runtime/vm/raw_object.h | 10 +- runtime/vm/raw_object_snapshot.cc | 7 +- runtime/vm/regexp.cc | 1026 ++++------------- runtime/vm/regexp.h | 184 +-- runtime/vm/regexp_assembler.cc | 68 +- runtime/vm/regexp_assembler.h | 31 +- runtime/vm/regexp_assembler_bytecode.cc | 20 +- runtime/vm/regexp_assembler_bytecode.h | 1 - runtime/vm/regexp_assembler_ir.cc | 15 +- runtime/vm/regexp_assembler_ir.h | 1 - runtime/vm/regexp_ast.h | 61 +- runtime/vm/regexp_bytecodes.h | 24 +- runtime/vm/regexp_interpreter.cc | 42 +- runtime/vm/regexp_parser.cc | 898 ++------------- runtime/vm/regexp_parser.h | 57 +- runtime/vm/regexp_test.cc | 2 +- runtime/vm/runtime_entry_list.h | 4 +- runtime/vm/splay-tree.h | 32 - runtime/vm/stub_code_arm64_test.cc | 2 +- runtime/vm/stub_code_arm_test.cc | 2 +- runtime/vm/stub_code_ia32_test.cc | 2 +- runtime/vm/stub_code_x64_test.cc | 2 +- .../_internal/js_runtime/lib/core_patch.dart | 10 +- .../js_runtime/lib/regexp_helper.dart | 74 +- sdk/lib/core/regexp.dart | 46 +- tests/corelib_2/regexp/dot-all_test.dart | 117 -- tests/corelib_2/regexp/lookbehind_test.dart | 6 +- .../corelib_2/regexp/named-captures_test.dart | 255 +--- .../regexp/unicode-character-ranges_test.dart | 160 --- .../unicode-escapes-in-regexps_test.dart | 307 ----- .../regexp/unicode-property-binary_test.dart | 201 ---- .../unicode-property-char-class_test.dart | 56 - .../unicode-property-enumerated_test.dart | 55 - .../unicode-property-exact-match_test.dart | 71 -- ...nicode-property-general-category_test.dart | 114 -- .../regexp/unicode-property-invalid_test.dart | 68 -- .../regexp/unicode-property-scripts_test.dart | 77 -- .../regexp/unicode-property-special_test.dart | 110 -- .../regexp/unicode-regexp-backrefs_test.dart | 81 -- .../unicode-regexp-ignore-case_test.dart | 135 --- .../unicode-regexp-match-index_test.dart | 155 --- ...unicode-regexp-restricted-syntax_test.dart | 74 -- ...nicode-regexp-unanchored-advance_test.dart | 39 - .../unicode-regexp-zero-length_test.dart | 93 -- tests/corelib_2/regexp/v8_regexp_utils.dart | 4 - 74 files changed, 635 insertions(+), 5156 deletions(-) delete mode 100644 build/config/android/rules.gni delete mode 100644 build/config/host_byteorder.gni delete mode 100644 runtime/platform/splay-tree-inl.h delete mode 100644 runtime/platform/splay-tree.h delete mode 100644 runtime/vm/splay-tree.h delete mode 100644 tests/corelib_2/regexp/dot-all_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-character-ranges_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-escapes-in-regexps_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-binary_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-char-class_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-enumerated_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-exact-match_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-general-category_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-invalid_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-scripts_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-property-special_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-regexp-match-index_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart delete mode 100644 tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d28400389f..21265c16a34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,32 +1,3 @@ -## 2.3.0-dev.XX.0 -(Add new changes here, and they will be copied to the change section for the - next dev version) - -### Core library changes - -#### `dart:core` - -* **Breaking change**: The `RegExp` interface has been extended with two new - constructor named parameters: - - * `unicode:` (`bool`, default: `false`), for Unicode patterns , and - * `dotAll:` (`bool`, default: `false`), to change the matching behavior of - '.' to also match line terminating characters. - - Appropriate properties for these named parameters have also been added so - their use can be detected after construction. - - In addition, `RegExp` methods that originally returned `Match` objects - now return a more specific subtype, `RegExpMatch`, which adds two features: - - * `Iterable groupNames`, a property that contains the names of all - named capture groups, and - * `String namedGroup(String name)`: a method that retrieves the match for - the given named capture group - - This change only affects implementers of the `RegExp` interface; current - code using Dart regular expressions will not be affected. - ## 2.3.0 The focus in this release is on the new "UI-as-code" language features which diff --git a/DEPS b/DEPS index 6e37b9b12b4..0ad497ff0bc 100644 --- a/DEPS +++ b/DEPS @@ -91,7 +91,6 @@ vars = { "http_retry_tag": "0.1.1", "http_tag" : "0.12.0+2", "http_throttle_tag" : "1.0.2", - "icu_rev" : "c56c671998902fcc4fc9ace88c83daa99f980793", "idl_parser_rev": "5fb1ebf49d235b5a70c9f49047e83b0654031eb7", "intl_tag": "0.15.7", "jinja2_rev": "2222b31554f03e62600cd7e383376a7c187967a1", @@ -211,10 +210,6 @@ deps = { Var("chromium_git") + "/chromium/src/third_party/ply.git" + "@" + Var("ply_rev"), - Var("dart_root") + "/third_party/icu": - Var("chromium_git") + "/chromium/deps/icu.git" + - "@" + Var("icu_rev"), - Var("dart_root") + "/tools/idl_parser": Var("chromium_git") + "/chromium/src/tools/idl_parser.git" + "@" + Var("idl_parser_rev"), diff --git a/build/config/android/config.gni b/build/config/android/config.gni index c61d41acad6..38e3f8e218f 100644 --- a/build/config/android/config.gni +++ b/build/config/android/config.gni @@ -15,9 +15,6 @@ if (is_android) { android_sdk_root = default_android_sdk_root android_sdk_version = default_android_sdk_version android_sdk_build_tools_version = default_android_sdk_build_tools_version - - # Unused by Dart. Required for GN files in the third_party package ICU. - enable_java_templates = false } # Host stuff ----------------------------------------------------------------- diff --git a/build/config/android/rules.gni b/build/config/android/rules.gni deleted file mode 100644 index f22ba846b2e..00000000000 --- a/build/config/android/rules.gni +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2019, the Dart project authors. Please see the AUTHORS file -# for details. All rights reserved. Use of this source code is governed by a -# BSD-style license that can be found in the LICENSE file. - -# A trivial rules file that allows for the Chromium third_party ICU to -# be built successfully. - -import("//build/config/android/config.gni") - -assert(is_android) diff --git a/build/config/host_byteorder.gni b/build/config/host_byteorder.gni deleted file mode 100644 index 50ea94bca41..00000000000 --- a/build/config/host_byteorder.gni +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2017 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# Originally from v8, included in our repository as the ICU third party -# import depends on it for building. - -# This header file defines the "host_byteorder" variable. -# Not that this is currently used only for building v8. -# The chromium code generally assumes little-endianness. -declare_args() { - host_byteorder = "undefined" -} - -# Detect host byteorder -# ppc64 can be either BE or LE -if (host_cpu == "ppc64") { - if (current_os == "aix") { - host_byteorder = "big" - } else { - # Only use the script when absolutely necessary - host_byteorder = - exec_script("//build/config/get_host_byteorder.py", [], "trim string") - } -} else if (host_cpu == "ppc" || host_cpu == "s390" || host_cpu == "s390x" || - host_cpu == "mips" || host_cpu == "mips64") { - host_byteorder = "big" -} else { - host_byteorder = "little" -} diff --git a/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart b/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart index da07a3c0d31..192b5f68d6e 100644 --- a/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart +++ b/pkg/dev_compiler/tool/input_sdk/patch/core_patch.dart @@ -547,15 +547,9 @@ class bool { class RegExp { @patch factory RegExp(String source, - {bool multiLine = false, - bool caseSensitive = true, - bool unicode = false, - bool dotAll = false}) => + {bool multiLine = false, bool caseSensitive = true}) => JSSyntaxRegExp(source, - multiLine: multiLine, - caseSensitive: caseSensitive, - unicode: unicode, - dotAll: dotAll); + multiLine: multiLine, caseSensitive: caseSensitive); @patch static String escape(String text) => quoteStringForRegExp(text); diff --git a/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart b/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart index 744effc1bee..92066850dd8 100644 --- a/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart +++ b/pkg/dev_compiler/tool/input_sdk/private/regexp_helper.dart @@ -47,22 +47,18 @@ class JSSyntaxRegExp implements RegExp { var _nativeGlobalRegExp; var _nativeAnchoredRegExp; - String toString() => - 'RegExp/$pattern/' + JS('String', '#.flags', _nativeRegExp); + String toString() => "RegExp/$pattern/"; JSSyntaxRegExp(String source, - {bool multiLine = false, - bool caseSensitive = true, - bool unicode = false, - bool dotAll = false}) + {bool multiLine = false, bool caseSensitive = true}) : this.pattern = source, - this._nativeRegExp = makeNative( - source, multiLine, caseSensitive, unicode, dotAll, false); + this._nativeRegExp = + makeNative(source, multiLine, caseSensitive, false); get _nativeGlobalVersion { if (_nativeGlobalRegExp != null) return _nativeGlobalRegExp; - return _nativeGlobalRegExp = makeNative( - pattern, _isMultiLine, _isCaseSensitive, _isUnicode, _isDotAll, true); + return _nativeGlobalRegExp = + makeNative(pattern, _isMultiLine, _isCaseSensitive, true); } get _nativeAnchoredVersion { @@ -72,21 +68,17 @@ class JSSyntaxRegExp implements RegExp { // that it tries, and you can see if the original regexp matched, or it // was the added zero-width match that matched, by looking at the last // capture. If it is a String, the match participated, otherwise it didn't. - return _nativeAnchoredRegExp = makeNative("$pattern|()", _isMultiLine, - _isCaseSensitive, _isUnicode, _isDotAll, true); + return _nativeAnchoredRegExp = + makeNative("$pattern|()", _isMultiLine, _isCaseSensitive, true); } bool get _isMultiLine => JS("bool", "#.multiline", _nativeRegExp); bool get _isCaseSensitive => JS("bool", "!#.ignoreCase", _nativeRegExp); - bool get _isUnicode => JS("bool", "#.unicode", _nativeRegExp); - bool get _isDotAll => JS("bool", "#.dotAll", _nativeRegExp); static makeNative(@nullCheck String source, bool multiLine, - bool caseSensitive, bool unicode, bool dotAll, bool global) { + bool caseSensitive, bool global) { String m = multiLine ? 'm' : ''; String i = caseSensitive ? '' : 'i'; - String u = unicode ? 'u' : ''; - String s = dotAll ? 's' : ''; String g = global ? 'g' : ''; // We're using the JavaScript's try catch instead of the Dart one // to avoid dragging in Dart runtime support just because of using @@ -95,7 +87,7 @@ class JSSyntaxRegExp implements RegExp { '', '(function() {' 'try {' - 'return new RegExp(#, # + # + # + # + #);' + 'return new RegExp(#, # + # + #);' '} catch (e) {' 'return e;' '}' @@ -103,8 +95,6 @@ class JSSyntaxRegExp implements RegExp { source, m, i, - u, - s, g); if (JS('bool', '# instanceof RegExp', regexp)) return regexp; // The returned value is the JavaScript exception. Turn it into a @@ -113,7 +103,7 @@ class JSSyntaxRegExp implements RegExp { throw FormatException("Illegal RegExp pattern: $source, $errorMessage"); } - RegExpMatch firstMatch(@nullCheck String string) { + Match firstMatch(@nullCheck String string) { List m = JS('JSExtendableArray|Null', r'#.exec(#)', _nativeRegExp, string); if (m == null) return null; return _MatchImplementation(this, JSArray.of(m)); @@ -130,7 +120,7 @@ class JSSyntaxRegExp implements RegExp { return null; } - Iterable allMatches(@nullCheck String string, + Iterable allMatches(@nullCheck String string, [@nullCheck int start = 0]) { if (start < 0 || start > string.length) { throw RangeError.range(start, 0, string.length); @@ -138,7 +128,7 @@ class JSSyntaxRegExp implements RegExp { return _AllMatchesIterable(this, string, start); } - RegExpMatch _execGlobal(String string, int start) { + Match _execGlobal(String string, int start) { Object regexp = _nativeGlobalVersion; JS("void", "#.lastIndex = #", regexp, start); List match = JS("JSExtendableArray|Null", "#.exec(#)", regexp, string); @@ -146,7 +136,7 @@ class JSSyntaxRegExp implements RegExp { return _MatchImplementation(this, JSArray.of(match)); } - RegExpMatch _execAnchored(String string, int start) { + Match _execAnchored(String string, int start) { Object regexp = _nativeAnchoredVersion; JS("void", "#.lastIndex = #", regexp, start); List match = JS("JSExtendableArray|Null", "#.exec(#)", regexp, string); @@ -158,7 +148,7 @@ class JSSyntaxRegExp implements RegExp { return _MatchImplementation(this, JSArray.of(match)); } - RegExpMatch matchAsPrefix(String string, [int start = 0]) { + Match matchAsPrefix(String string, [int start = 0]) { if (start < 0 || start > string.length) { throw RangeError.range(start, 0, string.length); } @@ -167,8 +157,6 @@ class JSSyntaxRegExp implements RegExp { bool get isMultiLine => _isMultiLine; bool get isCaseSensitive => _isCaseSensitive; - bool get isUnicode => _isUnicode; - bool get isDotAll => _isDotAll; } class _MatchImplementation implements RegExpMatch { @@ -219,34 +207,25 @@ class _MatchImplementation implements RegExpMatch { } } -class _AllMatchesIterable extends IterableBase { +class _AllMatchesIterable extends IterableBase { final JSSyntaxRegExp _re; final String _string; final int _start; _AllMatchesIterable(this._re, this._string, this._start); - Iterator get iterator => - _AllMatchesIterator(_re, _string, _start); + Iterator get iterator => _AllMatchesIterator(_re, _string, _start); } -class _AllMatchesIterator implements Iterator { +class _AllMatchesIterator implements Iterator { final JSSyntaxRegExp _regExp; String _string; int _nextIndex; - RegExpMatch _current; + Match _current; _AllMatchesIterator(this._regExp, this._string, this._nextIndex); - RegExpMatch get current => _current; - - static bool _isLeadSurrogate(int c) { - return c >= 0xd800 && c <= 0xdbff; - } - - static bool _isTrailSurrogate(int c) { - return c >= 0xdc00 && c <= 0xdfff; - } + Match get current => _current; bool moveNext() { if (_string == null) return false; @@ -256,15 +235,6 @@ class _AllMatchesIterator implements Iterator { _current = match; int nextIndex = match.end; if (match.start == nextIndex) { - // Zero-width match. Advance by one more, unless the regexp - // is in unicode mode and it would put us within a surrogate - // pair. In that case, advance past the code point as a whole. - if (_regExp.isUnicode && - _nextIndex + 1 < _string.length && - _isLeadSurrogate(_string.codeUnitAt(_nextIndex)) && - _isTrailSurrogate(_string.codeUnitAt(_nextIndex + 1))) { - nextIndex++; - } nextIndex++; } _nextIndex = nextIndex; @@ -278,6 +248,6 @@ class _AllMatchesIterator implements Iterator { } /** Find the first match of [regExp] in [string] at or after [start]. */ -RegExpMatch firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { +Match firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { return regExp._execGlobal(string, start); } diff --git a/runtime/lib/regexp.cc b/runtime/lib/regexp.cc index eb4237c5954..4dba76e6eb7 100644 --- a/runtime/lib/regexp.cc +++ b/runtime/lib/regexp.cc @@ -14,7 +14,7 @@ namespace dart { -DEFINE_NATIVE_ENTRY(RegExp_factory, 0, 6) { +DEFINE_NATIVE_ENTRY(RegExp_factory, 0, 4) { ASSERT( TypeArguments::CheckedHandle(zone, arguments->NativeArgAt(0)).IsNull()); GET_NON_NULL_NATIVE_ARGUMENT(String, pattern, arguments->NativeArgAt(1)); @@ -22,30 +22,17 @@ DEFINE_NATIVE_ENTRY(RegExp_factory, 0, 6) { arguments->NativeArgAt(2)); GET_NON_NULL_NATIVE_ARGUMENT(Instance, handle_case_sensitive, arguments->NativeArgAt(3)); - GET_NON_NULL_NATIVE_ARGUMENT(Instance, handle_unicode, - arguments->NativeArgAt(4)); - GET_NON_NULL_NATIVE_ARGUMENT(Instance, handle_dot_all, - arguments->NativeArgAt(5)); bool ignore_case = handle_case_sensitive.raw() != Bool::True().raw(); bool multi_line = handle_multi_line.raw() == Bool::True().raw(); - bool unicode = handle_unicode.raw() == Bool::True().raw(); - bool dot_all = handle_dot_all.raw() == Bool::True().raw(); - - RegExpFlags flags; - - if (ignore_case) flags.SetIgnoreCase(); - if (multi_line) flags.SetMultiLine(); - if (unicode) flags.SetUnicode(); - if (dot_all) flags.SetDotAll(); // Parse the pattern once in order to throw any format exceptions within // the factory constructor. It is parsed again upon compilation. RegExpCompileData compileData; // Throws an exception on parsing failure. - RegExpParser::ParseRegExp(pattern, flags, &compileData); + RegExpParser::ParseRegExp(pattern, multi_line, &compileData); // Create a RegExp object containing only the initial parameters. - return RegExpEngine::CreateRegExp(thread, pattern, flags); + return RegExpEngine::CreateRegExp(thread, pattern, multi_line, ignore_case); } DEFINE_NATIVE_ENTRY(RegExp_getPattern, 0, 1) { @@ -57,25 +44,13 @@ DEFINE_NATIVE_ENTRY(RegExp_getPattern, 0, 1) { DEFINE_NATIVE_ENTRY(RegExp_getIsMultiLine, 0, 1) { const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); ASSERT(!regexp.IsNull()); - return Bool::Get(regexp.flags().IsMultiLine()).raw(); -} - -DEFINE_NATIVE_ENTRY(RegExp_getIsUnicode, 0, 1) { - const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); - ASSERT(!regexp.IsNull()); - return Bool::Get(regexp.flags().IsUnicode()).raw(); -} - -DEFINE_NATIVE_ENTRY(RegExp_getIsDotAll, 0, 1) { - const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); - ASSERT(!regexp.IsNull()); - return Bool::Get(regexp.flags().IsDotAll()).raw(); + return Bool::Get(regexp.is_multi_line()).raw(); } DEFINE_NATIVE_ENTRY(RegExp_getIsCaseSensitive, 0, 1) { const RegExp& regexp = RegExp::CheckedHandle(zone, arguments->NativeArgAt(0)); ASSERT(!regexp.IsNull()); - return Bool::Get(!regexp.flags().IgnoreCase()).raw(); + return Bool::Get(!regexp.is_ignore_case()).raw(); } DEFINE_NATIVE_ENTRY(RegExp_getGroupCount, 0, 1) { diff --git a/runtime/lib/regexp_patch.dart b/runtime/lib/regexp_patch.dart index 35a13b1a937..4eac4461187 100644 --- a/runtime/lib/regexp_patch.dart +++ b/runtime/lib/regexp_patch.dart @@ -8,12 +8,8 @@ class RegExp { @patch factory RegExp(String source, - {bool multiLine: false, - bool caseSensitive: true, - bool unicode: false, - bool dotAll: false}) { - _RegExpHashKey key = - new _RegExpHashKey(source, multiLine, caseSensitive, unicode, dotAll); + {bool multiLine: false, bool caseSensitive: true}) { + _RegExpHashKey key = new _RegExpHashKey(source, multiLine, caseSensitive); _RegExpHashValue value = _cache[key]; if (value == null) { @@ -25,10 +21,7 @@ class RegExp { value = new _RegExpHashValue( new _RegExp(source, - multiLine: multiLine, - caseSensitive: caseSensitive, - unicode: unicode, - dotAll: dotAll), + multiLine: multiLine, caseSensitive: caseSensitive), key); _cache[key] = value; } else { @@ -121,20 +114,15 @@ class _RegExpHashKey extends LinkedListEntry<_RegExpHashKey> { final String pattern; final bool multiLine; final bool caseSensitive; - final bool unicode; - final bool dotAll; - _RegExpHashKey(this.pattern, this.multiLine, this.caseSensitive, this.unicode, - this.dotAll); + _RegExpHashKey(this.pattern, this.multiLine, this.caseSensitive); int get hashCode => pattern.hashCode; bool operator ==(that) { return (that is _RegExpHashKey) && (this.pattern == that.pattern) && (this.multiLine == that.multiLine) && - (this.caseSensitive == that.caseSensitive) && - (this.unicode == that.unicode) && - (this.dotAll == that.dotAll); + (this.caseSensitive == that.caseSensitive); } } @@ -212,11 +200,9 @@ class _RegExpMatch implements RegExpMatch { class _RegExp implements RegExp { factory _RegExp(String pattern, {bool multiLine: false, - bool caseSensitive: true, - bool unicode: false, - bool dotAll: false}) native "RegExp_factory"; + bool caseSensitive: true}) native "RegExp_factory"; - RegExpMatch firstMatch(String str) { + Match firstMatch(String str) { if (str is! String) throw new ArgumentError(str); List match = _ExecuteMatch(str, 0); if (match == null) { @@ -225,7 +211,7 @@ class _RegExp implements RegExp { return new _RegExpMatch(this, str, match); } - Iterable allMatches(String string, [int start = 0]) { + Iterable allMatches(String string, [int start = 0]) { if (string is! String) throw new ArgumentError(string); if (start is! int) throw new ArgumentError(start); if (0 > start || start > string.length) { @@ -234,7 +220,7 @@ class _RegExp implements RegExp { return new _AllMatchesIterable(this, string, start); } - RegExpMatch matchAsPrefix(String string, [int start = 0]) { + Match matchAsPrefix(String string, [int start = 0]) { if (string is! String) throw new ArgumentError(string); if (start is! int) throw new ArgumentError(start); if (start < 0 || start > string.length) { @@ -266,10 +252,6 @@ class _RegExp implements RegExp { bool get isCaseSensitive native "RegExp_getIsCaseSensitive"; - bool get isUnicode native "RegExp_getIsUnicode"; - - bool get isDotAll native "RegExp_getIsDotAll"; - int get _groupCount native "RegExp_getGroupCount"; // Returns a List [String, int, String, int, ...] where each @@ -345,34 +327,25 @@ class _RegExp implements RegExp { native "RegExp_ExecuteMatchSticky"; } -class _AllMatchesIterable extends IterableBase { +class _AllMatchesIterable extends IterableBase { final _RegExp _re; final String _str; final int _start; _AllMatchesIterable(this._re, this._str, this._start); - Iterator get iterator => - new _AllMatchesIterator(_re, _str, _start); + Iterator get iterator => new _AllMatchesIterator(_re, _str, _start); } -class _AllMatchesIterator implements Iterator { +class _AllMatchesIterator implements Iterator { final String _str; int _nextIndex; _RegExp _re; - RegExpMatch _current; + Match _current; _AllMatchesIterator(this._re, this._str, this._nextIndex); - RegExpMatch get current => _current; - - static bool _isLeadSurrogate(int c) { - return c >= 0xd800 && c <= 0xdbff; - } - - static bool _isTrailSurrogate(int c) { - return c >= 0xdc00 && c <= 0xdfff; - } + Match get current => _current; bool moveNext() { if (_re == null) return false; // Cleared after a failed match. @@ -382,15 +355,7 @@ class _AllMatchesIterator implements Iterator { _current = new _RegExpMatch(_re, _str, match); _nextIndex = _current.end; if (_nextIndex == _current.start) { - // Zero-width match. Advance by one more, unless the regexp - // is in unicode mode and it would put us within a surrogate - // pair. In that case, advance past the code point as a whole. - if (_re.isUnicode && - _nextIndex + 1 < _str.length && - _isLeadSurrogate(_str.codeUnitAt(_nextIndex)) && - _isTrailSurrogate(_str.codeUnitAt(_nextIndex + 1))) { - _nextIndex++; - } + // Zero-width match. Advance by one more. _nextIndex++; } return true; diff --git a/runtime/platform/splay-tree-inl.h b/runtime/platform/splay-tree-inl.h deleted file mode 100644 index 7322a56ce4c..00000000000 --- a/runtime/platform/splay-tree-inl.h +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. -// Copyright 2010 the V8 project authors. -// Please see the AUTHORS file for details. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -// -// The original file can be found at: -// https://github.com/v8/v8/blob/master/src/splay-tree-inl.h - -#ifndef RUNTIME_PLATFORM_SPLAY_TREE_INL_H_ -#define RUNTIME_PLATFORM_SPLAY_TREE_INL_H_ - -#include - -#include "platform/splay-tree.h" - -namespace dart { - -template -SplayTree::~SplayTree() { - NodeDeleter deleter; - ForEachNode(&deleter); -} - -template -bool SplayTree::Insert(const Key& key, Locator* locator) { - if (is_empty()) { - // If the tree is empty, insert the new node. - root_ = new (allocator_) Node(key, Config::NoValue()); - } else { - // Splay on the key to move the last node on the search path - // for the key to the root of the tree. - Splay(key); - // Ignore repeated insertions with the same key. - int cmp = Config::Compare(key, root_->key_); - if (cmp == 0) { - locator->bind(root_); - return false; - } - // Insert the new node. - Node* node = new (allocator_) Node(key, Config::NoValue()); - InsertInternal(cmp, node); - } - locator->bind(root_); - return true; -} - -template -void SplayTree::InsertInternal(int cmp, Node* node) { - if (cmp > 0) { - node->left_ = root_; - node->right_ = root_->right_; - root_->right_ = nullptr; - } else { - node->right_ = root_; - node->left_ = root_->left_; - root_->left_ = nullptr; - } - root_ = node; -} - -template -bool SplayTree::FindInternal(const Key& key) { - if (is_empty()) return false; - Splay(key); - return Config::Compare(key, root_->key_) == 0; -} - -template -bool SplayTree::Contains(const Key& key) { - return FindInternal(key); -} - -template -bool SplayTree::Find(const Key& key, Locator* locator) { - if (FindInternal(key)) { - locator->bind(root_); - return true; - } else { - return false; - } -} - -template -bool SplayTree::FindGreatestLessThan(const Key& key, - Locator* locator) { - if (is_empty()) return false; - // Splay on the key to move the node with the given key or the last - // node on the search path to the top of the tree. - Splay(key); - // Now the result is either the root node or the greatest node in - // the left subtree. - int cmp = Config::Compare(root_->key_, key); - if (cmp <= 0) { - locator->bind(root_); - return true; - } else { - Node* temp = root_; - root_ = root_->left_; - bool result = FindGreatest(locator); - root_ = temp; - return result; - } -} - -template -bool SplayTree::FindLeastGreaterThan(const Key& key, - Locator* locator) { - if (is_empty()) return false; - // Splay on the key to move the node with the given key or the last - // node on the search path to the top of the tree. - Splay(key); - // Now the result is either the root node or the least node in - // the right subtree. - int cmp = Config::Compare(root_->key_, key); - if (cmp >= 0) { - locator->bind(root_); - return true; - } else { - Node* temp = root_; - root_ = root_->right_; - bool result = FindLeast(locator); - root_ = temp; - return result; - } -} - -template -bool SplayTree::FindGreatest(Locator* locator) { - if (is_empty()) return false; - Node* current = root_; - while (current->right_ != nullptr) - current = current->right_; - locator->bind(current); - return true; -} - -template -bool SplayTree::FindLeast(Locator* locator) { - if (is_empty()) return false; - Node* current = root_; - while (current->left_ != nullptr) - current = current->left_; - locator->bind(current); - return true; -} - -template -bool SplayTree::Move(const Key& old_key, - const Key& new_key) { - if (!FindInternal(old_key)) return false; - Node* node_to_move = root_; - RemoveRootNode(old_key); - Splay(new_key); - int cmp = Config::Compare(new_key, root_->key_); - if (cmp == 0) { - // A node with the target key already exists. - delete node_to_move; - return false; - } - node_to_move->key_ = new_key; - InsertInternal(cmp, node_to_move); - return true; -} - -template -bool SplayTree::Remove(const Key& key) { - if (!FindInternal(key)) return false; - Node* node_to_remove = root_; - RemoveRootNode(key); - delete node_to_remove; - return true; -} - -template -void SplayTree::RemoveRootNode(const Key& key) { - if (root_->left_ == nullptr) { - // No left child, so the new tree is just the right child. - root_ = root_->right_; - } else { - // Left child exists. - Node* right = root_->right_; - // Make the original left child the new root. - root_ = root_->left_; - // Splay to make sure that the new root has an empty right child. - Splay(key); - // Insert the original right child as the right child of the new - // root. - root_->right_ = right; - } -} - -template -void SplayTree::Splay(const Key& key) { - if (is_empty()) return; - Node dummy_node(Config::kNoKey, Config::NoValue()); - // Create a dummy node. The use of the dummy node is a bit - // counter-intuitive: The right child of the dummy node will hold - // the L tree of the algorithm. The left child of the dummy node - // will hold the R tree of the algorithm. Using a dummy node, left - // and right will always be nodes and we avoid special cases. - Node* dummy = &dummy_node; - Node* left = dummy; - Node* right = dummy; - Node* current = root_; - while (true) { - int cmp = Config::Compare(key, current->key_); - if (cmp < 0) { - if (current->left_ == nullptr) break; - if (Config::Compare(key, current->left_->key_) < 0) { - // Rotate right. - Node* temp = current->left_; - current->left_ = temp->right_; - temp->right_ = current; - current = temp; - if (current->left_ == nullptr) break; - } - // Link right. - right->left_ = current; - right = current; - current = current->left_; - } else if (cmp > 0) { - if (current->right_ == nullptr) break; - if (Config::Compare(key, current->right_->key_) > 0) { - // Rotate left. - Node* temp = current->right_; - current->right_ = temp->left_; - temp->left_ = current; - current = temp; - if (current->right_ == nullptr) break; - } - // Link left. - left->right_ = current; - left = current; - current = current->right_; - } else { - break; - } - } - // Assemble. - left->right_ = current->left_; - right->left_ = current->right_; - current->left_ = dummy->right_; - current->right_ = dummy->left_; - root_ = current; -} - -template -template -void SplayTree::ForEach(Callback* callback) { - NodeToPairAdaptor callback_adaptor(callback); - ForEachNode(&callback_adaptor); -} - -template -template -void SplayTree::ForEachNode(Callback* callback) { - if (root_ == nullptr) return; - // Pre-allocate some space for tiny trees. - std::vector nodes_to_visit; - nodes_to_visit.push_back(root_); - size_t pos = 0; - while (pos < nodes_to_visit.size()) { - Node* node = nodes_to_visit[pos++]; - if (node->left() != nullptr) nodes_to_visit.push_back(node->left()); - if (node->right() != nullptr) nodes_to_visit.push_back(node->right()); - callback->Call(node); - } -} - -} // namespace dart - -#endif // RUNTIME_PLATFORM_SPLAY_TREE_INL_H_ diff --git a/runtime/platform/splay-tree.h b/runtime/platform/splay-tree.h deleted file mode 100644 index a85488b154a..00000000000 --- a/runtime/platform/splay-tree.h +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. -// Copyright 2010 the V8 project authors. -// Please see the AUTHORS file for details. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -// -// The original file can be found at: -// https://github.com/v8/v8/blob/master/src/splay-tree.h - -#ifndef RUNTIME_PLATFORM_SPLAY_TREE_H_ -#define RUNTIME_PLATFORM_SPLAY_TREE_H_ - -#include "platform/allocation.h" - -namespace dart { - -// A splay tree. The config type parameter encapsulates the different -// configurations of a concrete splay tree: -// -// typedef Key: the key type -// typedef Value: the value type -// static const Key kNoKey: the dummy key used when no key is set -// static Value kNoValue(): the dummy value used to initialize nodes -// static int (Compare)(Key& a, Key& b) -> {-1, 0, 1}: comparison function -// -// The tree is also parameterized by an allocation policy -// (Allocator). The policy is used for allocating lists in the C free -// store or the zone; see zone.h. - -template -class SplayTree : public B { - public: - typedef typename Config::Key Key; - typedef typename Config::Value Value; - - class Locator; - - explicit SplayTree(Allocator* allocator) - : root_(nullptr), allocator_(allocator) {} - ~SplayTree(); - - Allocator* allocator() { return allocator_; } - - // Checks if there is a mapping for the key. - bool Contains(const Key& key); - - // Inserts the given key in this tree with the given value. Returns - // true if a node was inserted, otherwise false. If found the locator - // is enabled and provides access to the mapping for the key. - bool Insert(const Key& key, Locator* locator); - - // Looks up the key in this tree and returns true if it was found, - // otherwise false. If the node is found the locator is enabled and - // provides access to the mapping for the key. - bool Find(const Key& key, Locator* locator); - - // Finds the mapping with the greatest key less than or equal to the - // given key. - bool FindGreatestLessThan(const Key& key, Locator* locator); - - // Find the mapping with the greatest key in this tree. - bool FindGreatest(Locator* locator); - - // Finds the mapping with the least key greater than or equal to the - // given key. - bool FindLeastGreaterThan(const Key& key, Locator* locator); - - // Find the mapping with the least key in this tree. - bool FindLeast(Locator* locator); - - // Move the node from one key to another. - bool Move(const Key& old_key, const Key& new_key); - - // Remove the node with the given key from the tree. - bool Remove(const Key& key); - - // Remove all keys from the tree. - void Clear() { ResetRoot(); } - - bool is_empty() { return root_ == nullptr; } - - // Perform the splay operation for the given key. Moves the node with - // the given key to the top of the tree. If no node has the given - // key, the last node on the search path is moved to the top of the - // tree. - void Splay(const Key& key); - - class Node : public B { - public: - Node(const Key& key, const Value& value) - : key_(key), value_(value), left_(nullptr), right_(nullptr) {} - - Key key() { return key_; } - Value value() { return value_; } - Node* left() { return left_; } - Node* right() { return right_; } - - private: - friend class SplayTree; - friend class Locator; - Key key_; - Value value_; - Node* left_; - Node* right_; - }; - - // A locator provides access to a node in the tree without actually - // exposing the node. - class Locator : public B { - public: - explicit Locator(Node* node) : node_(node) {} - Locator() : node_(nullptr) {} - const Key& key() { return node_->key_; } - Value& value() { return node_->value_; } - void set_value(const Value& value) { node_->value_ = value; } - inline void bind(Node* node) { node_ = node; } - - private: - Node* node_; - }; - - template - void ForEach(Callback* callback); - - protected: - // Resets tree root. Existing nodes become unreachable. - void ResetRoot() { root_ = nullptr; } - - private: - // Search for a node with a given key. If found, root_ points - // to the node. - bool FindInternal(const Key& key); - - // Inserts a node assuming that root_ is already set up. - void InsertInternal(int cmp, Node* node); - - // Removes root_ node. - void RemoveRootNode(const Key& key); - - template - class NodeToPairAdaptor : public B { - public: - explicit NodeToPairAdaptor(Callback* callback) : callback_(callback) {} - void Call(Node* node) { callback_->Call(node->key(), node->value()); } - - private: - Callback* callback_; - - DISALLOW_COPY_AND_ASSIGN(NodeToPairAdaptor); - }; - - class NodeDeleter : public B { - public: - NodeDeleter() = default; - void Call(Node* node) { delete node; } - - private: - DISALLOW_COPY_AND_ASSIGN(NodeDeleter); - }; - - template - void ForEachNode(Callback* callback); - - Node* root_; - Allocator* allocator_; - - DISALLOW_COPY_AND_ASSIGN(SplayTree); -}; - -} // namespace dart - -#endif // RUNTIME_PLATFORM_SPLAY_TREE_H_ diff --git a/runtime/platform/unicode.h b/runtime/platform/unicode.h index b784823e528..f3c0778f7e4 100644 --- a/runtime/platform/unicode.h +++ b/runtime/platform/unicode.h @@ -15,7 +15,6 @@ class String; class Utf : AllStatic { public: static const int32_t kMaxCodePoint = 0x10FFFF; - static const int32_t kInvalidChar = 0xFFFFFFFF; static bool IsLatin1(int32_t code_point) { return (code_point >= 0) && (code_point <= 0xFF); @@ -30,7 +29,7 @@ class Utf : AllStatic { } // Returns true if the code point value is above Plane 17. - static bool IsOutOfRange(int32_t code_point) { + static bool IsOutOfRange(intptr_t code_point) { return (code_point < 0) || (code_point > kMaxCodePoint); } }; @@ -57,11 +56,11 @@ class Utf8 : AllStatic { static intptr_t Length(const String& str); static intptr_t Encode(int32_t ch, char* dst); - static intptr_t Encode(const String& src, char* dst, intptr_t len); static intptr_t Decode(const uint8_t* utf8_array, intptr_t array_len, int32_t* ch); + static intptr_t Encode(const String& src, char* dst, intptr_t len); static bool DecodeToLatin1(const uint8_t* utf8_array, intptr_t array_len, @@ -153,10 +152,6 @@ class Utf16 : AllStatic { static void Encode(int32_t codepoint, uint16_t* dst); static const int32_t kMaxCodeUnit = 0xFFFF; - static const int32_t kLeadSurrogateStart = 0xD800; - static const int32_t kLeadSurrogateEnd = 0xDBFF; - static const int32_t kTrailSurrogateStart = 0xDC00; - static const int32_t kTrailSurrogateEnd = 0xDFFF; private: static const int32_t kLeadSurrogateOffset = (0xD800 - (0x10000 >> 10)); @@ -192,11 +187,11 @@ class CaseMapping : AllStatic { // The size of the stage 1 index. // TODO(cshapiro): improve indexing so this value is unnecessary. - static const intptr_t kStage1Size = 261; + static const int kStage1Size = 261; // The size of a stage 2 block in bytes. - static const intptr_t kBlockSizeLog2 = 8; - static const intptr_t kBlockSize = 1 << kBlockSizeLog2; + static const int kBlockSizeLog2 = 8; + static const int kBlockSize = 1 << kBlockSizeLog2; static int32_t Convert(int32_t ch, int32_t mapping) { if (Utf::IsLatin1(ch)) { @@ -227,25 +222,6 @@ class CaseMapping : AllStatic { static const int32_t stage2_exception_[][2]; }; -class Latin1 { - public: - static const int32_t kMaxChar = 0xff; - // Convert the character to Latin-1 case equivalent if possible. - static inline uint16_t TryConvertToLatin1(uint16_t c) { - switch (c) { - // This are equivalent characters in unicode. - case 0x39c: - case 0x3bc: - return 0xb5; - // This is an uppercase of a Latin-1 character - // outside of Latin-1. - case 0x178: - return 0xff; - } - return c; - } -}; - } // namespace dart #endif // RUNTIME_PLATFORM_UNICODE_H_ diff --git a/runtime/platform/utils.h b/runtime/platform/utils.h index 993ff26be8d..9f72205dd6e 100644 --- a/runtime/platform/utils.h +++ b/runtime/platform/utils.h @@ -203,14 +203,7 @@ class Utils { return (static_cast(high) << 32) | (low & 0x0ffffffffLL); } - static inline constexpr bool IsAlphaNumeric(uint32_t c) { - return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || - IsDecimalDigit(c); - } - - static inline constexpr bool IsDecimalDigit(uint32_t c) { - return ('0' <= c) && (c <= '9'); - } + static bool IsDecimalDigit(char c) { return ('0' <= c) && (c <= '9'); } static bool IsHexDigit(char c) { return IsDecimalDigit(c) || (('A' <= c) && (c <= 'F')) || diff --git a/runtime/vm/BUILD.gn b/runtime/vm/BUILD.gn index f2737c98f68..680ab8411cd 100644 --- a/runtime/vm/BUILD.gn +++ b/runtime/vm/BUILD.gn @@ -60,9 +60,8 @@ config("libdart_vm_config") { library_for_all_configs("libdart_vm") { target_type = "source_set" - extra_deps = [ "//third_party/icu" ] if (is_fuchsia) { - extra_deps += [ + extra_deps = [ # TODO(US-399): Remove time_service specific code when it is no longer # necessary. "//sdk/lib/sys/cpp", diff --git a/runtime/vm/bootstrap_natives.h b/runtime/vm/bootstrap_natives.h index 34cfbc43044..da1dc36e8c4 100644 --- a/runtime/vm/bootstrap_natives.h +++ b/runtime/vm/bootstrap_natives.h @@ -98,12 +98,10 @@ namespace dart { V(Double_toStringAsExponential, 2) \ V(Double_toStringAsPrecision, 2) \ V(Double_flipSignBit, 1) \ - V(RegExp_factory, 6) \ + V(RegExp_factory, 4) \ V(RegExp_getPattern, 1) \ V(RegExp_getIsMultiLine, 1) \ V(RegExp_getIsCaseSensitive, 1) \ - V(RegExp_getIsUnicode, 1) \ - V(RegExp_getIsDotAll, 1) \ V(RegExp_getGroupCount, 1) \ V(RegExp_getGroupNameMap, 1) \ V(RegExp_ExecuteMatch, 3) \ diff --git a/runtime/vm/clustered_snapshot.cc b/runtime/vm/clustered_snapshot.cc index 033ffbc63e3..4f6602719f0 100644 --- a/runtime/vm/clustered_snapshot.cc +++ b/runtime/vm/clustered_snapshot.cc @@ -3630,8 +3630,7 @@ class RegExpSerializationCluster : public SerializationCluster { RawRegExp* regexp = objects_[i]; AutoTraceObject(regexp); WriteFromTo(regexp); - s->Write(regexp->ptr()->num_one_byte_registers_); - s->Write(regexp->ptr()->num_two_byte_registers_); + s->Write(regexp->ptr()->num_registers_); s->Write(regexp->ptr()->type_flags_); } } @@ -3662,8 +3661,7 @@ class RegExpDeserializationCluster : public DeserializationCluster { Deserializer::InitializeHeader(regexp, kRegExpCid, RegExp::InstanceSize()); ReadFromTo(regexp); - regexp->ptr()->num_one_byte_registers_ = d->Read(); - regexp->ptr()->num_two_byte_registers_ = d->Read(); + regexp->ptr()->num_registers_ = d->Read(); regexp->ptr()->type_flags_ = d->Read(); } } diff --git a/runtime/vm/compiler/backend/constant_propagator.cc b/runtime/vm/compiler/backend/constant_propagator.cc index acf4f9c484d..d27b6e21a87 100644 --- a/runtime/vm/compiler/backend/constant_propagator.cc +++ b/runtime/vm/compiler/backend/constant_propagator.cc @@ -1272,8 +1272,8 @@ void ConstantPropagator::VisitMathMinMax(MathMinMaxInstr* instr) { } } -void ConstantPropagator::VisitCaseInsensitiveCompare( - CaseInsensitiveCompareInstr* instr) { +void ConstantPropagator::VisitCaseInsensitiveCompareUC16( + CaseInsensitiveCompareUC16Instr* instr) { SetValue(instr, non_constant_); } diff --git a/runtime/vm/compiler/backend/il.cc b/runtime/vm/compiler/backend/il.cc index dda6e01bdf0..4ee0e27619f 100644 --- a/runtime/vm/compiler/backend/il.cc +++ b/runtime/vm/compiler/backend/il.cc @@ -5161,6 +5161,10 @@ const char* MathUnaryInstr::KindToCString(MathUnaryKind kind) { return ""; } +const RuntimeEntry& CaseInsensitiveCompareUC16Instr::TargetFunction() const { + return kCaseInsensitiveCompareUC16RuntimeEntry; +} + TruncDivModInstr::TruncDivModInstr(Value* lhs, Value* rhs, intptr_t deopt_id) : TemplateDefinition(deopt_id) { SetInputAt(0, lhs); diff --git a/runtime/vm/compiler/backend/il.h b/runtime/vm/compiler/backend/il.h index 4c22692810f..c5997de1682 100644 --- a/runtime/vm/compiler/backend/il.h +++ b/runtime/vm/compiler/backend/il.h @@ -18,7 +18,6 @@ #include "vm/native_entry.h" #include "vm/object.h" #include "vm/parser.h" -#include "vm/runtime_entry.h" #include "vm/static_type_exactness_state.h" #include "vm/token_position.h" @@ -423,7 +422,7 @@ struct InstrAttrs { M(Unbox, kNoGC) \ M(BoxInt64, _) \ M(UnboxInt64, kNoGC) \ - M(CaseInsensitiveCompare, _) \ + M(CaseInsensitiveCompareUC16, _) \ M(BinaryInt64Op, kNoGC) \ M(ShiftInt64Op, kNoGC) \ M(SpeculativeShiftInt64Op, kNoGC) \ @@ -5955,18 +5954,18 @@ class MathUnaryInstr : public TemplateDefinition<1, NoThrow, Pure> { // Calls into the runtime and performs a case-insensitive comparison of the // UTF16 strings (i.e. TwoByteString or ExternalTwoByteString) located at // str[lhs_index:lhs_index + length] and str[rhs_index:rhs_index + length]. -// Depending on the runtime entry passed, we will treat the strings as either -// UCS2 (no surrogate handling) or UTF16 (surrogates handled appropriately). -class CaseInsensitiveCompareInstr +// +// TODO(zerny): Remove this once (if) functions inherited from unibrow +// are moved to dart code. +class CaseInsensitiveCompareUC16Instr : public TemplateDefinition<4, NoThrow, Pure> { public: - CaseInsensitiveCompareInstr(Value* str, - Value* lhs_index, - Value* rhs_index, - Value* length, - const RuntimeEntry& entry, - intptr_t cid) - : entry_(entry), cid_(cid) { + CaseInsensitiveCompareUC16Instr(Value* str, + Value* lhs_index, + Value* rhs_index, + Value* length, + intptr_t cid) + : cid_(cid) { ASSERT(cid == kTwoByteStringCid || cid == kExternalTwoByteStringCid); ASSERT(index_scale() == 2); SetInputAt(0, str); @@ -5980,7 +5979,7 @@ class CaseInsensitiveCompareInstr Value* rhs_index() const { return inputs_[2]; } Value* length() const { return inputs_[3]; } - const RuntimeEntry& TargetFunction() const { return entry_; } + const RuntimeEntry& TargetFunction() const; bool IsExternal() const { return cid_ == kExternalTwoByteStringCid; } intptr_t class_id() const { return cid_; } intptr_t index_scale() const { return Instance::ElementSizeFor(cid_); } @@ -5989,18 +5988,17 @@ class CaseInsensitiveCompareInstr virtual Representation representation() const { return kTagged; } - DECLARE_INSTRUCTION(CaseInsensitiveCompare) + DECLARE_INSTRUCTION(CaseInsensitiveCompareUC16) virtual CompileType ComputeType() const; virtual bool AttributesEqual(Instruction* other) const { - return other->AsCaseInsensitiveCompare()->cid_ == cid_; + return other->AsCaseInsensitiveCompareUC16()->cid_ == cid_; } private: - const RuntimeEntry& entry_; const intptr_t cid_; - DISALLOW_COPY_AND_ASSIGN(CaseInsensitiveCompareInstr); + DISALLOW_COPY_AND_ASSIGN(CaseInsensitiveCompareUC16Instr); }; // Represents Math's static min and max functions. diff --git a/runtime/vm/compiler/backend/il_arm.cc b/runtime/vm/compiler/backend/il_arm.cc index d2c6d25bf98..c7ce0c06649 100644 --- a/runtime/vm/compiler/backend/il_arm.cc +++ b/runtime/vm/compiler/backend/il_arm.cc @@ -5137,7 +5137,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -5151,7 +5151,8 @@ LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareUC16Instr::EmitNativeCode( + FlowGraphCompiler* compiler) { // Call the function. __ CallRuntime(TargetFunction(), TargetFunction().argument_count()); } diff --git a/runtime/vm/compiler/backend/il_arm64.cc b/runtime/vm/compiler/backend/il_arm64.cc index f1ac89aa06d..949490b4cc0 100644 --- a/runtime/vm/compiler/backend/il_arm64.cc +++ b/runtime/vm/compiler/backend/il_arm64.cc @@ -4361,7 +4361,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -4375,7 +4375,8 @@ LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareUC16Instr::EmitNativeCode( + FlowGraphCompiler* compiler) { // Call the function. __ CallRuntime(TargetFunction(), TargetFunction().argument_count()); } diff --git a/runtime/vm/compiler/backend/il_dbc.cc b/runtime/vm/compiler/backend/il_dbc.cc index dedc848073f..14d5596ad7c 100644 --- a/runtime/vm/compiler/backend/il_dbc.cc +++ b/runtime/vm/compiler/backend/il_dbc.cc @@ -55,7 +55,7 @@ DECLARE_FLAG(int, optimization_counter_threshold); // - Optimized RegExps, // - Precompilation. #define FOR_EACH_UNREACHABLE_INSTRUCTION(M) \ - M(CaseInsensitiveCompare) \ + M(CaseInsensitiveCompareUC16) \ M(GenericCheckBound) \ M(IndirectGoto) \ M(Int64ToDouble) \ diff --git a/runtime/vm/compiler/backend/il_ia32.cc b/runtime/vm/compiler/backend/il_ia32.cc index 933a52d99b8..6b8c38095fd 100644 --- a/runtime/vm/compiler/backend/il_ia32.cc +++ b/runtime/vm/compiler/backend/il_ia32.cc @@ -4459,7 +4459,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -4473,7 +4473,8 @@ LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareUC16Instr::EmitNativeCode( + FlowGraphCompiler* compiler) { // Save ESP. EDI is chosen because it is callee saved so we do not need to // back it up before calling into the runtime. static const Register kSavedSPReg = EDI; @@ -4488,7 +4489,7 @@ void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { // Call the function. __ CallRuntime(TargetFunction(), TargetFunction().argument_count()); - // Restore ESP and pop the old value off the stack. + // Restore ESP. __ movl(ESP, kSavedSPReg); } diff --git a/runtime/vm/compiler/backend/il_x64.cc b/runtime/vm/compiler/backend/il_x64.cc index 2e055a03928..2ff2880bf4c 100644 --- a/runtime/vm/compiler/backend/il_x64.cc +++ b/runtime/vm/compiler/backend/il_x64.cc @@ -4511,7 +4511,7 @@ void MathUnaryInstr::EmitNativeCode(FlowGraphCompiler* compiler) { } } -LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( +LocationSummary* CaseInsensitiveCompareUC16Instr::MakeLocationSummary( Zone* zone, bool opt) const { const intptr_t kNumTemps = 0; @@ -4525,7 +4525,8 @@ LocationSummary* CaseInsensitiveCompareInstr::MakeLocationSummary( return summary; } -void CaseInsensitiveCompareInstr::EmitNativeCode(FlowGraphCompiler* compiler) { +void CaseInsensitiveCompareUC16Instr::EmitNativeCode( + FlowGraphCompiler* compiler) { // Save RSP. R13 is chosen because it is callee saved so we do not need to // back it up before calling into the runtime. static const Register kSavedSPReg = R13; diff --git a/runtime/vm/compiler/backend/type_propagator.cc b/runtime/vm/compiler/backend/type_propagator.cc index 0352767a6e2..0dec0c96690 100644 --- a/runtime/vm/compiler/backend/type_propagator.cc +++ b/runtime/vm/compiler/backend/type_propagator.cc @@ -1515,7 +1515,7 @@ CompileType MathMinMaxInstr::ComputeType() const { return CompileType::FromCid(result_cid_); } -CompileType CaseInsensitiveCompareInstr::ComputeType() const { +CompileType CaseInsensitiveCompareUC16Instr::ComputeType() const { return CompileType::FromCid(kBoolCid); } diff --git a/runtime/vm/compiler/jit/compiler.cc b/runtime/vm/compiler/jit/compiler.cc index d3343307d0d..db5a703d752 100644 --- a/runtime/vm/compiler/jit/compiler.cc +++ b/runtime/vm/compiler/jit/compiler.cc @@ -166,10 +166,11 @@ void IrregexpCompilationPipeline::ParseFunction( RegExp& regexp = RegExp::Handle(parsed_function->function().regexp()); const String& pattern = String::Handle(regexp.pattern()); + const bool multiline = regexp.is_multi_line(); RegExpCompileData* compile_data = new (zone) RegExpCompileData(); // Parsing failures are handled in the RegExp factory constructor. - RegExpParser::ParseRegExp(pattern, regexp.flags(), compile_data); + RegExpParser::ParseRegExp(pattern, multiline, compile_data); regexp.set_num_bracket_expressions(compile_data->capture_count); regexp.set_capture_name_map(compile_data->capture_name_map); diff --git a/runtime/vm/object.cc b/runtime/vm/object.cc index e25bf06d4d1..9804f6935c7 100644 --- a/runtime/vm/object.cc +++ b/runtime/vm/object.cc @@ -21751,45 +21751,23 @@ RawRegExp* RegExp::New(Heap::Space space) { NoSafepointScope no_safepoint; result ^= raw; result.set_type(kUninitialized); - result.set_flags(RegExpFlags()); - result.set_num_registers(/*is_one_byte=*/false, -1); - result.set_num_registers(/*is_one_byte=*/true, -1); + result.set_flags(0); + result.set_num_registers(-1); } return result.raw(); } -const char* RegExpFlags::ToCString() const { - switch (value_ & ~kGlobal) { - case kIgnoreCase | kMultiLine | kDotAll | kUnicode: - return "imsu"; - case kIgnoreCase | kMultiLine | kDotAll: - return "ims"; - case kIgnoreCase | kMultiLine | kUnicode: - return "imu"; - case kIgnoreCase | kUnicode | kDotAll: - return "ius"; - case kMultiLine | kDotAll | kUnicode: - return "msu"; +const char* RegExp::Flags() const { + switch (flags()) { + case kGlobal | kIgnoreCase | kMultiLine: case kIgnoreCase | kMultiLine: return "im"; - case kIgnoreCase | kDotAll: - return "is"; - case kIgnoreCase | kUnicode: - return "iu"; - case kMultiLine | kDotAll: - return "ms"; - case kMultiLine | kUnicode: - return "mu"; - case kDotAll | kUnicode: - return "su"; + case kGlobal | kIgnoreCase: case kIgnoreCase: return "i"; + case kGlobal | kMultiLine: case kMultiLine: return "m"; - case kDotAll: - return "s"; - case kUnicode: - return "u"; default: break; } @@ -21811,7 +21789,9 @@ bool RegExp::CanonicalizeEquals(const Instance& other) const { return false; } // Match the flags. - if (flags() != other_js.flags()) { + if ((is_global() != other_js.is_global()) || + (is_ignore_case() != other_js.is_ignore_case()) || + (is_multi_line() != other_js.is_multi_line())) { return false; } return true; @@ -21820,7 +21800,7 @@ bool RegExp::CanonicalizeEquals(const Instance& other) const { const char* RegExp::ToCString() const { const String& str = String::Handle(pattern()); return OS::SCreate(Thread::Current()->zone(), "RegExp: pattern=%s flags=%s", - str.ToCString(), flags().ToCString()); + str.ToCString(), Flags()); } RawWeakProperty* WeakProperty::New(Heap::Space space) { diff --git a/runtime/vm/object.h b/runtime/vm/object.h index f0040cf35b3..5b0bd63b4e8 100644 --- a/runtime/vm/object.h +++ b/runtime/vm/object.h @@ -9105,55 +9105,6 @@ class StackTrace : public Instance { friend class Debugger; }; -class RegExpFlags { - public: - // Flags are passed to a regex object as follows: - // 'i': ignore case, 'g': do global matches, 'm': pattern is multi line, - // 'u': pattern is full Unicode, not just BMP, 's': '.' in pattern matches - // all characters including line terminators. - enum Flags { - kNone = 0, - kGlobal = 1, - kIgnoreCase = 2, - kMultiLine = 4, - kUnicode = 8, - kDotAll = 16, - }; - - static const int kDefaultFlags = 0; - - RegExpFlags() : value_(kDefaultFlags) {} - explicit RegExpFlags(int value) : value_(value) {} - - inline bool IsGlobal() const { return (value_ & kGlobal) != 0; } - inline bool IgnoreCase() const { return (value_ & kIgnoreCase) != 0; } - inline bool IsMultiLine() const { return (value_ & kMultiLine) != 0; } - inline bool IsUnicode() const { return (value_ & kUnicode) != 0; } - inline bool IsDotAll() const { return (value_ & kDotAll) != 0; } - - inline bool NeedsUnicodeCaseEquivalents() { - // Both unicode and ignore_case flags are set. We need to use ICU to find - // the closure over case equivalents. - return IsUnicode() && IgnoreCase(); - } - - void SetGlobal() { value_ |= kGlobal; } - void SetIgnoreCase() { value_ |= kIgnoreCase; } - void SetMultiLine() { value_ |= kMultiLine; } - void SetUnicode() { value_ |= kUnicode; } - void SetDotAll() { value_ |= kDotAll; } - - const char* ToCString() const; - - int value() const { return value_; } - - bool operator==(const RegExpFlags& other) { return value_ == other.value_; } - bool operator!=(const RegExpFlags& other) { return value_ != other.value_; } - - private: - int value_; -}; - // Internal JavaScript regular expression object. class RegExp : public Instance { public: @@ -9167,11 +9118,20 @@ class RegExp : public Instance { kComplex = 2, }; + // Flags are passed to a regex object as follows: + // 'i': ignore case, 'g': do global matches, 'm': pattern is multi line. + enum Flags { + kNone = 0, + kGlobal = 1, + kIgnoreCase = 2, + kMultiLine = 4, + }; + enum { kTypePos = 0, kTypeSize = 2, kFlagsPos = 2, - kFlagsSize = 5, + kFlagsSize = 4, }; class TypeBits : public BitField {}; @@ -9181,10 +9141,11 @@ class RegExp : public Instance { bool is_simple() const { return (type() == kSimple); } bool is_complex() const { return (type() == kComplex); } - intptr_t num_registers(bool is_one_byte) const { - return is_one_byte ? raw_ptr()->num_one_byte_registers_ - : raw_ptr()->num_two_byte_registers_; - } + bool is_global() const { return (flags() & kGlobal); } + bool is_ignore_case() const { return (flags() & kIgnoreCase); } + bool is_multi_line() const { return (flags() & kMultiLine); } + + intptr_t num_registers() const { return raw_ptr()->num_registers_; } RawString* pattern() const { return raw_ptr()->pattern_; } RawSmi* num_bracket_expressions() const { @@ -9248,48 +9209,15 @@ class RegExp : public Instance { void set_num_bracket_expressions(intptr_t value) const; void set_capture_name_map(const Array& array) const; - void set_is_global() const { - RegExpFlags f = flags(); - f.SetGlobal(); - set_flags(f); - } - void set_is_ignore_case() const { - RegExpFlags f = flags(); - f.SetIgnoreCase(); - set_flags(f); - } - void set_is_multi_line() const { - RegExpFlags f = flags(); - f.SetMultiLine(); - set_flags(f); - } - void set_is_unicode() const { - RegExpFlags f = flags(); - f.SetUnicode(); - set_flags(f); - } - void set_is_dot_all() const { - RegExpFlags f = flags(); - f.SetDotAll(); - set_flags(f); - } + void set_is_global() const { set_flags(flags() | kGlobal); } + void set_is_ignore_case() const { set_flags(flags() | kIgnoreCase); } + void set_is_multi_line() const { set_flags(flags() | kMultiLine); } void set_is_simple() const { set_type(kSimple); } void set_is_complex() const { set_type(kComplex); } - void set_num_registers(bool is_one_byte, intptr_t value) const { - if (is_one_byte) { - StoreNonPointer(&raw_ptr()->num_one_byte_registers_, value); - } else { - StoreNonPointer(&raw_ptr()->num_two_byte_registers_, value); - } + void set_num_registers(intptr_t value) const { + StoreNonPointer(&raw_ptr()->num_registers_, value); } - RegExpFlags flags() const { - return RegExpFlags(FlagsBits::decode(raw_ptr()->type_flags_)); - } - void set_flags(RegExpFlags flags) const { - StoreNonPointer(&raw_ptr()->type_flags_, - FlagsBits::update(flags.value(), raw_ptr()->type_flags_)); - } const char* Flags() const; virtual bool CanonicalizeEquals(const Instance& other) const; @@ -9305,8 +9233,13 @@ class RegExp : public Instance { StoreNonPointer(&raw_ptr()->type_flags_, TypeBits::update(type, raw_ptr()->type_flags_)); } + void set_flags(intptr_t value) const { + StoreNonPointer(&raw_ptr()->type_flags_, + FlagsBits::update(value, raw_ptr()->type_flags_)); + } RegExType type() const { return TypeBits::decode(raw_ptr()->type_flags_); } + intptr_t flags() const { return FlagsBits::decode(raw_ptr()->type_flags_); } FINAL_HEAP_OBJECT_IMPLEMENTATION(RegExp, Instance); friend class Class; diff --git a/runtime/vm/object_service.cc b/runtime/vm/object_service.cc index 9895c421f4c..0397f6e3181 100644 --- a/runtime/vm/object_service.cc +++ b/runtime/vm/object_service.cc @@ -1487,8 +1487,8 @@ void RegExp::PrintJSONImpl(JSONStream* stream, bool ref) const { return; } - jsobj.AddProperty("isCaseSensitive", !flags().IgnoreCase()); - jsobj.AddProperty("isMultiLine", flags().IsMultiLine()); + jsobj.AddProperty("isCaseSensitive", !is_ignore_case()); + jsobj.AddProperty("isMultiLine", is_multi_line()); if (!FLAG_interpret_irregexp) { Function& func = Function::Handle(); diff --git a/runtime/vm/raw_object.h b/runtime/vm/raw_object.h index 75e7a92d4db..05d60ddf6dc 100644 --- a/runtime/vm/raw_object.h +++ b/runtime/vm/raw_object.h @@ -2428,17 +2428,11 @@ class RawRegExp : public RawInstance { VISIT_TO(RawObject*, external_two_byte_sticky_function_) RawObject** to_snapshot(Snapshot::Kind kind) { return to(); } - // The same pattern may use different amount of registers if compiled - // for a one-byte target than a two-byte target. For example, we do not - // need to allocate registers to check whether the current position is within - // a surrogate pair when matching a Unicode pattern against a one-byte string. - intptr_t num_one_byte_registers_; - intptr_t num_two_byte_registers_; + intptr_t num_registers_; // A bitfield with two fields: // type: Uninitialized, simple or complex. - // flags: Represents global/local, case insensitive, multiline, unicode, - // dotAll. + // flags: Represents global/local, case insensitive, multiline. int8_t type_flags_; }; diff --git a/runtime/vm/raw_object_snapshot.cc b/runtime/vm/raw_object_snapshot.cc index 72ef288d6f7..20e39835386 100644 --- a/runtime/vm/raw_object_snapshot.cc +++ b/runtime/vm/raw_object_snapshot.cc @@ -2172,9 +2172,7 @@ RawRegExp* RegExp::ReadFrom(SnapshotReader* reader, *reader->StringHandle() ^= reader->ReadObjectImpl(kAsInlinedObject); regex.set_pattern(*reader->StringHandle()); - regex.StoreNonPointer(®ex.raw_ptr()->num_one_byte_registers_, - reader->Read()); - regex.StoreNonPointer(®ex.raw_ptr()->num_two_byte_registers_, + regex.StoreNonPointer(®ex.raw_ptr()->num_registers_, reader->Read()); regex.StoreNonPointer(®ex.raw_ptr()->type_flags_, reader->Read()); @@ -2204,8 +2202,7 @@ void RawRegExp::WriteTo(SnapshotWriter* writer, // Write out all the other fields. writer->Write(ptr()->num_bracket_expressions_); writer->WriteObjectImpl(ptr()->pattern_, kAsInlinedObject); - writer->Write(ptr()->num_one_byte_registers_); - writer->Write(ptr()->num_two_byte_registers_); + writer->Write(ptr()->num_registers_); writer->Write(ptr()->type_flags_); } diff --git a/runtime/vm/regexp.cc b/runtime/vm/regexp.cc index 23eff8f9daa..dd559bd9ee1 100644 --- a/runtime/vm/regexp.cc +++ b/runtime/vm/regexp.cc @@ -4,11 +4,7 @@ #include "vm/regexp.h" -#include "platform/splay-tree-inl.h" #include "platform/unicode.h" - -#include "unicode/uniset.h" - #include "vm/dart_entry.h" #include "vm/regexp_assembler.h" #include "vm/regexp_assembler_bytecode.h" @@ -29,14 +25,14 @@ static const bool kRegexpOptimization = true; static const intptr_t kMaxLookaheadForBoyerMoore = 8; ContainedInLattice AddRange(ContainedInLattice containment, - const int32_t* ranges, + const intptr_t* ranges, intptr_t ranges_length, Interval new_range) { ASSERT((ranges_length & 1) == 1); - ASSERT(ranges[ranges_length - 1] == Utf::kMaxCodePoint + 1); + ASSERT(ranges[ranges_length - 1] == Utf16::kMaxCodeUnit + 1); if (containment == kLatticeUnknown) return containment; bool inside = false; - int32_t last = 0; + intptr_t last = 0; for (intptr_t i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) { // Consider the range from last to ranges[i]. @@ -286,26 +282,10 @@ class FrequencyCollator : public ValueObject { class RegExpCompiler : public ValueObject { public: - RegExpCompiler(intptr_t capture_count, bool is_one_byte); + RegExpCompiler(intptr_t capture_count, bool ignore_case, bool is_one_byte); intptr_t AllocateRegister() { return next_register_++; } - // Lookarounds to match lone surrogates for unicode character class matches - // are never nested. We can therefore reuse registers. - intptr_t UnicodeLookaroundStackRegister() { - if (unicode_lookaround_stack_register_ == kNoRegister) { - unicode_lookaround_stack_register_ = AllocateRegister(); - } - return unicode_lookaround_stack_register_; - } - - intptr_t UnicodeLookaroundPositionRegister() { - if (unicode_lookaround_position_register_ == kNoRegister) { - unicode_lookaround_position_register_ = AllocateRegister(); - } - return unicode_lookaround_position_register_; - } - #if !defined(DART_PRECOMPILED_RUNTIME) RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler* assembler, RegExpNode* start, @@ -335,6 +315,7 @@ class RegExpCompiler : public ValueObject { void SetRegExpTooBig() { reg_exp_too_big_ = true; } + inline bool ignore_case() { return ignore_case_; } inline bool one_byte() const { return is_one_byte_; } bool read_backward() { return read_backward_; } void set_read_backward(bool value) { read_backward_ = value; } @@ -352,11 +333,10 @@ class RegExpCompiler : public ValueObject { private: EndNode* accept_; intptr_t next_register_; - intptr_t unicode_lookaround_stack_register_; - intptr_t unicode_lookaround_position_register_; ZoneGrowableArray* work_list_; intptr_t recursion_depth_; RegExpMacroAssembler* macro_assembler_; + bool ignore_case_; bool is_one_byte_; bool reg_exp_too_big_; bool read_backward_; @@ -382,12 +362,13 @@ static RegExpEngine::CompilationResult IrregexpRegExpTooBig() { // Attempts to compile the regexp using an Irregexp code generator. Returns // a fixed array or a null handle depending on whether it succeeded. -RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte) +RegExpCompiler::RegExpCompiler(intptr_t capture_count, + bool ignore_case, + bool is_one_byte) : next_register_(2 * (capture_count + 1)), - unicode_lookaround_stack_register_(kNoRegister), - unicode_lookaround_position_register_(kNoRegister), work_list_(NULL), recursion_depth_(0), + ignore_case_(ignore_case), is_one_byte_(is_one_byte), reg_exp_too_big_(false), read_backward_(false), @@ -1018,7 +999,7 @@ static inline bool EmitAtomLetter(Zone* zone, } static void EmitBoundaryTest(RegExpMacroAssembler* masm, - uint16_t border, + intptr_t border, BlockLabel* fall_through, BlockLabel* above_or_equal, BlockLabel* below) { @@ -1031,8 +1012,8 @@ static void EmitBoundaryTest(RegExpMacroAssembler* masm, } static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, - uint16_t first, - uint16_t last, + intptr_t first, + intptr_t last, BlockLabel* fall_through, BlockLabel* in_range, BlockLabel* out_of_range) { @@ -1055,10 +1036,10 @@ static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm, // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even. // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd. static void EmitUseLookupTable(RegExpMacroAssembler* masm, - ZoneGrowableArray* ranges, + ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, - uint16_t min_char, + intptr_t min_char, BlockLabel* fall_through, BlockLabel* even_label, BlockLabel* odd_label) { @@ -1112,7 +1093,7 @@ static void EmitUseLookupTable(RegExpMacroAssembler* masm, } static void CutOutRange(RegExpMacroAssembler* masm, - ZoneGrowableArray* ranges, + ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, intptr_t cut_index, @@ -1138,17 +1119,17 @@ static void CutOutRange(RegExpMacroAssembler* masm, // Unicode case. Split the search space into kSize spaces that are handled // with recursion. -static void SplitSearchSpace(ZoneGrowableArray* ranges, +static void SplitSearchSpace(ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, intptr_t* new_start_index, intptr_t* new_end_index, - uint16_t* border) { + intptr_t* border) { static const intptr_t kSize = RegExpMacroAssembler::kTableSize; static const intptr_t kMask = RegExpMacroAssembler::kTableMask; - uint16_t first = ranges->At(start_index); - uint16_t last = ranges->At(end_index) - 1; + intptr_t first = ranges->At(start_index); + intptr_t last = ranges->At(end_index) - 1; *new_start_index = start_index; *border = (ranges->At(start_index) & ~kMask) + kSize; @@ -1208,7 +1189,7 @@ static void SplitSearchSpace(ZoneGrowableArray* ranges, // Either label can be NULL indicating backtracking. Either label can also be // equal to the fall_through label. static void GenerateBranches(RegExpMacroAssembler* masm, - ZoneGrowableArray* ranges, + ZoneGrowableArray* ranges, intptr_t start_index, intptr_t end_index, uint16_t min_char, @@ -1216,8 +1197,8 @@ static void GenerateBranches(RegExpMacroAssembler* masm, BlockLabel* fall_through, BlockLabel* even_label, BlockLabel* odd_label) { - uint16_t first = ranges->At(start_index); - uint16_t last = ranges->At(end_index) - 1; + intptr_t first = ranges->At(start_index); + intptr_t last = ranges->At(end_index) - 1; ASSERT(min_char < first); @@ -1277,7 +1258,7 @@ static void GenerateBranches(RegExpMacroAssembler* masm, intptr_t new_start_index = 0; intptr_t new_end_index = 0; - uint16_t border = 0; + intptr_t border = 0; SplitSearchSpace(ranges, start_index, end_index, &new_start_index, &new_end_index, &border); @@ -1334,7 +1315,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, CharacterRange::Canonicalize(ranges); } - uint16_t max_char; + intptr_t max_char; if (one_byte) { max_char = Symbols::kMaxOneCharCodeSymbol; } else { @@ -1345,7 +1326,7 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, intptr_t last_valid_range = range_count - 1; while (last_valid_range >= 0) { - const CharacterRange& range = ranges->At(last_valid_range); + CharacterRange& range = (*ranges)[last_valid_range]; if (range.from() <= max_char) { break; } @@ -1373,6 +1354,14 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, } return; } + if (last_valid_range == 0 && !cc->is_negated() && + ranges->At(0).IsEverything(max_char)) { + // This is a common case hit by non-anchored expressions. + if (check_offset) { + macro_assembler->CheckPosition(cp_offset, on_failure); + } + return; + } if (!preloaded) { macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset); @@ -1389,24 +1378,25 @@ static void EmitCharClass(RegExpMacroAssembler* macro_assembler, // entry at zero which goes to the failure label, but if there // was already one there we fall through for success on that entry. // Subsequent entries have alternating meaning (success/failure). - ZoneGrowableArray* range_boundaries = - new (zone) ZoneGrowableArray(last_valid_range); + ZoneGrowableArray* range_boundaries = + new (zone) ZoneGrowableArray(last_valid_range); bool zeroth_entry_is_failure = !cc->is_negated(); for (intptr_t i = 0; i <= last_valid_range; i++) { - const CharacterRange& range = ranges->At(i); + CharacterRange& range = (*ranges)[i]; if (range.from() == 0) { ASSERT(i == 0); zeroth_entry_is_failure = !zeroth_entry_is_failure; } else { range_boundaries->Add(range.from()); } - if (range.to() + 1 <= max_char) { - range_boundaries->Add(range.to() + 1); - } + range_boundaries->Add(range.to() + 1); } intptr_t end_index = range_boundaries->length() - 1; + if (range_boundaries->At(end_index) > max_char) { + end_index--; + } BlockLabel fall_through; GenerateBranches(macro_assembler, range_boundaries, @@ -1704,7 +1694,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, if (read_backward()) return; ASSERT(characters_filled_in < details->characters()); intptr_t characters = details->characters(); - int32_t char_mask; + intptr_t char_mask; if (compiler->one_byte()) { char_mask = Symbols::kMaxOneCharCodeSymbol; } else { @@ -1729,7 +1719,7 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, pos->determines_perfectly = false; return; } - if (elm.atom()->ignore_case()) { + if (compiler->ignore_case()) { int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; intptr_t length = GetCaseIndependentLetters(c, compiler->one_byte(), chars); @@ -1779,7 +1769,6 @@ void TextNode::GetQuickCheckDetails(QuickCheckDetails* details, details->positions(characters_filled_in); RegExpCharacterClass* tree = elm.char_class(); ZoneGrowableArray* ranges = tree->ranges(); - ASSERT(!ranges->is_empty()); if (tree->is_negated()) { // A quick check uses multi-character mask and compare. There is no // useful way to incorporate a negative char class into this scheme @@ -1918,16 +1907,16 @@ class VisitMarker : public ValueObject { NodeInfo* info_; }; -RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth) { +RegExpNode* SeqRegExpNode::FilterOneByte(intptr_t depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; ASSERT(!info()->visited); VisitMarker marker(info()); - return FilterSuccessor(depth - 1); + return FilterSuccessor(depth - 1, ignore_case); } -RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth) { - RegExpNode* next = on_success_->FilterOneByte(depth - 1); +RegExpNode* SeqRegExpNode::FilterSuccessor(intptr_t depth, bool ignore_case) { + RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case); if (next == NULL) return set_replacement(NULL); on_success_ = next; return set_replacement(this); @@ -1964,7 +1953,7 @@ static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) { return 0; } -RegExpNode* TextNode::FilterOneByte(intptr_t depth) { +RegExpNode* TextNode::FilterOneByte(intptr_t depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; ASSERT(!info()->visited); @@ -1977,7 +1966,7 @@ RegExpNode* TextNode::FilterOneByte(intptr_t depth) { for (intptr_t j = 0; j < quarks->length(); j++) { uint16_t c = quarks->At(j); if (c <= Symbols::kMaxOneCharCodeSymbol) continue; - if (!elm.atom()->ignore_case()) return set_replacement(NULL); + if (!ignore_case) return set_replacement(NULL); // Here, we need to check for characters whose upper and lower cases // are outside the Latin-1 range. uint16_t converted = ConvertNonLatin1ToLatin1(c); @@ -1999,44 +1988,40 @@ RegExpNode* TextNode::FilterOneByte(intptr_t depth) { if (range_count != 0 && ranges->At(0).from() == 0 && ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) { // This will be handled in a later filter. - if (cc->flags().IgnoreCase() && - RangesContainLatin1Equivalents(ranges)) { - continue; - } + if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; return set_replacement(NULL); } } else { if (range_count == 0 || ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) { // This will be handled in a later filter. - if (cc->flags().IgnoreCase() && - RangesContainLatin1Equivalents(ranges)) - continue; + if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue; return set_replacement(NULL); } } } } - return FilterSuccessor(depth - 1); + return FilterSuccessor(depth - 1, ignore_case); } -RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth) { +RegExpNode* LoopChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; { VisitMarker marker(info()); - RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1); + RegExpNode* continue_replacement = + continue_node_->FilterOneByte(depth - 1, ignore_case); // If we can't continue after the loop then there is no sense in doing the // loop. if (continue_replacement == NULL) return set_replacement(NULL); } - return ChoiceNode::FilterOneByte(depth - 1); + return ChoiceNode::FilterOneByte(depth - 1, ignore_case); } -RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) { +RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth, bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2055,7 +2040,8 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) { RegExpNode* survivor = NULL; for (intptr_t i = 0; i < choice_count; i++) { GuardedAlternative alternative = alternatives_->At(i); - RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1); + RegExpNode* replacement = + alternative.node()->FilterOneByte(depth - 1, ignore_case); ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK. if (replacement != NULL) { (*alternatives_)[i].set_node(replacement); @@ -2075,7 +2061,7 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) { new (Z) ZoneGrowableArray(surviving); for (intptr_t i = 0; i < choice_count; i++) { RegExpNode* replacement = - (*alternatives_)[i].node()->FilterOneByte(depth - 1); + (*alternatives_)[i].node()->FilterOneByte(depth - 1, ignore_case); if (replacement != NULL) { (*alternatives_)[i].set_node(replacement); new_alternatives->Add((*alternatives_)[i]); @@ -2085,7 +2071,8 @@ RegExpNode* ChoiceNode::FilterOneByte(intptr_t depth) { return this; } -RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth) { +RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth, + bool ignore_case) { if (info()->replacement_calculated) return replacement(); if (depth < 0) return this; if (info()->visited) return this; @@ -2093,12 +2080,12 @@ RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(intptr_t depth) { // Alternative 0 is the negative lookahead, alternative 1 is what comes // afterwards. RegExpNode* node = (*alternatives_)[1].node(); - RegExpNode* replacement = node->FilterOneByte(depth - 1); + RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case); if (replacement == NULL) return set_replacement(NULL); (*alternatives_)[1].set_node(replacement); RegExpNode* neg_node = (*alternatives_)[0].node(); - RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1); + RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case); // If the negative lookahead is always going to fail then // we don't need to check it. if (neg_replacement == NULL) return set_replacement(replacement); @@ -2396,21 +2383,13 @@ void TextNode::TextEmitPass(RegExpCompiler* compiler, if (elm.text_type() == TextElement::ATOM) { ZoneGrowableArray* quarks = elm.atom()->data(); for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) { - if (SkipPass(pass, elm.atom()->ignore_case())) continue; if (first_element_checked && i == 0 && j == 0) continue; if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue; EmitCharacterFunction* emit_function = NULL; - uint16_t quark = quarks->At(j); - if (elm.atom()->ignore_case()) { - // Everywhere else we assume that a non-Latin-1 character cannot match - // a Latin-1 character. Avoid the cases where this is assumption is - // invalid by using the Latin1 equivalent instead. - quark = Latin1::TryConvertToLatin1(quark); - } switch (pass) { case NON_LATIN1_MATCH: ASSERT(one_byte); - if (quark > Symbols::kMaxOneCharCodeSymbol) { + if (quarks->At(j) > Symbols::kMaxOneCharCodeSymbol) { assembler->GoTo(backtrack); return; } @@ -2466,34 +2445,6 @@ bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) { } } -TextNode* TextNode::CreateForCharacterRanges( - ZoneGrowableArray* ranges, - bool read_backward, - RegExpNode* on_success, - RegExpFlags flags) { - ASSERT(ranges != nullptr); - ZoneGrowableArray* elms = new ZoneGrowableArray(1); - elms->Add(TextElement::CharClass(new RegExpCharacterClass(ranges, flags))); - return new TextNode(elms, read_backward, on_success); -} - -TextNode* TextNode::CreateForSurrogatePair(CharacterRange lead, - CharacterRange trail, - bool read_backward, - RegExpNode* on_success, - RegExpFlags flags) { - auto lead_ranges = CharacterRange::List(on_success->zone(), lead); - auto trail_ranges = CharacterRange::List(on_success->zone(), trail); - auto elms = new ZoneGrowableArray(2); - - elms->Add( - TextElement::CharClass(new RegExpCharacterClass(lead_ranges, flags))); - elms->Add( - TextElement::CharClass(new RegExpCharacterClass(trail_ranges, flags))); - - return new TextNode(elms, read_backward, on_success); -} - // This generates the code to match a text node. A text node can contain // straight character sequences (possibly to be matched in a case-independent // way) and character classes. For efficiency we do not do this in a single @@ -2523,15 +2474,19 @@ void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) { // check that now. if (trace->characters_preloaded() == 1) { for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) { - TextEmitPass(compiler, static_cast(pass), true, trace, - false, &bound_checked_to); + if (!SkipPass(pass, compiler->ignore_case())) { + TextEmitPass(compiler, static_cast(pass), true, trace, + false, &bound_checked_to); + } } first_elt_done = true; } for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) { - TextEmitPass(compiler, static_cast(pass), false, trace, - first_elt_done, &bound_checked_to); + if (!SkipPass(pass, compiler->ignore_case())) { + TextEmitPass(compiler, static_cast(pass), false, trace, + first_elt_done, &bound_checked_to); + } } Trace successor_trace(*trace); @@ -2573,13 +2528,13 @@ void TextNode::MakeCaseIndependent(bool is_one_byte) { TextElement elm = elms_->At(i); if (elm.text_type() == TextElement::CHAR_CLASS) { RegExpCharacterClass* cc = elm.char_class(); - bool case_equivalents_already_added = - cc->flags().NeedsUnicodeCaseEquivalents(); - if (cc->flags().IgnoreCase() && !case_equivalents_already_added) { - // None of the standard character classes is different in the case - // independent case and it slows us down if we don't know that. - if (cc->is_standard()) continue; - CharacterRange::AddCaseEquivalents(cc->ranges(), is_one_byte, Z); + // None of the standard character classes is different in the case + // independent case and it slows us down if we don't know that. + if (cc->is_standard()) continue; + ZoneGrowableArray* ranges = cc->ranges(); + intptr_t range_count = ranges->length(); + for (intptr_t j = 0; j < range_count; j++) { + (*ranges)[j].AddCaseEquivalents(ranges, is_one_byte, Z); } } } @@ -2619,7 +2574,7 @@ RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode( // length nodes or other complications in the way then return a sentinel // value indicating that a greedy loop cannot be constructed. intptr_t ChoiceNode::GreedyLoopTextLengthForAlternative( - const GuardedAlternative* alternative) { + GuardedAlternative* alternative) { intptr_t length = 0; RegExpNode* node = alternative->node(); // Later we will generate code for all these text nodes using recursion @@ -2657,7 +2612,7 @@ void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { if (trace->stop_node() == this) { // Back edge of greedy optimized loop node graph. intptr_t text_length = - GreedyLoopTextLengthForAlternative(&alternatives_->At(0)); + GreedyLoopTextLengthForAlternative(&((*alternatives_)[0])); ASSERT(text_length != kNodeIsTooComplexForGreedyLoops); // Update the counter-based backtracking info on the stack. This is an // optimization for greedy loops (see below). @@ -2738,27 +2693,25 @@ class AlternativeGenerationList { DISALLOW_ALLOCATION(); }; -static const int32_t kRangeEndMarker = Utf::kMaxCodePoint + 1; - // The '2' variant is inclusive from and exclusive to. // This covers \s as defined in ECMA-262 5.1, 15.10.2.12, // which include WhiteSpace (7.2) or LineTerminator (7.3) values. // 0x180E has been removed from Unicode's Zs category and thus // from ECMAScript's WhiteSpace category as of Unicode 6.3. -static const int32_t kSpaceRanges[] = { +static const intptr_t kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030, - 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker}; + 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, 0x10000}; static const intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges); -static const int32_t kWordRanges[] = { - '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker}; +static const intptr_t kWordRanges[] = {'0', '9' + 1, 'A', 'Z' + 1, '_', + '_' + 1, 'a', 'z' + 1, 0x10000}; static const intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges); -static const int32_t kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker}; +static const intptr_t kDigitRanges[] = {'0', '9' + 1, 0x10000}; static const intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges); -static const int32_t kSurrogateRanges[] = {0xd800, 0xe000, kRangeEndMarker}; +static const intptr_t kSurrogateRanges[] = {0xd800, 0xe000, 0x10000}; static const intptr_t kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges); -static const int32_t kLineTerminatorRanges[] = { - 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker}; +static const intptr_t kLineTerminatorRanges[] = {0x000A, 0x000B, 0x000D, 0x000E, + 0x2028, 0x202A, 0x10000}; static const intptr_t kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges); @@ -3104,11 +3057,6 @@ void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler, void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { intptr_t choice_count = alternatives_->length(); - if (choice_count == 1 && alternatives_->At(0).guards() == nullptr) { - alternatives_->At(0).node()->Emit(compiler, trace); - return; - } - AssertGuardsMentionRegisters(trace); LimitResult limit_result = LimitVersions(compiler, trace); @@ -3129,7 +3077,7 @@ void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { GreedyLoopState greedy_loop_state(not_at_start()); intptr_t text_length = - GreedyLoopTextLengthForAlternative(&alternatives_->At(0)); + GreedyLoopTextLengthForAlternative(&((*alternatives_)[0])); AlternativeGenerationList alt_gens(choice_count); if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) { @@ -3515,9 +3463,9 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { RecursionCheck rc(compiler); ASSERT(start_reg_ + 1 == end_reg_); - if (flags_.IgnoreCase()) { - assembler->CheckNotBackReferenceIgnoreCase( - start_reg_, read_backward(), flags_.IsUnicode(), trace->backtrack()); + if (compiler->ignore_case()) { + assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), + trace->backtrack()); } else { assembler->CheckNotBackReference(start_reg_, read_backward(), trace->backtrack()); @@ -3525,11 +3473,6 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { // We are going to advance backward, so we may end up at the start. if (read_backward()) trace->set_at_start(Trace::UNKNOWN); - // Check that the back reference does not end inside a surrogate pair. - if (flags_.IsUnicode() && !compiler->one_byte()) { - assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack()); - } - on_success()->Emit(compiler, trace); } @@ -3788,10 +3731,10 @@ RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler, } static bool CompareInverseRanges(ZoneGrowableArray* ranges, - const int32_t* special_class, + const intptr_t* special_class, intptr_t length) { - length--; // Remove final kRangeEndMarker. - ASSERT(special_class[length] == kRangeEndMarker); + length--; // Remove final 0x10000. + ASSERT(special_class[length] == 0x10000); ASSERT(ranges->length() != 0); ASSERT(length != 0); ASSERT(special_class[0] != 0); @@ -3811,17 +3754,17 @@ static bool CompareInverseRanges(ZoneGrowableArray* ranges, return false; } } - if (range.to() != Utf::kMaxCodePoint) { + if (range.to() != 0xffff) { return false; } return true; } static bool CompareRanges(ZoneGrowableArray* ranges, - const int32_t* special_class, + const intptr_t* special_class, intptr_t length) { - length--; // Remove final kRangeEndMarker. - ASSERT(special_class[length] == kRangeEndMarker); + length--; // Remove final 0x10000. + ASSERT(special_class[length] == 0x10000); if (ranges->length() * 2 != length) { return false; } @@ -3838,7 +3781,7 @@ static bool CompareRanges(ZoneGrowableArray* ranges, bool RegExpCharacterClass::is_standard() { // TODO(lrn): Remove need for this function, by not throwing away information // along the way. - if (is_negated()) { + if (is_negated_) { return false; } if (set_.is_standard()) { @@ -3873,307 +3816,8 @@ bool RegExpCharacterClass::is_standard() { return false; } -UnicodeRangeSplitter::UnicodeRangeSplitter( - Zone* zone, - ZoneGrowableArray* base) - : zone_(zone), - table_(zone), - bmp_(nullptr), - lead_surrogates_(nullptr), - trail_surrogates_(nullptr), - non_bmp_(nullptr) { - // The unicode range splitter categorizes given character ranges into: - // - Code points from the BMP representable by one code unit. - // - Code points outside the BMP that need to be split into surrogate pairs. - // - Lone lead surrogates. - // - Lone trail surrogates. - // Lone surrogates are valid code points, even though no actual characters. - // They require special matching to make sure we do not split surrogate pairs. - // We use the dispatch table to accomplish this. The base range is split up - // by the table by the overlay ranges, and the Call callback is used to - // filter and collect ranges for each category. - for (intptr_t i = 0; i < base->length(); i++) { - table_.AddRange(base->At(i), kBase, zone_); - } - // Add overlay ranges. - table_.AddRange(CharacterRange::Range(0, Utf16::kLeadSurrogateStart - 1), - kBmpCodePoints, zone_); - table_.AddRange(CharacterRange::Range(Utf16::kLeadSurrogateStart, - Utf16::kLeadSurrogateEnd), - kLeadSurrogates, zone_); - table_.AddRange(CharacterRange::Range(Utf16::kTrailSurrogateStart, - Utf16::kTrailSurrogateEnd), - kTrailSurrogates, zone_); - table_.AddRange( - CharacterRange::Range(Utf16::kTrailSurrogateEnd + 1, Utf16::kMaxCodeUnit), - kBmpCodePoints, zone_); - table_.AddRange( - CharacterRange::Range(Utf16::kMaxCodeUnit + 1, Utf::kMaxCodePoint), - kNonBmpCodePoints, zone_); - table_.ForEach(this); -} - -void UnicodeRangeSplitter::Call(uint32_t from, DispatchTable::Entry entry) { - OutSet* outset = entry.out_set(); - if (!outset->Get(kBase)) return; - ZoneGrowableArray** target = nullptr; - if (outset->Get(kBmpCodePoints)) { - target = &bmp_; - } else if (outset->Get(kLeadSurrogates)) { - target = &lead_surrogates_; - } else if (outset->Get(kTrailSurrogates)) { - target = &trail_surrogates_; - } else { - ASSERT(outset->Get(kNonBmpCodePoints)); - target = &non_bmp_; - } - if (*target == nullptr) { - *target = new (zone_) ZoneGrowableArray(2); - } - (*target)->Add(CharacterRange::Range(entry.from(), entry.to())); -} - -void AddBmpCharacters(RegExpCompiler* compiler, - ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - ZoneGrowableArray* bmp = splitter->bmp(); - if (bmp == nullptr) return; - result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges( - bmp, compiler->read_backward(), on_success, RegExpFlags()))); -} - -void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, - ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - ZoneGrowableArray* non_bmp = splitter->non_bmp(); - if (non_bmp == nullptr) return; - ASSERT(!compiler->one_byte()); - CharacterRange::Canonicalize(non_bmp); - for (int i = 0; i < non_bmp->length(); i++) { - // Match surrogate pair. - // E.g. [\u10005-\u11005] becomes - // \ud800[\udc05-\udfff]| - // [\ud801-\ud803][\udc00-\udfff]| - // \ud804[\udc00-\udc05] - uint32_t from = non_bmp->At(i).from(); - uint32_t to = non_bmp->At(i).to(); - uint16_t from_points[2]; - Utf16::Encode(from, from_points); - uint16_t to_points[2]; - Utf16::Encode(to, to_points); - if (from_points[0] == to_points[0]) { - // The lead surrogate is the same. - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - CharacterRange::Singleton(from_points[0]), - CharacterRange::Range(from_points[1], to_points[1]), - compiler->read_backward(), on_success, RegExpFlags()))); - } else { - if (from_points[1] != Utf16::kTrailSurrogateStart) { - // Add [from_l][from_t-\udfff] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - CharacterRange::Singleton(from_points[0]), - CharacterRange::Range(from_points[1], - Utf16::kTrailSurrogateEnd), - compiler->read_backward(), on_success, RegExpFlags()))); - from_points[0]++; - } - if (to_points[1] != Utf16::kTrailSurrogateEnd) { - // Add [to_l][\udc00-to_t] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - CharacterRange::Singleton(to_points[0]), - CharacterRange::Range(Utf16::kTrailSurrogateStart, - to_points[1]), - compiler->read_backward(), on_success, RegExpFlags()))); - to_points[0]--; - } - if (from_points[0] <= to_points[0]) { - // Add [from_l-to_l][\udc00-\udfff] - result->AddAlternative( - GuardedAlternative(TextNode::CreateForSurrogatePair( - CharacterRange::Range(from_points[0], to_points[0]), - CharacterRange::Range(Utf16::kTrailSurrogateStart, - Utf16::kTrailSurrogateEnd), - compiler->read_backward(), on_success, RegExpFlags()))); - } - } - } -} - -RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch( - RegExpCompiler* compiler, - ZoneGrowableArray* lookbehind, - ZoneGrowableArray* match, - RegExpNode* on_success, - bool read_backward, - RegExpFlags flags) { - RegExpNode* match_node = TextNode::CreateForCharacterRanges( - match, read_backward, on_success, flags); - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - RegExpLookaround::Builder lookaround(false, match_node, stack_register, - position_register); - RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - lookbehind, !read_backward, lookaround.on_match_success(), flags); - return lookaround.ForMatch(negative_match); -} - -RegExpNode* MatchAndNegativeLookaroundInReadDirection( - RegExpCompiler* compiler, - ZoneGrowableArray* match, - ZoneGrowableArray* lookahead, - RegExpNode* on_success, - bool read_backward, - RegExpFlags flags) { - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - RegExpLookaround::Builder lookaround(false, on_success, stack_register, - position_register); - RegExpNode* negative_match = TextNode::CreateForCharacterRanges( - lookahead, read_backward, lookaround.on_match_success(), flags); - return TextNode::CreateForCharacterRanges( - match, read_backward, lookaround.ForMatch(negative_match), flags); -} - -void AddLoneLeadSurrogates(RegExpCompiler* compiler, - ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - auto lead_surrogates = splitter->lead_surrogates(); - if (lead_surrogates == nullptr) return; - // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). - auto trail_surrogates = CharacterRange::List( - on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart, - Utf16::kTrailSurrogateEnd)); - - RegExpNode* match; - if (compiler->read_backward()) { - // Reading backward. Assert that reading forward, there is no trail - // surrogate, and then backward match the lead surrogate. - match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, trail_surrogates, lead_surrogates, on_success, true, - RegExpFlags()); - } else { - // Reading forward. Forward match the lead surrogate and assert that - // no trail surrogate follows. - match = MatchAndNegativeLookaroundInReadDirection( - compiler, lead_surrogates, trail_surrogates, on_success, false, - RegExpFlags()); - } - result->AddAlternative(GuardedAlternative(match)); -} - -void AddLoneTrailSurrogates(RegExpCompiler* compiler, - ChoiceNode* result, - RegExpNode* on_success, - UnicodeRangeSplitter* splitter) { - auto trail_surrogates = splitter->trail_surrogates(); - if (trail_surrogates == nullptr) return; - // E.g. \udc01 becomes (?zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart, - Utf16::kLeadSurrogateEnd)); - - RegExpNode* match; - if (compiler->read_backward()) { - // Reading backward. Backward match the trail surrogate and assert that no - // lead surrogate precedes it. - match = MatchAndNegativeLookaroundInReadDirection( - compiler, trail_surrogates, lead_surrogates, on_success, true, - RegExpFlags()); - } else { - // Reading forward. Assert that reading backward, there is no lead - // surrogate, and then forward match the trail surrogate. - match = NegativeLookaroundAgainstReadDirectionAndMatch( - compiler, lead_surrogates, trail_surrogates, on_success, false, - RegExpFlags()); - } - result->AddAlternative(GuardedAlternative(match)); -} - -RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, - RegExpNode* on_success) { - // This implements ES2015 21.2.5.2.3, AdvanceStringIndex. - ASSERT(!compiler->read_backward()); - // Advance any character. If the character happens to be a lead surrogate and - // we advanced into the middle of a surrogate pair, it will work out, as - // nothing will match from there. We will have to advance again, consuming - // the associated trail surrogate. - auto range = CharacterRange::List( - on_success->zone(), CharacterRange::Range(0, Utf16::kMaxCodeUnit)); - return TextNode::CreateForCharacterRanges(range, false, on_success, - RegExpFlags()); -} - -void AddUnicodeCaseEquivalents(ZoneGrowableArray* ranges) { - ASSERT(CharacterRange::IsCanonical(ranges)); - - // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver. - // See also https://crbug.com/v8/6727. - // TODO(sstrickl): This only covers the special case of the {0,0x10FFFF} - // range, which we use frequently internally. But large ranges can also easily - // be created by the user. We might want to have a more general caching - // mechanism for such ranges. - if (ranges->length() == 1 && ranges->At(0).IsEverything(Utf::kMaxCodePoint)) { - return; - } - - icu::UnicodeSet set; - for (int i = 0; i < ranges->length(); i++) { - set.add(ranges->At(i).from(), ranges->At(i).to()); - } - ranges->Clear(); - set.closeOver(USET_CASE_INSENSITIVE); - // Full case mapping map single characters to multiple characters. - // Those are represented as strings in the set. Remove them so that - // we end up with only simple and common case mappings. - set.removeAllStrings(); - for (int i = 0; i < set.getRangeCount(); i++) { - ranges->Add( - CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i))); - } - // No errors and everything we collected have been ranges. - CharacterRange::Canonicalize(ranges); -} - RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { - set_.Canonicalize(); - ZoneGrowableArray* ranges = this->ranges(); - if (flags_.NeedsUnicodeCaseEquivalents()) { - AddUnicodeCaseEquivalents(ranges); - } - if (flags_.IsUnicode() && !compiler->one_byte() && - !contains_split_surrogate()) { - if (is_negated()) { - ZoneGrowableArray* negated = - new ZoneGrowableArray(2); - CharacterRange::Negate(ranges, negated); - ranges = negated; - } - if (ranges->length() == 0) { - RegExpCharacterClass* fail = - new RegExpCharacterClass(ranges, RegExpFlags()); - return new TextNode(fail, compiler->read_backward(), on_success); - } - if (standard_type() == '*') { - return UnanchoredAdvance(compiler, on_success); - } else { - ChoiceNode* result = new (OZ) ChoiceNode(2, OZ); - UnicodeRangeSplitter splitter(OZ, ranges); - AddBmpCharacters(compiler, result, on_success, &splitter); - AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); - AddLoneLeadSurrogates(compiler, result, on_success, &splitter); - AddLoneTrailSurrogates(compiler, result, on_success, &splitter); - return result; - } - } else { - return new TextNode(this, compiler->read_backward(), on_success); - } return new (OZ) TextNode(this, compiler->read_backward(), on_success); } @@ -4370,43 +4014,6 @@ RegExpNode* RegExpQuantifier::ToNode(intptr_t min, } } -namespace { -// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and -// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) -RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, - RegExpNode* on_success, - RegExpAssertion::AssertionType type, - RegExpFlags flags) { - ASSERT(flags.NeedsUnicodeCaseEquivalents()); - ZoneGrowableArray* word_range = - new ZoneGrowableArray(2); - CharacterRange::AddClassEscape('w', word_range, true); - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - ChoiceNode* result = new (OZ) ChoiceNode(2, OZ); - // Add two choices. The (non-)boundary could start with a word or - // a non-word-character. - for (int i = 0; i < 2; i++) { - bool lookbehind_for_word = i == 0; - bool lookahead_for_word = - (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; - // Look to the left. - RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, - stack_register, position_register); - RegExpNode* backward = TextNode::CreateForCharacterRanges( - word_range, true, lookbehind.on_match_success(), flags); - // Look to the right. - RegExpLookaround::Builder lookahead(lookahead_for_word, - lookbehind.ForMatch(backward), - stack_register, position_register); - RegExpNode* forward = TextNode::CreateForCharacterRanges( - word_range, false, lookahead.on_match_success(), flags); - result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); - } - return result; -} -} // anonymous namespace - RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { switch (assertion_type()) { @@ -4415,15 +4022,9 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, case START_OF_INPUT: return AssertionNode::AtStart(on_success); case BOUNDARY: - return flags_.NeedsUnicodeCaseEquivalents() - ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY, - flags_) - : AssertionNode::AtBoundary(on_success); + return AssertionNode::AtBoundary(on_success); case NON_BOUNDARY: - return flags_.NeedsUnicodeCaseEquivalents() - ? BoundaryAssertionAsLookaround(compiler, on_success, - NON_BOUNDARY, flags_) - : AssertionNode::AtNonBoundary(on_success); + return AssertionNode::AtNonBoundary(on_success); case END_OF_INPUT: return AssertionNode::AtEnd(on_success); case END_OF_LINE: { @@ -4438,8 +4039,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, ZoneGrowableArray* newline_ranges = new ZoneGrowableArray(3); CharacterRange::AddClassEscape('n', newline_ranges); - RegExpCharacterClass* newline_atom = - new RegExpCharacterClass('n', RegExpFlags()); + RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n'); TextNode* newline_matcher = new TextNode(newline_atom, /*read_backwards=*/false, ActionNode::PositiveSubmatchSuccess( @@ -4466,7 +4066,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler, RegExpNode* on_success) { return new (OZ) BackReferenceNode(RegExpCapture::StartRegister(index()), - RegExpCapture::EndRegister(index()), flags_, + RegExpCapture::EndRegister(index()), compiler->read_backward(), on_success); } @@ -4574,24 +4174,24 @@ RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler, return current; } -static void AddClass(const int32_t* elmv, +static void AddClass(const intptr_t* elmv, intptr_t elmc, ZoneGrowableArray* ranges) { elmc--; - ASSERT(elmv[elmc] == kRangeEndMarker); + ASSERT(elmv[elmc] == 0x10000); for (intptr_t i = 0; i < elmc; i += 2) { ASSERT(elmv[i] < elmv[i + 1]); ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1)); } } -static void AddClassNegated(const int32_t* elmv, +static void AddClassNegated(const intptr_t* elmv, intptr_t elmc, ZoneGrowableArray* ranges) { elmc--; - ASSERT(elmv[elmc] == kRangeEndMarker); + ASSERT(elmv[elmc] == 0x10000); ASSERT(elmv[0] != 0x0000); - ASSERT(elmv[elmc - 1] != Utf::kMaxCodePoint); + ASSERT(elmv[elmc - 1] != Utf16::kMaxCodeUnit); uint16_t last = 0x0000; for (intptr_t i = 0; i < elmc; i += 2) { ASSERT(last <= elmv[i] - 1); @@ -4599,30 +4199,7 @@ static void AddClassNegated(const int32_t* elmv, ranges->Add(CharacterRange(last, elmv[i] - 1)); last = elmv[i + 1]; } - ranges->Add(CharacterRange(last, Utf::kMaxCodePoint)); -} - -void CharacterRange::AddClassEscape(uint16_t type, - ZoneGrowableArray* ranges, - bool add_unicode_case_equivalents) { - if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { - // See #sec-runtime-semantics-wordcharacters-abstract-operation - // In case of unicode and ignore_case, we need to create the closure over - // case equivalent characters before negating. - ZoneGrowableArray* new_ranges = - new ZoneGrowableArray(2); - AddClass(kWordRanges, kWordRangeCount, new_ranges); - AddUnicodeCaseEquivalents(new_ranges); - if (type == 'W') { - ZoneGrowableArray* negated = - new ZoneGrowableArray(2); - CharacterRange::Negate(new_ranges, negated); - new_ranges = negated; - } - ranges->AddArray(*new_ranges); - return; - } - AddClassEscape(type, ranges); + ranges->Add(CharacterRange(last, Utf16::kMaxCodeUnit)); } void CharacterRange::AddClassEscape(uint16_t type, @@ -4669,79 +4246,69 @@ void CharacterRange::AddCaseEquivalents( ZoneGrowableArray* ranges, bool is_one_byte, Zone* zone) { - CharacterRange::Canonicalize(ranges); - int range_count = ranges->length(); - for (intptr_t i = 0; i < range_count; i++) { - CharacterRange range = ranges->At(i); - int32_t bottom = range.from(); - if (bottom > Utf16::kMaxCodeUnit) continue; - int32_t top = Utils::Minimum(range.to(), Utf16::kMaxCodeUnit); - // Nothing to be done for surrogates - if (bottom >= Utf16::kLeadSurrogateStart && - top <= Utf16::kTrailSurrogateEnd) { - continue; - } - if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { - if (bottom > Symbols::kMaxOneCharCodeSymbol) continue; - if (top > Symbols::kMaxOneCharCodeSymbol) { - top = Symbols::kMaxOneCharCodeSymbol; - } + uint16_t bottom = from(); + uint16_t top = to(); + if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { + if (bottom > Symbols::kMaxOneCharCodeSymbol) return; + if (top > Symbols::kMaxOneCharCodeSymbol) { + top = Symbols::kMaxOneCharCodeSymbol; } + } - unibrow::Mapping jsregexp_uncanonicalize; - unibrow::Mapping jsregexp_canonrange; - int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - if (top == bottom) { - // If this is a singleton we just expand the one character. - intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars); + unibrow::Mapping jsregexp_uncanonicalize; + unibrow::Mapping jsregexp_canonrange; + int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + if (top == bottom) { + // If this is a singleton we just expand the one character. + intptr_t length = + jsregexp_uncanonicalize.get(bottom, '\0', chars); // NOLINT + for (intptr_t i = 0; i < length; i++) { + uint32_t chr = chars[i]; + if (chr != bottom) { + ranges->Add(CharacterRange::Singleton(chars[i])); + } + } + } else { + // If this is a range we expand the characters block by block, + // expanding contiguous subranges (blocks) one at a time. + // The approach is as follows. For a given start character we + // look up the remainder of the block that contains it (represented + // by the end point), for instance we find 'z' if the character + // is 'c'. A block is characterized by the property + // that all characters uncanonicalize in the same way, except that + // each entry in the result is incremented by the distance from the first + // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and + // the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. + // Once we've found the end point we look up its uncanonicalization + // and produce a range for each element. For instance for [c-f] + // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only + // add a range if it is not already contained in the input, so [c-f] + // will be skipped but [C-F] will be added. If this range is not + // completely contained in a block we do this for all the blocks + // covered by the range (handling characters that is not in a block + // as a "singleton block"). + int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; + intptr_t pos = bottom; + while (pos <= top) { + intptr_t length = jsregexp_canonrange.get(pos, '\0', range); + uint16_t block_end; + if (length == 0) { + block_end = pos; + } else { + ASSERT(length == 1); + block_end = range[0]; + } + intptr_t end = (block_end > top) ? top : block_end; + length = jsregexp_uncanonicalize.get(block_end, '\0', range); // NOLINT for (intptr_t i = 0; i < length; i++) { - int32_t chr = chars[i]; - if (chr != bottom) { - ranges->Add(CharacterRange::Singleton(chars[i])); + uint32_t c = range[i]; + uint16_t range_from = c - (block_end - pos); + uint16_t range_to = c - (block_end - end); + if (!(bottom <= range_from && range_to <= top)) { + ranges->Add(CharacterRange(range_from, range_to)); } } - } else { - // If this is a range we expand the characters block by block, - // expanding contiguous subranges (blocks) one at a time. - // The approach is as follows. For a given start character we - // look up the remainder of the block that contains it (represented - // by the end point), for instance we find 'z' if the character - // is 'c'. A block is characterized by the property - // that all characters uncanonicalize in the same way, except that - // each entry in the result is incremented by the distance from the first - // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] - // and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. - // Once we've found the end point we look up its uncanonicalization - // and produce a range for each element. For instance for [c-f] - // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only - // add a range if it is not already contained in the input, so [c-f] - // will be skipped but [C-F] will be added. If this range is not - // completely contained in a block we do this for all the blocks - // covered by the range (handling characters that is not in a block - // as a "singleton block"). - int32_t range[unibrow::Ecma262UnCanonicalize::kMaxWidth]; - intptr_t pos = bottom; - while (pos <= top) { - intptr_t length = jsregexp_canonrange.get(pos, '\0', range); - int32_t block_end; - if (length == 0) { - block_end = pos; - } else { - ASSERT(length == 1); - block_end = range[0]; - } - intptr_t end = (block_end > top) ? top : block_end; - length = jsregexp_uncanonicalize.get(block_end, '\0', range); - for (intptr_t i = 0; i < length; i++) { - int32_t c = range[i]; - int32_t range_from = c - (block_end - pos); - int32_t range_to = c - (block_end - end); - if (!(bottom <= range_from && range_to <= top)) { - ranges->Add(CharacterRange(range_from, range_to)); - } - } - pos = end + 1; - } + pos = end + 1; } } } @@ -4794,8 +4361,8 @@ static intptr_t InsertRangeInCanonicalList( // list[0..count] for the result. Returns the number of resulting // canonicalized ranges. Inserting a range may collapse existing ranges into // fewer ranges, so the return value can be anything in the range 1..count+1. - int32_t from = insert.from(); - int32_t to = insert.to(); + uint16_t from = insert.from(); + uint16_t to = insert.to(); intptr_t start_pos = 0; intptr_t end_pos = count; for (intptr_t i = count - 1; i >= 0; i--) { @@ -4891,7 +4458,7 @@ void CharacterRange::Negate(ZoneGrowableArray* ranges, ASSERT(CharacterRange::IsCanonical(ranges)); ASSERT(negated_ranges->length() == 0); intptr_t range_count = ranges->length(); - uint32_t from = 0; + uint16_t from = 0; intptr_t i = 0; if (range_count > 0 && ranges->At(0).from() == 0) { from = ranges->At(0).to(); @@ -4903,8 +4470,8 @@ void CharacterRange::Negate(ZoneGrowableArray* ranges, from = range.to(); i++; } - if (from < Utf::kMaxCodePoint) { - negated_ranges->Add(CharacterRange(from + 1, Utf::kMaxCodePoint)); + if (from < Utf16::kMaxCodeUnit) { + negated_ranges->Add(CharacterRange(from + 1, Utf16::kMaxCodeUnit)); } } @@ -4921,22 +4488,6 @@ static bool ArrayContains(ZoneGrowableArray* array, unsigned value) { return false; } -OutSet* OutSet::Extend(unsigned value, Zone* zone) { - if (Get(value)) return this; - if (successors() != nullptr) { - for (int i = 0; i < successors()->length(); i++) { - OutSet* successor = successors()->At(i); - if (successor->Get(value)) return successor; - } - } else { - successors_ = new (zone) ZoneGrowableArray(2); - } - OutSet* result = new (zone) OutSet(first_, remaining_); - result->Set(value, zone); - successors()->Add(result); - return result; -} - void OutSet::Set(unsigned value, Zone* zone) { if (value < kFirstLimit) { first_ |= (1 << value); @@ -4961,108 +4512,6 @@ bool OutSet::Get(unsigned value) const { } } -const int32_t DispatchTable::Config::kNoKey = Utf::kInvalidChar; - -void DispatchTable::AddRange(CharacterRange full_range, - int32_t value, - Zone* zone) { - CharacterRange current = full_range; - if (tree()->is_empty()) { - // If this is the first range we just insert into the table. - ZoneSplayTree::Locator loc; - bool inserted = tree()->Insert(current.from(), &loc); - ASSERT(inserted); - USE(inserted); - loc.set_value( - Entry(current.from(), current.to(), empty()->Extend(value, zone))); - return; - } - // First see if there is a range to the left of this one that - // overlaps. - ZoneSplayTree::Locator loc; - if (tree()->FindGreatestLessThan(current.from(), &loc)) { - Entry* entry = &loc.value(); - // If we've found a range that overlaps with this one, and it - // starts strictly to the left of this one, we have to fix it - // because the following code only handles ranges that start on - // or after the start point of the range we're adding. - if (entry->from() < current.from() && entry->to() >= current.from()) { - // Snap the overlapping range in half around the start point of - // the range we're adding. - CharacterRange left = - CharacterRange::Range(entry->from(), current.from() - 1); - CharacterRange right = CharacterRange::Range(current.from(), entry->to()); - // The left part of the overlapping range doesn't overlap. - // Truncate the whole entry to be just the left part. - entry->set_to(left.to()); - // The right part is the one that overlaps. We add this part - // to the map and let the next step deal with merging it with - // the range we're adding. - ZoneSplayTree::Locator loc; - bool inserted = tree()->Insert(right.from(), &loc); - ASSERT(inserted); - USE(inserted); - loc.set_value(Entry(right.from(), right.to(), entry->out_set())); - } - } - while (current.is_valid()) { - if (tree()->FindLeastGreaterThan(current.from(), &loc) && - (loc.value().from() <= current.to()) && - (loc.value().to() >= current.from())) { - Entry* entry = &loc.value(); - // We have overlap. If there is space between the start point of - // the range we're adding and where the overlapping range starts - // then we have to add a range covering just that space. - if (current.from() < entry->from()) { - ZoneSplayTree::Locator ins; - bool inserted = tree()->Insert(current.from(), &ins); - ASSERT(inserted); - USE(inserted); - ins.set_value(Entry(current.from(), entry->from() - 1, - empty()->Extend(value, zone))); - current.set_from(entry->from()); - } - ASSERT(current.from() == entry->from()); - // If the overlapping range extends beyond the one we want to add - // we have to snap the right part off and add it separately. - if (entry->to() > current.to()) { - ZoneSplayTree::Locator ins; - bool inserted = tree()->Insert(current.to() + 1, &ins); - ASSERT(inserted); - USE(inserted); - ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set())); - entry->set_to(current.to()); - } - ASSERT(entry->to() <= current.to()); - // The overlapping range is now completely contained by the range - // we're adding so we can just update it and move the start point - // of the range we're adding just past it. - entry->AddValue(value, zone); - ASSERT(entry->to() + 1 > current.from()); - current.set_from(entry->to() + 1); - } else { - // There is no overlap so we can just add the range - ZoneSplayTree::Locator ins; - bool inserted = tree()->Insert(current.from(), &ins); - ASSERT(inserted); - USE(inserted); - ins.set_value( - Entry(current.from(), current.to(), empty()->Extend(value, zone))); - break; - } - } -} - -OutSet* DispatchTable::Get(int32_t value) { - ZoneSplayTree::Locator loc; - if (!tree()->FindGreatestLessThan(value, &loc)) return empty(); - Entry* entry = &loc.value(); - if (value <= entry->to()) - return entry->out_set(); - else - return empty(); -} - // ------------------------------------------------------------------- // Analysis @@ -5091,7 +4540,9 @@ void TextNode::CalculateOffsets() { } void Analysis::VisitText(TextNode* that) { - that->MakeCaseIndependent(is_one_byte_); + if (ignore_case_) { + that->MakeCaseIndependent(is_one_byte_); + } EnsureAnalyzed(that->on_success()); if (!has_failed()) { that->CalculateOffsets(); @@ -5198,7 +4649,7 @@ void TextNode::FillInBMInfo(intptr_t initial_offset, return; } uint16_t character = atom->data()->At(j); - if (atom->flags().IgnoreCase()) { + if (bm->compiler()->ignore_case()) { int32_t chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; intptr_t length = GetCaseIndependentLetters( character, bm->max_char() == Symbols::kMaxOneCharCodeSymbol, @@ -5218,7 +4669,7 @@ void TextNode::FillInBMInfo(intptr_t initial_offset, bm->SetAll(offset); } else { for (intptr_t k = 0; k < ranges->length(); k++) { - const CharacterRange& range = ranges->At(k); + CharacterRange& range = (*ranges)[k]; if (range.from() > max_char) continue; intptr_t to = Utils::Minimum(max_char, static_cast(range.to())); @@ -5237,40 +4688,6 @@ void TextNode::FillInBMInfo(intptr_t initial_offset, if (initial_offset == 0) set_bm_info(not_at_start, bm); } -RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler, - RegExpNode* on_success, - RegExpFlags flags) { - // If the regexp matching starts within a surrogate pair, step back - // to the lead surrogate and start matching from there. - ASSERT(!compiler->read_backward()); - Zone* zone = compiler->zone(); - - auto lead_surrogates = CharacterRange::List( - on_success->zone(), CharacterRange::Range(Utf16::kLeadSurrogateStart, - Utf16::kLeadSurrogateEnd)); - auto trail_surrogates = CharacterRange::List( - on_success->zone(), CharacterRange::Range(Utf16::kTrailSurrogateStart, - Utf16::kTrailSurrogateEnd)); - - ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone); - - int stack_register = compiler->UnicodeLookaroundStackRegister(); - int position_register = compiler->UnicodeLookaroundPositionRegister(); - RegExpNode* step_back = TextNode::CreateForCharacterRanges( - lead_surrogates, /*read_backward=*/true, on_success, flags); - RegExpLookaround::Builder builder(/*is_positive=*/true, step_back, - stack_register, position_register); - RegExpNode* match_trail = TextNode::CreateForCharacterRanges( - trail_surrogates, /*read_backward=*/false, builder.on_match_success(), - flags); - - optional_step_back->AddAlternative( - GuardedAlternative(builder.ForMatch(match_trail))); - optional_step_back->AddAlternative(GuardedAlternative(on_success)); - - return optional_step_back; -} - #if !defined(DART_PRECOMPILED_RUNTIME) RegExpEngine::CompilationResult RegExpEngine::CompileIR( RegExpCompileData* data, @@ -5291,10 +4708,10 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR( ASSERT(!regexp.IsNull()); ASSERT(!pattern.IsNull()); - const bool is_global = regexp.flags().IsGlobal(); - const bool is_unicode = regexp.flags().IsUnicode(); + const bool ignore_case = regexp.is_ignore_case(); + const bool is_global = regexp.is_global(); - RegExpCompiler compiler(data->capture_count, is_one_byte); + RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte); // TODO(zerny): Frequency sampling is currently disabled because of several // issues. We do not want to store subject strings in the regexp object since @@ -5317,37 +4734,34 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR( // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning or is sticky. RegExpNode* loop_node = RegExpQuantifier::ToNode( - 0, RegExpTree::kInfinity, false, - new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler, - captured_body, data->contains_anchor); + 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), + &compiler, captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start // at the start of input. ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone); first_step_node->AddAlternative(GuardedAlternative(captured_body)); - first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( - new (zone) RegExpCharacterClass('*', RegExpFlags()), - /*read_backwards=*/false, loop_node))); + first_step_node->AddAlternative(GuardedAlternative( + new (zone) TextNode(new (zone) RegExpCharacterClass('*'), + /*read_backwards=*/false, loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != NULL) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); } - } else if (is_unicode && (is_global || is_sticky)) { - node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags()); } if (node == NULL) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - Analysis analysis(is_one_byte); + Analysis analysis(ignore_case, is_one_byte); analysis.EnsureAnalyzed(node); if (analysis.has_failed()) { const char* error_message = analysis.error_message(); @@ -5369,13 +4783,10 @@ RegExpEngine::CompilationResult RegExpEngine::CompileIR( } if (is_global) { - RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; - if (data->tree->min_match() > 0) { - mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; - } else if (is_unicode) { - mode = RegExpMacroAssembler::GLOBAL_UNICODE; - } - macro_assembler->set_global_mode(mode); + macro_assembler->set_global_mode( + (data->tree->min_match() > 0) + ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK + : RegExpMacroAssembler::GLOBAL); } RegExpEngine::CompilationResult result = @@ -5401,10 +4812,10 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode( ASSERT(!regexp.IsNull()); ASSERT(!pattern.IsNull()); - const bool is_global = regexp.flags().IsGlobal(); - const bool is_unicode = regexp.flags().IsUnicode(); + const bool ignore_case = regexp.is_ignore_case(); + const bool is_global = regexp.is_global(); - RegExpCompiler compiler(data->capture_count, is_one_byte); + RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte); // TODO(zerny): Frequency sampling is currently disabled because of several // issues. We do not want to store subject strings in the regexp object since @@ -5427,37 +4838,34 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode( // Add a .*? at the beginning, outside the body capture, unless // this expression is anchored at the beginning. RegExpNode* loop_node = RegExpQuantifier::ToNode( - 0, RegExpTree::kInfinity, false, - new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler, - captured_body, data->contains_anchor); + 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'), + &compiler, captured_body, data->contains_anchor); if (data->contains_anchor) { // Unroll loop once, to take care of the case that might start // at the start of input. ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone); first_step_node->AddAlternative(GuardedAlternative(captured_body)); - first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode( - new (zone) RegExpCharacterClass('*', RegExpFlags()), - /*read_backwards=*/false, loop_node))); + first_step_node->AddAlternative(GuardedAlternative( + new (zone) TextNode(new (zone) RegExpCharacterClass('*'), + /*read_backwards=*/false, loop_node))); node = first_step_node; } else { node = loop_node; } } if (is_one_byte) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); // Do it again to propagate the new nodes to places where they were not // put because they had not been calculated yet. if (node != NULL) { - node = node->FilterOneByte(RegExpCompiler::kMaxRecursion); + node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); } - } else if (is_unicode && (is_global || is_sticky)) { - node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags()); } if (node == NULL) node = new (zone) EndNode(EndNode::BACKTRACK, zone); data->node = node; - Analysis analysis(is_one_byte); + Analysis analysis(ignore_case, is_one_byte); analysis.EnsureAnalyzed(node); if (analysis.has_failed()) { const char* error_message = analysis.error_message(); @@ -5479,13 +4887,10 @@ RegExpEngine::CompilationResult RegExpEngine::CompileBytecode( } if (is_global) { - RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL; - if (data->tree->min_match() > 0) { - mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK; - } else if (is_unicode) { - mode = RegExpMacroAssembler::GLOBAL_UNICODE; - } - macro_assembler->set_global_mode(mode); + macro_assembler->set_global_mode( + (data->tree->min_match() > 0) + ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK + : RegExpMacroAssembler::GLOBAL); } RegExpEngine::CompilationResult result = @@ -5547,12 +4952,19 @@ static void CreateSpecializedFunction(Thread* thread, RawRegExp* RegExpEngine::CreateRegExp(Thread* thread, const String& pattern, - RegExpFlags flags) { + bool multi_line, + bool ignore_case) { Zone* zone = thread->zone(); const RegExp& regexp = RegExp::Handle(RegExp::New()); regexp.set_pattern(pattern); - regexp.set_flags(flags); + + if (multi_line) { + regexp.set_is_multi_line(); + } + if (ignore_case) { + regexp.set_is_ignore_case(); + } // TODO(zerny): We might want to use normal string searching algorithms // for simple patterns. diff --git a/runtime/vm/regexp.h b/runtime/vm/regexp.h index febcb91bec3..092a06ad4bb 100644 --- a/runtime/vm/regexp.h +++ b/runtime/vm/regexp.h @@ -5,14 +5,11 @@ #ifndef RUNTIME_VM_REGEXP_H_ #define RUNTIME_VM_REGEXP_H_ -#include "platform/unicode.h" - #include "vm/compiler/assembler/assembler.h" #include "vm/compiler/backend/flow_graph_compiler.h" #include "vm/compiler/backend/il.h" #include "vm/object.h" #include "vm/regexp_assembler.h" -#include "vm/splay-tree.h" namespace dart { @@ -28,42 +25,32 @@ class BoyerMooreLookahead; class CharacterRange { public: CharacterRange() : from_(0), to_(0) {} - CharacterRange(int32_t from, int32_t to) : from_(from), to_(to) {} + CharacterRange(uint16_t from, uint16_t to) : from_(from), to_(to) {} static void AddClassEscape(uint16_t type, ZoneGrowableArray* ranges); - // Add class escapes with case equivalent closure for \w and \W if necessary. - static void AddClassEscape(uint16_t type, - ZoneGrowableArray* ranges, - bool add_unicode_case_equivalents); static GrowableArray GetWordBounds(); - static inline CharacterRange Singleton(int32_t value) { + static inline CharacterRange Singleton(uint16_t value) { return CharacterRange(value, value); } - static inline CharacterRange Range(int32_t from, int32_t to) { + static inline CharacterRange Range(uint16_t from, uint16_t to) { ASSERT(from <= to); return CharacterRange(from, to); } static inline CharacterRange Everything() { - return CharacterRange(0, Utf::kMaxCodePoint); + return CharacterRange(0, 0xFFFF); } - static inline ZoneGrowableArray* List(Zone* zone, - CharacterRange range) { - auto list = new (zone) ZoneGrowableArray(1); - list->Add(range); - return list; - } - bool Contains(int32_t i) const { return from_ <= i && i <= to_; } - int32_t from() const { return from_; } - void set_from(int32_t value) { from_ = value; } - int32_t to() const { return to_; } - void set_to(int32_t value) { to_ = value; } + bool Contains(uint16_t i) const { return from_ <= i && i <= to_; } + uint16_t from() const { return from_; } + void set_from(uint16_t value) { from_ = value; } + uint16_t to() const { return to_; } + void set_to(uint16_t value) { to_ = value; } bool is_valid() const { return from_ <= to_; } - bool IsEverything(int32_t max) const { return from_ == 0 && to_ >= max; } + bool IsEverything(uint16_t max) const { return from_ == 0 && to_ >= max; } bool IsSingleton() const { return (from_ == to_); } - static void AddCaseEquivalents(ZoneGrowableArray* ranges, - bool is_one_byte, - Zone* zone); + void AddCaseEquivalents(ZoneGrowableArray* ranges, + bool is_one_byte, + Zone* zone); static void Split(ZoneGrowableArray* base, GrowableArray overlay, ZoneGrowableArray** included, @@ -84,8 +71,8 @@ class CharacterRange { static const intptr_t kPayloadMask = (1 << 24) - 1; private: - int32_t from_; - int32_t to_; + uint16_t from_; + uint16_t to_; DISALLOW_ALLOCATION(); }; @@ -118,98 +105,6 @@ class OutSet : public ZoneAllocated { friend class Trace; }; -// A mapping from integers, specified as ranges, to a set of integers. -// Used for mapping character ranges to choices. -class DispatchTable : public ValueObject { - public: - explicit DispatchTable(Zone* zone) : tree_(zone) {} - - class Entry { - public: - Entry() : from_(0), to_(0), out_set_(nullptr) {} - Entry(int32_t from, int32_t to, OutSet* out_set) - : from_(from), to_(to), out_set_(out_set) { - ASSERT(from <= to); - } - int32_t from() { return from_; } - int32_t to() { return to_; } - void set_to(int32_t value) { to_ = value; } - void AddValue(int value, Zone* zone) { - out_set_ = out_set_->Extend(value, zone); - } - OutSet* out_set() { return out_set_; } - - private: - int32_t from_; - int32_t to_; - OutSet* out_set_; - }; - - class Config { - public: - typedef int32_t Key; - typedef Entry Value; - static const int32_t kNoKey; - static const Entry NoValue() { return Value(); } - static inline int Compare(int32_t a, int32_t b) { - if (a == b) - return 0; - else if (a < b) - return -1; - else - return 1; - } - }; - - void AddRange(CharacterRange range, int32_t value, Zone* zone); - OutSet* Get(int32_t value); - void Dump(); - - template - void ForEach(Callback* callback) { - return tree()->ForEach(callback); - } - - private: - // There can't be a static empty set since it allocates its - // successors in a zone and caches them. - OutSet* empty() { return &empty_; } - OutSet empty_; - ZoneSplayTree* tree() { return &tree_; } - ZoneSplayTree tree_; -}; - -// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates. -class UnicodeRangeSplitter : public ValueObject { - public: - UnicodeRangeSplitter(Zone* zone, ZoneGrowableArray* base); - void Call(uint32_t from, DispatchTable::Entry entry); - - ZoneGrowableArray* bmp() { return bmp_; } - ZoneGrowableArray* lead_surrogates() { - return lead_surrogates_; - } - ZoneGrowableArray* trail_surrogates() { - return trail_surrogates_; - } - ZoneGrowableArray* non_bmp() const { return non_bmp_; } - - private: - static const int kBase = 0; - // Separate ranges into - static const int kBmpCodePoints = 1; - static const int kLeadSurrogates = 2; - static const int kTrailSurrogates = 3; - static const int kNonBmpCodePoints = 4; - - Zone* zone_; - DispatchTable table_; - ZoneGrowableArray* bmp_; - ZoneGrowableArray* lead_surrogates_; - ZoneGrowableArray* trail_surrogates_; - ZoneGrowableArray* non_bmp_; -}; - #define FOR_EACH_NODE_TYPE(VISIT) \ VISIT(End) \ VISIT(Action) \ @@ -449,7 +344,9 @@ class RegExpNode : public ZoneAllocated { // If we know that the input is one-byte then there are some nodes that can // never match. This method returns a node that can be substituted for // itself, or NULL if the node can never match. - virtual RegExpNode* FilterOneByte(intptr_t depth) { return this; } + virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case) { + return this; + } // Helper for FilterOneByte. RegExpNode* replacement() { ASSERT(info()->replacement_calculated); @@ -546,7 +443,7 @@ class SeqRegExpNode : public RegExpNode { : RegExpNode(on_success->zone()), on_success_(on_success) {} RegExpNode* on_success() { return on_success_; } void set_on_success(RegExpNode* node) { on_success_ = node; } - virtual RegExpNode* FilterOneByte(intptr_t depth); + virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead* bm, @@ -556,7 +453,7 @@ class SeqRegExpNode : public RegExpNode { } protected: - RegExpNode* FilterSuccessor(intptr_t depth); + RegExpNode* FilterSuccessor(intptr_t depth, bool ignore_case); private: RegExpNode* on_success_; @@ -664,19 +561,6 @@ class TextNode : public SeqRegExpNode { read_backward_(read_backward) { elms_->Add(TextElement::CharClass(that)); } - // Create TextNode for a single character class for the given ranges. - static TextNode* CreateForCharacterRanges( - ZoneGrowableArray* ranges, - bool read_backward, - RegExpNode* on_success, - RegExpFlags flags); - // Create TextNode for a surrogate pair with a range given for the - // lead and the trail surrogate each. - static TextNode* CreateForSurrogatePair(CharacterRange lead, - CharacterRange trail, - bool read_backward, - RegExpNode* on_success, - RegExpFlags flags); virtual void Accept(NodeVisitor* visitor); virtual void Emit(RegExpCompiler* compiler, Trace* trace); virtual intptr_t EatsAtLeast(intptr_t still_to_find, @@ -697,7 +581,7 @@ class TextNode : public SeqRegExpNode { BoyerMooreLookahead* bm, bool not_at_start); void CalculateOffsets(); - virtual RegExpNode* FilterOneByte(intptr_t depth); + virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); private: enum TextEmitPassType { @@ -775,13 +659,11 @@ class BackReferenceNode : public SeqRegExpNode { public: BackReferenceNode(intptr_t start_reg, intptr_t end_reg, - RegExpFlags flags, bool read_backward, RegExpNode* on_success) : SeqRegExpNode(on_success), start_reg_(start_reg), end_reg_(end_reg), - flags_(flags), read_backward_(read_backward) {} virtual void Accept(NodeVisitor* visitor); intptr_t start_register() { return start_reg_; } @@ -805,7 +687,6 @@ class BackReferenceNode : public SeqRegExpNode { private: intptr_t start_reg_; intptr_t end_reg_; - RegExpFlags flags_; bool read_backward_; }; @@ -880,9 +761,9 @@ class GuardedAlternative { public: explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) {} void AddGuard(Guard* guard, Zone* zone); - RegExpNode* node() const { return node_; } + RegExpNode* node() { return node_; } void set_node(RegExpNode* node) { node_ = node; } - ZoneGrowableArray* guards() const { return guards_; } + ZoneGrowableArray* guards() { return guards_; } private: RegExpNode* node_; @@ -930,12 +811,11 @@ class ChoiceNode : public RegExpNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return true; } - virtual RegExpNode* FilterOneByte(intptr_t depth); + virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); virtual bool read_backward() { return false; } protected: - intptr_t GreedyLoopTextLengthForAlternative( - const GuardedAlternative* alternative); + intptr_t GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative); ZoneGrowableArray* alternatives_; private: @@ -1006,7 +886,7 @@ class NegativeLookaroundChoiceNode : public ChoiceNode { virtual bool try_to_emit_quick_check_for_alternative(bool is_first) { return !is_first; } - virtual RegExpNode* FilterOneByte(intptr_t depth); + virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); }; class LoopChoiceNode : public ChoiceNode { @@ -1038,7 +918,7 @@ class LoopChoiceNode : public ChoiceNode { bool body_can_be_zero_length() { return body_can_be_zero_length_; } virtual bool read_backward() { return read_backward_; } virtual void Accept(NodeVisitor* visitor); - virtual RegExpNode* FilterOneByte(intptr_t depth); + virtual RegExpNode* FilterOneByte(intptr_t depth, bool ignore_case); private: // AddAlternative is made private for loop nodes because alternatives @@ -1406,8 +1286,10 @@ class NodeVisitor : public ValueObject { // +-------+ +------------+ class Analysis : public NodeVisitor { public: - explicit Analysis(bool is_one_byte) - : is_one_byte_(is_one_byte), error_message_(NULL) {} + Analysis(bool ignore_case, bool is_one_byte) + : ignore_case_(ignore_case), + is_one_byte_(is_one_byte), + error_message_(NULL) {} void EnsureAnalyzed(RegExpNode* node); #define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that); @@ -1423,6 +1305,7 @@ class Analysis : public NodeVisitor { void fail(const char* error_message) { error_message_ = error_message; } private: + bool ignore_case_; bool is_one_byte_; const char* error_message_; @@ -1515,7 +1398,8 @@ class RegExpEngine : public AllStatic { static RawRegExp* CreateRegExp(Thread* thread, const String& pattern, - RegExpFlags flags); + bool multi_line, + bool ignore_case); static void DotPrint(const char* label, RegExpNode* node, bool ignore_case); }; diff --git a/runtime/vm/regexp_assembler.cc b/runtime/vm/regexp_assembler.cc index ab242c4bd54..fb7dc746fc8 100644 --- a/runtime/vm/regexp_assembler.cc +++ b/runtime/vm/regexp_assembler.cc @@ -4,10 +4,6 @@ #include "vm/regexp_assembler.h" -#include "unicode/uchar.h" - -#include "platform/unicode.h" - #include "vm/flags.h" #include "vm/regexp.h" #include "vm/unibrow-inl.h" @@ -20,10 +16,11 @@ void PrintUtf16(uint16_t c) { OS::PrintErr(format, c); } -RawBool* CaseInsensitiveCompareUCS2(RawString* str_raw, - RawSmi* lhs_index_raw, - RawSmi* rhs_index_raw, - RawSmi* length_raw) { + +static RawBool* CaseInsensitiveCompareUC16(RawString* str_raw, + RawSmi* lhs_index_raw, + RawSmi* rhs_index_raw, + RawSmi* length_raw) { const String& str = String::Handle(str_raw); const Smi& lhs_index = Smi::Handle(lhs_index_raw); const Smi& rhs_index = Smi::Handle(rhs_index_raw); @@ -51,50 +48,13 @@ RawBool* CaseInsensitiveCompareUCS2(RawString* str_raw, return Bool::True().raw(); } -RawBool* CaseInsensitiveCompareUTF16(RawString* str_raw, - RawSmi* lhs_index_raw, - RawSmi* rhs_index_raw, - RawSmi* length_raw) { - const String& str = String::Handle(str_raw); - const Smi& lhs_index = Smi::Handle(lhs_index_raw); - const Smi& rhs_index = Smi::Handle(rhs_index_raw); - const Smi& length = Smi::Handle(length_raw); - - for (intptr_t i = 0; i < length.Value(); i++) { - int32_t c1 = str.CharAt(lhs_index.Value() + i); - int32_t c2 = str.CharAt(rhs_index.Value() + i); - if (Utf16::IsLeadSurrogate(c1)) { - // Non-BMP characters do not have case-equivalents in the BMP. - // Both have to be non-BMP for them to be able to match. - if (!Utf16::IsLeadSurrogate(c2)) return Bool::False().raw(); - if (i + 1 < length.Value()) { - uint16_t c1t = str.CharAt(lhs_index.Value() + i + 1); - uint16_t c2t = str.CharAt(rhs_index.Value() + i + 1); - if (Utf16::IsTrailSurrogate(c1t) && Utf16::IsTrailSurrogate(c2t)) { - c1 = Utf16::Decode(c1, c1t); - c2 = Utf16::Decode(c2, c2t); - i++; - } - } - } - c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT); - c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT); - if (c1 != c2) return Bool::False().raw(); - } - return Bool::True().raw(); -} DEFINE_RAW_LEAF_RUNTIME_ENTRY( - CaseInsensitiveCompareUCS2, + CaseInsensitiveCompareUC16, 4, false /* is_float */, - reinterpret_cast(&CaseInsensitiveCompareUCS2)); + reinterpret_cast(&CaseInsensitiveCompareUC16)); -DEFINE_RAW_LEAF_RUNTIME_ENTRY( - CaseInsensitiveCompareUTF16, - 4, - false /* is_float */, - reinterpret_cast(&CaseInsensitiveCompareUTF16)); BlockLabel::BlockLabel() : block_(NULL), is_bound_(false), is_linked_(false), pos_(-1) { @@ -112,18 +72,4 @@ RegExpMacroAssembler::RegExpMacroAssembler(Zone* zone) RegExpMacroAssembler::~RegExpMacroAssembler() {} -void RegExpMacroAssembler::CheckNotInSurrogatePair(intptr_t cp_offset, - BlockLabel* on_failure) { - BlockLabel ok; - // Check that current character is not a trail surrogate. - LoadCurrentCharacter(cp_offset, &ok); - CheckCharacterNotInRange(Utf16::kTrailSurrogateStart, - Utf16::kTrailSurrogateEnd, &ok); - // Check that previous character is not a lead surrogate. - LoadCurrentCharacter(cp_offset - 1, &ok); - CheckCharacterInRange(Utf16::kLeadSurrogateStart, Utf16::kLeadSurrogateEnd, - on_failure); - BindBlock(&ok); -} - } // namespace dart diff --git a/runtime/vm/regexp_assembler.h b/runtime/vm/regexp_assembler.h index 818f5730c50..a7b087e23ff 100644 --- a/runtime/vm/regexp_assembler.h +++ b/runtime/vm/regexp_assembler.h @@ -13,19 +13,6 @@ namespace dart { // Utility function for the DotPrinter void PrintUtf16(uint16_t c); -// Compares two-byte strings case insensitively as UCS2. -// Called from generated RegExp code. -RawBool* CaseInsensitiveCompareUCS2(RawString* str_raw, - RawSmi* lhs_index_raw, - RawSmi* rhs_index_raw, - RawSmi* length_raw); - -// Compares two-byte strings case insensitively as UTF16. -// Called from generated RegExp code. -RawBool* CaseInsensitiveCompareUTF16(RawString* str_raw, - RawSmi* lhs_index_raw, - RawSmi* rhs_index_raw, - RawSmi* length_raw); /// Convenience wrapper around a BlockEntryInstr pointer. class BlockLabel : public ValueObject { @@ -140,7 +127,6 @@ class RegExpMacroAssembler : public ZoneAllocated { BlockLabel* on_no_match) = 0; virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, - bool unicode, BlockLabel* on_no_match) = 0; // Check the current character for a match with a literal character. If we // fail to match then goto the on_failure label. End of input always @@ -229,33 +215,22 @@ class RegExpMacroAssembler : public ZoneAllocated { virtual void ClearRegisters(intptr_t reg_from, intptr_t reg_to) = 0; virtual void WriteStackPointerToRegister(intptr_t reg) = 0; - // Check that we are not in the middle of a surrogate pair. - void CheckNotInSurrogatePair(intptr_t cp_offset, BlockLabel* on_failure); - // Controls the generation of large inlined constants in the code. void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } bool slow_safe() { return slow_safe_compiler_; } - enum GlobalMode { - NOT_GLOBAL, - GLOBAL, - GLOBAL_NO_ZERO_LENGTH_CHECK, - GLOBAL_UNICODE - }; + enum GlobalMode { NOT_GLOBAL, GLOBAL, GLOBAL_NO_ZERO_LENGTH_CHECK }; // Set whether the regular expression has the global flag. Exiting due to // a failure in a global regexp may still mean success overall. inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } inline bool global() { return global_mode_ != NOT_GLOBAL; } - inline bool global_with_zero_length_check() { - return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; - } - inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; } + inline bool global_with_zero_length_check() { return global_mode_ == GLOBAL; } Zone* zone() const { return zone_; } private: bool slow_safe_compiler_; - GlobalMode global_mode_; + bool global_mode_; Zone* zone_; }; diff --git a/runtime/vm/regexp_assembler_bytecode.cc b/runtime/vm/regexp_assembler_bytecode.cc index 0318a27a810..8288e1b15df 100644 --- a/runtime/vm/regexp_assembler_bytecode.cc +++ b/runtime/vm/regexp_assembler_bytecode.cc @@ -349,14 +349,11 @@ void BytecodeRegExpMacroAssembler::CheckNotBackReference( void BytecodeRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( intptr_t start_reg, bool read_backward, - bool unicode, BlockLabel* on_not_equal) { ASSERT(start_reg >= 0); ASSERT(start_reg <= kMaxRegister); - Emit(read_backward ? (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD - : BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) - : (unicode ? BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE - : BC_CHECK_NOT_BACK_REF_NO_CASE), + Emit(read_backward ? BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD + : BC_CHECK_NOT_BACK_REF_NO_CASE, start_reg); EmitOrLink(on_not_equal); } @@ -437,10 +434,11 @@ static intptr_t Prepare(const RegExp& regexp, } #endif // !defined(PRODUCT) + const bool multiline = regexp.is_multi_line(); RegExpCompileData* compile_data = new (zone) RegExpCompileData(); // Parsing failures are handled in the RegExp factory constructor. - RegExpParser::ParseRegExp(pattern, regexp.flags(), compile_data); + RegExpParser::ParseRegExp(pattern, multiline, compile_data); regexp.set_num_bracket_expressions(compile_data->capture_count); regexp.set_capture_name_map(compile_data->capture_name_map); @@ -453,15 +451,15 @@ static intptr_t Prepare(const RegExp& regexp, RegExpEngine::CompilationResult result = RegExpEngine::CompileBytecode( compile_data, regexp, is_one_byte, sticky, zone); ASSERT(result.bytecode != NULL); - ASSERT(regexp.num_registers(is_one_byte) == -1 || - regexp.num_registers(is_one_byte) == result.num_registers); - regexp.set_num_registers(is_one_byte, result.num_registers); + ASSERT((regexp.num_registers() == -1) || + (regexp.num_registers() == result.num_registers)); + regexp.set_num_registers(result.num_registers); regexp.set_bytecode(is_one_byte, sticky, *(result.bytecode)); } - ASSERT(regexp.num_registers(is_one_byte) != -1); + ASSERT(regexp.num_registers() != -1); - return regexp.num_registers(is_one_byte) + + return regexp.num_registers() + (Smi::Value(regexp.num_bracket_expressions()) + 1) * 2; } diff --git a/runtime/vm/regexp_assembler_bytecode.h b/runtime/vm/regexp_assembler_bytecode.h index 59cf68f4575..3e17d493238 100644 --- a/runtime/vm/regexp_assembler_bytecode.h +++ b/runtime/vm/regexp_assembler_bytecode.h @@ -83,7 +83,6 @@ class BytecodeRegExpMacroAssembler : public RegExpMacroAssembler { BlockLabel* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, - bool unicode, BlockLabel* on_no_match); virtual void IfRegisterLT(intptr_t register_index, intptr_t comparand, diff --git a/runtime/vm/regexp_assembler_ir.cc b/runtime/vm/regexp_assembler_ir.cc index f7f18ac1380..b1efebec92b 100644 --- a/runtime/vm/regexp_assembler_ir.cc +++ b/runtime/vm/regexp_assembler_ir.cc @@ -821,7 +821,6 @@ void IRRegExpMacroAssembler::CheckGreedyLoop(BlockLabel* on_equal) { void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( intptr_t start_reg, bool read_backward, - bool unicode, BlockLabel* on_no_match) { TAG(); ASSERT(start_reg + 1 <= registers_count_); @@ -968,17 +967,9 @@ void IRRegExpMacroAssembler::CheckNotBackReferenceIgnoreCase( Value* rhs_index_value = Bind(LoadLocal(capture_start_index_)); Value* length_value = Bind(LoadLocal(capture_length_)); - Definition* is_match_def; - - if (unicode) { - is_match_def = new (Z) CaseInsensitiveCompareInstr( - string_value, lhs_index_value, rhs_index_value, length_value, - kCaseInsensitiveCompareUTF16RuntimeEntry, specialization_cid_); - } else { - is_match_def = new (Z) CaseInsensitiveCompareInstr( - string_value, lhs_index_value, rhs_index_value, length_value, - kCaseInsensitiveCompareUCS2RuntimeEntry, specialization_cid_); - } + Definition* is_match_def = new (Z) CaseInsensitiveCompareUC16Instr( + string_value, lhs_index_value, rhs_index_value, length_value, + specialization_cid_); BranchOrBacktrack(Comparison(kNE, is_match_def, BoolConstant(true)), on_no_match); diff --git a/runtime/vm/regexp_assembler_ir.h b/runtime/vm/regexp_assembler_ir.h index d8c222a568f..c4f6e1fd2b7 100644 --- a/runtime/vm/regexp_assembler_ir.h +++ b/runtime/vm/regexp_assembler_ir.h @@ -67,7 +67,6 @@ class IRRegExpMacroAssembler : public RegExpMacroAssembler { BlockLabel* on_no_match); virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, - bool unicode, BlockLabel* on_no_match); virtual void CheckNotCharacter(uint32_t c, BlockLabel* on_not_equal); virtual void CheckNotCharacterAfterAnd(uint32_t c, diff --git a/runtime/vm/regexp_ast.h b/runtime/vm/regexp_ast.h index 4fb64a3ae20..17a175e3d2d 100644 --- a/runtime/vm/regexp_ast.h +++ b/runtime/vm/regexp_ast.h @@ -108,8 +108,7 @@ class RegExpAssertion : public RegExpTree { BOUNDARY, NON_BOUNDARY }; - RegExpAssertion(AssertionType type, RegExpFlags flags) - : assertion_type_(type), flags_(flags) {} + explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpAssertion* AsAssertion(); @@ -122,7 +121,6 @@ class RegExpAssertion : public RegExpTree { private: AssertionType assertion_type_; - RegExpFlags flags_; }; class CharacterSet : public ValueObject { @@ -152,40 +150,18 @@ class CharacterSet : public ValueObject { class RegExpCharacterClass : public RegExpTree { public: - enum Flag { - // The character class is negated and should match everything but the - // specified ranges. - NEGATED = 1 << 0, - // The character class contains part of a split surrogate and should not - // be unicode-desugared. - CONTAINS_SPLIT_SURROGATE = 1 << 1, - }; - using CharacterClassFlags = intptr_t; - static inline CharacterClassFlags DefaultFlags() { return 0; } - - RegExpCharacterClass( - ZoneGrowableArray* ranges, - RegExpFlags flags, - CharacterClassFlags character_class_flags = DefaultFlags()) - : set_(ranges), - flags_(flags), - character_class_flags_(character_class_flags) { - // Convert the empty set of ranges to the negated Everything() range. - if (ranges->is_empty()) { - ranges->Add(CharacterRange::Everything()); - character_class_flags_ ^= NEGATED; - } - } - RegExpCharacterClass(uint16_t type, RegExpFlags flags) - : set_(type), flags_(flags), character_class_flags_(0) {} + RegExpCharacterClass(ZoneGrowableArray* ranges, + bool is_negated) + : set_(ranges), is_negated_(is_negated) {} + explicit RegExpCharacterClass(uint16_t type) + : set_(type), is_negated_(false) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpCharacterClass* AsCharacterClass(); virtual bool IsCharacterClass() const; virtual bool IsTextElement() const { return true; } virtual intptr_t min_match() const { return 1; } - // The character class may match two code units for unicode regexps. - virtual intptr_t max_match() const { return 2; } + virtual intptr_t max_match() const { return 1; } virtual void AppendToText(RegExpText* text); CharacterSet character_set() const { return set_; } // TODO(lrn): Remove need for complex version if is_standard that @@ -204,22 +180,16 @@ class RegExpCharacterClass : public RegExpTree { // * : All characters uint16_t standard_type() const { return set_.standard_set_type(); } ZoneGrowableArray* ranges() { return set_.ranges(); } - bool is_negated() const { return character_class_flags_ & NEGATED; } - RegExpFlags flags() const { return flags_; } - bool contains_split_surrogate() const { - return character_class_flags_ & CONTAINS_SPLIT_SURROGATE; - } + bool is_negated() const { return is_negated_; } private: CharacterSet set_; - RegExpFlags flags_; - CharacterClassFlags character_class_flags_; + bool is_negated_; }; class RegExpAtom : public RegExpTree { public: - RegExpAtom(ZoneGrowableArray* data, RegExpFlags flags) - : data_(data), flags_(flags) {} + explicit RegExpAtom(ZoneGrowableArray* data) : data_(data) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpAtom* AsAtom(); @@ -230,12 +200,9 @@ class RegExpAtom : public RegExpTree { virtual void AppendToText(RegExpText* text); ZoneGrowableArray* data() const { return data_; } intptr_t length() const { return data_->length(); } - RegExpFlags flags() const { return flags_; } - bool ignore_case() const { return flags_.IgnoreCase(); } private: ZoneGrowableArray* data_; - const RegExpFlags flags_; }; class RegExpText : public RegExpTree { @@ -403,10 +370,9 @@ class RegExpLookaround : public RegExpTree { class RegExpBackReference : public RegExpTree { public: - explicit RegExpBackReference(RegExpFlags flags) - : capture_(nullptr), name_(nullptr), flags_(flags) {} - RegExpBackReference(RegExpCapture* capture, RegExpFlags flags) - : capture_(capture), name_(nullptr), flags_(flags) {} + RegExpBackReference() : capture_(nullptr), name_(nullptr) {} + explicit RegExpBackReference(RegExpCapture* capture) + : capture_(capture), name_(nullptr) {} virtual void* Accept(RegExpVisitor* visitor, void* data); virtual RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success); virtual RegExpBackReference* AsBackReference(); @@ -425,7 +391,6 @@ class RegExpBackReference : public RegExpTree { private: RegExpCapture* capture_; const ZoneGrowableArray* name_; - RegExpFlags flags_; }; class RegExpEmpty : public RegExpTree { diff --git a/runtime/vm/regexp_bytecodes.h b/runtime/vm/regexp_bytecodes.h index 178c9500318..858eef5ee23 100644 --- a/runtime/vm/regexp_bytecodes.h +++ b/runtime/vm/regexp_bytecodes.h @@ -55,19 +55,17 @@ V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32 */ \ V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \ V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) /*bc8 reg_idx24 addr32*/ \ -V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ -V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \ -V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \ -V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \ -V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \ -V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \ -V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \ -V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \ -V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ +V(CHECK_NOT_BACK_REF_BACKWARD, 39, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_NOT_REGS_EQUAL, 41, 12) /* bc8 regidx24 reg_idx32 addr32 */ \ +V(CHECK_REGISTER_LT, 42, 12) /* bc8 reg_idx24 value32 addr32 */ \ +V(CHECK_REGISTER_GE, 43, 12) /* bc8 reg_idx24 value32 addr32 */ \ +V(CHECK_REGISTER_EQ_POS, 44, 8) /* bc8 reg_idx24 addr32 */ \ +V(CHECK_AT_START, 45, 8) /* bc8 pad24 addr32 */ \ +V(CHECK_NOT_AT_START, 46, 8) /* bc8 offset24 addr32 */ \ +V(CHECK_GREEDY, 47, 8) /* bc8 pad24 addr32 */ \ +V(ADVANCE_CP_AND_GOTO, 48, 8) /* bc8 offset24 addr32 */ \ +V(SET_CURRENT_POSITION_FROM_END, 49, 4) /* bc8 idx24 */ // clang-format on diff --git a/runtime/vm/regexp_interpreter.cc b/runtime/vm/regexp_interpreter.cc index 2921e3ecc26..94c44ec0799 100644 --- a/runtime/vm/regexp_interpreter.cc +++ b/runtime/vm/regexp_interpreter.cc @@ -24,25 +24,27 @@ static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, intptr_t from, intptr_t current, intptr_t len, - const String& subject, - bool unicode); + const String& subject); template <> bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, intptr_t from, intptr_t current, intptr_t len, - const String& subject, - bool unicode) { - Bool& ret = Bool::Handle(); - if (unicode) { - ret = CaseInsensitiveCompareUTF16(subject.raw(), Smi::New(from), - Smi::New(current), Smi::New(len)); - } else { - ret = CaseInsensitiveCompareUCS2(subject.raw(), Smi::New(from), - Smi::New(current), Smi::New(len)); + const String& subject) { + for (int i = 0; i < len; i++) { + int32_t old_char = subject.CharAt(from++); + int32_t new_char = subject.CharAt(current++); + if (old_char == new_char) continue; + int32_t old_string[1] = {old_char}; + int32_t new_string[1] = {new_char}; + interp_canonicalize->get(old_char, '\0', old_string); + interp_canonicalize->get(new_char, '\0', new_string); + if (old_string[0] != new_string[0]) { + return false; + } } - return ret.value(); + return true; } template <> @@ -50,9 +52,7 @@ bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize, intptr_t from, intptr_t current, intptr_t len, - const String& subject, - bool unicode) { - // For Latin1 characters the unicode flag makes no difference. + const String& subject) { for (int i = 0; i < len; i++) { unsigned int old_char = subject.CharAt(from++); unsigned int new_char = subject.CharAt(current++); @@ -513,11 +513,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, pc += BC_CHECK_NOT_BACK_REF_LENGTH; break; } - BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE) - FALL_THROUGH; BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) { - const bool unicode = - (insn & BYTECODE_MASK) == BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE; int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { @@ -529,7 +525,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, break; } else { if (BackRefMatchesNoCase(&canonicalize, from, current, len, - subject, unicode)) { + subject)) { current += len; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH; } else { @@ -566,11 +562,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, pc += BC_CHECK_NOT_BACK_REF_BACKWARD_LENGTH; break; } - BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD) - FALL_THROUGH; BYTECODE(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD) { - bool unicode = (insn & BYTECODE_MASK) == - BC_CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD; int from = registers[insn >> BYTECODE_SHIFT]; int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from; if (from < 0 || len <= 0) { @@ -582,7 +574,7 @@ static IrregexpInterpreter::IrregexpResult RawMatch(const uint8_t* code_base, break; } else { if (BackRefMatchesNoCase(&canonicalize, from, current - len, - len, subject, unicode)) { + len, subject)) { current -= len; pc += BC_CHECK_NOT_BACK_REF_NO_CASE_BACKWARD_LENGTH; } else { diff --git a/runtime/vm/regexp_parser.cc b/runtime/vm/regexp_parser.cc index 31aff3fb052..b70b945ad0d 100644 --- a/runtime/vm/regexp_parser.cc +++ b/runtime/vm/regexp_parser.cc @@ -3,12 +3,6 @@ // BSD-style license that can be found in the LICENSE file. #include "vm/regexp_parser.h" - -#include "unicode/uchar.h" -#include "unicode/uniset.h" - -#include "platform/unicode.h" - #include "vm/longjump.h" #include "vm/object_store.h" @@ -19,12 +13,10 @@ namespace dart { // Enables possessive quantifier syntax for testing. static const bool FLAG_regexp_possessive_quantifier = false; -RegExpBuilder::RegExpBuilder(RegExpFlags flags) +RegExpBuilder::RegExpBuilder() : zone_(Thread::Current()->zone()), pending_empty_(false), - flags_(flags), characters_(NULL), - pending_surrogate_(kNoPendingSurrogate), terms_(), text_(), alternatives_() @@ -35,49 +27,10 @@ RegExpBuilder::RegExpBuilder(RegExpFlags flags) { } -void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) { - ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); - FlushPendingSurrogate(); - // Hold onto the lead surrogate, waiting for a trail surrogate to follow. - pending_surrogate_ = lead_surrogate; -} - -void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) { - ASSERT(Utf16::IsTrailSurrogate(trail_surrogate)); - if (pending_surrogate_ != kNoPendingSurrogate) { - uint16_t lead_surrogate = pending_surrogate_; - pending_surrogate_ = kNoPendingSurrogate; - ASSERT(Utf16::IsLeadSurrogate(lead_surrogate)); - uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate); - if (NeedsDesugaringForIgnoreCase(combined)) { - AddCharacterClassForDesugaring(combined); - } else { - auto surrogate_pair = new (Z) ZoneGrowableArray(2); - surrogate_pair->Add(lead_surrogate); - surrogate_pair->Add(trail_surrogate); - RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_); - AddAtom(atom); - } - } else { - pending_surrogate_ = trail_surrogate; - FlushPendingSurrogate(); - } -} - -void RegExpBuilder::FlushPendingSurrogate() { - if (pending_surrogate_ != kNoPendingSurrogate) { - ASSERT(is_unicode()); - uint32_t c = pending_surrogate_; - pending_surrogate_ = kNoPendingSurrogate; - AddCharacterClassForDesugaring(c); - } -} - void RegExpBuilder::FlushCharacters() { - FlushPendingSurrogate(); pending_empty_ = false; if (characters_ != NULL) { - RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_); + RegExpTree* atom = new (Z) RegExpAtom(characters_); characters_ = NULL; text_.Add(atom); LAST(ADD_ATOM); @@ -101,62 +54,18 @@ void RegExpBuilder::FlushText() { } void RegExpBuilder::AddCharacter(uint16_t c) { - FlushPendingSurrogate(); pending_empty_ = false; - if (NeedsDesugaringForIgnoreCase(c)) { - AddCharacterClassForDesugaring(c); - } else { - if (characters_ == NULL) { - characters_ = new (Z) ZoneGrowableArray(4); - } - characters_->Add(c); - LAST(ADD_CHAR); + if (characters_ == NULL) { + characters_ = new (Z) ZoneGrowableArray(4); } -} - -void RegExpBuilder::AddUnicodeCharacter(uint32_t c) { - if (c > static_cast(Utf16::kMaxCodeUnit)) { - ASSERT(is_unicode()); - uint16_t surrogates[2]; - Utf16::Encode(c, surrogates); - AddLeadSurrogate(surrogates[0]); - AddTrailSurrogate(surrogates[1]); - } else if (is_unicode() && Utf16::IsLeadSurrogate(c)) { - AddLeadSurrogate(c); - } else if (is_unicode() && Utf16::IsTrailSurrogate(c)) { - AddTrailSurrogate(c); - } else { - AddCharacter(static_cast(c)); - } -} - -void RegExpBuilder::AddEscapedUnicodeCharacter(uint32_t character) { - // A lead or trail surrogate parsed via escape sequence will not - // pair up with any preceding lead or following trail surrogate. - FlushPendingSurrogate(); - AddUnicodeCharacter(character); - FlushPendingSurrogate(); + characters_->Add(c); + LAST(ADD_CHAR); } void RegExpBuilder::AddEmpty() { pending_empty_ = true; } -void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { - if (NeedsDesugaringForUnicode(cc)) { - // With /u, character class needs to be desugared, so it - // must be a standalone term instead of being part of a RegExpText. - AddTerm(cc); - } else { - AddAtom(cc); - } -} - -void RegExpBuilder::AddCharacterClassForDesugaring(uint32_t c) { - auto ranges = CharacterRange::List(Z, CharacterRange::Singleton(c)); - AddTerm(new (Z) RegExpCharacterClass(ranges, flags_)); -} - void RegExpBuilder::AddAtom(RegExpTree* term) { if (term->IsEmpty()) { AddEmpty(); @@ -172,12 +81,6 @@ void RegExpBuilder::AddAtom(RegExpTree* term) { LAST(ADD_ATOM); } -void RegExpBuilder::AddTerm(RegExpTree* term) { - FlushText(); - terms_.Add(term); - LAST(ADD_ATOM); -} - void RegExpBuilder::AddAssertion(RegExpTree* assert) { FlushText(); terms_.Add(assert); @@ -209,37 +112,6 @@ void RegExpBuilder::FlushTerms() { LAST(ADD_NONE); } -bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { - if (!is_unicode()) return false; - // TODO(yangguo): we could be smarter than this. Case-insensitivity does not - // necessarily mean that we need to desugar. It's probably nicer to have a - // separate pass to figure out unicode desugarings. - if (ignore_case()) return true; - ZoneGrowableArray* ranges = cc->ranges(); - CharacterRange::Canonicalize(ranges); - for (int i = ranges->length() - 1; i >= 0; i--) { - uint32_t from = ranges->At(i).from(); - uint32_t to = ranges->At(i).to(); - // Check for non-BMP characters. - if (to >= Utf16::kMaxCodeUnit) return true; - // Check for lone surrogates. - if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) { - return true; - } - } - return false; -} - -bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) { - if (is_unicode() && ignore_case()) { - icu::UnicodeSet set(c, c); - set.closeOver(USET_CASE_INSENSITIVE); - set.removeAllStrings(); - return set.size() > 1; - } - return false; -} - RegExpTree* RegExpBuilder::ToRegExp() { FlushTerms(); intptr_t num_alternatives = alternatives_.length(); @@ -280,13 +152,13 @@ bool RegExpBuilder::AddQuantifierToAtom( for (intptr_t i = 0; i < num_chars - 1; i++) { prefix->Add(char_vector->At(i)); } - text_.Add(new (Z) RegExpAtom(prefix, flags_)); + text_.Add(new (Z) RegExpAtom(prefix)); ZoneGrowableArray* tail = new (Z) ZoneGrowableArray(); tail->Add(char_vector->At(num_chars - 1)); char_vector = tail; } characters_ = NULL; - atom = new (Z) RegExpAtom(char_vector, flags_); + atom = new (Z) RegExpAtom(char_vector); FlushText(); } else if (text_.length() > 0) { DEBUG_ASSERT(last_added_ == ADD_ATOM); @@ -296,8 +168,6 @@ bool RegExpBuilder::AddQuantifierToAtom( DEBUG_ASSERT(last_added_ == ADD_ATOM); atom = terms_.RemoveLast(); if (auto lookaround = atom->AsLookaround()) { - // With /u, lookarounds are not quantifiable. - if (is_unicode()) return false; // Lookbehinds are not quantifiable. if (lookaround->type() == RegExpLookaround::LOOKBEHIND) { return false; @@ -324,7 +194,7 @@ bool RegExpBuilder::AddQuantifierToAtom( // ---------------------------------------------------------------------------- // Implementation of Parser -RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags) +RegExpParser::RegExpParser(const String& in, String* error, bool multiline) : zone_(Thread::Current()->zone()), captures_(nullptr), named_captures_(nullptr), @@ -335,7 +205,7 @@ RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags) captures_started_(0), capture_count_(0), has_more_(true), - top_level_flags_(flags), + multiline_(multiline), simple_(false), contains_anchor_(false), is_scanned_for_captures_(false), @@ -343,38 +213,20 @@ RegExpParser::RegExpParser(const String& in, String* error, RegExpFlags flags) Advance(); } -inline uint32_t RegExpParser::ReadNext(bool update_position) { - intptr_t position = next_pos_; - const uint16_t c0 = in().CharAt(position); - uint32_t c = c0; - position++; - if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) { - const uint16_t c1 = in().CharAt(position); - if (Utf16::IsTrailSurrogate(c1)) { - c = Utf16::Decode(c0, c1); - position++; - } - } - if (update_position) next_pos_ = position; - return c; -} - uint32_t RegExpParser::Next() { if (has_next()) { - return ReadNext(false); + return in().CharAt(next_pos_); } else { return kEndMarker; } } void RegExpParser::Advance() { - if (has_next()) { - current_ = ReadNext(true); + if (next_pos_ < in().Length()) { + current_ = in().CharAt(next_pos_); + next_pos_++; } else { current_ = kEndMarker; - // Advance so that position() points to 1 after the last character. This is - // important so that Reset() to this position works correctly. - next_pos_ = in().Length() + 1; has_more_ = false; } } @@ -394,30 +246,6 @@ bool RegExpParser::simple() { return simple_; } -bool RegExpParser::IsSyntaxCharacterOrSlash(uint32_t c) { - switch (c) { - case '^': - case '$': - case '\\': - case '.': - case '*': - case '+': - case '?': - case '(': - case ')': - case '[': - case ']': - case '{': - case '}': - case '|': - case '/': - return true; - default: - break; - } - return false; -} - void RegExpParser::ReportError(const char* message) { // Zip to the end to make sure the no more input is read. current_ = kEndMarker; @@ -446,11 +274,6 @@ RegExpTree* RegExpParser::ParsePattern() { return result; } -// Used for error messages where we would have fallen back on treating an -// escape as the identity escape, but we are in Unicode mode. -static const char* kUnicodeIdentity = - "Invalid identity escape in Unicode pattern"; - // Disjunction :: // Alternative // Alternative | Disjunction @@ -464,7 +287,7 @@ static const char* kUnicodeIdentity = RegExpTree* RegExpParser::ParseDisjunction() { // Used to store current state while parsing subexpressions. RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD, - 0, nullptr, top_level_flags_, Z); + 0, nullptr, Z); RegExpParserState* stored_state = &initial_state; // Cache the builder in a local variable for quick access. RegExpBuilder* builder = initial_state.builder(); @@ -535,12 +358,12 @@ RegExpTree* RegExpParser::ParseDisjunction() { UNREACHABLE(); case '^': { Advance(); - if (builder->is_multi_line()) { - builder->AddAssertion(new (Z) RegExpAssertion( - RegExpAssertion::START_OF_LINE, builder->flags())); + if (multiline_) { + builder->AddAssertion( + new (Z) RegExpAssertion(RegExpAssertion::START_OF_LINE)); } else { - builder->AddAssertion(new (Z) RegExpAssertion( - RegExpAssertion::START_OF_INPUT, builder->flags())); + builder->AddAssertion( + new (Z) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); set_contains_anchor(); } continue; @@ -548,29 +371,19 @@ RegExpTree* RegExpParser::ParseDisjunction() { case '$': { Advance(); RegExpAssertion::AssertionType assertion_type = - builder->is_multi_line() ? RegExpAssertion::END_OF_LINE - : RegExpAssertion::END_OF_INPUT; - builder->AddAssertion( - new (Z) RegExpAssertion(assertion_type, builder->flags())); + multiline_ ? RegExpAssertion::END_OF_LINE + : RegExpAssertion::END_OF_INPUT; + builder->AddAssertion(new RegExpAssertion(assertion_type)); continue; } case '.': { Advance(); - auto ranges = new (Z) ZoneGrowableArray(2); - if (builder->is_dot_all()) { - // Everything. - CharacterRange::AddClassEscape( - '*', ranges, - /*add_unicode_case_equivalents=*/false); - } else { - // everything except \x0a, \x0d, \u2028 and \u2029 - CharacterRange::AddClassEscape( - '.', ranges, - /*add_unicode_case_equivalents=*/false); - } - RegExpCharacterClass* cc = - new (Z) RegExpCharacterClass(ranges, builder->flags()); - builder->AddCharacterClass(cc); + // everything except \x0a, \x0d, \u2028 and \u2029 + ZoneGrowableArray* ranges = + new ZoneGrowableArray(2); + CharacterRange::AddClassEscape('.', ranges); + RegExpTree* atom = new RegExpCharacterClass(ranges, false); + builder->AddAtom(atom); break; } case '(': { @@ -579,8 +392,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { continue; } case '[': { - RegExpTree* atom = ParseCharacterClass(builder); - builder->AddCharacterClass(atom->AsCharacterClass()); + RegExpTree* atom = ParseCharacterClass(); + builder->AddAtom(atom); break; } // Atom :: @@ -592,13 +405,13 @@ RegExpTree* RegExpParser::ParseDisjunction() { UNREACHABLE(); case 'b': Advance(2); - builder->AddAssertion(new (Z) RegExpAssertion( - RegExpAssertion::BOUNDARY, builder->flags())); + builder->AddAssertion( + new RegExpAssertion(RegExpAssertion::BOUNDARY)); continue; case 'B': Advance(2); - builder->AddAssertion(new (Z) RegExpAssertion( - RegExpAssertion::NON_BOUNDARY, builder->flags())); + builder->AddAssertion( + new RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); continue; // AtomEscape :: // CharacterClassEscape @@ -613,36 +426,11 @@ RegExpTree* RegExpParser::ParseDisjunction() { case 'W': { uint32_t c = Next(); Advance(2); - auto ranges = new (Z) ZoneGrowableArray(2); - CharacterRange::AddClassEscape( - c, ranges, is_unicode() && builder->ignore_case()); - RegExpCharacterClass* cc = - new (Z) RegExpCharacterClass(ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - case 'p': - case 'P': { - uint32_t p = Next(); - Advance(2); - - if (is_unicode()) { - auto name_1 = new (Z) ZoneGrowableArray(); - auto name_2 = new (Z) ZoneGrowableArray(); - auto ranges = new (Z) ZoneGrowableArray(2); - if (ParsePropertyClassName(name_1, name_2)) { - if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) { - RegExpCharacterClass* cc = - new (Z) RegExpCharacterClass(ranges, builder->flags()); - builder->AddCharacterClass(cc); - break; - } - } - ReportError("Invalid property name"); - UNREACHABLE(); - } else { - builder->AddCharacter(p); - } + ZoneGrowableArray* ranges = + new ZoneGrowableArray(2); + CharacterRange::AddClassEscape(c, ranges); + RegExpTree* atom = new RegExpCharacterClass(ranges, false); + builder->AddAtom(atom); break; } case '1': @@ -665,20 +453,14 @@ RegExpTree* RegExpParser::ParseDisjunction() { builder->AddEmpty(); } else { RegExpCapture* capture = GetCapture(index); - RegExpTree* atom = - new (Z) RegExpBackReference(capture, builder->flags()); + RegExpTree* atom = new RegExpBackReference(capture); builder->AddAtom(atom); } break; } - // With /u, no identity escapes except for syntax characters are - // allowed. Otherwise, all identity escapes are allowed. - if (is_unicode()) { - ReportError(kUnicodeIdentity); - UNREACHABLE(); - } uint32_t first_digit = Next(); if (first_digit == '8' || first_digit == '9') { + // Treat as identity escape builder->AddCharacter(first_digit); Advance(2); break; @@ -687,11 +469,6 @@ RegExpTree* RegExpParser::ParseDisjunction() { } case '0': { Advance(); - if (is_unicode() && Next() >= '0' && Next() <= '9') { - // With /u, decimal escape with leading 0 are not parsed as octal. - ReportError("Invalid decimal escape"); - UNREACHABLE(); - } uint32_t octal = ParseOctalLiteral(); builder->AddCharacter(octal); break; @@ -729,11 +506,6 @@ RegExpTree* RegExpParser::ParseDisjunction() { // This is outside the specification. We match JSC in // reading the backslash as a literal character instead // of as starting an escape. - if (is_unicode()) { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(kUnicodeIdentity); - UNREACHABLE(); - } builder->AddCharacter('\\'); } else { Advance(2); @@ -746,26 +518,18 @@ RegExpTree* RegExpParser::ParseDisjunction() { uint32_t value; if (ParseHexEscape(2, &value)) { builder->AddCharacter(value); - } else if (!is_unicode()) { - builder->AddCharacter('x'); } else { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(kUnicodeIdentity); - UNREACHABLE(); + builder->AddCharacter('x'); } break; } case 'u': { Advance(2); uint32_t value; - if (ParseUnicodeEscape(&value)) { - builder->AddEscapedUnicodeCharacter(value); - } else if (!is_unicode()) { - builder->AddCharacter('u'); + if (ParseHexEscape(4, &value)) { + builder->AddCharacter(value); } else { - // With /u, invalid escapes are not treated as identity escapes. - ReportError(kUnicodeIdentity); - UNREACHABLE(); + builder->AddCharacter('u'); } break; } @@ -775,24 +539,16 @@ RegExpTree* RegExpParser::ParseDisjunction() { // an identity escape for non-Unicode patterns without named // capture groups, and as the beginning of a named back-reference // in all other cases. - if (is_unicode() || HasNamedCaptures()) { + if (HasNamedCaptures()) { Advance(2); ParseNamedBackReference(builder, stored_state); break; } FALL_THROUGH; default: - Advance(); - // With the unicode flag, no identity escapes except for syntax - // characters are allowed. Otherwise, all identity escapes are - // allowed. - if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) { - builder->AddCharacter(current()); - Advance(); - } else { - ReportError(kUnicodeIdentity); - UNREACHABLE(); - } + // Identity escape. + builder->AddCharacter(Next()); + Advance(2); break; } break; @@ -804,15 +560,8 @@ RegExpTree* RegExpParser::ParseDisjunction() { } FALL_THROUGH; } - case '}': - case ']': - if (is_unicode()) { - ReportError("Lone quantifier brackets"); - UNREACHABLE(); - } - FALL_THROUGH; default: - builder->AddUnicodeCharacter(current()); + builder->AddCharacter(current()); Advance(); break; } // end switch(current()) @@ -944,9 +693,8 @@ RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis( } } // Store current state and begin new disjunction parsing. - return new (Z) - RegExpParserState(state, subexpr_type, lookaround_type, captures_started_, - capture_name, state->builder()->flags(), Z); + return new RegExpParserState(state, subexpr_type, lookaround_type, + captures_started_, capture_name, Z); } // In order to know whether an escape is a backreference or not we have to scan @@ -1008,6 +756,10 @@ void RegExpParser::ScanForCaptures() { Reset(saved_position); } +static inline bool IsDecimalDigit(int32_t c) { + return '0' <= c && c <= '9'; +} + bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { ASSERT('\\' == current()); ASSERT('1' <= Next() && Next() <= '9'); @@ -1018,7 +770,7 @@ bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { Advance(2); while (true) { uint32_t c = current(); - if (Utils::IsDecimalDigit(c)) { + if (IsDecimalDigit(c)) { value = 10 * value + (c - '0'); if (value > kMaxCaptures) { Reset(start); @@ -1042,43 +794,17 @@ bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) { namespace { -static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) { - return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$'; +inline constexpr bool IsIdentifierStart(uint16_t ch) { + return (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '_' || + ch == '$'; } -// ES#sec-names-and-keywords Names and Keywords -// UnicodeIDStart, '$', '_' and '\' -static bool IsIdentifierStartSlow(uint32_t c) { - // cannot use u_isIDStart because it does not work for - // Other_ID_Start characters. - return u_hasBinaryProperty(c, UCHAR_ID_START) || - (c < 0x60 && (c == '$' || c == '\\' || c == '_')); +inline constexpr bool IsIdentifierPart(uint16_t ch) { + return IsIdentifierStart(ch) || (ch >= '0' && ch <= '9'); } -// ES#sec-names-and-keywords Names and Keywords -// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ -static bool IsIdentifierPartSlow(uint32_t c) { - const uint32_t kZeroWidthNonJoiner = 0x200C; - const uint32_t kZeroWidthJoiner = 0x200D; - // Can't use u_isIDPart because it does not work for - // Other_ID_Continue characters. - return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) || - (c < 0x60 && (c == '$' || c == '\\' || c == '_')) || - c == kZeroWidthNonJoiner || c == kZeroWidthJoiner; -} - -static inline bool IsIdentifierStart(uint32_t c) { - if (c > 127) return IsIdentifierStartSlow(c); - return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c); -} - -static inline bool IsIdentifierPart(uint32_t c) { - if (c > 127) return IsIdentifierPartSlow(c); - return IsAsciiIdentifierPart(c); -} - -static bool IsSameName(const RegExpCaptureName* name1, - const RegExpCaptureName* name2) { +bool IsSameName(const RegExpCaptureName* name1, + const RegExpCaptureName* name2) { if (name1->length() != name2->length()) return false; for (intptr_t i = 0; i < name1->length(); i++) { if (name1->At(i) != name2->At(i)) return false; @@ -1088,34 +814,14 @@ static bool IsSameName(const RegExpCaptureName* name1, } // end namespace -static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) { - if (code_unit <= Utf16::kMaxCodeUnit) { - v->Add(code_unit); - } else { - uint16_t units[2]; - Utf16::Encode(code_unit, units); - v->Add(units[0]); - v->Add(units[1]); - } -} - const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() { auto name = new (Z) RegExpCaptureName(); bool at_start = true; while (true) { - uint32_t c = current(); + const uint16_t c = current(); Advance(); - // Convert unicode escapes. - if (c == '\\' && current() == 'u') { - Advance(); - if (!ParseUnicodeEscape(&c)) { - ReportError("Invalid Unicode escape sequence"); - UNREACHABLE(); - } - } - // The backslash char is misclassified as both ID_Start and ID_Continue. if (c == '\\') { ReportError("Invalid capture group name"); @@ -1127,13 +833,13 @@ const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() { ReportError("Invalid capture group name"); UNREACHABLE(); } - PushCodeUnit(name, c); + name->Add(c); at_start = false; } else { if (c == '>') { break; } else if (IsIdentifierPart(c)) { - PushCodeUnit(name, c); + name->Add(c); } else { ReportError("Invalid capture group name"); UNREACHABLE(); @@ -1190,7 +896,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, if (state->IsInsideCaptureGroup(name)) { builder->AddEmpty(); } else { - RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags()); + RegExpBackReference* atom = new (Z) RegExpBackReference(); atom->set_name(name); builder->AddAtom(atom); @@ -1309,17 +1015,17 @@ bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out, intptr_t start = position(); Advance(); intptr_t min = 0; - if (!Utils::IsDecimalDigit(current())) { + if (!IsDecimalDigit(current())) { Reset(start); return false; } - while (Utils::IsDecimalDigit(current())) { + while (IsDecimalDigit(current())) { intptr_t next = current() - '0'; if (min > (RegExpTree::kInfinity - next) / 10) { // Overflow. Skip past remaining decimal digits and return -1. do { Advance(); - } while (Utils::IsDecimalDigit(current())); + } while (IsDecimalDigit(current())); min = RegExpTree::kInfinity; break; } @@ -1336,12 +1042,12 @@ bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out, max = RegExpTree::kInfinity; Advance(); } else { - while (Utils::IsDecimalDigit(current())) { + while (IsDecimalDigit(current())) { intptr_t next = current() - '0'; if (max > (RegExpTree::kInfinity - next) / 10) { do { Advance(); - } while (Utils::IsDecimalDigit(current())); + } while (IsDecimalDigit(current())); max = RegExpTree::kInfinity; break; } @@ -1411,317 +1117,6 @@ bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) { return true; } -// This parses RegExpUnicodeEscapeSequence as described in ECMA262. -bool RegExpParser::ParseUnicodeEscape(uint32_t* value) { - // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are - // allowed). In the latter case, the number of hex digits between { } is - // arbitrary. \ and u have already been read. - if (current() == '{' && is_unicode()) { - int start = position(); - Advance(); - if (ParseUnlimitedLengthHexNumber(Utf::kMaxCodePoint, value)) { - if (current() == '}') { - Advance(); - return true; - } - } - Reset(start); - return false; - } - // \u but no {, or \u{...} escapes not allowed. - bool result = ParseHexEscape(4, value); - if (result && is_unicode() && Utf16::IsLeadSurrogate(*value) && - current() == '\\') { - // Attempt to read trail surrogate. - int start = position(); - if (Next() == 'u') { - Advance(2); - uint32_t trail; - if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) { - *value = Utf16::Decode(static_cast(*value), - static_cast(trail)); - return true; - } - } - Reset(start); - } - return result; -} - -namespace { - -bool IsExactPropertyAlias(const char* property_name, UProperty property) { - const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME); - if (short_name != nullptr && strcmp(property_name, short_name) == 0) { - return true; - } - for (int i = 0;; i++) { - const char* long_name = u_getPropertyName( - property, static_cast(U_LONG_PROPERTY_NAME + i)); - if (long_name == nullptr) break; - if (strcmp(property_name, long_name) == 0) return true; - } - return false; -} - -bool IsExactPropertyValueAlias(const char* property_value_name, - UProperty property, - int32_t property_value) { - const char* short_name = - u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME); - if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) { - return true; - } - for (int i = 0;; i++) { - const char* long_name = u_getPropertyValueName( - property, property_value, - static_cast(U_LONG_PROPERTY_NAME + i)); - if (long_name == nullptr) break; - if (strcmp(property_value_name, long_name) == 0) return true; - } - return false; -} - -bool LookupPropertyValueName(UProperty property, - const char* property_value_name, - bool negate, - ZoneGrowableArray* result) { - UProperty property_for_lookup = property; - if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) { - // For the property Script_Extensions, we have to do the property value - // name lookup as if the property is Script. - property_for_lookup = UCHAR_SCRIPT; - } - int32_t property_value = - u_getPropertyValueEnum(property_for_lookup, property_value_name); - if (property_value == UCHAR_INVALID_CODE) return false; - - // We require the property name to match exactly to one of the property value - // aliases. However, u_getPropertyValueEnum uses loose matching. - if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup, - property_value)) { - return false; - } - - UErrorCode ec = U_ZERO_ERROR; - icu::UnicodeSet set; - set.applyIntPropertyValue(property, property_value, ec); - bool success = ec == U_ZERO_ERROR && !set.isEmpty(); - - if (success) { - set.removeAllStrings(); - if (negate) set.complement(); - for (int i = 0; i < set.getRangeCount(); i++) { - result->Add( - CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i))); - } - } - return success; -} - -template -inline bool NameEquals(const char* name, const char (&literal)[N]) { - return strncmp(name, literal, N + 1) == 0; -} - -bool LookupSpecialPropertyValueName(const char* name, - ZoneGrowableArray* result, - bool negate) { - if (NameEquals(name, "Any")) { - if (negate) { - // Leave the list of character ranges empty, since the negation of 'Any' - // is the empty set. - } else { - result->Add(CharacterRange::Everything()); - } - } else if (NameEquals(name, "ASCII")) { - result->Add(negate ? CharacterRange::Range(0x80, Utf::kMaxCodePoint) - : CharacterRange::Range(0x0, 0x7F)); - } else if (NameEquals(name, "Assigned")) { - return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned", - !negate, result); - } else { - return false; - } - return true; -} - -// Explicitly whitelist supported binary properties. The spec forbids supporting -// properties outside of this set to ensure interoperability. -bool IsSupportedBinaryProperty(UProperty property) { - switch (property) { - case UCHAR_ALPHABETIC: - // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName. - // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName. - case UCHAR_ASCII_HEX_DIGIT: - // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName. - case UCHAR_BIDI_CONTROL: - case UCHAR_BIDI_MIRRORED: - case UCHAR_CASE_IGNORABLE: - case UCHAR_CASED: - case UCHAR_CHANGES_WHEN_CASEFOLDED: - case UCHAR_CHANGES_WHEN_CASEMAPPED: - case UCHAR_CHANGES_WHEN_LOWERCASED: - case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: - case UCHAR_CHANGES_WHEN_TITLECASED: - case UCHAR_CHANGES_WHEN_UPPERCASED: - case UCHAR_DASH: - case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: - case UCHAR_DEPRECATED: - case UCHAR_DIACRITIC: - case UCHAR_EMOJI: - case UCHAR_EMOJI_COMPONENT: - case UCHAR_EMOJI_MODIFIER_BASE: - case UCHAR_EMOJI_MODIFIER: - case UCHAR_EMOJI_PRESENTATION: - case UCHAR_EXTENDED_PICTOGRAPHIC: - case UCHAR_EXTENDER: - case UCHAR_GRAPHEME_BASE: - case UCHAR_GRAPHEME_EXTEND: - case UCHAR_HEX_DIGIT: - case UCHAR_ID_CONTINUE: - case UCHAR_ID_START: - case UCHAR_IDEOGRAPHIC: - case UCHAR_IDS_BINARY_OPERATOR: - case UCHAR_IDS_TRINARY_OPERATOR: - case UCHAR_JOIN_CONTROL: - case UCHAR_LOGICAL_ORDER_EXCEPTION: - case UCHAR_LOWERCASE: - case UCHAR_MATH: - case UCHAR_NONCHARACTER_CODE_POINT: - case UCHAR_PATTERN_SYNTAX: - case UCHAR_PATTERN_WHITE_SPACE: - case UCHAR_QUOTATION_MARK: - case UCHAR_RADICAL: - case UCHAR_REGIONAL_INDICATOR: - case UCHAR_S_TERM: - case UCHAR_SOFT_DOTTED: - case UCHAR_TERMINAL_PUNCTUATION: - case UCHAR_UNIFIED_IDEOGRAPH: - case UCHAR_UPPERCASE: - case UCHAR_VARIATION_SELECTOR: - case UCHAR_WHITE_SPACE: - case UCHAR_XID_CONTINUE: - case UCHAR_XID_START: - return true; - default: - break; - } - return false; -} - -bool IsUnicodePropertyValueCharacter(char c) { - // https://tc39.github.io/proposal-regexp-unicode-property-escapes/ - // - // Note that using this to validate each parsed char is quite conservative. - // A possible alternative solution would be to only ensure the parsed - // property name/value candidate string does not contain '\0' characters and - // let ICU lookups trigger the final failure. - if (Utils::IsAlphaNumeric(c)) return true; - return (c == '_'); -} - -} // anonymous namespace - -bool RegExpParser::ParsePropertyClassName(ZoneGrowableArray* name_1, - ZoneGrowableArray* name_2) { - ASSERT(name_1->is_empty()); - ASSERT(name_2->is_empty()); - // Parse the property class as follows: - // - In \p{name}, 'name' is interpreted - // - either as a general category property value name. - // - or as a binary property name. - // - In \p{name=value}, 'name' is interpreted as an enumerated property name, - // and 'value' is interpreted as one of the available property value names. - // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used. - // - Loose matching is not applied. - if (current() == '{') { - // Parse \p{[PropertyName=]PropertyNameValue} - for (Advance(); current() != '}' && current() != '='; Advance()) { - if (!IsUnicodePropertyValueCharacter(current())) return false; - if (!has_next()) return false; - name_1->Add(static_cast(current())); - } - if (current() == '=') { - for (Advance(); current() != '}'; Advance()) { - if (!IsUnicodePropertyValueCharacter(current())) return false; - if (!has_next()) return false; - name_2->Add(static_cast(current())); - } - name_2->Add(0); // null-terminate string. - } - } else { - return false; - } - Advance(); - name_1->Add(0); // null-terminate string. - - ASSERT(static_cast(name_1->length() - 1) == strlen(name_1->data())); - ASSERT(name_2->is_empty() || - static_cast(name_2->length() - 1) == strlen(name_2->data())); - return true; -} - -bool RegExpParser::AddPropertyClassRange( - ZoneGrowableArray* add_to, - bool negate, - ZoneGrowableArray* name_1, - ZoneGrowableArray* name_2) { - ASSERT(name_1->At(name_1->length() - 1) == '\0'); - ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0'); - if (name_2->is_empty()) { - // First attempt to interpret as general category property value name. - const char* name = name_1->data(); - if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate, - add_to)) { - return true; - } - // Interpret "Any", "ASCII", and "Assigned". - if (LookupSpecialPropertyValueName(name, add_to, negate)) { - return true; - } - // Then attempt to interpret as binary property name with value name 'Y'. - UProperty property = u_getPropertyEnum(name); - if (!IsSupportedBinaryProperty(property)) return false; - if (!IsExactPropertyAlias(name, property)) return false; - return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to); - } else { - // Both property name and value name are specified. Attempt to interpret - // the property name as enumerated property. - const char* property_name = name_1->data(); - const char* value_name = name_2->data(); - UProperty property = u_getPropertyEnum(property_name); - if (!IsExactPropertyAlias(property_name, property)) return false; - if (property == UCHAR_GENERAL_CATEGORY) { - // We want to allow aggregate value names such as "Letter". - property = UCHAR_GENERAL_CATEGORY_MASK; - } else if (property != UCHAR_SCRIPT && - property != UCHAR_SCRIPT_EXTENSIONS) { - return false; - } - return LookupPropertyValueName(property, value_name, negate, add_to); - } -} - -bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t max_value, - uint32_t* value) { - uint32_t x = 0; - int d = HexValue(current()); - if (d < 0) { - return false; - } - while (d >= 0) { - x = x * 16 + d; - if (x > max_value) { - return false; - } - Advance(); - d = HexValue(current()); - } - *value = x; - return true; -} - uint32_t RegExpParser::ParseClassCharacterEscape() { ASSERT(current() == '\\'); DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next())); @@ -1752,47 +1147,35 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { uint32_t letter = controlLetter & ~('A' ^ 'a'); // For compatibility with JSC, inside a character class // we also accept digits and underscore as control characters. - if (letter >= 'A' && letter <= 'Z') { + if ((controlLetter >= '0' && controlLetter <= '9') || + controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { Advance(2); // Control letters mapped to ASCII control characters in the range // 0x00-0x1f. return controlLetter & 0x1f; } - if (is_unicode()) { - // With /u, \c# or \c_ are invalid. - ReportError("Invalid class escape"); - UNREACHABLE(); - } - if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') { - Advance(2); - return controlLetter & 0x1f; - } // We match JSC in reading the backslash as a literal // character instead of as starting an escape. return '\\'; } case '0': - // With /u, \0 is interpreted as NUL if not followed by another digit. - if (is_unicode() && !(Next() >= '0' && Next() <= '9')) { - Advance(); - return 0; - } FALL_THROUGH; case '1': + FALL_THROUGH; case '2': + FALL_THROUGH; case '3': + FALL_THROUGH; case '4': + FALL_THROUGH; case '5': + FALL_THROUGH; case '6': + FALL_THROUGH; case '7': // For compatibility, we interpret a decimal escape that isn't // a back reference (and therefore either \0 or not valid according // to the specification) as a 1..3 digit octal character code. - if (is_unicode()) { - // With \u, decimal escape is not interpreted as octal character code. - ReportError("Invalid class escape"); - UNREACHABLE(); - } return ParseOctalLiteral(); case 'x': { Advance(); @@ -1800,11 +1183,6 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { if (ParseHexEscape(2, &value)) { return value; } - if (is_unicode()) { - // With \u, invalid escapes are not treated as identity escapes. - ReportError("Invalid escape"); - UNREACHABLE(); - } // If \x is not followed by a two-digit hexadecimal, treat it // as an identity escape. return 'x'; @@ -1812,14 +1190,9 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { case 'u': { Advance(); uint32_t value; - if (ParseUnicodeEscape(&value)) { + if (ParseHexEscape(4, &value)) { return value; } - if (is_unicode()) { - // With \u, invalid escapes are not treated as identity escapes. - ReportError(kUnicodeIdentity); - UNREACHABLE(); - } // If \u is not followed by a four-digit hexadecimal, treat it // as an identity escape. return 'u'; @@ -1829,20 +1202,15 @@ uint32_t RegExpParser::ParseClassCharacterEscape() { // been matched by a more specific case, not just the subset required // by the ECMAScript specification. uint32_t result = current(); - if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') { - Advance(); - return result; - } - ReportError(kUnicodeIdentity); - UNREACHABLE(); + Advance(); + return result; } } return 0; } -bool RegExpParser::ParseClassEscape(ZoneGrowableArray* ranges, - bool add_unicode_case_equivalents, - uint32_t* char_out) { +CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) { + ASSERT(0 == *char_class); uint32_t first = current(); if (first == '\\') { switch (Next()) { @@ -1852,42 +1220,40 @@ bool RegExpParser::ParseClassEscape(ZoneGrowableArray* ranges, case 'D': case 's': case 'S': { - CharacterRange::AddClassEscape(static_cast(Next()), ranges, - add_unicode_case_equivalents); + *char_class = Next(); Advance(2); - return true; - } - case 'p': - case 'P': { - if (!is_unicode()) break; - bool negate = Next() == 'P'; - Advance(2); - auto name_1 = new (Z) ZoneGrowableArray(); - auto name_2 = new (Z) ZoneGrowableArray(); - if (!ParsePropertyClassName(name_1, name_2) || - !AddPropertyClassRange(ranges, negate, name_1, name_2)) { - ReportError("Invalid property name in character class"); - UNREACHABLE(); - } - return true; + return CharacterRange::Singleton(0); // Return dummy value. } case kEndMarker: ReportError("\\ at end of pattern"); UNREACHABLE(); default: - break; + uint32_t c = ParseClassCharacterEscape(); + return CharacterRange::Singleton(c); } - *char_out = ParseClassCharacterEscape(); - return false; + } else { + Advance(); + return CharacterRange::Singleton(first); } - Advance(); - *char_out = first; - return false; } -RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { +static const uint16_t kNoCharClass = 0; + +// Adds range or pre-defined character class to character ranges. +// If char_class is not kInvalidClass, it's interpreted as a class +// escape (i.e., 's' means whitespace, from '\s'). +static inline void AddRangeOrEscape(ZoneGrowableArray* ranges, + uint16_t char_class, + CharacterRange range) { + if (char_class != kNoCharClass) { + CharacterRange::AddClassEscape(char_class, ranges); + } else { + ranges->Add(range); + } +} + +RegExpTree* RegExpParser::ParseCharacterClass() { static const char* kUnterminated = "Unterminated character class"; - static const char* kRangeInvalid = "Invalid character class"; static const char* kRangeOutOfOrder = "Range out of order in character class"; ASSERT(current() == '['); @@ -1899,11 +1265,9 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { } ZoneGrowableArray* ranges = new (Z) ZoneGrowableArray(2); - bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case(); while (has_more() && current() != ']') { - uint32_t char_1; - bool is_class_1 = - ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1); + uint16_t char_class = kNoCharClass; + CharacterRange first = ParseClassAtom(&char_class); if (current() == '-') { Advance(); if (current() == kEndMarker) { @@ -1911,32 +1275,26 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { // following code report an error. break; } else if (current() == ']') { - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); + AddRangeOrEscape(ranges, char_class, first); ranges->Add(CharacterRange::Singleton('-')); break; } - uint32_t char_2; - bool is_class_2 = - ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2); - if (is_class_1 || is_class_2) { + uint16_t char_class_2 = kNoCharClass; + CharacterRange next = ParseClassAtom(&char_class_2); + if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { // Either end is an escaped character class. Treat the '-' verbatim. - if (is_unicode()) { - // ES2015 21.2.2.15.1 step 1. - ReportError(kRangeInvalid); - UNREACHABLE(); - } - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); + AddRangeOrEscape(ranges, char_class, first); ranges->Add(CharacterRange::Singleton('-')); - if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2)); + AddRangeOrEscape(ranges, char_class_2, next); continue; } - if (char_1 > char_2) { + if (first.from() > next.to()) { ReportError(kRangeOutOfOrder); UNREACHABLE(); } - ranges->Add(CharacterRange::Range(char_1, char_2)); + ranges->Add(CharacterRange::Range(first.from(), next.to())); } else { - if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1)); + AddRangeOrEscape(ranges, char_class, first); } } if (!has_more()) { @@ -1944,21 +1302,21 @@ RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) { UNREACHABLE(); } Advance(); - RegExpCharacterClass::CharacterClassFlags character_class_flags = - RegExpCharacterClass::DefaultFlags(); - if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED; - return new (Z) - RegExpCharacterClass(ranges, builder->flags(), character_class_flags); + if (ranges->length() == 0) { + ranges->Add(CharacterRange::Everything()); + is_negated = !is_negated; + } + return new (Z) RegExpCharacterClass(ranges, is_negated); } // ---------------------------------------------------------------------------- // The Parser interface. void RegExpParser::ParseRegExp(const String& input, - RegExpFlags flags, + bool multiline, RegExpCompileData* result) { ASSERT(result != NULL); - RegExpParser parser(input, &result->error, flags); + RegExpParser parser(input, &result->error, multiline); // Throws an exception if 'input' is not valid. RegExpTree* tree = parser.ParsePattern(); ASSERT(tree != NULL); diff --git a/runtime/vm/regexp_parser.h b/runtime/vm/regexp_parser.h index 63c237a3568..a2b626b97a6 100644 --- a/runtime/vm/regexp_parser.h +++ b/runtime/vm/regexp_parser.h @@ -14,18 +14,13 @@ namespace dart { // Accumulates RegExp atoms and assertions into lists of terms and alternatives. class RegExpBuilder : public ZoneAllocated { public: - explicit RegExpBuilder(RegExpFlags flags); + RegExpBuilder(); void AddCharacter(uint16_t character); - void AddUnicodeCharacter(uint32_t character); - void AddEscapedUnicodeCharacter(uint32_t character); // "Adds" an empty expression. Does nothing except consume a // following quantifier void AddEmpty(); - void AddCharacterClass(RegExpCharacterClass* cc); - void AddCharacterClassForDesugaring(uint32_t c); void AddAtom(RegExpTree* tree); - void AddTerm(RegExpTree* tree); void AddAssertion(RegExpTree* tree); void NewAlternative(); // '|' // Attempt to add a quantifier to the last atom added. The return value @@ -35,30 +30,17 @@ class RegExpBuilder : public ZoneAllocated { intptr_t max, RegExpQuantifier::QuantifierType type); RegExpTree* ToRegExp(); - RegExpFlags flags() const { return flags_; } - bool ignore_case() const { return flags_.IgnoreCase(); } - bool is_multi_line() const { return flags_.IsMultiLine(); } - bool is_dot_all() const { return flags_.IsDotAll(); } private: - static const uint16_t kNoPendingSurrogate = 0; - void AddLeadSurrogate(uint16_t lead_surrogate); - void AddTrailSurrogate(uint16_t trail_surrogate); - void FlushPendingSurrogate(); void FlushCharacters(); void FlushText(); void FlushTerms(); - bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc); - bool NeedsDesugaringForIgnoreCase(uint32_t c); Zone* zone() const { return zone_; } - bool is_unicode() const { return flags_.IsUnicode(); } Zone* zone_; bool pending_empty_; - RegExpFlags flags_; ZoneGrowableArray* characters_; - uint16_t pending_surrogate_; GrowableArray terms_; GrowableArray text_; GrowableArray alternatives_; @@ -74,15 +56,16 @@ using RegExpCaptureName = ZoneGrowableArray; class RegExpParser : public ValueObject { public: - RegExpParser(const String& in, String* error, RegExpFlags regexp_flags); + RegExpParser(const String& in, String* error, bool multiline_mode); static void ParseRegExp(const String& input, - RegExpFlags regexp_flags, + bool multiline, RegExpCompileData* result); RegExpTree* ParsePattern(); RegExpTree* ParseDisjunction(); RegExpTree* ParseGroup(); + RegExpTree* ParseCharacterClass(); // Parses a {...,...} quantifier and stores the range in the given // out parameters. @@ -95,24 +78,6 @@ class RegExpParser : public ValueObject { // Checks whether the following is a length-digit hexadecimal number, // and sets the value if it is. bool ParseHexEscape(intptr_t length, uint32_t* value); - bool ParseUnicodeEscape(uint32_t* value); - bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t* value); - - // Parses either {UNICODE_PROPERTY_NAME=UNICODE_PROPERTY_VALUE} or - // the shorthand {UNICODE_PROPERTY_NAME_OR_VALUE} and stores the - // result in the given out parameters. If the shorthand is used, - // nothing will be added to name_2. - bool ParsePropertyClassName(ZoneGrowableArray* name_1, - ZoneGrowableArray* name_2); - // Adds the specified unicode property to the provided character range. - bool AddPropertyClassRange(ZoneGrowableArray* add_to, - bool negate, - ZoneGrowableArray* name_1, - ZoneGrowableArray* name_2); - // Returns a regexp node that corresponds to one of these unicode - // property sequences: "Any", "ASCII", "Assigned". - RegExpTree* GetPropertySequence(ZoneGrowableArray* name_1); - RegExpTree* ParseCharacterClass(const RegExpBuilder* builder); uint32_t ParseOctalLiteral(); @@ -122,10 +87,7 @@ class RegExpParser : public ValueObject { // can be reparsed. bool ParseBackReferenceIndex(intptr_t* index_out); - // Attempts to parse a possible escape within a character class. - bool ParseClassEscape(ZoneGrowableArray* ranges, - bool add_unicode_case_equivalents, - uint32_t* char_out); + CharacterRange ParseClassAtom(uint16_t* char_class); void ReportError(const char* message); void Advance(); void Advance(intptr_t dist); @@ -138,9 +100,6 @@ class RegExpParser : public ValueObject { void set_contains_anchor() { contains_anchor_ = true; } intptr_t captures_started() { return captures_started_; } intptr_t position() { return next_pos_ - 1; } - bool is_unicode() const { return top_level_flags_.IsUnicode(); } - - static bool IsSyntaxCharacterOrSlash(uint32_t c); static const intptr_t kMaxCaptures = 1 << 16; static const uint32_t kEndMarker = (1 << 21); @@ -161,10 +120,9 @@ class RegExpParser : public ValueObject { RegExpLookaround::Type lookaround_type, intptr_t disjunction_capture_index, const RegExpCaptureName* capture_name, - RegExpFlags flags, Zone* zone) : previous_state_(previous_state), - builder_(new (zone) RegExpBuilder(flags)), + builder_(new (zone) RegExpBuilder()), group_type_(group_type), lookaround_type_(lookaround_type), disjunction_capture_index_(disjunction_capture_index), @@ -240,7 +198,6 @@ class RegExpParser : public ValueObject { bool has_more() { return has_more_; } bool has_next() { return next_pos_ < in().Length(); } uint32_t Next(); - uint32_t ReadNext(bool update_position); const String& in() { return in_; } void ScanForCaptures(); @@ -255,7 +212,7 @@ class RegExpParser : public ValueObject { // The capture count is only valid after we have scanned for captures. intptr_t capture_count_; bool has_more_; - RegExpFlags top_level_flags_; + bool multiline_; bool simple_; bool contains_anchor_; bool is_scanned_for_captures_; diff --git a/runtime/vm/regexp_test.cc b/runtime/vm/regexp_test.cc index f8009312e5b..531f12d5b97 100644 --- a/runtime/vm/regexp_test.cc +++ b/runtime/vm/regexp_test.cc @@ -16,7 +16,7 @@ static RawArray* Match(const String& pat, const String& str) { Thread* thread = Thread::Current(); Zone* zone = thread->zone(); const RegExp& regexp = - RegExp::Handle(RegExpEngine::CreateRegExp(thread, pat, RegExpFlags())); + RegExp::Handle(RegExpEngine::CreateRegExp(thread, pat, false, false)); const Smi& idx = Smi::Handle(Smi::New(0)); return IRRegExpMacroAssembler::Execute(regexp, str, idx, /*sticky=*/false, zone); diff --git a/runtime/vm/runtime_entry_list.h b/runtime/vm/runtime_entry_list.h index dd4159d851e..ddcd350bd43 100644 --- a/runtime/vm/runtime_entry_list.h +++ b/runtime/vm/runtime_entry_list.h @@ -79,9 +79,7 @@ namespace dart { V(double, LibcAsin, double) \ V(double, LibcAtan, double) \ V(double, LibcAtan2, double, double) \ - V(RawBool*, CaseInsensitiveCompareUCS2, RawString*, RawSmi*, RawSmi*, \ - RawSmi*) \ - V(RawBool*, CaseInsensitiveCompareUTF16, RawString*, RawSmi*, RawSmi*, \ + V(RawBool*, CaseInsensitiveCompareUC16, RawString*, RawSmi*, RawSmi*, \ RawSmi*) \ V(void, EnterSafepoint) \ V(void, ExitSafepoint) diff --git a/runtime/vm/splay-tree.h b/runtime/vm/splay-tree.h deleted file mode 100644 index 54c5d9faf0e..00000000000 --- a/runtime/vm/splay-tree.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. Please see the AUTHORS file -// for details. All rights reserved. Use of this source code is governed by a -// BSD-style license that can be found in the LICENSE file. - -#ifndef RUNTIME_VM_SPLAY_TREE_H_ -#define RUNTIME_VM_SPLAY_TREE_H_ - -#include "platform/splay-tree.h" -#include "vm/zone.h" - -namespace dart { - -// A zone splay tree. The config type parameter encapsulates the -// different configurations of a concrete splay tree (see -// platform/splay-tree.h). The tree itself and all its elements are allocated -// in the Zone. -template -class ZoneSplayTree final : public SplayTree { - public: - explicit ZoneSplayTree(Zone* zone) - : SplayTree(ASSERT_NOTNULL(zone)) {} - ~ZoneSplayTree() { - // Reset the root to avoid unneeded iteration over all tree nodes - // in the destructor. For a zone-allocated tree, nodes will be - // freed by the Zone. - SplayTree::ResetRoot(); - } -}; - -} // namespace dart - -#endif // RUNTIME_VM_SPLAY_TREE_H_ diff --git a/runtime/vm/stub_code_arm64_test.cc b/runtime/vm/stub_code_arm64_test.cc index 7471c7de8fb..4e69a5a842d 100644 --- a/runtime/vm/stub_code_arm64_test.cc +++ b/runtime/vm/stub_code_arm64_test.cc @@ -83,7 +83,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ LoadObject(R1, lhs_index); __ LoadObject(R2, rhs_index); __ LoadObject(R3, length); - __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); __ LeaveDartFrame(); __ ret(); // Return value is in R0. } diff --git a/runtime/vm/stub_code_arm_test.cc b/runtime/vm/stub_code_arm_test.cc index d849dc0a292..2a664780063 100644 --- a/runtime/vm/stub_code_arm_test.cc +++ b/runtime/vm/stub_code_arm_test.cc @@ -82,7 +82,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ LoadObject(R1, lhs_index); __ LoadObject(R2, rhs_index); __ LoadObject(R3, length); - __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); __ LeaveDartFrameAndReturn(); // Return value is in R0. } diff --git a/runtime/vm/stub_code_ia32_test.cc b/runtime/vm/stub_code_ia32_test.cc index 578afde029f..a4f79fcf25a 100644 --- a/runtime/vm/stub_code_ia32_test.cc +++ b/runtime/vm/stub_code_ia32_test.cc @@ -86,7 +86,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ movl(Address(ESP, 2 * kWordSize), EAX); // Push argument 3. __ LoadObject(EAX, length); __ movl(Address(ESP, 3 * kWordSize), EAX); // Push argument 4. - __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); __ leave(); __ ret(); // Return value is in EAX. } diff --git a/runtime/vm/stub_code_x64_test.cc b/runtime/vm/stub_code_x64_test.cc index 4cdf61386e2..a71484ab6b6 100644 --- a/runtime/vm/stub_code_x64_test.cc +++ b/runtime/vm/stub_code_x64_test.cc @@ -83,7 +83,7 @@ static void GenerateCallToCallLeafRuntimeStub(Assembler* assembler, __ LoadObject(CallingConventions::kArg2Reg, lhs_index); __ LoadObject(CallingConventions::kArg3Reg, rhs_index); __ LoadObject(CallingConventions::kArg4Reg, length); - __ CallRuntime(kCaseInsensitiveCompareUCS2RuntimeEntry, 4); + __ CallRuntime(kCaseInsensitiveCompareUC16RuntimeEntry, 4); __ LeaveStubFrame(); __ ret(); // Return value is in RAX. } diff --git a/sdk/lib/_internal/js_runtime/lib/core_patch.dart b/sdk/lib/_internal/js_runtime/lib/core_patch.dart index 998f42b774d..e31fe29db10 100644 --- a/sdk/lib/_internal/js_runtime/lib/core_patch.dart +++ b/sdk/lib/_internal/js_runtime/lib/core_patch.dart @@ -528,15 +528,9 @@ class RegExp { @pragma('dart2js:noInline') @patch factory RegExp(String source, - {bool multiLine: false, - bool caseSensitive: true, - bool unicode: false, - bool dotAll: false}) => + {bool multiLine: false, bool caseSensitive: true}) => new JSSyntaxRegExp(source, - multiLine: multiLine, - caseSensitive: caseSensitive, - unicode: unicode, - dotAll: dotAll); + multiLine: multiLine, caseSensitive: caseSensitive); @patch static String escape(String text) => quoteStringForRegExp(text); diff --git a/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart b/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart index 3dacc06ae0a..51b4bfbe143 100644 --- a/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart +++ b/sdk/lib/_internal/js_runtime/lib/regexp_helper.dart @@ -42,22 +42,18 @@ class JSSyntaxRegExp implements RegExp { var _nativeGlobalRegExp; var _nativeAnchoredRegExp; - String toString() => - 'RegExp/$pattern/' + JS('String', '#.flags', _nativeRegExp); + String toString() => 'RegExp/$pattern/'; JSSyntaxRegExp(String source, - {bool multiLine: false, - bool caseSensitive: true, - bool unicode: false, - bool dotAll: false}) + {bool multiLine: false, bool caseSensitive: true}) : this.pattern = source, - this._nativeRegExp = makeNative( - source, multiLine, caseSensitive, unicode, dotAll, false); + this._nativeRegExp = + makeNative(source, multiLine, caseSensitive, false); get _nativeGlobalVersion { if (_nativeGlobalRegExp != null) return _nativeGlobalRegExp; - return _nativeGlobalRegExp = makeNative( - pattern, _isMultiLine, _isCaseSensitive, _isUnicode, _isDotAll, true); + return _nativeGlobalRegExp = + makeNative(pattern, _isMultiLine, _isCaseSensitive, true); } get _nativeAnchoredVersion { @@ -67,22 +63,18 @@ class JSSyntaxRegExp implements RegExp { // that it tries, and you can see if the original regexp matched, or it // was the added zero-width match that matched, by looking at the last // capture. If it is a String, the match participated, otherwise it didn't. - return _nativeAnchoredRegExp = makeNative('$pattern|()', _isMultiLine, - _isCaseSensitive, _isUnicode, _isDotAll, true); + return _nativeAnchoredRegExp = + makeNative('$pattern|()', _isMultiLine, _isCaseSensitive, true); } bool get _isMultiLine => JS('bool', '#.multiline', _nativeRegExp); bool get _isCaseSensitive => JS('bool', '!#.ignoreCase', _nativeRegExp); - bool get _isUnicode => JS('bool', '#.unicode', _nativeRegExp); - bool get _isDotAll => JS('bool', '#.dotAll', _nativeRegExp); - static makeNative(String source, bool multiLine, bool caseSensitive, - bool unicode, bool dotAll, bool global) { + static makeNative( + String source, bool multiLine, bool caseSensitive, bool global) { checkString(source); String m = multiLine == true ? 'm' : ''; String i = caseSensitive == true ? '' : 'i'; - String u = unicode ? 'u' : ''; - String s = dotAll ? 's' : ''; String g = global ? 'g' : ''; // We're using the JavaScript's try catch instead of the Dart one to avoid // dragging in Dart runtime support just because of using RegExp. @@ -95,12 +87,10 @@ class JSSyntaxRegExp implements RegExp { } catch (e) { return e; } - })(#, # + # + # + # + #)''', + })(#, # + # + #)''', source, m, i, - u, - s, g); if (JS('bool', '# instanceof RegExp', regexp)) return regexp; // The returned value is the JavaScript exception. Turn it into a @@ -109,7 +99,7 @@ class JSSyntaxRegExp implements RegExp { throw new FormatException('Illegal RegExp pattern ($errorMessage)', source); } - RegExpMatch firstMatch(String string) { + Match firstMatch(String string) { List m = JS('JSExtendableArray|Null', r'#.exec(#)', _nativeRegExp, checkString(string)); if (m == null) return null; @@ -126,7 +116,7 @@ class JSSyntaxRegExp implements RegExp { return null; } - Iterable allMatches(String string, [int start = 0]) { + Iterable allMatches(String string, [int start = 0]) { checkString(string); checkInt(start); if (start < 0 || start > string.length) { @@ -135,7 +125,7 @@ class JSSyntaxRegExp implements RegExp { return new _AllMatchesIterable(this, string, start); } - RegExpMatch _execGlobal(String string, int start) { + Match _execGlobal(String string, int start) { Object regexp = _nativeGlobalVersion; JS('void', '#.lastIndex = #', regexp, start); List match = JS('JSExtendableArray|Null', '#.exec(#)', regexp, string); @@ -143,7 +133,7 @@ class JSSyntaxRegExp implements RegExp { return new _MatchImplementation(this, match); } - RegExpMatch _execAnchored(String string, int start) { + Match _execAnchored(String string, int start) { Object regexp = _nativeAnchoredVersion; JS('void', '#.lastIndex = #', regexp, start); List match = JS('JSExtendableArray|Null', '#.exec(#)', regexp, string); @@ -154,7 +144,7 @@ class JSSyntaxRegExp implements RegExp { return new _MatchImplementation(this, match); } - RegExpMatch matchAsPrefix(String string, [int start = 0]) { + Match matchAsPrefix(String string, [int start = 0]) { if (start < 0 || start > string.length) { throw new RangeError.range(start, 0, string.length); } @@ -163,8 +153,6 @@ class JSSyntaxRegExp implements RegExp { bool get isMultiLine => _isMultiLine; bool get isCaseSensitive => _isCaseSensitive; - bool get isUnicode => _isUnicode; - bool get isDotAll => _isDotAll; } class _MatchImplementation implements RegExpMatch { @@ -231,34 +219,25 @@ class _MatchImplementation implements RegExpMatch { } } -class _AllMatchesIterable extends IterableBase { +class _AllMatchesIterable extends IterableBase { final JSSyntaxRegExp _re; final String _string; final int _start; _AllMatchesIterable(this._re, this._string, this._start); - Iterator get iterator => - new _AllMatchesIterator(_re, _string, _start); + Iterator get iterator => new _AllMatchesIterator(_re, _string, _start); } -class _AllMatchesIterator implements Iterator { +class _AllMatchesIterator implements Iterator { final JSSyntaxRegExp _regExp; String _string; int _nextIndex; - RegExpMatch _current; + Match _current; _AllMatchesIterator(this._regExp, this._string, this._nextIndex); - RegExpMatch get current => _current; - - static bool _isLeadSurrogate(int c) { - return c >= 0xd800 && c <= 0xdbff; - } - - static bool _isTrailSurrogate(int c) { - return c >= 0xdc00 && c <= 0xdfff; - } + Match get current => _current; bool moveNext() { if (_string == null) return false; @@ -268,15 +247,6 @@ class _AllMatchesIterator implements Iterator { _current = match; int nextIndex = match.end; if (match.start == nextIndex) { - // Zero-width match. Advance by one more, unless the regexp - // is in unicode mode and it would put us within a surrogate - // pair. In that case, advance past the code point as a whole. - if (_regExp.isUnicode && - _nextIndex + 1 < _string.length && - _isLeadSurrogate(_string.codeUnitAt(_nextIndex)) && - _isTrailSurrogate(_string.codeUnitAt(_nextIndex + 1))) { - nextIndex++; - } nextIndex++; } _nextIndex = nextIndex; @@ -290,6 +260,6 @@ class _AllMatchesIterator implements Iterator { } /// Find the first match of [regExp] in [string] at or after [start]. -RegExpMatch firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { +Match firstMatchAfter(JSSyntaxRegExp regExp, String string, int start) { return regExp._execGlobal(string, start); } diff --git a/sdk/lib/core/regexp.dart b/sdk/lib/core/regexp.dart index 62a35f06b21..c0bd153653e 100644 --- a/sdk/lib/core/regexp.dart +++ b/sdk/lib/core/regexp.dart @@ -16,7 +16,7 @@ part of dart.core; * for the specification of JavaScript regular expressions. * * [firstMatch] is the main implementation method that applies a regular - * expression to a string and returns the first [RegExpMatch]. All + * expression to a string and returns the first [Match]. All * other methods in [RegExp] can build on it. * * Use [allMatches] to look for all matches of a regular expression in @@ -27,7 +27,7 @@ part of dart.core; * ```dart * RegExp exp = new RegExp(r"(\w+)"); * String str = "Parse my string"; - * Iterable matches = exp.allMatches(str); + * Iterable matches = exp.allMatches(str); * ``` * * Note the use of a _raw string_ (a string prefixed with `r`) @@ -47,12 +47,6 @@ abstract class RegExp implements Pattern { * * If `caseSensitive` is disabled, then case is ignored. * - * If `unicode` is enabled, then the pattern is treated as a Unicode - * pattern as described by the ECMAScript standard. - * - * If `dotAll` is enabled, then the `.` pattern will match _all_ characters, - * including line terminators. - * * Example: * * ```dart @@ -66,10 +60,7 @@ abstract class RegExp implements Pattern { * interpolation is required. */ external factory RegExp(String source, - {bool multiLine = false, - bool caseSensitive = true, - bool unicode = false, - bool dotAll = false}); + {bool multiLine = false, bool caseSensitive = true}); /** * Returns a regular expression that matches [text]. @@ -88,14 +79,14 @@ abstract class RegExp implements Pattern { * Searches for the first match of the regular expression * in the string [input]. Returns `null` if there is no match. */ - RegExpMatch firstMatch(String input); + Match firstMatch(String input); /** * Returns an iterable of the matches of the regular expression on [input]. * * If [start] is provided, only start looking for matches at `start`. */ - Iterable allMatches(String input, [int start = 0]); + Iterable allMatches(String input, [int start = 0]); /** * Returns whether the regular expression has a match in the string [input]. @@ -129,33 +120,6 @@ abstract class RegExp implements Pattern { * versions of the same letter. */ bool get isCaseSensitive; - - /** - * Whether this regular expression uses full Unicode matching. - * - * In Unicode mode, UTF-16 surrogate pairs in the original string will be - * treated as a single code point and will not match separately. Otherwise, - * the target string will be treated purely as a sequence of individual code - * units and surrogates will not be treated specially. - * - * In Unicode mode, the syntax of the RegExp pattern is more restricted, but - * some pattern features, like Unicode property escapes, are only available in - * this mode. - */ - bool get isUnicode; - - /** - * Whether "." in this regular expression matches line terminators. - * - * Normally, the "." character matches a single character, unless that - * character is a line terminator. If this feature is active, then the "." - * character will match any single character including line terminators. - * - * This feature is distinct from [isMultiline], as they affect the behavior - * of different pattern characters, and so they can be used together or - * separately. - */ - bool get isDotAll; } /** diff --git a/tests/corelib_2/regexp/dot-all_test.dart b/tests/corelib_2/regexp/dot-all_test.dart deleted file mode 100644 index 5610e7fd797..00000000000 --- a/tests/corelib_2/regexp/dot-all_test.dart +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2017 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - // The flags accessors. - var re = new RegExp(r".", dotAll: true); - assertTrue(re.isCaseSensitive); - assertFalse(re.isMultiLine); - assertFalse(re.isUnicode); - assertTrue(re.isDotAll); - - re = new RegExp(r".", - caseSensitive: false, multiLine: true, unicode: true, dotAll: true); - assertFalse(re.isCaseSensitive); - assertTrue(re.isMultiLine); - assertTrue(re.isUnicode); - assertTrue(re.isDotAll); - - re = new RegExp(r".", caseSensitive: false, multiLine: true, unicode: true); - assertFalse(re.isCaseSensitive); - assertTrue(re.isMultiLine); - assertTrue(re.isUnicode); - assertFalse(re.isDotAll); - - // Default '.' behavior. - re = new RegExp(r"^.$"); - assertTrue(re.hasMatch("a")); - assertTrue(re.hasMatch("3")); - assertTrue(re.hasMatch("π")); - assertTrue(re.hasMatch("\u2027")); - assertTrue(re.hasMatch("\u0085")); - assertTrue(re.hasMatch("\v")); - assertTrue(re.hasMatch("\f")); - assertTrue(re.hasMatch("\u180E")); - assertFalse(re.hasMatch("\u{10300}")); // Supplementary plane. - assertFalse(re.hasMatch("\n")); - assertFalse(re.hasMatch("\r")); - assertFalse(re.hasMatch("\u2028")); - assertFalse(re.hasMatch("\u2029")); - - // Default '.' behavior (unicode). - re = new RegExp(r"^.$", unicode: true); - assertTrue(re.hasMatch("a")); - assertTrue(re.hasMatch("3")); - assertTrue(re.hasMatch("π")); - assertTrue(re.hasMatch("\u2027")); - assertTrue(re.hasMatch("\u0085")); - assertTrue(re.hasMatch("\v")); - assertTrue(re.hasMatch("\f")); - assertTrue(re.hasMatch("\u180E")); - assertTrue(re.hasMatch("\u{10300}")); // Supplementary plane. - assertFalse(re.hasMatch("\n")); - assertFalse(re.hasMatch("\r")); - assertFalse(re.hasMatch("\u2028")); - assertFalse(re.hasMatch("\u2029")); - - // DotAll '.' behavior. - re = new RegExp(r"^.$", dotAll: true); - assertTrue(re.hasMatch("a")); - assertTrue(re.hasMatch("3")); - assertTrue(re.hasMatch("π")); - assertTrue(re.hasMatch("\u2027")); - assertTrue(re.hasMatch("\u0085")); - assertTrue(re.hasMatch("\v")); - assertTrue(re.hasMatch("\f")); - assertTrue(re.hasMatch("\u180E")); - assertFalse(re.hasMatch("\u{10300}")); // Supplementary plane. - assertTrue(re.hasMatch("\n")); - assertTrue(re.hasMatch("\r")); - assertTrue(re.hasMatch("\u2028")); - assertTrue(re.hasMatch("\u2029")); - - // DotAll '.' behavior (unicode). - re = new RegExp(r"^.$", unicode: true, dotAll: true); - assertTrue(re.hasMatch("a")); - assertTrue(re.hasMatch("3")); - assertTrue(re.hasMatch("π")); - assertTrue(re.hasMatch("\u2027")); - assertTrue(re.hasMatch("\u0085")); - assertTrue(re.hasMatch("\v")); - assertTrue(re.hasMatch("\f")); - assertTrue(re.hasMatch("\u180E")); - assertTrue(re.hasMatch("\u{10300}")); // Supplementary plane. - assertTrue(re.hasMatch("\n")); - assertTrue(re.hasMatch("\r")); - assertTrue(re.hasMatch("\u2028")); - assertTrue(re.hasMatch("\u2029")); -} diff --git a/tests/corelib_2/regexp/lookbehind_test.dart b/tests/corelib_2/regexp/lookbehind_test.dart index 3fd11af5a67..5ccdcdb6253 100644 --- a/tests/corelib_2/regexp/lookbehind_test.dart +++ b/tests/corelib_2/regexp/lookbehind_test.dart @@ -433,6 +433,8 @@ void main() { assertThrows(() => new RegExp(r"(?<=.)?")); //# 01: ok assertThrows(() => new RegExp(r"(?<=.)+")); //# 01: ok - assertThrows(() => new RegExp(r"(?<=.)*", unicode: true)); //# 01: ok - assertThrows(() => new RegExp(r"(?<=.){1,2}", unicode: true)); //# 01: ok + // No unicode flag (yet), so can't test these. + // See https://github.com/dart-lang/sdk/issues/36170. + // assertThrows("/(?<=.)*/u", SyntaxError); + // assertThrows("/(?<=.){1,2}/u", SyntaxError); } diff --git a/tests/corelib_2/regexp/named-captures_test.dart b/tests/corelib_2/regexp/named-captures_test.dart index ee2d843b385..aff61b82d8b 100644 --- a/tests/corelib_2/regexp/named-captures_test.dart +++ b/tests/corelib_2/regexp/named-captures_test.dart @@ -31,90 +31,35 @@ import 'package:expect/expect.dart'; import 'v8_regexp_utils.dart'; void main() { + void testRE(RegExp re, String input, bool expectedResult) { + if (expectedResult) { + assertTrue(re.hasMatch(input)); + } else { + assertFalse(re.hasMatch(input)); + } + } + void execRE(RegExp re, String input, List expectedResult) { assertTrue(re.hasMatch(input)); shouldBe(re.firstMatch(input), expectedResult); } - void execString(String pattern, String input, List expectedResult, - {bool unicode = true, bool caseSensitive: false}) { - execRE(RegExp(pattern, unicode: unicode, caseSensitive: caseSensitive), - input, expectedResult); - } - void namedRE(RegExp re, String input, Map expectedResults) { assertTrue(re.hasMatch(input)); - var match = re.firstMatch(input); + var match = re.firstMatch(input) as RegExpMatch; for (var s in expectedResults.keys) { assertEquals(match.namedGroup(s), expectedResults[s]); } } - void execStringGroups( - String pattern, String input, Map expectedResults, - {bool unicode = true, bool caseSensitive: false}) { - namedRE(RegExp(pattern, unicode: unicode, caseSensitive: caseSensitive), - input, expectedResults); - } - void hasNames(RegExp re, String input, List expectedResults) { assertTrue(re.hasMatch(input)); - var match = re.firstMatch(input); + var match = re.firstMatch(input) as RegExpMatch; for (var s in match.groupNames) { assertTrue(expectedResults.contains(s)); } } - void matchesIndexEqual(String input, RegExp re1, RegExp re2) { - var m1 = re1.firstMatch(input); - var m2 = re2.firstMatch(input); - if (m2 == null) { - assertNull(m1); - } else { - assertTrue(m1 != null); - assertEquals(m1.groupCount, m2.groupCount); - for (int i = 0; i < m1.groupCount; i++) { - assertEquals(m1.group(i), m2.group(i)); - } - } - } - - // Malformed named captures. - // Empty name. - assertThrows(() => RegExp(r"(?<>a)", unicode: true)); - // Unterminated name. - assertThrows(() => RegExp(r"(? RegExp(r"(?<42a>a)", unicode: true)); - // Name starting with invalid char. - assertThrows(() => RegExp(r"(?<:a>a)", unicode: true)); - // Name containing invalid char. - assertThrows(() => RegExp(r"(?a)", unicode: true)); - // Duplicate name. - assertThrows(() => RegExp(r"(?a)(?a)", unicode: true)); - // Duplicate name. - assertThrows(() => RegExp(r"(?a)(?b)(?a)", unicode: true)); - // Invalid reference. - assertThrows(() => RegExp(r"\k", unicode: true)); - // Unterminated reference. - assertThrows(() => RegExp(r"\k RegExp(r"\k", unicode: true)); - // Lone \k. - assertThrows(() => RegExp(r"(?.)\k", unicode: true)); - // Unterminated reference. - assertThrows(() => RegExp(r"(?.)\k RegExp(r"(?.)\k", unicode: true)); - // Invalid reference. - assertThrows(() => RegExp(r"(?a)\k", unicode: true)); - // Invalid reference. - assertThrows(() => RegExp(r"(?a)\k", unicode: true)); - // Invalid reference. - assertThrows(() => RegExp(r"\k(?a)", unicode: true)); - // Identity escape in capture. - assertThrows(() => RegExp(r"(?\a)", unicode: true)); - // Behavior in non-unicode mode. assertThrows(() => RegExp(r"(?<>a)")); assertThrows(() => RegExp(r"(? RegExp(r"\k.)")); assertThrows(() => RegExp(r"\k(?.)")); - // Basic named groups. - execString(r"(?a)", "bab", ["a", "a"]); - execString(r"(?a)", "bab", ["a", "a"]); - execString(r"(?<_>a)", "bab", ["a", "a"]); - execString(r"(?<$>a)", "bab", ["a", "a"]); - execString(r".(?<$>a).", "bab", ["bab", "a"]); - execString(r".(?a)(.)", "bab", ["bab", "a", "b"]); - execString(r".(?a)(?.)", "bab", ["bab", "a", "b"]); - execString(r".(?\w\w)", "bab", ["bab", "ab"]); - execString(r"(?\w\w\w)", "bab", ["bab", "bab"]); - execString(r"(?\w\w)(?\w)", "bab", ["bab", "ba", "b"]); - - execString(r"(?a)", "bab", ["a", "a"], unicode: false); - execString(r"(?a)", "bab", ["a", "a"], unicode: false); - execString(r"(?<_>a)", "bab", ["a", "a"], unicode: false); - execString(r"(?<$>a)", "bab", ["a", "a"], unicode: false); - execString(r".(?<$>a).", "bab", ["bab", "a"], unicode: false); - execString(r".(?a)(.)", "bab", ["bab", "a", "b"], unicode: false); - execString(r".(?a)(?.)", "bab", ["bab", "a", "b"], unicode: false); - execString(r".(?\w\w)", "bab", ["bab", "ab"], unicode: false); - execString(r"(?\w\w\w)", "bab", ["bab", "bab"], unicode: false); - execString(r"(?\w\w)(?\w)", "bab", ["bab", "ba", "b"], unicode: false); - - matchesIndexEqual( - "bab", RegExp(r"(?a)", unicode: true), RegExp(r"(a)", unicode: true)); - matchesIndexEqual("bab", RegExp(r"(?a)", unicode: true), - RegExp(r"(a)", unicode: true)); - matchesIndexEqual( - "bab", RegExp(r"(?<_>a)", unicode: true), RegExp(r"(a)", unicode: true)); - matchesIndexEqual( - "bab", RegExp(r"(?<$>a)", unicode: true), RegExp(r"(a)", unicode: true)); - matchesIndexEqual("bab", RegExp(r".(?<$>a).", unicode: true), - RegExp(r".(a).", unicode: true)); - matchesIndexEqual("bab", RegExp(r".(?a)(.)", unicode: true), - RegExp(r".(a)(.)", unicode: true)); - matchesIndexEqual("bab", RegExp(r".(?a)(?.)", unicode: true), - RegExp(r".(a)(.)", unicode: true)); - matchesIndexEqual("bab", RegExp(r".(?\w\w)", unicode: true), - RegExp(r".(\w\w)", unicode: true)); - matchesIndexEqual("bab", RegExp(r"(?\w\w\w)", unicode: true), - RegExp(r"(\w\w\w)", unicode: true)); - matchesIndexEqual("bab", RegExp(r"(?\w\w)(?\w)", unicode: true), - RegExp(r"(\w\w)(\w)", unicode: true)); - - execString(r"(?b).\1", "bab", ["bab", "b"]); - execString(r"(.)(?a)\1\2", "baba", ["baba", "b", "a"]); - execString(r"(.)(?a)(?\1)(\2)", "baba", ["baba", "b", "a", "b", "a"]); - execString(r"(?<)a", ">)a", ">a", [">a", ">"]); - - // Named references. - var pattern = r"(?.).\k"; - execString(pattern, "bab", ["bab", "b"]); - assertFalse(RegExp(pattern, unicode: true).hasMatch("baa")); - - // Nested groups. - pattern = r"(?.(?.(?.)))"; - execString(pattern, "bab", ["bab", "bab", "ab", "b"]); - execStringGroups(pattern, "bab", {"a": "bab", "b": "ab", "c": "b"}); - - // Reference inside group. - pattern = r"(?\k\w).."; - execString(pattern, "bab", ["bab", "b"]); - execStringGroups(pattern, "bab", {"a": "b"}); - - // Reference before group. - pattern = r"\k(?b)\w\k"; - execString(pattern, "bab", ["bab", "b"], unicode: false); - execString(pattern, "bab", ["bab", "b"]); - execStringGroups(pattern, "bab", {"a": "b"}); - - pattern = r"(?b)\k(?a)\k"; - execString(pattern, "bab", ["bab", "b", "a"], unicode: false); - execString(pattern, "bab", ["bab", "b", "a"]); - execStringGroups(pattern, "bab", {"a": "a", "b": "b"}); - - // Reference named groups. - var match = RegExp(r"(?a)(?b)\k", unicode: true).firstMatch("aba"); - assertEquals("a", match.namedGroup("a")); - assertEquals("b", match.namedGroup("b")); - assertFalse(match.groupNames.contains("c")); - - match = - RegExp(r"(?a)(?b)\k|(?c)", unicode: true).firstMatch("aba"); - assertNull(match.namedGroup("c")); - - // Unicode names. - execStringGroups(r"(?<π>a)", "bab", {"π": "a"}); - execStringGroups(r"(?<\u{03C0}>a)", "bab", {"π": "a"}); - execStringGroups(r"(?<π>a)", "bab", {"\u03C0": "a"}); - execStringGroups(r"(?<\u{03C0}>a)", "bab", {"\u03C0": "a"}); - execStringGroups(r"(?<$>a)", "bab", {"\$": "a"}); - execStringGroups(r"(?<_>a)", "bab", {"_": "a"}); - execStringGroups(r"(?<$𐒤>a)", "bab", {"\$𐒤": "a"}); - execStringGroups(r"(?<_\u200C>a)", "bab", {"_\u200C": "a"}); - execStringGroups(r"(?<_\u200D>a)", "bab", {"_\u200D": "a"}); - execStringGroups(r"(?<ಠ_ಠ>a)", "bab", {"ಠ_ಠ": "a"}); - // ID_Continue but not ID_Start. - assertThrows(() => RegExp(r"/(?<❤>a)", unicode: true)); - assertThrows(() => RegExp(r"/(?<𐒤>a)", unicode: true)); - - execStringGroups(r"(?<π>a)", "bab", {"π": "a"}, unicode: false); - execStringGroups(r"(?<$>a)", "bab", {"\$": "a"}, unicode: false); - execStringGroups(r"(?<_>a)", "bab", {"_": "a"}, unicode: false); - assertThrows(() => RegExp(r"(?<$𐒤>a)")); - execStringGroups(r"(?<ಠ_ಠ>a)", "bab", {"ಠ_ಠ": "a"}, unicode: false); - // ID_Continue but not ID_Start. - assertThrows(() => RegExp(r"/(?<❤>a)")); - assertThrows(() => RegExp(r"/(?<𐒤>a)")); - - // Interaction with lookbehind assertions. - pattern = r"(?<=(?\w){3})f"; - execString(pattern, "abcdef", ["f", "c"]); - execStringGroups(pattern, "abcdef", {"a": "c"}); - - execStringGroups(r"(?<=(?\w){4})f", "abcdef", {"a": "b"}); - execStringGroups(r"(?<=(?\w)+)f", "abcdef", {"a": "a"}); - assertFalse(RegExp(r"(?<=(?\w){6})f", unicode: true).hasMatch("abcdef")); - - execString(r"((?<=\w{3}))f", "abcdef", ["f", ""]); - execString(r"(?(?<=\w{3}))f", "abcdef", ["f", ""]); - - execString(r"(?\d){3})f", "abcdef", ["f", null]); - assertFalse(RegExp(r"(?\D){3})f", unicode: true).hasMatch("abcdef")); - - execString(r"(?\D){3})f|f", "abcdef", ["f", null]); - execString(r"(?(?.)|(?.)", unicode: true).firstMatch("abcd"); - Expect.setEquals(["fst", "snd"], match.groupNames); - - // Backslash as ID_Start and ID_Continue (v8:5868). - assertThrows(() => RegExp("(?<\\>.)")); // '\' misclassified as ID_Start. - assertThrows(() => RegExp("(?.)")); // '\' misclassified as ID_Continue. - - // Backreference before the group (exercises the capture mini-parser). - assertThrows(() => RegExp(r"/\1(?:.)", unicode: true)); - assertThrows(() => RegExp(r"/\1(?<=a).", unicode: true)); - assertThrows(() => RegExp(r"/\1(?.)", "abcd", ["a", "a"]); - - // Unicode escapes in capture names. (Testing both unicode interpreted by - // Dart string handling and also escaped unicode making it to RegExp parser.) - - // \u Lead \u Trail - assertTrue(RegExp("(?.)", unicode: true).hasMatch("a")); - assertTrue(RegExp(r"(?.)", unicode: true).hasMatch("a")); - assertThrows(() => RegExp("(?.)", unicode: true)); // \u Lead - assertThrows(() => RegExp(r"(?.)", unicode: true)); // \u Lead - assertThrows(() => RegExp("(?.)", unicode: true)); // \u Trail - assertThrows(() => RegExp(r"(?.)", unicode: true)); // \u Trail - // \u NonSurrogate - assertTrue(RegExp("(?<\u0041>.)", unicode: true).hasMatch("a")); - assertTrue(RegExp(r"(?<\u0041>.)", unicode: true).hasMatch("a")); - // \u{ Surrogate, ID_Continue } - assertTrue(RegExp("(?.)", unicode: true).hasMatch("a")); - assertTrue(RegExp(r"(?.)", unicode: true).hasMatch("a")); - - // \u{ Out-of-bounds } -- only need to test RegExp parser for this. - assertThrows(() => RegExp(r"(?.)", unicode: true)); - - // Also checking non-unicode patterns, where surrogate pairs will not - // be combined (so only \u0041 will have any success). - - assertThrows(() => RegExp("(?.)")); - assertThrows(() => RegExp(r"(?.)")); - assertThrows(() => RegExp("(?.)")); - assertThrows(() => RegExp(r"(?.)")); - assertThrows(() => RegExp("(?.)")); - assertThrows(() => RegExp(r"(?.)")); - assertTrue(RegExp("(?<\u0041>.)").hasMatch("a")); - assertTrue(RegExp(r"(?<\u0041>.)").hasMatch("a")); - assertThrows(() => RegExp("(?.)")); - assertThrows(() => RegExp(r"(?.)")); - assertThrows(() => RegExp("(?.)")); - assertThrows(() => RegExp(r"(?.)")); - assertThrows(() => RegExp(r"(?.)")); + // TODO(sstrickl): Add more tests when unicode flag support is in. + // https://github.com/dart-lang/sdk/issues/36170 } diff --git a/tests/corelib_2/regexp/unicode-character-ranges_test.dart b/tests/corelib_2/regexp/unicode-character-ranges_test.dart deleted file mode 100644 index 7154d0fa0bd..00000000000 --- a/tests/corelib_2/regexp/unicode-character-ranges_test.dart +++ /dev/null @@ -1,160 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2011 the V8 project authors. All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions -// are met: -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -// DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY -// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -// ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void execl(List expectation, RegExp re, String subject) { - shouldBe(re.firstMatch(subject), expectation); -} - -void execs(List expectation, String pattern, String subject) { - final re = RegExp(pattern, unicode: true); - shouldBe(re.firstMatch(subject), expectation); -} - -void main() { - // Character ranges. - execs(["A"], r"[A-D]", "A"); - execs(["ABCD"], r"[A-D]+", "ZABCDEF"); - - execs(["\u{12345}"], r"[\u1234-\u{12345}]", "\u{12345}"); - execs(null, r"[^\u1234-\u{12345}]", "\u{12345}"); - - execs(["\u{1234}"], r"[\u1234-\u{12345}]", "\u{1234}"); - execs(null, r"[^\u1234-\u{12345}]", "\u{1234}"); - - execs(null, r"[\u1234-\u{12345}]", "\u{1233}"); - execs(["\u{1233}"], r"[^\u1234-\u{12345}]", "\u{1233}"); - - execs(["\u{12346}"], r"[^\u1234-\u{12345}]", "\u{12346}"); - execs(null, r"[\u1234-\u{12345}]", "\u{12346}"); - - execs(["\u{12342}"], r"[\u{12340}-\u{12345}]", "\u{12342}"); - execs(["\u{12342}"], r"[\ud808\udf40-\ud808\udf45]", "\u{12342}"); - execs(null, r"[^\u{12340}-\u{12345}]", "\u{12342}"); - execs(null, r"[^\ud808\udf40-\ud808\udf45]", "\u{12342}"); - - execs(["\u{ffff}"], r"[\u{ff80}-\u{12345}]", "\u{ffff}"); - execs(["\u{ffff}"], r"[\u{ff80}-\ud808\udf45]", "\u{ffff}"); - execs(null, r"[^\u{ff80}-\u{12345}]", "\u{ffff}"); - execs(null, r"[^\u{ff80}-\ud808\udf45]", "\u{ffff}"); - - // Lone surrogate - execs(["\udc00"], r"[^\u{ff80}-\u{12345}]", "\uff99\u{dc00}A"); - execs(["\udc01"], r"[\u0100-\u{10ffff}]", "A\udc01"); - execs(["\udc03"], r"[\udc01-\udc03]", "\ud801\udc02\udc03"); - execs(["\ud801"], r"[\ud801-\ud803]", "\ud802\udc01\ud801"); - - // Paired surrogate. - execs(null, r"[^\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}"); - execs(["\ud800\udc00"], r"[\u{ff80}-\u{12345}]", "\u{d800}\u{dc00}"); - execs(["foo\u{10e6d}bar"], r"foo\ud803\ude6dbar", "foo\u{10e6d}bar"); - - // Lone surrogates - execs(["\ud801\ud801"], r"\ud801+", "\ud801\udc01\ud801\ud801"); - execs(["\udc01\udc01"], r"\udc01+", "\ud801\ud801\udc01\udc01\udc01"); - - execs(["\udc02\udc03A"], r"\W\WA", "\ud801\udc01A\udc02\udc03A"); - execs(["\ud801\ud802"], r"\ud801.", "\ud801\udc01\ud801\ud802"); - execs(["\udc02\udc03A"], r"[\ud800-\udfff][\ud800-\udfff]A", - "\ud801\udc01A\udc02\udc03A"); - - // Character classes - execs(null, r"\w", "\ud801\udc01"); - execl(["\ud801"], RegExp(r"[^\w]"), "\ud801\udc01"); - execs(["\ud801\udc01"], r"[^\w]", "\ud801\udc01"); - execl(["\ud801"], RegExp(r"\W"), "\ud801\udc01"); - execs(["\ud801\udc01"], r"\W", "\ud801\udc01"); - - execs(["\ud800X"], r".X", "\ud800XaX"); - execs(["aX"], r".(? RegExp(r"\u", unicode: true)); - assertThrows(() => RegExp(r"\u12", unicode: true)); - assertThrows(() => RegExp(r"\ufoo", unicode: true)); - assertThrows(() => RegExp(r"\x", unicode: true)); - assertThrows(() => RegExp(r"\xfoo", unicode: true)); - assertThrows(() => RegExp(r"\z", unicode: true)); - assertThrows(() => RegExp(r"\8", unicode: true)); - assertThrows(() => RegExp(r"\9", unicode: true)); - - assertThrows(() => RegExp("\\u", unicode: true)); - assertThrows(() => RegExp("\\u12", unicode: true)); - assertThrows(() => RegExp("\\ufoo", unicode: true)); - assertThrows(() => RegExp("\\x", unicode: true)); - assertThrows(() => RegExp("\\xfoo", unicode: true)); - assertThrows(() => RegExp("\\z", unicode: true)); - assertThrows(() => RegExp("\\8", unicode: true)); - assertThrows(() => RegExp("\\9", unicode: true)); -} - -void TestTooBigHexEscape() { - // The hex number inside \u{} has a maximum value. - RegExp(r"\u{10ffff}", unicode: true); - RegExp("\\u{10ffff}", unicode: true); - assertThrows(() => RegExp(r"\u{110000}", unicode: true)); - assertThrows(() => RegExp("\\u{110000}", unicode: true)); - - // Without the u flag, they're of course fine ({x} is the count). - RegExp(r"\u{110000}"); - RegExp("\\u{110000}"); -} - -void TestSyntaxEscapes() { - // Syntax escapes work the same with or without the u flag. - void helper(RegExp r) { - assertTrue(r.hasMatch("foo[bar")); - assertFalse(r.hasMatch("foo]bar")); - } - - helper(RegExp(r"foo\[bar")); - helper(RegExp("foo\\[bar")); - helper(RegExp(r"foo\[bar", unicode: true)); - helper(RegExp("foo\\[bar", unicode: true)); -} - -void TestUnicodeSurrogates() { - // U+10E6D corresponds to the surrogate pair [U+D803, U+DE6D]. - void helper(RegExp r) { - assertTrue(r.hasMatch("foo\u{10e6d}bar")); - } - - helper(RegExp(r"foo\ud803\ude6dbar", unicode: true)); - helper(RegExp("foo\\ud803\\ude6dbar", unicode: true)); -} - -void main() { - TestUnicodeEscapes(); - TestUnicodeEscapesInCharacterClasses(); - TestBraceEscapesWithoutUnicodeFlag(); - TestInvalidEscapes(); - TestTooBigHexEscape(); - TestSyntaxEscapes(); - TestUnicodeSurrogates(); - - // Non-BMP patterns. - // Single character atom. - assertTrue(RegExp("\u{12345}", unicode: true).hasMatch("\u{12345}")); - assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\u{12345}")); - assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\ud808\udf45")); - assertTrue(RegExp(r"\u{12345}", unicode: true).hasMatch("\ud808\udf45")); - assertFalse(RegExp(r"\u{12345}", unicode: true).hasMatch("\udf45")); - assertFalse(RegExp(r"\u{12345}", unicode: true).hasMatch("\udf45")); - - // Multi-character atom. - assertTrue(RegExp(r"\u{12345}\u{23456}", unicode: true) - .hasMatch("a\u{12345}\u{23456}b")); - assertTrue(RegExp(r"\u{12345}\u{23456}", unicode: true) - .hasMatch("b\u{12345}\u{23456}c")); - assertFalse(RegExp(r"\u{12345}\u{23456}", unicode: true) - .hasMatch("a\udf45\u{23456}b")); - assertFalse(RegExp(r"\u{12345}\u{23456}", unicode: true) - .hasMatch("b\udf45\u{23456}c")); - - // Disjunction. - assertTrue(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) - .hasMatch("a\u{12345}\u{23456}b")); - assertTrue(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) - .hasMatch("b\u{12345}\u{23456}c")); - assertFalse(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) - .hasMatch("a\udf45\u{23456}b")); - assertFalse(RegExp(r"\u{12345}(?:\u{23456})", unicode: true) - .hasMatch("b\udf45\u{23456}c")); - - // Alternative. - assertTrue( - RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("a\u{12345}b")); - assertTrue( - RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("b\u{23456}c")); - assertFalse( - RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("a\udf45\ud84db")); - assertFalse( - RegExp(r"\u{12345}|\u{23456}", unicode: true).hasMatch("b\udf45\ud808c")); - - // Capture. - assertTrue(RegExp("(\u{12345}|\u{23456}).\\1", unicode: true) - .hasMatch("\u{12345}b\u{12345}")); - assertTrue(RegExp(r"(\u{12345}|\u{23456}).\1", unicode: true) - .hasMatch("\u{12345}b\u{12345}")); - assertFalse(RegExp("(\u{12345}|\u{23456}).\\1", unicode: true) - .hasMatch("\u{12345}b\u{23456}")); - assertFalse(RegExp(r"(\u{12345}|\u{23456}).\1", unicode: true) - .hasMatch("\u{12345}b\u{23456}")); - - // Quantifier. - assertTrue(RegExp("\u{12345}{3}", unicode: true) - .hasMatch("\u{12345}\u{12345}\u{12345}")); - assertTrue(RegExp(r"\u{12345}{3}", unicode: true) - .hasMatch("\u{12345}\u{12345}\u{12345}")); - assertTrue(RegExp("\u{12345}{3}").hasMatch("\u{12345}\udf45\udf45")); - assertFalse(RegExp(r"\ud808\udf45{3}", unicode: true) - .hasMatch("\u{12345}\udf45\udf45")); - assertTrue(RegExp(r"\ud808\udf45{3}", unicode: true) - .hasMatch("\u{12345}\u{12345}\u{12345}")); - assertFalse( - RegExp("\u{12345}{3}", unicode: true).hasMatch("\u{12345}\udf45\udf45")); - assertFalse( - RegExp(r"\u{12345}{3}", unicode: true).hasMatch("\u{12345}\udf45\udf45")); - - // Literal surrogates. - shouldBe( - RegExp("\ud800\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"), - ["\u{10000}\u{10000}"]); - shouldBe( - RegExp("\\ud800\\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}"), - ["\u{10000}\u{10000}"]); - - shouldBe( - RegExp("[\\ud800\\udc03-\\ud900\\udc01\]+", unicode: true) - .firstMatch("\u{10003}\u{50001}"), - ["\u{10003}\u{50001}"]); - shouldBe( - RegExp("[\ud800\udc03-\u{50001}\]+", unicode: true) - .firstMatch("\u{10003}\u{50001}"), - ["\u{10003}\u{50001}"]); - - // Unicode escape sequences to represent a non-BMP character cannot have - // mixed notation, and must follow the rules for RegExpUnicodeEscapeSequence. - assertThrows(() => RegExp("[\\ud800\udc03-\ud900\\udc01\]+", unicode: true)); - assertNull( - RegExp("\\ud800\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}")); - assertNull( - RegExp("\ud800\\udc00+", unicode: true).firstMatch("\u{10000}\u{10000}")); - - assertNull(RegExp("[\\ud800\udc00]", unicode: true).firstMatch("\u{10000}")); - assertNull( - RegExp("[\\{ud800}\udc00]", unicode: true).firstMatch("\u{10000}")); - assertNull(RegExp("[\ud800\\udc00]", unicode: true).firstMatch("\u{10000}")); - assertNull( - RegExp("[\ud800\\{udc00}]", unicode: true).firstMatch("\u{10000}")); - - assertNull(RegExp(r"\u{d800}\u{dc00}+", unicode: true) - .firstMatch("\ud800\udc00\udc00")); - assertNull(RegExp(r"\ud800\u{dc00}+", unicode: true) - .firstMatch("\ud800\udc00\udc00")); - assertNull(RegExp(r"\u{d800}\udc00+", unicode: true) - .firstMatch("\ud800\udc00\udc00")); -} diff --git a/tests/corelib_2/regexp/unicode-property-binary_test.dart b/tests/corelib_2/regexp/unicode-property-binary_test.dart deleted file mode 100644 index b8981a7fb0b..00000000000 --- a/tests/corelib_2/regexp/unicode-property-binary_test.dart +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - void t(RegExp re, String s) { - assertTrue(re.hasMatch(s)); - } - - void f(RegExp re, String s) { - assertFalse(re.hasMatch(s)); - } - - assertThrows(() => RegExp("\\p{Hiragana}", unicode: true)); - assertThrows(() => RegExp("\\p{Bidi_Class}", unicode: true)); - assertThrows(() => RegExp("\\p{Bidi_C=False}", unicode: true)); - assertThrows(() => RegExp("\\P{Bidi_Control=Y}", unicode: true)); - assertThrows(() => RegExp("\\p{AHex=Yes}", unicode: true)); - - assertThrows(() => RegExp("\\p{Composition_Exclusion}", unicode: true)); - assertThrows(() => RegExp("\\p{CE}", unicode: true)); - assertThrows(() => RegExp("\\p{Full_Composition_Exclusion}", unicode: true)); - assertThrows(() => RegExp("\\p{Comp_Ex}", unicode: true)); - assertThrows(() => RegExp("\\p{Grapheme_Link}", unicode: true)); - assertThrows(() => RegExp("\\p{Gr_Link}", unicode: true)); - assertThrows(() => RegExp("\\p{Hyphen}", unicode: true)); - assertThrows(() => RegExp("\\p{NFD_Inert}", unicode: true)); - assertThrows(() => RegExp("\\p{NFDK_Inert}", unicode: true)); - assertThrows(() => RegExp("\\p{NFC_Inert}", unicode: true)); - assertThrows(() => RegExp("\\p{NFKC_Inert}", unicode: true)); - assertThrows(() => RegExp("\\p{Segment_Starter}", unicode: true)); - - t(RegExp(r"\p{Alphabetic}", unicode: true), "æ"); - f(RegExp(r"\p{Alpha}", unicode: true), "1"); - - t(RegExp(r"\p{ASCII_Hex_Digit}", unicode: true), "f"); - f(RegExp(r"\p{AHex}", unicode: true), "g"); - - t(RegExp(r"\p{Bidi_Control}", unicode: true), "\u200e"); - f(RegExp(r"\p{Bidi_C}", unicode: true), "g"); - - t(RegExp(r"\p{Bidi_Mirrored}", unicode: true), "("); - f(RegExp(r"\p{Bidi_M}", unicode: true), "-"); - - t(RegExp(r"\p{Case_Ignorable}", unicode: true), "\u02b0"); - f(RegExp(r"\p{CI}", unicode: true), "a"); - - t(RegExp(r"\p{Changes_When_Casefolded}", unicode: true), "B"); - f(RegExp(r"\p{CWCF}", unicode: true), "1"); - - t(RegExp(r"\p{Changes_When_Casemapped}", unicode: true), "b"); - f(RegExp(r"\p{CWCM}", unicode: true), "1"); - - t(RegExp(r"\p{Changes_When_Lowercased}", unicode: true), "B"); - f(RegExp(r"\p{CWL}", unicode: true), "1"); - - t(RegExp(r"\p{Changes_When_Titlecased}", unicode: true), "b"); - f(RegExp(r"\p{CWT}", unicode: true), "1"); - - t(RegExp(r"\p{Changes_When_Uppercased}", unicode: true), "b"); - f(RegExp(r"\p{CWU}", unicode: true), "1"); - - t(RegExp(r"\p{Dash}", unicode: true), "-"); - f(RegExp(r"\p{Dash}", unicode: true), "1"); - - t(RegExp(r"\p{Default_Ignorable_Code_Point}", unicode: true), "\u00ad"); - f(RegExp(r"\p{DI}", unicode: true), "1"); - - t(RegExp(r"\p{Deprecated}", unicode: true), "\u17a3"); - f(RegExp(r"\p{Dep}", unicode: true), "1"); - - t(RegExp(r"\p{Diacritic}", unicode: true), "\u0301"); - f(RegExp(r"\p{Dia}", unicode: true), "1"); - - t(RegExp(r"\p{Emoji}", unicode: true), "\u2603"); - f(RegExp(r"\p{Emoji}", unicode: true), "x"); - - t(RegExp(r"\p{Emoji_Component}", unicode: true), "\u{1F1E6}"); - f(RegExp(r"\p{Emoji_Component}", unicode: true), "x"); - - t(RegExp(r"\p{Emoji_Modifier_Base}", unicode: true), "\u{1F6CC}"); - f(RegExp(r"\p{Emoji_Modifier_Base}", unicode: true), "x"); - - t(RegExp(r"\p{Emoji_Modifier}", unicode: true), "\u{1F3FE}"); - f(RegExp(r"\p{Emoji_Modifier}", unicode: true), "x"); - - t(RegExp(r"\p{Emoji_Presentation}", unicode: true), "\u{1F308}"); - f(RegExp(r"\p{Emoji_Presentation}", unicode: true), "x"); - - t(RegExp(r"\p{Extender}", unicode: true), "\u3005"); - f(RegExp(r"\p{Ext}", unicode: true), "x"); - - t(RegExp(r"\p{Grapheme_Base}", unicode: true), " "); - f(RegExp(r"\p{Gr_Base}", unicode: true), "\u0010"); - - t(RegExp(r"\p{Grapheme_Extend}", unicode: true), "\u0300"); - f(RegExp(r"\p{Gr_Ext}", unicode: true), "x"); - - t(RegExp(r"\p{Hex_Digit}", unicode: true), "a"); - f(RegExp(r"\p{Hex}", unicode: true), "g"); - - t(RegExp(r"\p{ID_Continue}", unicode: true), "1"); - f(RegExp(r"\p{IDC}", unicode: true), "."); - - t(RegExp(r"\p{ID_Start}", unicode: true), "a"); - f(RegExp(r"\p{IDS}", unicode: true), "1"); - - t(RegExp(r"\p{Ideographic}", unicode: true), "漢"); - f(RegExp(r"\p{Ideo}", unicode: true), "H"); - - t(RegExp(r"\p{IDS_Binary_Operator}", unicode: true), "\u2FF0"); - f(RegExp(r"\p{IDSB}", unicode: true), "a"); - - t(RegExp(r"\p{IDS_Trinary_Operator}", unicode: true), "\u2FF2"); - f(RegExp(r"\p{IDST}", unicode: true), "a"); - - t(RegExp(r"\p{Join_Control}", unicode: true), "\u200c"); - f(RegExp(r"\p{Join_C}", unicode: true), "a"); - - t(RegExp(r"\p{Logical_Order_Exception}", unicode: true), "\u0e40"); - f(RegExp(r"\p{LOE}", unicode: true), "a"); - - t(RegExp(r"\p{Lowercase}", unicode: true), "a"); - f(RegExp(r"\p{Lower}", unicode: true), "A"); - - t(RegExp(r"\p{Math}", unicode: true), "="); - f(RegExp(r"\p{Math}", unicode: true), "A"); - - t(RegExp(r"\p{Noncharacter_Code_Point}", unicode: true), "\uFDD0"); - f(RegExp(r"\p{NChar}", unicode: true), "A"); - - t(RegExp(r"\p{Pattern_Syntax}", unicode: true), "\u0021"); - f(RegExp(r"\p{NChar}", unicode: true), "A"); - - t(RegExp(r"\p{Pattern_White_Space}", unicode: true), "\u0009"); - f(RegExp(r"\p{Pat_Syn}", unicode: true), "A"); - - t(RegExp(r"\p{Quotation_Mark}", unicode: true), "'"); - f(RegExp(r"\p{QMark}", unicode: true), "A"); - - t(RegExp(r"\p{Radical}", unicode: true), "\u2FAD"); - f(RegExp(r"\p{Radical}", unicode: true), "A"); - - t(RegExp(r"\p{Regional_Indicator}", unicode: true), "\u{1F1E6}"); - f(RegExp(r"\p{Regional_Indicator}", unicode: true), "A"); - - t(RegExp(r"\p{Sentence_Terminal}", unicode: true), "!"); - f(RegExp(r"\p{STerm}", unicode: true), "A"); - - t(RegExp(r"\p{Soft_Dotted}", unicode: true), "i"); - f(RegExp(r"\p{SD}", unicode: true), "A"); - - t(RegExp(r"\p{Terminal_Punctuation}", unicode: true), "."); - f(RegExp(r"\p{Term}", unicode: true), "A"); - - t(RegExp(r"\p{Unified_Ideograph}", unicode: true), "\u4e00"); - f(RegExp(r"\p{UIdeo}", unicode: true), "A"); - - t(RegExp(r"\p{Uppercase}", unicode: true), "A"); - f(RegExp(r"\p{Upper}", unicode: true), "a"); - - t(RegExp(r"\p{Variation_Selector}", unicode: true), "\uFE00"); - f(RegExp(r"\p{VS}", unicode: true), "A"); - - t(RegExp(r"\p{White_Space}", unicode: true), " "); - f(RegExp(r"\p{WSpace}", unicode: true), "A"); - - t(RegExp(r"\p{XID_Continue}", unicode: true), "1"); - f(RegExp(r"\p{XIDC}", unicode: true), " "); - - t(RegExp(r"\p{XID_Start}", unicode: true), "A"); - f(RegExp(r"\p{XIDS}", unicode: true), " "); -} diff --git a/tests/corelib_2/regexp/unicode-property-char-class_test.dart b/tests/corelib_2/regexp/unicode-property-char-class_test.dart deleted file mode 100644 index db87fada0ad..00000000000 --- a/tests/corelib_2/regexp/unicode-property-char-class_test.dart +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2011 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - assertThrows(() => RegExp("[\\p]", unicode: true)); - assertThrows(() => RegExp("[\\p{garbage}]", unicode: true)); - assertThrows(() => RegExp("[\\p{}]", unicode: true)); - assertThrows(() => RegExp("[\\p{]", unicode: true)); - assertThrows(() => RegExp("[\\p}]", unicode: true)); - assertThrows(() => RegExp("^[\\p{Lu}-\\p{Ll}]+\$", unicode: true)); - - assertTrue(RegExp(r"^[\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("ABCabc")); - assertTrue(RegExp(r"^[\p{Lu}-]+$", unicode: true).hasMatch("ABC-")); - assertFalse(RegExp(r"^[\P{Lu}\p{Ll}]+$", unicode: true).hasMatch("ABCabc")); - assertTrue(RegExp(r"^[\P{Lu}\p{Ll}]+$", unicode: true).hasMatch("abc")); - assertTrue(RegExp(r"^[\P{Lu}]+$", unicode: true).hasMatch("abc123")); - assertFalse(RegExp(r"^[\P{Lu}]+$", unicode: true).hasMatch("XYZ")); - assertTrue(RegExp(r"[\p{Math}]", unicode: true).hasMatch("+")); - assertTrue(RegExp(r"[\P{Bidi_M}]", unicode: true).hasMatch(" ")); - assertTrue(RegExp(r"[\p{Hex}]", unicode: true).hasMatch("A")); - - assertTrue(RegExp(r"^[^\P{Lu}]+$", unicode: true).hasMatch("XYZ")); - assertFalse(RegExp(r"^[^\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("abc")); - assertFalse(RegExp(r"^[^\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("ABC")); - assertTrue(RegExp(r"^[^\p{Lu}\p{Ll}]+$", unicode: true).hasMatch("123")); - assertTrue(RegExp(r"^[^\p{Lu}\P{Ll}]+$", unicode: true).hasMatch("abc")); -} diff --git a/tests/corelib_2/regexp/unicode-property-enumerated_test.dart b/tests/corelib_2/regexp/unicode-property-enumerated_test.dart deleted file mode 100644 index cfb3e029e15..00000000000 --- a/tests/corelib_2/regexp/unicode-property-enumerated_test.dart +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - assertThrows(() => RegExp("\\p{Bidi_Class=L}+", unicode: true)); - assertThrows(() => RegExp("\\p{bc=Left_To_Right}+", unicode: true)); - assertThrows(() => RegExp("\\p{bc=AL}+", unicode: true)); - assertThrows(() => RegExp("\\p{bc=Arabic_Letter}+", unicode: true)); - - assertThrows(() => RegExp("\\p{Line_Break=Glue}", unicode: true)); - assertThrows(() => RegExp("\\p{lb=AL}", unicode: true)); - - assertThrows(() => RegExp("\\p{Block=}", unicode: true)); - assertThrows(() => RegExp("\\p{=}", unicode: true)); - assertThrows(() => RegExp("\\p{=L}", unicode: true)); - assertThrows(() => RegExp("\\p{=Hiragana}", unicode: true)); - assertThrows(() => RegExp("\\p{Block=CJK=}", unicode: true)); - - assertThrows(() => RegExp("\\p{Age=V8_0}", unicode: true)); - assertDoesNotThrow( - () => RegExp("\\p{General_Category=Letter}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{gc=L}", unicode: true)); - assertThrows( - () => RegExp("\\p{General_Category_Mask=Letter}", unicode: true)); - assertThrows(() => RegExp("\\p{gcm=L}", unicode: true)); -} diff --git a/tests/corelib_2/regexp/unicode-property-exact-match_test.dart b/tests/corelib_2/regexp/unicode-property-exact-match_test.dart deleted file mode 100644 index bbf99d0ab76..00000000000 --- a/tests/corelib_2/regexp/unicode-property-exact-match_test.dart +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - assertThrows(() => RegExp("\\p{In CJK}", unicode: true)); - assertThrows(() => RegExp("\\p{InCJKUnifiedIdeographs}", unicode: true)); - assertThrows(() => RegExp("\\p{InCJK}", unicode: true)); - assertThrows(() => RegExp("\\p{InCJK_Unified_Ideographs}", unicode: true)); - - assertThrows(() => RegExp("\\p{InCyrillic_Sup}", unicode: true)); - assertThrows(() => RegExp("\\p{InCyrillic_Supplement}", unicode: true)); - assertThrows(() => RegExp("\\p{InCyrillic_Supplementary}", unicode: true)); - assertThrows(() => RegExp("\\p{InCyrillicSupplementary}", unicode: true)); - assertThrows(() => RegExp("\\p{InCyrillic_supplementary}", unicode: true)); - - assertDoesNotThrow(() => RegExp("\\p{C}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{Other}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{Cc}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{Control}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{cntrl}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{M}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{Mark}", unicode: true)); - assertDoesNotThrow(() => RegExp("\\p{Combining_Mark}", unicode: true)); - assertThrows(() => RegExp("\\p{Combining Mark}", unicode: true)); - - assertDoesNotThrow(() => RegExp("\\p{Script=Copt}", unicode: true)); - assertThrows(() => RegExp("\\p{Coptic}", unicode: true)); - assertThrows(() => RegExp("\\p{Qaac}", unicode: true)); - assertThrows(() => RegExp("\\p{Egyp}", unicode: true)); - assertDoesNotThrow( - () => RegExp("\\p{Script=Egyptian_Hieroglyphs}", unicode: true)); - assertThrows(() => RegExp("\\p{EgyptianHieroglyphs}", unicode: true)); - - assertThrows(() => RegExp("\\p{BidiClass=LeftToRight}", unicode: true)); - assertThrows(() => RegExp("\\p{BidiC=LeftToRight}", unicode: true)); - assertThrows(() => RegExp("\\p{bidi_c=Left_To_Right}", unicode: true)); - - assertThrows(() => RegExp("\\p{Block=CJK}", unicode: true)); - assertThrows(() => RegExp("\\p{Block = CJK}", unicode: true)); - assertThrows(() => RegExp("\\p{Block=cjk}", unicode: true)); - assertThrows(() => RegExp("\\p{BLK=CJK}", unicode: true)); -} diff --git a/tests/corelib_2/regexp/unicode-property-general-category_test.dart b/tests/corelib_2/regexp/unicode-property-general-category_test.dart deleted file mode 100644 index cb94cacd43e..00000000000 --- a/tests/corelib_2/regexp/unicode-property-general-category_test.dart +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - assertThrows(() => RegExp("\\p", unicode: true)); - assertThrows(() => RegExp("\\p{garbage}", unicode: true)); - assertThrows(() => RegExp("\\p{}", unicode: true)); - assertThrows(() => RegExp("\\p{", unicode: true)); - assertThrows(() => RegExp("\\p}", unicode: true)); - assertThrows(() => RegExp("\\pL", unicode: true)); - assertThrows(() => RegExp("\\P", unicode: true)); - assertThrows(() => RegExp("\\P{garbage}", unicode: true)); - assertThrows(() => RegExp("\\P{}", unicode: true)); - assertThrows(() => RegExp("\\P{", unicode: true)); - assertThrows(() => RegExp("\\P}", unicode: true)); - assertThrows(() => RegExp("\\PL", unicode: true)); - - assertTrue(RegExp(r"\p{Ll}", unicode: true).hasMatch("a")); - assertFalse(RegExp(r"\P{Ll}", unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\P{Ll}", unicode: true).hasMatch("A")); - assertFalse(RegExp(r"\p{Ll}", unicode: true).hasMatch("A")); - assertTrue(RegExp(r"\p{Ll}", unicode: true).hasMatch("\u{1D7BE}")); - assertFalse(RegExp(r"\P{Ll}", unicode: true).hasMatch("\u{1D7BE}")); - assertFalse(RegExp(r"\p{Ll}", unicode: true).hasMatch("\u{1D5E3}")); - assertTrue(RegExp(r"\P{Ll}", unicode: true).hasMatch("\u{1D5E3}")); - - assertTrue( - RegExp(r"\p{Ll}", caseSensitive: false, unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\p{Ll}", caseSensitive: false, unicode: true) - .hasMatch("\u{118D4}")); - assertTrue( - RegExp(r"\p{Ll}", caseSensitive: false, unicode: true).hasMatch("A")); - assertTrue(RegExp(r"\p{Ll}", caseSensitive: false, unicode: true) - .hasMatch("\u{118B4}")); - assertTrue( - RegExp(r"\P{Ll}", caseSensitive: false, unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\P{Ll}", caseSensitive: false, unicode: true) - .hasMatch("\u{118D4}")); - assertTrue( - RegExp(r"\P{Ll}", caseSensitive: false, unicode: true).hasMatch("A")); - assertTrue(RegExp(r"\P{Ll}", caseSensitive: false, unicode: true) - .hasMatch("\u{118B4}")); - - assertTrue(RegExp(r"\p{Lu}", unicode: true).hasMatch("A")); - assertFalse(RegExp(r"\P{Lu}", unicode: true).hasMatch("A")); - assertTrue(RegExp(r"\P{Lu}", unicode: true).hasMatch("a")); - assertFalse(RegExp(r"\p{Lu}", unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\p{Lu}", unicode: true).hasMatch("\u{1D5E3}")); - assertFalse(RegExp(r"\P{Lu}", unicode: true).hasMatch("\u{1D5E3}")); - assertFalse(RegExp(r"\p{Lu}", unicode: true).hasMatch("\u{1D7BE}")); - assertTrue(RegExp(r"\P{Lu}", unicode: true).hasMatch("\u{1D7BE}")); - - assertTrue( - RegExp(r"\p{Lu}", caseSensitive: false, unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\p{Lu}", caseSensitive: false, unicode: true) - .hasMatch("\u{118D4}")); - assertTrue( - RegExp(r"\p{Lu}", caseSensitive: false, unicode: true).hasMatch("A")); - assertTrue(RegExp(r"\p{Lu}", caseSensitive: false, unicode: true) - .hasMatch("\u{118B4}")); - assertTrue( - RegExp(r"\P{Lu}", caseSensitive: false, unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\P{Lu}", caseSensitive: false, unicode: true) - .hasMatch("\u{118D4}")); - assertTrue( - RegExp(r"\P{Lu}", caseSensitive: false, unicode: true).hasMatch("A")); - assertTrue(RegExp(r"\P{Lu}", caseSensitive: false, unicode: true) - .hasMatch("\u{118B4}")); - - assertTrue(RegExp(r"\p{Sm}", unicode: true).hasMatch("+")); - assertFalse(RegExp(r"\P{Sm}", unicode: true).hasMatch("+")); - assertTrue(RegExp(r"\p{Sm}", unicode: true).hasMatch("\u{1D6C1}")); - assertFalse(RegExp(r"\P{Sm}", unicode: true).hasMatch("\u{1D6C1}")); - - assertFalse(RegExp(r"\p{L}", unicode: true).hasMatch("\uA6EE")); - assertTrue(RegExp(r"\P{L}", unicode: true).hasMatch("\uA6EE")); - - assertTrue(RegExp(r"\p{Lowercase_Letter}", unicode: true).hasMatch("a")); - assertTrue(RegExp(r"\p{Math_Symbol}", unicode: true).hasMatch("+")); - - assertTrue(RegExp(r"\p{gc=Ll}", unicode: true).hasMatch("a")); - assertTrue( - RegExp(r"\p{General_Category=Math_Symbol}", unicode: true).hasMatch("+")); - assertTrue(RegExp(r"\p{General_Category=L}", unicode: true).hasMatch("X")); -} diff --git a/tests/corelib_2/regexp/unicode-property-invalid_test.dart b/tests/corelib_2/regexp/unicode-property-invalid_test.dart deleted file mode 100644 index 8dddbc1d5a3..00000000000 --- a/tests/corelib_2/regexp/unicode-property-invalid_test.dart +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); - assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); - assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); - assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); - - assertThrows(() => RegExp("\p{blk=CJK}+", unicode: true)); - assertThrows(() => RegExp("\p{blk=CJK_Unified_Ideographs}+", unicode: true)); - assertThrows(() => RegExp("\p{blk=CJK}+", unicode: true)); - assertThrows(() => RegExp("\p{blk=CJK_Unified_Ideographs}+", unicode: true)); - - assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); - assertThrows(() => RegExp("\p{Block=ASCII}+", unicode: true)); - assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); - assertThrows(() => RegExp("\p{Block=Basic_Latin}+", unicode: true)); - - assertThrows(() => RegExp("\p{NFKD_Quick_Check=Y}+", unicode: true)); - assertThrows(() => RegExp("\p{NFKD_QC=Yes}+", unicode: true)); - - assertThrows(() => RegExp("\p{Numeric_Type=Decimal}+", unicode: true)); - assertThrows(() => RegExp("\p{nt=De}+", unicode: true)); - - assertThrows(() => RegExp("\p{Bidi_Class=Arabic_Letter}+", unicode: true)); - assertThrows(() => RegExp("\p{Bidi_Class=AN}+", unicode: true)); - - assertThrows(() => RegExp("\p{ccc=OV}+", unicode: true)); - - assertThrows(() => RegExp("\p{Sentence_Break=Format}+", unicode: true)); - - assertThrows(() => RegExp("\\p{In}", unicode: true)); - assertThrows(() => RegExp("\\pI", unicode: true)); - assertThrows(() => RegExp("\\p{I}", unicode: true)); - assertThrows(() => RegExp("\\p{CJK}", unicode: true)); - - assertThrows(() => RegExp("\\p{}", unicode: true)); -} diff --git a/tests/corelib_2/regexp/unicode-property-scripts_test.dart b/tests/corelib_2/regexp/unicode-property-scripts_test.dart deleted file mode 100644 index 5adec29ae97..00000000000 --- a/tests/corelib_2/regexp/unicode-property-scripts_test.dart +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - void t(RegExp re, String s) { - assertTrue(re.hasMatch(s)); - } - - void f(RegExp re, String s) { - assertFalse(re.hasMatch(s)); - } - - t(RegExp(r"\p{Script=Common}+", unicode: true), "."); - f(RegExp(r"\p{Script=Common}+", unicode: true), - "supercalifragilisticexpialidocious"); - - t(RegExp(r"\p{Script=Han}+", unicode: true), "话说天下大势,分久必合,合久必分"); - t(RegExp(r"\p{Script=Hani}+", unicode: true), "吾庄后有一桃园,花开正盛"); - f(RegExp(r"\p{Script=Han}+", unicode: true), "おはようございます"); - f(RegExp(r"\p{Script=Hani}+", unicode: true), - "Something is rotten in the state of Denmark"); - - t(RegExp(r"\p{Script=Latin}+", unicode: true), - "Wie froh bin ich, daß ich weg bin!"); - t(RegExp(r"\p{Script=Latn}+", unicode: true), - "It was a bright day in April, and the clocks were striking thirteen"); - f(RegExp(r"\p{Script=Latin}+", unicode: true), "奔腾千里荡尘埃,渡水登山紫雾开"); - f(RegExp(r"\p{Script=Latn}+", unicode: true), "いただきます"); - - t(RegExp(r"\p{sc=Hiragana}", unicode: true), "いただきます"); - t(RegExp(r"\p{sc=Hira}", unicode: true), "ありがとうございました"); - f(RegExp(r"\p{sc=Hiragana}", unicode: true), - "Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte"); - f(RegExp(r"\p{sc=Hira}", unicode: true), "Call me Ishmael"); - - t(RegExp(r"\p{sc=Phoenician}", unicode: true), "\u{10900}\u{1091a}"); - t(RegExp(r"\p{sc=Phnx}", unicode: true), "\u{1091f}\u{10916}"); - f(RegExp(r"\p{sc=Phoenician}", unicode: true), "Arthur est un perroquet"); - f(RegExp(r"\p{sc=Phnx}", unicode: true), "设心狠毒非良士,操卓原来一路人"); - - t(RegExp(r"\p{sc=Grek}", unicode: true), - "ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃς μάλα πολλὰ"); - t(RegExp(r"\p{sc=Greek}", unicode: true), - "μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος"); - f(RegExp(r"\p{sc=Greek}", unicode: true), "高贤未服英雄志,屈节偏生杰士疑"); - f(RegExp(r"\p{sc=Greek}", unicode: true), - "Mr. Jones, of the Manor Farm, had locked the hen-houses for the night"); -} diff --git a/tests/corelib_2/regexp/unicode-property-special_test.dart b/tests/corelib_2/regexp/unicode-property-special_test.dart deleted file mode 100644 index 1e0d2c18a33..00000000000 --- a/tests/corelib_2/regexp/unicode-property-special_test.dart +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - void t(RegExp re, String s) { - assertTrue(re.hasMatch(s)); - } - - void f(RegExp re, String s) { - assertFalse(re.hasMatch(s)); - } - - t(RegExp(r"\p{ASCII}+", unicode: true), "abc123"); - f(RegExp(r"\p{ASCII}+", unicode: true), "ⓐⓑⓒ①②③"); - f(RegExp(r"\p{ASCII}+", unicode: true), "🄰🄱🄲①②③"); - f(RegExp(r"\P{ASCII}+", unicode: true), "abcd123"); - t(RegExp(r"\P{ASCII}+", unicode: true), "ⓐⓑⓒ①②③"); - t(RegExp(r"\P{ASCII}+", unicode: true), "🄰🄱🄲①②③"); - - f(RegExp(r"[^\p{ASCII}]+", unicode: true), "abc123"); - f(RegExp(r"[\p{ASCII}]+", unicode: true), "ⓐⓑⓒ①②③"); - f(RegExp(r"[\p{ASCII}]+", unicode: true), "🄰🄱🄲①②③"); - t(RegExp(r"[^\P{ASCII}]+", unicode: true), "abcd123"); - t(RegExp(r"[\P{ASCII}]+", unicode: true), "ⓐⓑⓒ①②③"); - f(RegExp(r"[^\P{ASCII}]+", unicode: true), "🄰🄱🄲①②③"); - - t(RegExp(r"\p{Any}+", unicode: true), "🄰🄱🄲①②③"); - - shouldBe( - RegExp(r"\p{Any}", unicode: true).firstMatch("\ud800\ud801"), ["\ud800"]); - shouldBe( - RegExp(r"\p{Any}", unicode: true).firstMatch("\udc00\udc01"), ["\udc00"]); - shouldBe(RegExp(r"\p{Any}", unicode: true).firstMatch("\ud800\udc01"), - ["\ud800\udc01"]); - shouldBe(RegExp(r"\p{Any}", unicode: true).firstMatch("\udc01"), ["\udc01"]); - - f(RegExp(r"\P{Any}+", unicode: true), "123"); - f(RegExp(r"[\P{Any}]+", unicode: true), "123"); - t(RegExp(r"[\P{Any}\d]+", unicode: true), "123"); - t(RegExp(r"[^\P{Any}]+", unicode: true), "123"); - - t(RegExp(r"\p{Assigned}+", unicode: true), "123"); - t(RegExp(r"\p{Assigned}+", unicode: true), "🄰🄱🄲"); - f(RegExp(r"\p{Assigned}+", unicode: true), "\ufdd0"); - f(RegExp(r"\p{Assigned}+", unicode: true), "\u{fffff}"); - - f(RegExp(r"\P{Assigned}+", unicode: true), "123"); - f(RegExp(r"\P{Assigned}+", unicode: true), "🄰🄱🄲"); - t(RegExp(r"\P{Assigned}+", unicode: true), "\ufdd0"); - t(RegExp(r"\P{Assigned}+", unicode: true), "\u{fffff}"); - f(RegExp(r"\P{Assigned}", unicode: true), ""); - - t(RegExp(r"[^\P{Assigned}]+", unicode: true), "123"); - f(RegExp(r"[\P{Assigned}]+", unicode: true), "🄰🄱🄲"); - f(RegExp(r"[^\P{Assigned}]+", unicode: true), "\ufdd0"); - t(RegExp(r"[\P{Assigned}]+", unicode: true), "\u{fffff}"); - f(RegExp(r"[\P{Assigned}]", unicode: true), ""); - - f(RegExp(r"[^\u1234\p{ASCII}]+", unicode: true), "\u1234"); - t(RegExp(r"[x\P{ASCII}]+", unicode: true), "x"); - t(RegExp(r"[\u1234\p{ASCII}]+", unicode: true), "\u1234"); - -// Contributory binary properties are not supported. - assertThrows(() => RegExp("\\p{Other_Alphabetic}", unicode: true)); - assertThrows(() => RegExp("\\P{OAlpha}", unicode: true)); - assertThrows( - () => RegExp("\\p{Other_Default_Ignorable_Code_Point}", unicode: true)); - assertThrows(() => RegExp("\\P{ODI}", unicode: true)); - assertThrows(() => RegExp("\\p{Other_Grapheme_Extend}", unicode: true)); - assertThrows(() => RegExp("\\P{OGr_Ext}", unicode: true)); - assertThrows(() => RegExp("\\p{Other_ID_Continue}", unicode: true)); - assertThrows(() => RegExp("\\P{OIDC}", unicode: true)); - assertThrows(() => RegExp("\\p{Other_ID_Start}", unicode: true)); - assertThrows(() => RegExp("\\P{OIDS}", unicode: true)); - assertThrows(() => RegExp("\\p{Other_Lowercase}", unicode: true)); - assertThrows(() => RegExp("\\P{OLower}", unicode: true)); - assertThrows(() => RegExp("\\p{Other_Math}", unicode: true)); - assertThrows(() => RegExp("\\P{OMath}", unicode: true)); - assertThrows(() => RegExp("\\p{Other_Uppercase}", unicode: true)); - assertThrows(() => RegExp("\\P{OUpper}", unicode: true)); -} diff --git a/tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart b/tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart deleted file mode 100644 index 2fcb7c1d580..00000000000 --- a/tests/corelib_2/regexp/unicode-regexp-backrefs_test.dart +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -String replace(String string) { - return string - .replaceAll("L", "\ud800") - .replaceAll("l", "\ud801") - .replaceAll("T", "\udc00") - .replaceAll(".", "[^]"); -} - -void test(List expectation, String regexp_source, String subject) { - if (expectation != null) expectation = expectation.map(replace).toList(); - subject = replace(subject); - regexp_source = replace(regexp_source); - shouldBe(new RegExp(regexp_source, unicode: true).firstMatch(subject), - expectation); -} - -void main() { - // Back reference does not end in the middle of a surrogate pair. - test(null, "(L)\\1", "LLT"); - test(["LLTLl", "L", "l"], "(L).*\\1(.)", "LLTLl"); - test(null, "(aL).*\\1", "aLaLT"); - test(["aLaLTaLl", "aL", "l"], "(aL).*\\1(.)", "aLaLTaLl"); - - var s = "TabcLxLTabcLxTabcLTyTabcLz"; - test([s, "TabcL", "z"], "([^x]+).*\\1(.)", s); - - // Back reference does not start in the middle of a surrogate pair. - test(["TLTabTc", "T", "c"], "(T).*\\1(.)", "TLTabTc"); - - // Lookbehinds. - test(null, "(?<=\\1(T)x)", "LTTx"); - test(["", "b", "T"], "(?<=(.)\\2.*(T)x)", "bTaLTTx"); - test(null, "(?<=\\1.*(L)x)", "LTLx"); - test(["", "b", "L"], "(?<=(.)\\2.*(L)x)", "bLaLTLx"); - - test(null, "([^x]+)x*\\1", "LxLT"); - test(null, "([^x]+)x*\\1", "TxLT"); - test(null, "([^x]+)x*\\1", "LTxL"); - test(null, "([^x]+)x*\\1", "LTxT"); - test(null, "([^x]+)x*\\1", "xLxLT"); - test(null, "([^x]+)x*\\1", "xTxLT"); - test(null, "([^x]+)x*\\1", "xLTxL"); - test(null, "([^x]+)x*\\1", "xLTxT"); - test(null, "([^x]+)x*\\1", "xxxLxxLTxx"); - test(null, "([^x]+)x*\\1", "xxxTxxLTxx"); - test(null, "([^x]+)x*\\1", "xxxLTxxLxx"); - test(null, "([^x]+)x*\\1", "xxxLTxxTxx"); - test(["LTTxxLTT", "LTT"], "([^x]+)x*\\1", "xxxLTTxxLTTxx"); -} diff --git a/tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart b/tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart deleted file mode 100644 index c87047c5c06..00000000000 --- a/tests/corelib_2/regexp/unicode-regexp-ignore-case_test.dart +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - // Non-unicode use toUpperCase mappings. - assertFalse(RegExp(r"[\u00e5]", caseSensitive: false).hasMatch("\u212b")); - assertFalse( - RegExp(r"[\u212b]", caseSensitive: false).hasMatch("\u00e5\u1234")); - assertFalse(RegExp(r"[\u212b]", caseSensitive: false).hasMatch("\u00e5")); - - assertTrue("\u212b".toLowerCase() == "\u00e5"); - assertTrue("\u00c5".toLowerCase() == "\u00e5"); - assertTrue("\u00e5".toUpperCase() == "\u00c5"); - - // Unicode uses case folding mappings. - assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) - .hasMatch("\u212b")); - assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) - .hasMatch("\u00c5")); - assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) - .hasMatch("\u00e5")); - assertTrue(RegExp(r"\u00e5", caseSensitive: false, unicode: true) - .hasMatch("\u212b")); - assertTrue(RegExp(r"\u00c5", caseSensitive: false, unicode: true) - .hasMatch("\u00e5")); - assertTrue(RegExp(r"\u00c5", caseSensitive: false, unicode: true) - .hasMatch("\u212b")); - assertTrue(RegExp(r"\u00c5", caseSensitive: false, unicode: true) - .hasMatch("\u00c5")); - assertTrue(RegExp(r"\u212b", caseSensitive: false, unicode: true) - .hasMatch("\u00c5")); - assertTrue(RegExp(r"\u212b", caseSensitive: false, unicode: true) - .hasMatch("\u00e5")); - assertTrue(RegExp(r"\u212b", caseSensitive: false, unicode: true) - .hasMatch("\u212b")); - - // Non-BMP. - assertFalse(RegExp(r"\u{10400}", caseSensitive: false).hasMatch("\u{10428}")); - assertTrue(RegExp(r"\u{10400}", caseSensitive: false, unicode: true) - .hasMatch("\u{10428}")); - assertTrue(RegExp(r"\ud801\udc00", caseSensitive: false, unicode: true) - .hasMatch("\u{10428}")); - assertTrue(RegExp(r"[\u{10428}]", caseSensitive: false, unicode: true) - .hasMatch("\u{10400}")); - assertTrue(RegExp(r"[\ud801\udc28]", caseSensitive: false, unicode: true) - .hasMatch("\u{10400}")); - shouldBe( - RegExp(r"[\uff40-\u{10428}]+", caseSensitive: false, unicode: true) - .firstMatch("\uff21\u{10400}abc"), - ["\uff21\u{10400}"]); - shouldBe( - RegExp(r"[^\uff40-\u{10428}]+", caseSensitive: false, unicode: true) - .firstMatch("\uff21\u{10400}abc\uff23"), - ["abc"]); - shouldBe( - RegExp(r"[\u24d5-\uff33]+", caseSensitive: false, unicode: true) - .firstMatch("\uff54\uff53\u24bb\u24ba"), - ["\uff53\u24bb"]); - - // Full mappings are ignored. - assertFalse( - RegExp(r"\u00df", caseSensitive: false, unicode: true).hasMatch("SS")); - assertFalse(RegExp(r"\u1f8d", caseSensitive: false, unicode: true) - .hasMatch("\u1f05\u03b9")); - - // Simple mappings work. - assertTrue(RegExp(r"\u1f8d", caseSensitive: false, unicode: true) - .hasMatch("\u1f85")); - - // Common mappings work. - assertTrue(RegExp(r"\u1f6b", caseSensitive: false, unicode: true) - .hasMatch("\u1f63")); - - // Back references. - shouldBe( - RegExp(r"(.)\1\1", caseSensitive: false, unicode: true) - .firstMatch("\u00e5\u212b\u00c5"), - ["\u00e5\u212b\u00c5", "\u00e5"]); - shouldBe( - RegExp(r"(.)\1", caseSensitive: false, unicode: true) - .firstMatch("\u{118aa}\u{118ca}"), - ["\u{118aa}\u{118ca}", "\u{118aa}"]); - - // Misc. - assertTrue(RegExp(r"\u00e5\u00e5\u00e5", caseSensitive: false, unicode: true) - .hasMatch("\u212b\u00e5\u00c5")); - assertTrue(RegExp(r"AB\u{10400}", caseSensitive: false, unicode: true) - .hasMatch("ab\u{10428}")); - - // Non-Latin1 maps to Latin1. - shouldBe( - RegExp(r"^\u017F", caseSensitive: false, unicode: true).firstMatch("s"), - ["s"]); - shouldBe( - RegExp(r"^\u017F", caseSensitive: false, unicode: true) - .firstMatch("s\u1234"), - ["s"]); - shouldBe( - RegExp(r"^a[\u017F]", caseSensitive: false, unicode: true) - .firstMatch("as"), - ["as"]); - shouldBe( - RegExp(r"^a[\u017F]", caseSensitive: false, unicode: true) - .firstMatch("as\u1234"), - ["as"]); -} diff --git a/tests/corelib_2/regexp/unicode-regexp-match-index_test.dart b/tests/corelib_2/regexp/unicode-regexp-match-index_test.dart deleted file mode 100644 index 3e696584ac1..00000000000 --- a/tests/corelib_2/regexp/unicode-regexp-match-index_test.dart +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - // Testing handling of paired and non-paired surrogates in unicode mode - var r = new RegExp(r".", unicode: true); - - var m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); - shouldBe(m, ["\ud800\udc00"]); - assertEquals(m.end, 2); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); - shouldBe(m, ["\ud800\udc00"]); - assertEquals(m.end, 2); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); - shouldBe(m, ["\ud801\udc01"]); - assertEquals(m.end, 4); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); - shouldBe(m, ["\ud801\udc01"]); - assertEquals(m.end, 4); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); - - m = r.matchAsPrefix("\ud800\udc00\ud801\ud802", 3); - shouldBe(m, ["\ud802"]); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\ud802", 4)); - - // Testing handling of paired and non-paired surrogates in non-unicode mode - r = new RegExp(r"."); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); - shouldBe(m, ["\ud800"]); - assertEquals(m.end, 1); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); - shouldBe(m, ["\udc00"]); - assertEquals(m.end, 2); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); - shouldBe(m, ["\ud801"]); - assertEquals(m.end, 3); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); - shouldBe(m, ["\udc01"]); - assertEquals(m.end, 4); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); - - // Testing same with start anchor, unicode mode. - r = new RegExp("^.", unicode: true); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); - shouldBe(m, ["\ud800\udc00"]); - assertEquals(2, m.end); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); - shouldBe(m, ["\ud800\udc00"]); - assertEquals(2, m.end); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2)); - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3)); - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); - - // Testing same with start anchor, non-unicode mode. - r = new RegExp("^."); - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); - shouldBe(m, ["\ud800"]); - assertEquals(1, m.end); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1)); - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2)); - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3)); - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); - - // Now with both anchored and not as alternatives (with the anchored - // version as a captured group), unicode mode. - r = new RegExp(r"(?:(^.)|.)", unicode: true); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); - shouldBe(m, ["\ud800\udc00", "\ud800\udc00"]); - assertEquals(m.end, 2); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); - shouldBe(m, ["\ud800\udc00", "\ud800\udc00"]); - assertEquals(m.end, 2); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); - shouldBe(m, ["\ud801\udc01", null]); - assertEquals(m.end, 4); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); - shouldBe(m, ["\ud801\udc01", null]); - assertEquals(m.end, 4); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); - - m = r.matchAsPrefix("\ud800\udc00\ud801\ud802", 3); - shouldBe(m, ["\ud802", null]); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\ud802", 4)); - - // Now with both anchored and not as alternatives (with the anchored - // version as a captured group), non-unicode mode. - r = new RegExp(r"(?:(^.)|.)"); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01"); - shouldBe(m, ["\ud800", "\ud800"]); - assertEquals(m.end, 1); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 1); - shouldBe(m, ["\udc00", null]); - assertEquals(m.end, 2); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 2); - shouldBe(m, ["\ud801", null]); - assertEquals(m.end, 3); - - m = r.matchAsPrefix("\ud800\udc00\ud801\udc01", 3); - shouldBe(m, ["\udc01", null]); - assertEquals(m.end, 4); - - assertNull(r.matchAsPrefix("\ud800\udc00\ud801\udc01", 4)); -} diff --git a/tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart b/tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart deleted file mode 100644 index d894c7bb859..00000000000 --- a/tests/corelib_2/regexp/unicode-regexp-restricted-syntax_test.dart +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - // test262/data/test/language/literals/regexp/u-dec-esc - assertThrows(() => RegExp(r"\1", unicode: true)); - // test262/language/literals/regexp/u-invalid-char-range-a - assertThrows(() => RegExp(r"[\w-a]", unicode: true)); - // test262/language/literals/regexp/u-invalid-char-range-b - assertThrows(() => RegExp(r"[a-\w]", unicode: true)); - // test262/language/literals/regexp/u-invalid-char-esc - assertThrows(() => RegExp(r"\c", unicode: true)); - assertThrows(() => RegExp(r"\c0", unicode: true)); - // test262/built-ins/RegExp/unicode_restricted_quantifiable_assertion - assertThrows(() => RegExp(r"(?=.)*", unicode: true)); - assertThrows(() => RegExp(r"(?=.){1,2}", unicode: true)); - // test262/built-ins/RegExp/unicode_restricted_octal_escape - assertThrows(() => RegExp(r"[\1]", unicode: true)); - assertThrows(() => RegExp(r"\00", unicode: true)); - assertThrows(() => RegExp(r"\09", unicode: true)); - // test262/built-ins/RegExp/unicode_restricted_identity_escape_alpha - assertThrows(() => RegExp(r"[\c]", unicode: true)); - // test262/built-ins/RegExp/unicode_restricted_identity_escape_c - assertThrows(() => RegExp(r"[\c0]", unicode: true)); - // test262/built-ins/RegExp/unicode_restricted_incomple_quantifier - assertThrows(() => RegExp(r"a{", unicode: true)); - assertThrows(() => RegExp(r"a{1,", unicode: true)); - assertThrows(() => RegExp(r"{", unicode: true)); - assertThrows(() => RegExp(r"}", unicode: true)); - // test262/data/test/built-ins/RegExp/unicode_restricted_brackets - assertThrows(() => RegExp(r"]", unicode: true)); - // test262/built-ins/RegExp/unicode_identity_escape - assertDoesNotThrow(() => RegExp(r"\/", unicode: true)); - - // escaped \0 (as NUL) is allowed inside a character class. - shouldBe(RegExp(r"[\0]", unicode: true).firstMatch("\u0000"), ["\u0000"]); - // unless it is followed by another digit. - assertThrows(() => RegExp(r"[\00]", unicode: true)); - assertThrows(() => RegExp(r"[\01]", unicode: true)); - assertThrows(() => RegExp(r"[\09]", unicode: true)); - shouldBe(RegExp(r"[1\0a]+", unicode: true).firstMatch("b\u{0}1\u{0}a\u{0}2"), - ["\u{0}1\u{0}a\u{0}"]); - // escaped \- is allowed inside a character class. - shouldBe(RegExp(r"[a\-z]", unicode: true).firstMatch("12-34"), ["-"]); -} diff --git a/tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart b/tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart deleted file mode 100644 index e57d810d16f..00000000000 --- a/tests/corelib_2/regexp/unicode-regexp-unanchored-advance_test.dart +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2013 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - final s = "a" * 10000000 + "\u1234"; - shouldBe( - RegExp(r"(\u1234)", unicode: true).firstMatch(s), ["\u1234", "\u1234"]); -} diff --git a/tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart b/tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart deleted file mode 100644 index 1d07628b74a..00000000000 --- a/tests/corelib_2/regexp/unicode-regexp-zero-length_test.dart +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2019, the Dart project authors. All rights reserved. -// Copyright 2016 the V8 project authors. All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following -// disclaimer in the documentation and/or other materials provided -// with the distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived -// from this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import 'package:expect/expect.dart'; - -import 'v8_regexp_utils.dart'; - -void main() { - const L = "\ud800"; - const T = "\udc00"; - const x = "x"; - - var r = RegExp(r"()"); // Not unicode. - // Zero-length matches do not advance lastIndex. - var m = r.matchAsPrefix(L + T + L + T); - shouldBe(m, ["", ""]); - assertEquals(0, m.end); - - m = r.matchAsPrefix(L + T + L + T, 1); - shouldBe(m, ["", ""]); - assertEquals(1, m.end); - - var u = RegExp(r"()", unicode: true); - - // Zero-length matches do not advance lastIndex (but do respect paired - // surrogates). - m = u.matchAsPrefix(L + T + L + T); - shouldBe(m, ["", ""]); - assertEquals(0, m.end); - - m = u.matchAsPrefix(L + T + L + T, 1); - shouldBe(m, ["", ""]); - assertEquals(0, m.end); - - // However, with repeating matches, we do advance from match to match. - var ms = r.allMatches(L + T + L + T); - assertEquals(5, ms.length); - for (var i = 0; i < ms.length; i++) { - shouldBe(ms.elementAt(i), ["", ""]); - } - - // With unicode flag, we advance code point by code point. - ms = u.allMatches(L + T + L + T); - assertEquals(3, ms.length); - for (var i = 0; i < ms.length; i++) { - shouldBe(ms.elementAt(i), ["", ""]); - } - - // Test with a lot of copies. - const c = 1000; - ms = u.allMatches((L + T) * c); - assertEquals(c + 1, ms.length); - for (var i = 0; i < ms.length; i++) { - shouldBe(ms.elementAt(i), ["", ""]); - } - - // Same with replaceAll(). - assertEquals( - x + L + x + T + x + L + x + T + x, (L + T + L + T).replaceAll(r, "x")); - - assertEquals(x + L + T + x + L + T + x, (L + T + L + T).replaceAll(u, "x")); - - assertEquals((x + L + T) * c + x, ((L + T) * c).replaceAll(u, "x")); - - // Also test String#split. - Expect.deepEquals( - ["\u{12345}"], "\u{12345}".split(RegExp(r"(?:)", unicode: true))); -} diff --git a/tests/corelib_2/regexp/v8_regexp_utils.dart b/tests/corelib_2/regexp/v8_regexp_utils.dart index 111ab90f365..cde750a7294 100644 --- a/tests/corelib_2/regexp/v8_regexp_utils.dart +++ b/tests/corelib_2/regexp/v8_regexp_utils.dart @@ -22,10 +22,6 @@ void assertThrows(fn, [num testid = null]) { Expect.throws(fn, null, "Test $testid"); } -void assertDoesNotThrow(fn, [num testid = null]) { - fn(); -} - void assertNull(actual, [num testid = null]) { Expect.isNull(actual, "Test $testid"); }