LibRegex: Make '.' reject matching LF / LS / PS as per the ECMA262 spec

Previously we allowed it to match those, but the ECMA262 spec disallows these (except in DotAll).
2024-10-06 16:09:30 +00:00 · 2023-02-15 09:55:43 +03:30 · 2023-02-15 09:55:43 +03:30 · 936a9fd759
parent 1e022295c4
commit 936a9fd759
4 changed files with 59 additions and 43 deletions
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -697,6 +697,7 @@ TEST_CASE(ECMA262_match)
        { "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
        { "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive },
        { "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive },
+        { "."sv, "\n\r\u2028\u2029"sv, false }, // Dot should not match any of CR/LF/LS/PS in ECMA262 mode without DotAll.
    };
    // clang-format on

--- a/Userland/Libraries/LibC/bits/regex_defs.h
+++ b/Userland/Libraries/LibC/bits/regex_defs.h
@ -31,24 +31,25 @@ enum __Regex_Error {
 };

 enum __RegexAllFlags {
-    __Regex_Global = 1,                                      // All matches (don't return after first match)
-    __Regex_Insensitive = __Regex_Global << 1,               // Case insensitive match (ignores case of [a-zA-Z])
-    __Regex_Ungreedy = __Regex_Global << 2,                  // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
-    __Regex_Unicode = __Regex_Global << 3,                   // Enable all unicode features and interpret all unicode escape sequences as such
-    __Regex_Extended = __Regex_Global << 4,                  // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
-    __Regex_Extra = __Regex_Global << 5,                     // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
-    __Regex_MatchNotBeginOfLine = __Regex_Global << 6,       // Pattern is not forced to ^ -> search in whole string!
-    __Regex_MatchNotEndOfLine = __Regex_Global << 7,         // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
-    __Regex_SkipSubExprResults = __Regex_Global << 8,        // Do not return sub expressions in the result
-    __Regex_StringCopyMatches = __Regex_Global << 9,         // Do explicitly copy results into new allocated string instead of StringView to original string.
-    __Regex_SingleLine = __Regex_Global << 10,               // Dot matches newline characters
-    __Regex_Sticky = __Regex_Global << 11,                   // Force the pattern to only match consecutive matches from where the previous match ended.
-    __Regex_Multiline = __Regex_Global << 12,                // Handle newline characters. Match each line, one by one.
-    __Regex_SkipTrimEmptyMatches = __Regex_Global << 13,     // Do not remove empty capture group results.
-    __Regex_SingleMatch = __Regex_Global << 14,              // Stop after acquiring a single match.
-    __Regex_UnicodeSets = __Regex_Global << 15,              // ECMA262 Parser specific: Allow set operations in char classes.
-    __Regex_Internal_Stateful = __Regex_Global << 16,        // Internal flag; enables stateful matches.
-    __Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
-    __Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
-    __Regex_Last = __Regex_UnicodeSets,
+    __Regex_Global = 1,                                          // All matches (don't return after first match)
+    __Regex_Insensitive = __Regex_Global << 1,                   // Case insensitive match (ignores case of [a-zA-Z])
+    __Regex_Ungreedy = __Regex_Global << 2,                      // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
+    __Regex_Unicode = __Regex_Global << 3,                       // Enable all unicode features and interpret all unicode escape sequences as such
+    __Regex_Extended = __Regex_Global << 4,                      // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
+    __Regex_Extra = __Regex_Global << 5,                         // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
+    __Regex_MatchNotBeginOfLine = __Regex_Global << 6,           // Pattern is not forced to ^ -> search in whole string!
+    __Regex_MatchNotEndOfLine = __Regex_Global << 7,             // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
+    __Regex_SkipSubExprResults = __Regex_Global << 8,            // Do not return sub expressions in the result
+    __Regex_StringCopyMatches = __Regex_Global << 9,             // Do explicitly copy results into new allocated string instead of StringView to original string.
+    __Regex_SingleLine = __Regex_Global << 10,                   // Dot matches newline characters
+    __Regex_Sticky = __Regex_Global << 11,                       // Force the pattern to only match consecutive matches from where the previous match ended.
+    __Regex_Multiline = __Regex_Global << 12,                    // Handle newline characters. Match each line, one by one.
+    __Regex_SkipTrimEmptyMatches = __Regex_Global << 13,         // Do not remove empty capture group results.
+    __Regex_SingleMatch = __Regex_Global << 14,                  // Stop after acquiring a single match.
+    __Regex_UnicodeSets = __Regex_Global << 15,                  // ECMA262 Parser specific: Allow set operations in char classes.
+    __Regex_Internal_Stateful = __Regex_Global << 16,            // Internal flag; enables stateful matches.
+    __Regex_Internal_BrowserExtended = __Regex_Global << 17,     // Internal flag; enable browser-specific ECMA262 extensions.
+    __Regex_Internal_ConsiderNewline = __Regex_Global << 18,     // Internal flag; allow matchers to consider newlines as line separators.
+    __Regex_Internal_ECMA262DotSemantics = __Regex_Global << 19, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
+    __Regex_Last = __Regex_Internal_ECMA262DotSemantics,
 };
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@ -499,8 +499,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
            if (input.view.length() <= state.string_position)
                return ExecutionResult::Failed_ExecuteLowPrioForks;

+            // U+2028 LINE SEPARATOR
+            constexpr static u32 const LineSeparator { 0x2028 };
+            // U+2029 PARAGRAPH SEPARATOR
+            constexpr static u32 const ParagraphSeparator { 0x2029 };
+
            auto input_view = input.view.substring_view(state.string_position, 1)[0];
-            if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) {
+            auto is_equivalent_to_newline = input_view == '\n'
+                || (input.regex_options.has_flag_set(AllFlags::Internal_ECMA262DotSemantics)
+                        ? (input_view == '\r' || input_view == LineSeparator || input_view == ParagraphSeparator)
+                        : false);
+
+            if (!is_equivalent_to_newline || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) {
                if (current_inversion_state())
                    inverse_matched = true;
                else
--- a/Userland/Libraries/LibRegex/RegexOptions.h
+++ b/Userland/Libraries/LibRegex/RegexOptions.h
@ -19,29 +19,32 @@ namespace regex {
 using FlagsUnderlyingType = u32;

 enum class AllFlags {
-    Global = __Regex_Global,                                     // All matches (don't return after first match)
-    Insensitive = __Regex_Insensitive,                           // Case insensitive match (ignores case of [a-zA-Z])
-    Ungreedy = __Regex_Ungreedy,                                 // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
-    Unicode = __Regex_Unicode,                                   // Enable all unicode features and interpret all unicode escape sequences as such
-    Extended = __Regex_Extended,                                 // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
-    Extra = __Regex_Extra,                                       // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
-    MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine,           // Pattern is not forced to ^ -> search in whole string!
-    MatchNotEndOfLine = __Regex_MatchNotEndOfLine,               // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
-    SkipSubExprResults = __Regex_SkipSubExprResults,             // Do not return sub expressions in the result
-    StringCopyMatches = __Regex_StringCopyMatches,               // Do explicitly copy results into new allocated string instead of StringView to original string.
-    SingleLine = __Regex_SingleLine,                             // Dot matches newline characters
-    Sticky = __Regex_Sticky,                                     // Force the pattern to only match consecutive matches from where the previous match ended.
-    Multiline = __Regex_Multiline,                               // Handle newline characters. Match each line, one by one.
-    SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches,         // Do not remove empty capture group results.
-    SingleMatch = __Regex_SingleMatch,                           // Stop after acquiring a single match.
-    UnicodeSets = __Regex_UnicodeSets,                           // Only for ECMA262, Allow set operations in character classes.
-    Internal_Stateful = __Regex_Internal_Stateful,               // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
-    Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
-    Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
+    Default = 0,
+    Global = __Regex_Global,                                             // All matches (don't return after first match)
+    Insensitive = __Regex_Insensitive,                                   // Case insensitive match (ignores case of [a-zA-Z])
+    Ungreedy = __Regex_Ungreedy,                                         // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
+    Unicode = __Regex_Unicode,                                           // Enable all unicode features and interpret all unicode escape sequences as such
+    Extended = __Regex_Extended,                                         // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
+    Extra = __Regex_Extra,                                               // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
+    MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine,                   // Pattern is not forced to ^ -> search in whole string!
+    MatchNotEndOfLine = __Regex_MatchNotEndOfLine,                       // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
+    SkipSubExprResults = __Regex_SkipSubExprResults,                     // Do not return sub expressions in the result
+    StringCopyMatches = __Regex_StringCopyMatches,                       // Do explicitly copy results into new allocated string instead of StringView to original string.
+    SingleLine = __Regex_SingleLine,                                     // Dot matches newline characters
+    Sticky = __Regex_Sticky,                                             // Force the pattern to only match consecutive matches from where the previous match ended.
+    Multiline = __Regex_Multiline,                                       // Handle newline characters. Match each line, one by one.
+    SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches,                 // Do not remove empty capture group results.
+    SingleMatch = __Regex_SingleMatch,                                   // Stop after acquiring a single match.
+    UnicodeSets = __Regex_UnicodeSets,                                   // Only for ECMA262, Allow set operations in character classes.
+    Internal_Stateful = __Regex_Internal_Stateful,                       // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
+    Internal_BrowserExtended = __Regex_Internal_BrowserExtended,         // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
+    Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline,         // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
+    Internal_ECMA262DotSemantics = __Regex_Internal_ECMA262DotSemantics, // Use ECMA262 dot semantics: disallow matching CR/LF/LS/PS instead of just CR.
    Last = Internal_BrowserExtended,
 };

 enum class PosixFlags : FlagsUnderlyingType {
+    Default = 0,
    Global = (FlagsUnderlyingType)AllFlags::Global,
    Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
    Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
@ -58,6 +61,7 @@ enum class PosixFlags : FlagsUnderlyingType {
 };

 enum class ECMAScriptFlags : FlagsUnderlyingType {
+    Default = (FlagsUnderlyingType)AllFlags::Internal_ECMA262DotSemantics,
    Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
    Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
    Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
@ -80,13 +84,13 @@ public:
    RegexOptions() = default;

    constexpr RegexOptions(T flags)
-        : m_flags(flags)
+        : m_flags(static_cast<T>(to_underlying(flags) | to_underlying(T::Default)))
    {
    }

    template<class U>
    constexpr RegexOptions(RegexOptions<U> other)
-        : m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
+        : RegexOptions(static_cast<T>(to_underlying(other.value())))
    {
    }

@ -115,7 +119,7 @@ public:
    T value() const { return m_flags; }

 private:
-    T m_flags { 0 };
+    T m_flags { T::Default };
 };

 template<class T>