mirror of
https://github.com/SerenityOS/serenity
synced 2024-10-06 16:09:30 +00:00
LibRegex: Make '.' reject matching LF / LS / PS as per the ECMA262 spec
Previously we allowed it to match those, but the ECMA262 spec disallows these (except in DotAll).
This commit is contained in:
parent
1e022295c4
commit
936a9fd759
|
@ -697,6 +697,7 @@ TEST_CASE(ECMA262_match)
|
|||
{ "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
|
||||
{ "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive },
|
||||
{ "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive },
|
||||
{ "."sv, "\n\r\u2028\u2029"sv, false }, // Dot should not match any of CR/LF/LS/PS in ECMA262 mode without DotAll.
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
|
|
|
@ -31,24 +31,25 @@ enum __Regex_Error {
|
|||
};
|
||||
|
||||
enum __RegexAllFlags {
|
||||
__Regex_Global = 1, // All matches (don't return after first match)
|
||||
__Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
__Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
__Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
__Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
__Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
__Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string!
|
||||
__Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
__Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result
|
||||
__Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string.
|
||||
__Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
|
||||
__Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
|
||||
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
|
||||
__Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match.
|
||||
__Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes.
|
||||
__Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches.
|
||||
__Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
|
||||
__Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
|
||||
__Regex_Last = __Regex_UnicodeSets,
|
||||
__Regex_Global = 1, // All matches (don't return after first match)
|
||||
__Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
__Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
__Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
__Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
__Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
__Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string!
|
||||
__Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
__Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result
|
||||
__Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string.
|
||||
__Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
|
||||
__Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
|
||||
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
|
||||
__Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match.
|
||||
__Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes.
|
||||
__Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches.
|
||||
__Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
|
||||
__Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
|
||||
__Regex_Internal_ECMA262DotSemantics = __Regex_Global << 19, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
|
||||
__Regex_Last = __Regex_Internal_ECMA262DotSemantics,
|
||||
};
|
||||
|
|
|
@ -499,8 +499,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
|||
if (input.view.length() <= state.string_position)
|
||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||
|
||||
// U+2028 LINE SEPARATOR
|
||||
constexpr static u32 const LineSeparator { 0x2028 };
|
||||
// U+2029 PARAGRAPH SEPARATOR
|
||||
constexpr static u32 const ParagraphSeparator { 0x2029 };
|
||||
|
||||
auto input_view = input.view.substring_view(state.string_position, 1)[0];
|
||||
if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) {
|
||||
auto is_equivalent_to_newline = input_view == '\n'
|
||||
|| (input.regex_options.has_flag_set(AllFlags::Internal_ECMA262DotSemantics)
|
||||
? (input_view == '\r' || input_view == LineSeparator || input_view == ParagraphSeparator)
|
||||
: false);
|
||||
|
||||
if (!is_equivalent_to_newline || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) {
|
||||
if (current_inversion_state())
|
||||
inverse_matched = true;
|
||||
else
|
||||
|
|
|
@ -19,29 +19,32 @@ namespace regex {
|
|||
using FlagsUnderlyingType = u32;
|
||||
|
||||
enum class AllFlags {
|
||||
Global = __Regex_Global, // All matches (don't return after first match)
|
||||
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
|
||||
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
|
||||
StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string.
|
||||
SingleLine = __Regex_SingleLine, // Dot matches newline characters
|
||||
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
|
||||
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
|
||||
SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match.
|
||||
UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes.
|
||||
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
|
||||
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
|
||||
Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
|
||||
Default = 0,
|
||||
Global = __Regex_Global, // All matches (don't return after first match)
|
||||
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
|
||||
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
|
||||
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
|
||||
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
|
||||
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
|
||||
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
|
||||
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
|
||||
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
|
||||
StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string.
|
||||
SingleLine = __Regex_SingleLine, // Dot matches newline characters
|
||||
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
|
||||
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
|
||||
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
|
||||
SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match.
|
||||
UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes.
|
||||
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
|
||||
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
|
||||
Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
|
||||
Internal_ECMA262DotSemantics = __Regex_Internal_ECMA262DotSemantics, // Use ECMA262 dot semantics: disallow matching CR/LF/LS/PS instead of just CR.
|
||||
Last = Internal_BrowserExtended,
|
||||
};
|
||||
|
||||
enum class PosixFlags : FlagsUnderlyingType {
|
||||
Default = 0,
|
||||
Global = (FlagsUnderlyingType)AllFlags::Global,
|
||||
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
|
||||
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
|
||||
|
@ -58,6 +61,7 @@ enum class PosixFlags : FlagsUnderlyingType {
|
|||
};
|
||||
|
||||
enum class ECMAScriptFlags : FlagsUnderlyingType {
|
||||
Default = (FlagsUnderlyingType)AllFlags::Internal_ECMA262DotSemantics,
|
||||
Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
|
||||
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
|
||||
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
|
||||
|
@ -80,13 +84,13 @@ public:
|
|||
RegexOptions() = default;
|
||||
|
||||
constexpr RegexOptions(T flags)
|
||||
: m_flags(flags)
|
||||
: m_flags(static_cast<T>(to_underlying(flags) | to_underlying(T::Default)))
|
||||
{
|
||||
}
|
||||
|
||||
template<class U>
|
||||
constexpr RegexOptions(RegexOptions<U> other)
|
||||
: m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
|
||||
: RegexOptions(static_cast<T>(to_underlying(other.value())))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -115,7 +119,7 @@ public:
|
|||
T value() const { return m_flags; }
|
||||
|
||||
private:
|
||||
T m_flags { 0 };
|
||||
T m_flags { T::Default };
|
||||
};
|
||||
|
||||
template<class T>
|
||||
|
|
Loading…
Reference in a new issue