From fd8ccedf2b21a49571100b5a56d9a1f5d28b834c Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 17 Aug 2021 22:20:04 -0400 Subject: [PATCH] AK: Add GenericLexer API to consume an escaped Unicode code point This parsing is already duplicated between LibJS and LibRegex, and will shortly be needed in more places in those libraries. Move it to AK to prevent further duplication. This API will consume escaped Unicode code points of the form: \\u{code point} \\unnnn (where each n is a hexadecimal digit) \\unnnn\\unnnn (where the two escaped values are a surrogate pair) --- AK/GenericLexer.cpp | 72 +++++++++++++++++++++++++++++++++++ AK/GenericLexer.h | 12 ++++++ Tests/AK/TestGenericLexer.cpp | 48 +++++++++++++++++++++++ 3 files changed, 132 insertions(+) diff --git a/AK/GenericLexer.cpp b/AK/GenericLexer.cpp index 2cbabf3d3e..7a0372d6a0 100644 --- a/AK/GenericLexer.cpp +++ b/AK/GenericLexer.cpp @@ -5,9 +5,11 @@ */ #include +#include #include #include #include +#include namespace AK { // Consume a number of characters @@ -128,4 +130,74 @@ String GenericLexer::consume_and_unescape_string(char escape_char) return builder.to_string(); } +auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result +{ + if (!consume_specific("\\u"sv)) + return UnicodeEscapeError::MalformedUnicodeEscape; + + if (next_is('{')) + return decode_code_point(); + return decode_single_or_paired_surrogate(combine_surrogate_pairs); +} + +auto GenericLexer::decode_code_point() -> Result +{ + bool starts_with_open_bracket = consume_specific('{'); + VERIFY(starts_with_open_bracket); + + u32 code_point = 0; + + while (true) { + if (!next_is(is_ascii_hex_digit)) + return UnicodeEscapeError::MalformedUnicodeEscape; + + auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume()); + if (new_code_point < code_point) + return UnicodeEscapeError::UnicodeEscapeOverflow; + + code_point = new_code_point; + if (consume_specific('}')) + break; + } + + if (is_unicode(code_point)) + return code_point; + return UnicodeEscapeError::UnicodeEscapeOverflow; +} + +auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result +{ + constexpr size_t surrogate_length = 4; + + auto decode_one_surrogate = [&]() -> Optional { + u16 surrogate = 0; + + for (size_t i = 0; i < surrogate_length; ++i) { + if (!next_is(is_ascii_hex_digit)) + return {}; + + surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume()); + } + + return surrogate; + }; + + auto high_surrogate = decode_one_surrogate(); + if (!high_surrogate.has_value()) + return UnicodeEscapeError::MalformedUnicodeEscape; + if (!Utf16View::is_high_surrogate(*high_surrogate)) + return *high_surrogate; + if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) + return *high_surrogate; + + auto low_surrogate = decode_one_surrogate(); + if (!low_surrogate.has_value()) + return UnicodeEscapeError::MalformedUnicodeEscape; + if (Utf16View::is_low_surrogate(*low_surrogate)) + return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); + + retreat(6); + return *high_surrogate; +} + } diff --git a/AK/GenericLexer.h b/AK/GenericLexer.h index e824f473a8..7a01060068 100644 --- a/AK/GenericLexer.h +++ b/AK/GenericLexer.h @@ -6,6 +6,7 @@ #pragma once +#include #include namespace AK { @@ -115,6 +116,13 @@ public: StringView consume_quoted_string(char escape_char = 0); String consume_and_unescape_string(char escape_char = '\\'); + enum class UnicodeEscapeError { + MalformedUnicodeEscape, + UnicodeEscapeOverflow, + }; + + Result consume_escaped_code_point(bool combine_surrogate_pairs = true); + constexpr void ignore(size_t count = 1) { count = min(count, m_input.length() - m_index); @@ -201,6 +209,10 @@ public: protected: StringView m_input; size_t m_index { 0 }; + +private: + Result decode_code_point(); + Result decode_single_or_paired_surrogate(bool combine_surrogate_pairs); }; constexpr auto is_any_of(const StringView& values) diff --git a/Tests/AK/TestGenericLexer.cpp b/Tests/AK/TestGenericLexer.cpp index 29089ec5d4..86e2383ed5 100644 --- a/Tests/AK/TestGenericLexer.cpp +++ b/Tests/AK/TestGenericLexer.cpp @@ -156,3 +156,51 @@ TEST_CASE(should_constexpr_ignore_until_pred) }(); static_assert(sut.peek() == 'c'); } + +TEST_CASE(consume_escaped_code_point) +{ + auto test = [](StringView test, Result expected, bool combine_surrogate_pairs = true) { + GenericLexer lexer(test); + + auto actual = lexer.consume_escaped_code_point(combine_surrogate_pairs); + EXPECT_EQ(actual.is_error(), expected.is_error()); + + if (actual.is_error() && expected.is_error()) + EXPECT_EQ(actual.error(), expected.error()); + else + EXPECT_EQ(actual.value(), expected.value()); + }; + + test("\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{x}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + + test("\\u{110000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow); + test("\\u{f00000000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow); + + test("\\u{0}"sv, 0); + test("\\u{41}"sv, 0x41); + test("\\u{ffff}"sv, 0xffff); + test("\\u{10ffff}"sv, 0x10ffff); + + test("\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + + test("\\u0000"sv, 0x0); + test("\\u0041"sv, 0x41); + test("\\uffff"sv, 0xffff); + + test("\\ud83d"sv, 0xd83d); + test("\\ud83d\\u1111"sv, 0xd83d); + test("\\ud83d\\ude00"sv, 0x1f600); + test("\\ud83d\\ude00"sv, 0xd83d, false); +}