AK: Add AllowSurrogates to UTF-8 validator

The [UTF-8](https://datatracker.ietf.org/doc/html/rfc3629#page-5) standard says to reject strings with upper or lower surrogates. However, in many standards, ECMAScript included, unpaired surrogates (and therefore UTF-8 surrogates) are allowed in strings. So, this commit extends the UTF-8 validation API with `AllowSurrogates`, which will reject upper and lower surrogate characters.
2024-10-01 13:44:21 +00:00 · 2024-06-07 07:25:39 -07:00 · 2024-06-07 07:25:39 -07:00 · a8639245bf
parent 27905af9ec
commit a8639245bf
3 changed files with 21 additions and 8 deletions
--- a/AK/Utf16View.cpp
+++ b/AK/Utf16View.cpp
@ -105,10 +105,11 @@ ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_unit

            TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
        }
-    } else {
+        return builder.to_string_without_validation();
+    }
+
    for (auto code_point : *this)
        TRY(builder.try_append_code_point(code_point));
-    }

    return builder.to_string();
 }
--- a/AK/Utf8View.h
+++ b/AK/Utf8View.h
@ -80,6 +80,11 @@ public:
    explicit Utf8View(ByteString&&) = delete;
 #endif

+    enum class AllowSurrogates {
+        Yes,
+        No,
+    };
+
    ~Utf8View() = default;

    StringView as_string() const { return m_string; }
@ -121,13 +126,13 @@ public:
        return m_length;
    }

-    constexpr bool validate() const
+    constexpr bool validate(AllowSurrogates surrogates = AllowSurrogates::Yes) const
    {
        size_t valid_bytes = 0;
-        return validate(valid_bytes);
+        return validate(valid_bytes, surrogates);
    }

-    constexpr bool validate(size_t& valid_bytes) const
+    constexpr bool validate(size_t& valid_bytes, AllowSurrogates surrogates = AllowSurrogates::Yes) const
    {
        valid_bytes = 0;

@ -148,7 +153,7 @@ public:
                code_point |= code_point_bits;
            }

-            if (!is_valid_code_point(code_point, byte_length))
+            if (!is_valid_code_point(code_point, byte_length, surrogates))
                return false;

            valid_bytes += byte_length;
@ -216,8 +221,10 @@ private:
        return { .is_valid = false };
    }

-    static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length)
+    static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length, AllowSurrogates surrogates = AllowSurrogates::Yes)
    {
+        if (surrogates == AllowSurrogates::No && byte_length == 3 && code_point >= 0xD800 && code_point <= 0xDFFF)
+            return false;
        for (auto const& data : utf8_encoded_byte_data) {
            if (code_point >= data.first_code_point && code_point <= data.last_code_point)
                return byte_length == data.byte_length;
--- a/Tests/AK/TestUtf8.cpp
+++ b/Tests/AK/TestUtf8.cpp
@ -80,6 +80,11 @@ TEST_CASE(validate_invalid_ut8)
    Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
    EXPECT(!utf8_6.validate(valid_bytes));
    EXPECT(valid_bytes == 0);
+
+    char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800
+    Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } };
+    EXPECT(!utf8_7.validate(valid_bytes, Utf8View::AllowSurrogates::No));
+    EXPECT(valid_bytes == 0);
 }

 TEST_CASE(validate_overlong_utf8)