mirror of
https://github.com/SerenityOS/serenity
synced 2024-10-01 13:44:21 +00:00
AK: Add AllowSurrogates
to UTF-8 validator
The [UTF-8](https://datatracker.ietf.org/doc/html/rfc3629#page-5) standard says to reject strings with upper or lower surrogates. However, in many standards, ECMAScript included, unpaired surrogates (and therefore UTF-8 surrogates) are allowed in strings. So, this commit extends the UTF-8 validation API with `AllowSurrogates`, which will reject upper and lower surrogate characters.
This commit is contained in:
parent
27905af9ec
commit
a8639245bf
|
@ -105,11 +105,12 @@ ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_unit
|
||||||
|
|
||||||
TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
|
TRY(builder.try_append_code_point(static_cast<u32>(*ptr)));
|
||||||
}
|
}
|
||||||
} else {
|
return builder.to_string_without_validation();
|
||||||
for (auto code_point : *this)
|
|
||||||
TRY(builder.try_append_code_point(code_point));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (auto code_point : *this)
|
||||||
|
TRY(builder.try_append_code_point(code_point));
|
||||||
|
|
||||||
return builder.to_string();
|
return builder.to_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -80,6 +80,11 @@ public:
|
||||||
explicit Utf8View(ByteString&&) = delete;
|
explicit Utf8View(ByteString&&) = delete;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
enum class AllowSurrogates {
|
||||||
|
Yes,
|
||||||
|
No,
|
||||||
|
};
|
||||||
|
|
||||||
~Utf8View() = default;
|
~Utf8View() = default;
|
||||||
|
|
||||||
StringView as_string() const { return m_string; }
|
StringView as_string() const { return m_string; }
|
||||||
|
@ -121,13 +126,13 @@ public:
|
||||||
return m_length;
|
return m_length;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr bool validate() const
|
constexpr bool validate(AllowSurrogates surrogates = AllowSurrogates::Yes) const
|
||||||
{
|
{
|
||||||
size_t valid_bytes = 0;
|
size_t valid_bytes = 0;
|
||||||
return validate(valid_bytes);
|
return validate(valid_bytes, surrogates);
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr bool validate(size_t& valid_bytes) const
|
constexpr bool validate(size_t& valid_bytes, AllowSurrogates surrogates = AllowSurrogates::Yes) const
|
||||||
{
|
{
|
||||||
valid_bytes = 0;
|
valid_bytes = 0;
|
||||||
|
|
||||||
|
@ -148,7 +153,7 @@ public:
|
||||||
code_point |= code_point_bits;
|
code_point |= code_point_bits;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_valid_code_point(code_point, byte_length))
|
if (!is_valid_code_point(code_point, byte_length, surrogates))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
valid_bytes += byte_length;
|
valid_bytes += byte_length;
|
||||||
|
@ -216,8 +221,10 @@ private:
|
||||||
return { .is_valid = false };
|
return { .is_valid = false };
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length)
|
static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length, AllowSurrogates surrogates = AllowSurrogates::Yes)
|
||||||
{
|
{
|
||||||
|
if (surrogates == AllowSurrogates::No && byte_length == 3 && code_point >= 0xD800 && code_point <= 0xDFFF)
|
||||||
|
return false;
|
||||||
for (auto const& data : utf8_encoded_byte_data) {
|
for (auto const& data : utf8_encoded_byte_data) {
|
||||||
if (code_point >= data.first_code_point && code_point <= data.last_code_point)
|
if (code_point >= data.first_code_point && code_point <= data.last_code_point)
|
||||||
return byte_length == data.byte_length;
|
return byte_length == data.byte_length;
|
||||||
|
|
|
@ -80,6 +80,11 @@ TEST_CASE(validate_invalid_ut8)
|
||||||
Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
|
Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
|
||||||
EXPECT(!utf8_6.validate(valid_bytes));
|
EXPECT(!utf8_6.validate(valid_bytes));
|
||||||
EXPECT(valid_bytes == 0);
|
EXPECT(valid_bytes == 0);
|
||||||
|
|
||||||
|
char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800
|
||||||
|
Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } };
|
||||||
|
EXPECT(!utf8_7.validate(valid_bytes, Utf8View::AllowSurrogates::No));
|
||||||
|
EXPECT(valid_bytes == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(validate_overlong_utf8)
|
TEST_CASE(validate_overlong_utf8)
|
||||||
|
|
Loading…
Reference in a new issue