mirror of
https://github.com/SerenityOS/serenity
synced 2024-07-22 18:46:18 +00:00
AK: Add a way to get the number of valid bytes in a Utf8View
This commit is contained in:
parent
07c765e258
commit
a4e0b585fe
|
@ -48,7 +48,9 @@ TEST_CASE(decode_ascii)
|
||||||
TEST_CASE(decode_utf8)
|
TEST_CASE(decode_utf8)
|
||||||
{
|
{
|
||||||
Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" };
|
Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" };
|
||||||
EXPECT(utf8.validate());
|
size_t valid_bytes;
|
||||||
|
EXPECT(utf8.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == (size_t)utf8.byte_length());
|
||||||
|
|
||||||
u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
|
u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
|
||||||
size_t expected_size = sizeof(expected) / sizeof(expected[0]);
|
size_t expected_size = sizeof(expected) / sizeof(expected[0]);
|
||||||
|
@ -64,21 +66,26 @@ TEST_CASE(decode_utf8)
|
||||||
|
|
||||||
TEST_CASE(validate_invalid_ut8)
|
TEST_CASE(validate_invalid_ut8)
|
||||||
{
|
{
|
||||||
|
size_t valid_bytes;
|
||||||
char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 };
|
char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 };
|
||||||
Utf8View utf8_1 { invalid_utf8_1 };
|
Utf8View utf8_1 { invalid_utf8_1 };
|
||||||
EXPECT(!utf8_1.validate());
|
EXPECT(!utf8_1.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 };
|
char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 };
|
||||||
Utf8View utf8_2 { invalid_utf8_2 };
|
Utf8View utf8_2 { invalid_utf8_2 };
|
||||||
EXPECT(!utf8_2.validate());
|
EXPECT(!utf8_2.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
char invalid_utf8_3[] = { (char)208, 0 };
|
char invalid_utf8_3[] = { (char)208, 0 };
|
||||||
Utf8View utf8_3 { invalid_utf8_3 };
|
Utf8View utf8_3 { invalid_utf8_3 };
|
||||||
EXPECT(!utf8_3.validate());
|
EXPECT(!utf8_3.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 0);
|
||||||
|
|
||||||
char invalid_utf8_4[] = { (char)208, 35, 0 };
|
char invalid_utf8_4[] = { (char)208, 35, 0 };
|
||||||
Utf8View utf8_4 { invalid_utf8_4 };
|
Utf8View utf8_4 { invalid_utf8_4 };
|
||||||
EXPECT(!utf8_4.validate());
|
EXPECT(!utf8_4.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_MAIN(UTF8)
|
TEST_MAIN(UTF8)
|
||||||
|
|
|
@ -111,8 +111,9 @@ static inline bool decode_first_byte(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Utf8View::validate() const
|
bool Utf8View::validate(size_t& valid_bytes) const
|
||||||
{
|
{
|
||||||
|
valid_bytes = 0;
|
||||||
for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
|
for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
|
||||||
int codepoint_length_in_bytes;
|
int codepoint_length_in_bytes;
|
||||||
u32 value;
|
u32 value;
|
||||||
|
@ -127,6 +128,8 @@ bool Utf8View::validate() const
|
||||||
if (*ptr >> 6 != 2)
|
if (*ptr >> 6 != 2)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
valid_bytes += codepoint_length_in_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -37,7 +37,7 @@ class Utf8CodepointIterator {
|
||||||
friend class Utf8View;
|
friend class Utf8View;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
~Utf8CodepointIterator() {}
|
~Utf8CodepointIterator() { }
|
||||||
|
|
||||||
bool operator==(const Utf8CodepointIterator&) const;
|
bool operator==(const Utf8CodepointIterator&) const;
|
||||||
bool operator!=(const Utf8CodepointIterator&) const;
|
bool operator!=(const Utf8CodepointIterator&) const;
|
||||||
|
@ -57,7 +57,7 @@ public:
|
||||||
explicit Utf8View(const String&);
|
explicit Utf8View(const String&);
|
||||||
explicit Utf8View(const StringView&);
|
explicit Utf8View(const StringView&);
|
||||||
explicit Utf8View(const char*);
|
explicit Utf8View(const char*);
|
||||||
~Utf8View() {}
|
~Utf8View() { }
|
||||||
|
|
||||||
const StringView& as_string() const { return m_string; }
|
const StringView& as_string() const { return m_string; }
|
||||||
|
|
||||||
|
@ -70,7 +70,12 @@ public:
|
||||||
Utf8View substring_view(int byte_offset, int byte_length) const;
|
Utf8View substring_view(int byte_offset, int byte_length) const;
|
||||||
bool is_empty() const { return m_string.is_empty(); }
|
bool is_empty() const { return m_string.is_empty(); }
|
||||||
|
|
||||||
bool validate() const;
|
bool validate(size_t& valid_bytes) const;
|
||||||
|
bool validate() const
|
||||||
|
{
|
||||||
|
size_t valid_bytes;
|
||||||
|
return validate(valid_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
size_t length_in_codepoints() const;
|
size_t length_in_codepoints() const;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue