diff --git a/AK/Tests/TestUtf8.cpp b/AK/Tests/TestUtf8.cpp index e9c437d6ff..d61209e63b 100644 --- a/AK/Tests/TestUtf8.cpp +++ b/AK/Tests/TestUtf8.cpp @@ -48,7 +48,9 @@ TEST_CASE(decode_ascii) TEST_CASE(decode_utf8) { Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" }; - EXPECT(utf8.validate()); + size_t valid_bytes; + EXPECT(utf8.validate(valid_bytes)); + EXPECT(valid_bytes == (size_t)utf8.byte_length()); u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; size_t expected_size = sizeof(expected) / sizeof(expected[0]); @@ -64,21 +66,26 @@ TEST_CASE(decode_utf8) TEST_CASE(validate_invalid_ut8) { + size_t valid_bytes; char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 }; Utf8View utf8_1 { invalid_utf8_1 }; - EXPECT(!utf8_1.validate()); + EXPECT(!utf8_1.validate(valid_bytes)); + EXPECT(valid_bytes == 2); char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 }; Utf8View utf8_2 { invalid_utf8_2 }; - EXPECT(!utf8_2.validate()); + EXPECT(!utf8_2.validate(valid_bytes)); + EXPECT(valid_bytes == 2); char invalid_utf8_3[] = { (char)208, 0 }; Utf8View utf8_3 { invalid_utf8_3 }; - EXPECT(!utf8_3.validate()); + EXPECT(!utf8_3.validate(valid_bytes)); + EXPECT(valid_bytes == 0); char invalid_utf8_4[] = { (char)208, 35, 0 }; Utf8View utf8_4 { invalid_utf8_4 }; - EXPECT(!utf8_4.validate()); + EXPECT(!utf8_4.validate(valid_bytes)); + EXPECT(valid_bytes == 0); } TEST_MAIN(UTF8) diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index b7bdbd43bc..a2cd2fa2a7 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -111,8 +111,9 @@ static inline bool decode_first_byte( return false; } -bool Utf8View::validate() const +bool Utf8View::validate(size_t& valid_bytes) const { + valid_bytes = 0; for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) { int codepoint_length_in_bytes; u32 value; @@ -127,6 +128,8 @@ bool Utf8View::validate() const if (*ptr >> 6 != 2) return false; } + + valid_bytes += codepoint_length_in_bytes; } return true; diff --git a/AK/Utf8View.h b/AK/Utf8View.h index 5946a6f5e5..d9d43a273f 100644 --- a/AK/Utf8View.h +++ b/AK/Utf8View.h @@ -37,7 +37,7 @@ class Utf8CodepointIterator { friend class Utf8View; public: - ~Utf8CodepointIterator() {} + ~Utf8CodepointIterator() { } bool operator==(const Utf8CodepointIterator&) const; bool operator!=(const Utf8CodepointIterator&) const; @@ -57,7 +57,7 @@ public: explicit Utf8View(const String&); explicit Utf8View(const StringView&); explicit Utf8View(const char*); - ~Utf8View() {} + ~Utf8View() { } const StringView& as_string() const { return m_string; } @@ -70,7 +70,12 @@ public: Utf8View substring_view(int byte_offset, int byte_length) const; bool is_empty() const { return m_string.is_empty(); } - bool validate() const; + bool validate(size_t& valid_bytes) const; + bool validate() const + { + size_t valid_bytes; + return validate(valid_bytes); + } size_t length_in_codepoints() const;