diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp index ad74dd3f3d..be02312cf0 100644 --- a/AK/URLParser.cpp +++ b/AK/URLParser.cpp @@ -215,7 +215,7 @@ URL URLParser::parse(Badge, const StringView& raw_input, URL const* base_ur Utf8CodePointIterator iterator = input.begin(); auto get_remaining = [&input, &iterator] { - return input.substring_view(iterator - input.begin() + iterator.code_point_length_in_bytes()).as_string(); + return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string(); }; // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop. diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index 16d4e39633..f967781644 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -52,7 +52,7 @@ Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) cons for (auto iterator = begin(); !iterator.done(); ++iterator) { if (current_offset >= byte_offset) return iterator; - current_offset += iterator.code_point_length_in_bytes(); + current_offset += iterator.underlying_code_point_length_in_bytes(); } return end(); } @@ -193,29 +193,48 @@ Utf8CodePointIterator& Utf8CodePointIterator::operator++() { VERIFY(m_length > 0); - size_t code_point_length_in_bytes = 0; - u32 value; - bool first_byte_makes_sense = decode_first_byte(*m_ptr, code_point_length_in_bytes, value); + size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes(); + if (code_point_length_in_bytes > m_length) { + // We don't have enough data for the next code point. Skip one character and try again. + // The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards. + dbgln("Expected code point size {} is too big for the remaining length {}. Moving forward one byte.", code_point_length_in_bytes, m_length); + m_ptr += 1; + m_length -= 1; + return *this; + } - VERIFY(first_byte_makes_sense); - - VERIFY(code_point_length_in_bytes <= m_length); m_ptr += code_point_length_in_bytes; m_length -= code_point_length_in_bytes; - return *this; } -size_t Utf8CodePointIterator::code_point_length_in_bytes() const +size_t Utf8CodePointIterator::underlying_code_point_length_in_bytes() const { VERIFY(m_length > 0); size_t code_point_length_in_bytes = 0; u32 value; bool first_byte_makes_sense = decode_first_byte(*m_ptr, code_point_length_in_bytes, value); - VERIFY(first_byte_makes_sense); + + // If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1. + if (!first_byte_makes_sense) + return 1; + + if (code_point_length_in_bytes > m_length) + return 1; + + for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) { + if (m_ptr[offset] >> 6 != 2) + return 1; + } + return code_point_length_in_bytes; } +ReadonlyBytes Utf8CodePointIterator::underlying_code_point_bytes() const +{ + return { m_ptr, underlying_code_point_length_in_bytes() }; +} + u32 Utf8CodePointIterator::operator*() const { VERIFY(m_length > 0); @@ -224,15 +243,26 @@ u32 Utf8CodePointIterator::operator*() const size_t code_point_length_in_bytes = 0; bool first_byte_makes_sense = decode_first_byte(m_ptr[0], code_point_length_in_bytes, code_point_value_so_far); - if (!first_byte_makes_sense) + + if (!first_byte_makes_sense) { + // The first byte of the code point doesn't make sense: output a replacement character dbgln("First byte doesn't make sense, bytes: {}", StringView { (const char*)m_ptr, m_length }); - VERIFY(first_byte_makes_sense); - if (code_point_length_in_bytes > m_length) + return 0xFFFD; + } + + if (code_point_length_in_bytes > m_length) { + // There is not enough data left for the full code point: output a replacement character dbgln("Not enough bytes (need {}, have {}), first byte is: {:#02x}, '{}'", code_point_length_in_bytes, m_length, m_ptr[0], (const char*)m_ptr); - VERIFY(code_point_length_in_bytes <= m_length); + return 0xFFFD; + } for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) { - VERIFY(m_ptr[offset] >> 6 == 2); + if (m_ptr[offset] >> 6 != 2) { + // One of the extension bytes of the code point doesn't make sense: output a replacement character + dbgln("Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense.", m_ptr[offset], offset, m_ptr[0]); + return 0xFFFD; + } + code_point_value_so_far <<= 6; code_point_value_so_far |= m_ptr[offset] & 63; } diff --git a/AK/Utf8View.h b/AK/Utf8View.h index 0e12af7a61..9e96e1e000 100644 --- a/AK/Utf8View.h +++ b/AK/Utf8View.h @@ -33,7 +33,13 @@ public: return m_ptr - other.m_ptr; } - size_t code_point_length_in_bytes() const; + // Note : These methods return the information about the underlying UTF-8 bytes. + // If the UTF-8 string encoding is not valid at the iterator's position, then the underlying bytes might be different from the + // decoded character's re-encoded bytes (which will be an `0xFFFD REPLACEMENT CHARACTER` with an UTF-8 length of three bytes). + // If your code relies on the decoded character being equivalent to the re-encoded character, use the `UTF8View::validate()` + // method on the view prior to using its iterator. + size_t underlying_code_point_length_in_bytes() const; + ReadonlyBytes underlying_code_point_bytes() const; bool done() const { return m_length == 0; } private: diff --git a/Tests/AK/TestUtf8.cpp b/Tests/AK/TestUtf8.cpp index 6d0187fab9..a458d5fa66 100644 --- a/Tests/AK/TestUtf8.cpp +++ b/Tests/AK/TestUtf8.cpp @@ -6,6 +6,7 @@ #include +#include #include TEST_CASE(decode_ascii) @@ -33,12 +34,15 @@ TEST_CASE(decode_utf8) EXPECT(valid_bytes == (size_t)utf8.byte_length()); u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 }; + String expected_underlying_bytes[] = { "П", "р", "и", "в", "е", "т", ",", " ", "м", "и", "р", "!", " ", "😀", " ", "γ", "ε", "ι", "ά", " ", "σ", "ο", "υ", " ", "κ", "ό", "σ", "μ", "ο", "ς", " ", "こ", "ん", "に", "ち", "は", "世", "界" }; size_t expected_size = sizeof(expected) / sizeof(expected[0]); size_t i = 0; - for (u32 code_point : utf8) { + for (auto it = utf8.begin(); it != utf8.end(); ++it) { + u32 code_point = *it; VERIFY(i < expected_size); EXPECT_EQ(code_point, expected[i]); + EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); i++; } EXPECT_EQ(i, expected_size); @@ -103,3 +107,78 @@ TEST_CASE(iterate_utf8) return Test::Crash::Failure::DidNotCrash; }); } + +TEST_CASE(decode_invalid_ut8) +{ + // Test case 1 : Getting an extension byte as first byte of the code point + { + char raw_data[] = { 'a', 'b', (char)0xA0, 'd', 0 }; + Utf8View view { raw_data }; + u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' }; + String expected_underlying_bytes[] = { "a", "b", "\xA0", "d" }; + size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); + size_t i = 0; + for (auto it = view.begin(); it != view.end(); ++it) { + u32 code_point = *it; + VERIFY(i < expected_size); + EXPECT_EQ(code_point, expected_characters[i]); + EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); + i++; + } + VERIFY(i == expected_size); + } + + // Test case 2 : Getting a non-extension byte when an extension byte is expected + { + char raw_data[] = { 'a', 'b', (char)0xC0, 'd', 'e', 0 }; + Utf8View view { raw_data }; + u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd', 'e' }; + String expected_underlying_bytes[] = { "a", "b", "\xC0", "d", "e" }; + size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); + size_t i = 0; + for (auto it = view.begin(); it != view.end(); ++it) { + u32 code_point = *it; + VERIFY(i < expected_size); + EXPECT_EQ(code_point, expected_characters[i]); + EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); + i++; + } + VERIFY(i == expected_size); + } + + // Test case 3 : Not enough bytes before the end of the string + { + char raw_data[] = { 'a', 'b', (char)0x90, 'd', 0 }; + Utf8View view { raw_data }; + u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' }; + String expected_underlying_bytes[] = { "a", "b", "\x90", "d" }; + size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); + size_t i = 0; + for (auto it = view.begin(); it != view.end(); ++it) { + u32 code_point = *it; + VERIFY(i < expected_size); + EXPECT_EQ(code_point, expected_characters[i]); + EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); + i++; + } + VERIFY(i == expected_size); + } + + // Test case 4 : Not enough bytes at the end of the string + { + char raw_data[] = { 'a', 'b', 'c', (char)0x90, 0 }; + Utf8View view { raw_data }; + u32 expected_characters[] = { 'a', 'b', 'c', 0xFFFD }; + String expected_underlying_bytes[] = { "a", "b", "c", "\x90" }; + size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]); + size_t i = 0; + for (auto it = view.begin(); it != view.end(); ++it) { + u32 code_point = *it; + VERIFY(i < expected_size); + EXPECT_EQ(code_point, expected_characters[i]); + EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes()); + i++; + } + VERIFY(i == expected_size); + } +} diff --git a/Userland/Libraries/LibWeb/Layout/TextNode.cpp b/Userland/Libraries/LibWeb/Layout/TextNode.cpp index 6f7818c55f..1520788d0c 100644 --- a/Userland/Libraries/LibWeb/Layout/TextNode.cpp +++ b/Userland/Libraries/LibWeb/Layout/TextNode.cpp @@ -126,7 +126,7 @@ void TextNode::compute_text_for_rendering(bool collapse, bool previous_is_empty_ skip_over_whitespace(); for (; it != utf8_view.end(); ++it) { if (!is_ascii_space(*it)) { - builder.append(utf8_view.as_string().characters_without_null_termination() + utf8_view.byte_offset_of(it), it.code_point_length_in_bytes()); + builder.append(StringView { it.underlying_code_point_bytes() }); } else { builder.append(' '); skip_over_whitespace(); diff --git a/Userland/Libraries/LibWeb/Page/EditEventHandler.cpp b/Userland/Libraries/LibWeb/Page/EditEventHandler.cpp index c1f96546aa..a5d262ffd3 100644 --- a/Userland/Libraries/LibWeb/Page/EditEventHandler.cpp +++ b/Userland/Libraries/LibWeb/Page/EditEventHandler.cpp @@ -26,7 +26,7 @@ void EditEventHandler::handle_delete_character_after(const DOM::Position& cursor auto& node = *static_cast(const_cast(cursor_position.node())); auto& text = node.data(); - auto code_point_length = Utf8View(text).iterator_at_byte_offset(cursor_position.offset()).code_point_length_in_bytes(); + auto code_point_length = Utf8View(text).iterator_at_byte_offset(cursor_position.offset()).underlying_code_point_length_in_bytes(); StringBuilder builder; builder.append(text.substring_view(0, cursor_position.offset()));