From 1d5b45f7d938b15db9da9b12dc4d8b373abd6c7c Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 13 Mar 2023 09:39:45 -0400 Subject: [PATCH] AK: Compute UTF-8 code point lengths using only leading bytes We don't need to decode the entire code point to know its length. This reduces the runtime of decoding a string containing 5 million instances of U+10FFFF from over 4 seconds to 0.9 seconds. --- AK/Utf8View.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index bcc6171dfd..f3f2a7af30 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -73,9 +73,14 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_ size_t Utf8View::calculate_length() const { size_t length = 0; - for ([[maybe_unused]] auto code_point : *this) { - ++length; + + for (size_t i = 0; i < m_string.length(); ++length) { + auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast(m_string[i])); + + // Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte. + i += is_valid ? byte_length : 1; } + return length; }