LibTextCodec+AK: Don't validate UTF-8 strings twice

UTF8Decoder was already converting invalid data into replacement
characters while converting, so we know for sure we have valid UTF-8
by the time conversion is finished.

This patch adds a new StringBuilder::to_string_without_validation()
and uses it to make UTF8Decoder avoid half the work it was doing.
This commit is contained in:
Andreas Kling 2023-12-29 15:30:15 +01:00
parent a285e36041
commit 3c039903fb
5 changed files with 22 additions and 1 deletions

View file

@ -233,6 +233,19 @@ void String::destroy_string()
m_data->unref();
}
String String::from_utf8_without_validation(ReadonlyBytes bytes)
{
if (bytes.size() <= MAX_SHORT_STRING_BYTE_COUNT) {
ShortString short_string;
if (!bytes.is_empty())
memcpy(short_string.storage, bytes.data(), bytes.size());
short_string.byte_count_and_short_string_flag = (bytes.size() << 1) | SHORT_STRING_FLAG;
return String { short_string };
}
auto data = MUST(Detail::StringData::from_utf8(reinterpret_cast<char const*>(bytes.data()), bytes.size()));
return String { move(data) };
}
ErrorOr<String> String::from_utf8(StringView view)
{
if (!Utf8View { view }.validate())

View file

@ -70,6 +70,8 @@ public:
requires(IsOneOf<RemoveCVReference<T>, ByteString, DeprecatedFlyString, FlyString, String>)
static ErrorOr<String> from_utf8(T&&) = delete;
[[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
static ErrorOr<String> from_stream(Stream&, size_t byte_count);

View file

@ -156,6 +156,11 @@ ErrorOr<String> StringBuilder::to_string() const
return String::from_utf8(string_view());
}
String StringBuilder::to_string_without_validation() const
{
return String::from_utf8_without_validation(string_view().bytes());
}
ErrorOr<FlyString> StringBuilder::to_fly_string() const
{
return FlyString::from_utf8(string_view());

View file

@ -73,6 +73,7 @@ public:
[[nodiscard]] ByteString to_byte_string() const;
#endif
[[nodiscard]] String to_string_without_validation() const;
ErrorOr<String> to_string() const;
ErrorOr<FlyString> to_fly_string() const;

View file

@ -247,7 +247,7 @@ ErrorOr<String> Decoder::to_utf8(StringView input)
{
StringBuilder builder(input.length());
TRY(process(input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
return builder.to_string();
return builder.to_string_without_validation();
}
ErrorOr<void> UTF8Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)