LibWeb: Port CSS Tokenizer to new Strings

Specifically, this uses FlyString, because the data gets held long-term
as a FlyString anyway.
This commit is contained in:
Sam Atkins 2023-02-15 11:24:38 +00:00 committed by Tim Flynn
parent a419039bb2
commit 3685a8813a
2 changed files with 27 additions and 27 deletions

View file

@ -12,7 +12,7 @@
#include <AK/Vector.h> #include <AK/Vector.h>
#include <LibTextCodec/Decoder.h> #include <LibTextCodec/Decoder.h>
#include <LibWeb/CSS/Parser/Tokenizer.h> #include <LibWeb/CSS/Parser/Tokenizer.h>
#include <math.h> #include <LibWeb/Infra/Strings.h>
namespace Web::CSS::Parser { namespace Web::CSS::Parser {
@ -198,7 +198,7 @@ static inline bool is_E(u32 code_point)
Tokenizer::Tokenizer(StringView input, StringView encoding) Tokenizer::Tokenizer(StringView input, StringView encoding)
{ {
// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
auto filter_code_points = [](StringView input, auto encoding) -> DeprecatedString { auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> {
auto* decoder = TextCodec::decoder_for(encoding); auto* decoder = TextCodec::decoder_for(encoding);
VERIFY(decoder); VERIFY(decoder);
@ -237,10 +237,10 @@ Tokenizer::Tokenizer(StringView input, StringView encoding)
last_was_carriage_return = false; last_was_carriage_return = false;
} }
}); });
return builder.to_deprecated_string(); return builder.to_string();
}; };
m_decoded_input = filter_code_points(input, encoding); m_decoded_input = filter_code_points(input, encoding).release_value_but_fixme_should_propagate_errors();
m_utf8_view = Utf8View(m_decoded_input); m_utf8_view = Utf8View(m_decoded_input);
m_utf8_iterator = m_utf8_view.begin(); m_utf8_iterator = m_utf8_view.begin();
} }
@ -352,11 +352,11 @@ Token Tokenizer::create_eof_token()
return create_new_token(Token::Type::EndOfFile); return create_new_token(Token::Type::EndOfFile);
} }
Token Tokenizer::create_value_token(Token::Type type, DeprecatedString value) Token Tokenizer::create_value_token(Token::Type type, FlyString&& value)
{ {
Token token; Token token;
token.m_type = type; token.m_type = type;
token.m_value = FlyString::from_utf8(value.view()).release_value_but_fixme_should_propagate_errors(); token.m_value = move(value);
return token; return token;
} }
@ -364,7 +364,7 @@ Token Tokenizer::create_value_token(Token::Type type, u32 value)
{ {
Token token = {}; Token token = {};
token.m_type = type; token.m_type = type;
token.m_value = FlyString(String::from_code_point(value)); token.m_value = String::from_code_point(value);
return token; return token;
} }
@ -397,7 +397,7 @@ u32 Tokenizer::consume_escaped_code_point()
} }
// Interpret the hex digits as a hexadecimal number. // Interpret the hex digits as a hexadecimal number.
auto unhexed = strtoul(builder.to_deprecated_string().characters(), nullptr, 16); auto unhexed = AK::StringUtils::convert_to_uint_from_hex<u32>(builder.string_view()).value_or(0);
// If this number is zero, or is for a surrogate, or is greater than the maximum allowed // If this number is zero, or is for a surrogate, or is greater than the maximum allowed
// code point, return U+FFFD REPLACEMENT CHARACTER (<28>). // code point, return U+FFFD REPLACEMENT CHARACTER (<28>).
if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) { if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
@ -427,11 +427,11 @@ Token Tokenizer::consume_an_ident_like_token()
// It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>. // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>.
// Consume an ident sequence, and let string be the result. // Consume an ident sequence, and let string be the result.
auto string = consume_an_ident_sequence(); auto string = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
// If strings value is an ASCII case-insensitive match for "url", and the next input code // If strings value is an ASCII case-insensitive match for "url", and the next input code
// point is U+0028 LEFT PARENTHESIS ((), consume it. // point is U+0028 LEFT PARENTHESIS ((), consume it.
if (string.equals_ignoring_case("url"sv) && is_left_paren(peek_code_point())) { if (Infra::is_ascii_case_insensitive_match(string, "url"sv) && is_left_paren(peek_code_point())) {
(void)next_code_point(); (void)next_code_point();
// While the next two input code points are whitespace, consume the next input code point. // While the next two input code points are whitespace, consume the next input code point.
@ -449,7 +449,7 @@ Token Tokenizer::consume_an_ident_like_token()
// <function-token> with its value set to string and return it. // <function-token> with its value set to string and return it.
auto next_two = peek_twin(); auto next_two = peek_twin();
if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) { if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
return create_value_token(Token::Type::Function, string); return create_value_token(Token::Type::Function, move(string));
} }
// Otherwise, consume a url token, and return it. // Otherwise, consume a url token, and return it.
@ -461,11 +461,11 @@ Token Tokenizer::consume_an_ident_like_token()
(void)next_code_point(); (void)next_code_point();
// Create a <function-token> with its value set to string and return it. // Create a <function-token> with its value set to string and return it.
return create_value_token(Token::Type::Function, string); return create_value_token(Token::Type::Function, move(string));
} }
// Otherwise, create an <ident-token> with its value set to string and return it. // Otherwise, create an <ident-token> with its value set to string and return it.
return create_value_token(Token::Type::Ident, string); return create_value_token(Token::Type::Ident, move(string));
} }
// https://www.w3.org/TR/css-syntax-3/#consume-number // https://www.w3.org/TR/css-syntax-3/#consume-number
@ -573,7 +573,7 @@ float Tokenizer::convert_a_string_to_a_number(StringView string)
} }
// https://www.w3.org/TR/css-syntax-3/#consume-name // https://www.w3.org/TR/css-syntax-3/#consume-name
DeprecatedString Tokenizer::consume_an_ident_sequence() ErrorOr<FlyString> Tokenizer::consume_an_ident_sequence()
{ {
// This section describes how to consume an ident sequence from a stream of code points. // This section describes how to consume an ident sequence from a stream of code points.
// It returns a string containing the largest name that can be formed from adjacent // It returns a string containing the largest name that can be formed from adjacent
@ -597,14 +597,14 @@ DeprecatedString Tokenizer::consume_an_ident_sequence()
// name code point // name code point
if (is_ident_code_point(input)) { if (is_ident_code_point(input)) {
// Append the code point to result. // Append the code point to result.
result.append_code_point(input); TRY(result.try_append_code_point(input));
continue; continue;
} }
// the stream starts with a valid escape // the stream starts with a valid escape
if (is_valid_escape_sequence(start_of_input_stream_twin())) { if (is_valid_escape_sequence(start_of_input_stream_twin())) {
// Consume an escaped code point. Append the returned code point to result. // Consume an escaped code point. Append the returned code point to result.
result.append_code_point(consume_escaped_code_point()); TRY(result.try_append_code_point(consume_escaped_code_point()));
continue; continue;
} }
@ -614,7 +614,7 @@ DeprecatedString Tokenizer::consume_an_ident_sequence()
break; break;
} }
return result.to_deprecated_string(); return result.to_fly_string();
} }
// https://www.w3.org/TR/css-syntax-3/#consume-url-token // https://www.w3.org/TR/css-syntax-3/#consume-url-token
@ -778,10 +778,10 @@ Token Tokenizer::consume_a_numeric_token()
token.m_number_value = number; token.m_number_value = number;
// 2. Consume an ident sequence. Set the <dimension-token>s unit to the returned value. // 2. Consume an ident sequence. Set the <dimension-token>s unit to the returned value.
auto unit = consume_an_ident_sequence(); auto unit = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
VERIFY(!unit.is_empty()); VERIFY(!unit.is_empty());
// NOTE: We intentionally store this in the `value`, to save space. // NOTE: We intentionally store this in the `value`, to save space.
token.m_value = FlyString::from_utf8(unit.view()).release_value_but_fixme_should_propagate_errors(); token.m_value = move(unit);
// 3. Return the <dimension-token>. // 3. Return the <dimension-token>.
return token; return token;
@ -1065,8 +1065,8 @@ Token Tokenizer::consume_a_token()
token.m_hash_type = Token::HashType::Id; token.m_hash_type = Token::HashType::Id;
// 3. Consume an ident sequence, and set the <hash-token>s value to the returned string. // 3. Consume an ident sequence, and set the <hash-token>s value to the returned string.
auto name = consume_an_ident_sequence(); auto name = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
token.m_value = FlyString::from_utf8(name.view()).release_value_but_fixme_should_propagate_errors(); token.m_value = move(name);
// 4. Return the <hash-token>. // 4. Return the <hash-token>.
return token; return token;
@ -1201,8 +1201,8 @@ Token Tokenizer::consume_a_token()
// If the next 3 input code points would start an ident sequence, consume an ident sequence, create // If the next 3 input code points would start an ident sequence, consume an ident sequence, create
// an <at-keyword-token> with its value set to the returned value, and return it. // an <at-keyword-token> with its value set to the returned value, and return it.
if (would_start_an_ident_sequence(peek_triplet())) { if (would_start_an_ident_sequence(peek_triplet())) {
auto name = consume_an_ident_sequence(); auto name = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
return create_value_token(Token::Type::AtKeyword, name); return create_value_token(Token::Type::AtKeyword, move(name));
} }
// Otherwise, return a <delim-token> with its value set to the current input code point. // Otherwise, return a <delim-token> with its value set to the current input code point.

View file

@ -1,12 +1,12 @@
/* /*
* Copyright (c) 2020-2021, the SerenityOS developers. * Copyright (c) 2020-2021, the SerenityOS developers.
* Copyright (c) 2021-2023, Sam Atkins <atkinssj@serenityos.org>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
#pragma once #pragma once
#include <AK/DeprecatedString.h>
#include <AK/Optional.h> #include <AK/Optional.h>
#include <AK/StringView.h> #include <AK/StringView.h>
#include <AK/Types.h> #include <AK/Types.h>
@ -76,7 +76,7 @@ private:
[[nodiscard]] U32Triplet start_of_input_stream_triplet(); [[nodiscard]] U32Triplet start_of_input_stream_triplet();
[[nodiscard]] static Token create_new_token(Token::Type); [[nodiscard]] static Token create_new_token(Token::Type);
[[nodiscard]] static Token create_value_token(Token::Type, DeprecatedString value); [[nodiscard]] static Token create_value_token(Token::Type, FlyString&& value);
[[nodiscard]] static Token create_value_token(Token::Type, u32 value); [[nodiscard]] static Token create_value_token(Token::Type, u32 value);
[[nodiscard]] Token consume_a_token(); [[nodiscard]] Token consume_a_token();
[[nodiscard]] Token consume_string_token(u32 ending_code_point); [[nodiscard]] Token consume_string_token(u32 ending_code_point);
@ -84,7 +84,7 @@ private:
[[nodiscard]] Token consume_an_ident_like_token(); [[nodiscard]] Token consume_an_ident_like_token();
[[nodiscard]] Number consume_a_number(); [[nodiscard]] Number consume_a_number();
[[nodiscard]] float convert_a_string_to_a_number(StringView); [[nodiscard]] float convert_a_string_to_a_number(StringView);
[[nodiscard]] DeprecatedString consume_an_ident_sequence(); [[nodiscard]] ErrorOr<FlyString> consume_an_ident_sequence();
[[nodiscard]] u32 consume_escaped_code_point(); [[nodiscard]] u32 consume_escaped_code_point();
[[nodiscard]] Token consume_a_url_token(); [[nodiscard]] Token consume_a_url_token();
void consume_the_remnants_of_a_bad_url(); void consume_the_remnants_of_a_bad_url();
@ -95,7 +95,7 @@ private:
[[nodiscard]] static bool would_start_an_ident_sequence(U32Triplet); [[nodiscard]] static bool would_start_an_ident_sequence(U32Triplet);
[[nodiscard]] static bool would_start_a_number(U32Triplet); [[nodiscard]] static bool would_start_a_number(U32Triplet);
DeprecatedString m_decoded_input; String m_decoded_input;
Utf8View m_utf8_view; Utf8View m_utf8_view;
AK::Utf8CodePointIterator m_utf8_iterator; AK::Utf8CodePointIterator m_utf8_iterator;
AK::Utf8CodePointIterator m_prev_utf8_iterator; AK::Utf8CodePointIterator m_prev_utf8_iterator;