From 9812031a02e539f08a6936e9c17d919a44c912b8 Mon Sep 17 00:00:00 2001 From: Jonatan Klemets Date: Sun, 23 Jul 2023 19:38:04 +0300 Subject: [PATCH] LibWeb: Implement spec-compliant integer parsing We have code inside LibWeb that uses the `AK::StringUtils::convert_to_uint`and `AK::StringUtils::convert_to_int` methods for parsing integers. This works well for the most part, but according to the spec, trailing characters are allowed and should be ignored, but this is not how the `StringUtil` methods are implemented. This patch adds two new methods named `parse_integer` and `parse_non_negative_integer` inside the `Web::HTML` namespace that uses `StringUtils` under the hood but adds a bit more logic to make it spec compliant. --- Tests/LibWeb/CMakeLists.txt | 1 + Tests/LibWeb/TestNumbers.cpp | 106 +++++++++++++++++++++ Userland/Libraries/LibWeb/CMakeLists.txt | 1 + Userland/Libraries/LibWeb/HTML/Numbers.cpp | 82 ++++++++++++++++ Userland/Libraries/LibWeb/HTML/Numbers.h | 18 ++++ 5 files changed, 208 insertions(+) create mode 100644 Tests/LibWeb/TestNumbers.cpp create mode 100644 Userland/Libraries/LibWeb/HTML/Numbers.cpp create mode 100644 Userland/Libraries/LibWeb/HTML/Numbers.h diff --git a/Tests/LibWeb/CMakeLists.txt b/Tests/LibWeb/CMakeLists.txt index 9f7f5359e0..0bba6cf7f4 100644 --- a/Tests/LibWeb/CMakeLists.txt +++ b/Tests/LibWeb/CMakeLists.txt @@ -2,6 +2,7 @@ set(TEST_SOURCES TestCSSIDSpeed.cpp TestCSSPixels.cpp TestHTMLTokenizer.cpp + TestNumbers.cpp ) foreach(source IN LISTS TEST_SOURCES) diff --git a/Tests/LibWeb/TestNumbers.cpp b/Tests/LibWeb/TestNumbers.cpp new file mode 100644 index 0000000000..56021212cc --- /dev/null +++ b/Tests/LibWeb/TestNumbers.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, Jonatan Klemets + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include + +TEST_CASE(parse_integer) +{ + auto optional_value = Web::HTML::parse_integer(""sv); + EXPECT(!optional_value.has_value()); + + optional_value = Web::HTML::parse_integer("123"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 123); + + optional_value = Web::HTML::parse_integer(" 456"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 456); + + optional_value = Web::HTML::parse_integer("789 "sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 789); + + optional_value = Web::HTML::parse_integer(" 22 "sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 22); + + optional_value = Web::HTML::parse_integer(" \n\t31\t\t\n\n"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 31); + + optional_value = Web::HTML::parse_integer("765foo"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 765); + + optional_value = Web::HTML::parse_integer("3;"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 3); + + optional_value = Web::HTML::parse_integer("foo765"sv); + EXPECT(!optional_value.has_value()); + + optional_value = Web::HTML::parse_integer("1"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 1); + + optional_value = Web::HTML::parse_integer("+2"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 2); + + optional_value = Web::HTML::parse_integer("-3"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), -3); +} + +TEST_CASE(parse_non_negative_integer) +{ + auto optional_value = Web::HTML::parse_non_negative_integer(""sv); + EXPECT(!optional_value.has_value()); + + optional_value = Web::HTML::parse_non_negative_integer("123"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 123u); + + optional_value = Web::HTML::parse_non_negative_integer(" 456"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 456u); + + optional_value = Web::HTML::parse_non_negative_integer("789 "sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 789u); + + optional_value = Web::HTML::parse_non_negative_integer(" 22 "sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 22u); + + optional_value = Web::HTML::parse_non_negative_integer(" \n\t31\t\t\n\n"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 31u); + + optional_value = Web::HTML::parse_non_negative_integer("765foo"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 765u); + + optional_value = Web::HTML::parse_non_negative_integer("3;"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 3u); + + optional_value = Web::HTML::parse_non_negative_integer("foo765"sv); + EXPECT(!optional_value.has_value()); + + optional_value = Web::HTML::parse_non_negative_integer("1"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 1u); + + optional_value = Web::HTML::parse_non_negative_integer("+2"sv); + EXPECT(optional_value.has_value()); + EXPECT_EQ(optional_value.value(), 2u); + + optional_value = Web::HTML::parse_non_negative_integer("-3"sv); + EXPECT(!optional_value.has_value()); +} diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index 32b6293aca..969256524e 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -358,6 +358,7 @@ set(SOURCES HTML/NavigationTransition.cpp HTML/Navigator.cpp HTML/NavigatorID.cpp + HTML/Numbers.cpp HTML/PageTransitionEvent.cpp HTML/Parser/Entities.cpp HTML/Parser/HTMLEncodingDetection.cpp diff --git a/Userland/Libraries/LibWeb/HTML/Numbers.cpp b/Userland/Libraries/LibWeb/HTML/Numbers.cpp new file mode 100644 index 0000000000..30f8a0a7fe --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Numbers.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2023, Jonatan Klemets + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +namespace Web::HTML { + +// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-integers +Optional parse_integer(StringView string) +{ + // 1. Let input be the string being parsed. + // 2. Let position be a pointer into input, initially pointing at the start of the string. + GenericLexer lexer { string }; + + // 3. Let sign have the value "positive". + // NOTE: Skipped, see comment on step 6. + + // 4. Skip ASCII whitespace within input given position. + lexer.ignore_while(Web::Infra::is_ascii_whitespace); + + // 5. If position is past the end of input, return an error. + if (lexer.is_eof()) { + return {}; + } + + // 6. If the character indicated by position (the first character) is a U+002D HYPHEN-MINUS character (-): + // + // If we parse a signed integer, then we include the sign character (if present) in the collect step + // (step 8) and lean on `AK::StringUtils::convert_to_int` to handle it for us. + size_t start_index = lexer.tell(); + if (lexer.peek() == '-' || lexer.peek() == '+') { + lexer.consume(); + } + + // 7. If the character indicated by position is not an ASCII digit, then return an error. + if (!lexer.next_is(is_ascii_digit)) { + return {}; + } + + // 8. Collect a sequence of code points that are ASCII digits from input given position, and interpret the resulting sequence as a base-ten integer. Let value be that integer. + lexer.consume_while(is_ascii_digit); + size_t end_index = lexer.tell(); + auto digits = lexer.input().substring_view(start_index, end_index - start_index); + auto optional_value = AK::StringUtils::convert_to_int(digits); + + // 9. If sign is "positive", return value, otherwise return the result of subtracting value from zero. + // NOTE: Skipped, see comment on step 6. + + return optional_value; +} + +// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-non-negative-integers +Optional parse_non_negative_integer(StringView string) +{ + // 1. Let input be the string being parsed. + // 2. Let value be the result of parsing input using the rules for parsing integers. + // + // NOTE: Because we call `parse_integer`, we parse all integers as signed. If we need the extra + // size that an unsigned integer offers, then this would need to be improved. That said, + // I don't think we need to support such large integers at the moment. + auto optional_value = parse_integer(string); + + // 3. If value is an error, return an error. + if (!optional_value.has_value()) { + return {}; + } + + // 4. If value is less than zero, return an error. + if (optional_value.value() < 0) { + return {}; + } + + // 5. Return value. + return static_cast(optional_value.value()); +} + +} diff --git a/Userland/Libraries/LibWeb/HTML/Numbers.h b/Userland/Libraries/LibWeb/HTML/Numbers.h new file mode 100644 index 0000000000..d819f36c13 --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Numbers.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2023, Jonatan Klemets + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace Web::HTML { + +Optional parse_integer(StringView string); + +Optional parse_non_negative_integer(StringView string); + +}