From 8283e8b88c5ae78d72894cafa765dc5e0289161d Mon Sep 17 00:00:00 2001 From: MacDue Date: Sun, 9 Apr 2023 14:21:00 +0100 Subject: [PATCH] AK: Don't store parts of URLs percent decoded As noted in serval comments doing this goes against the WC3 spec, and breaks parsing then re-serializing URLs that contain percent encoded data, that was not encoded using the same character set as the serializer. For example, previously if you had a URL like: https:://foo.com/what%2F%2F (the path is what + '//' percent encoded) Creating URL("https:://foo.com/what%2F%2F").serialize() would return: https://foo.com/what// Which is incorrect and not the same as the URL we passed. This is because the re-serializing uses the PercentEncodeSet::Path which does not include '/'. Only doing the percent encoding in the setters fixes this, which is required to navigate to Google Street View (which includes a percent encoded URL in its URL). Seems to fix #13477 too --- AK/URL.cpp | 69 ++++++++++++++++++++--------- AK/URL.h | 23 ++++++---- AK/URLParser.cpp | 31 +++++-------- Tests/AK/TestURL.cpp | 11 ++++- Userland/Libraries/LibWeb/URL/URL.h | 2 +- 5 files changed, 85 insertions(+), 51 deletions(-) diff --git a/AK/URL.cpp b/AK/URL.cpp index 16cee01228..2346ae22b6 100644 --- a/AK/URL.cpp +++ b/AK/URL.cpp @@ -47,20 +47,32 @@ URL URL::complete_url(StringView relative_url) const return URLParser::parse(relative_url, *this); } +// NOTE: This only exists for compatibility with the existing URL tests which check for both .is_null() and .is_empty(). +static DeprecatedString deprecated_string_percent_encode(DeprecatedString const& input, URL::PercentEncodeSet set = URL::PercentEncodeSet::Userinfo, URL::SpaceAsPlus space_as_plus = URL::SpaceAsPlus::No) +{ + if (input.is_null() || input.is_empty()) + return input; + return URL::percent_encode(input.view(), set, space_as_plus); +} + void URL::set_scheme(DeprecatedString scheme) { m_scheme = move(scheme); m_valid = compute_validity(); } -void URL::set_username(DeprecatedString username) +void URL::set_username(DeprecatedString username, ApplyPercentEncoding apply_percent_encoding) { + if (apply_percent_encoding == ApplyPercentEncoding::Yes) + username = deprecated_string_percent_encode(username, PercentEncodeSet::Userinfo); m_username = move(username); m_valid = compute_validity(); } -void URL::set_password(DeprecatedString password) +void URL::set_password(DeprecatedString password, ApplyPercentEncoding apply_percent_encoding) { + if (apply_percent_encoding == ApplyPercentEncoding::Yes) + password = deprecated_string_percent_encode(password, PercentEncodeSet::Userinfo); m_password = move(password); m_valid = compute_validity(); } @@ -81,19 +93,38 @@ void URL::set_port(Optional port) m_valid = compute_validity(); } -void URL::set_paths(Vector paths) +void URL::set_paths(Vector paths, ApplyPercentEncoding apply_percent_encoding) { - m_paths = move(paths); + if (apply_percent_encoding == ApplyPercentEncoding::Yes) { + Vector encoded_paths; + encoded_paths.ensure_capacity(paths.size()); + for (auto& segment : paths) + encoded_paths.unchecked_append(deprecated_string_percent_encode(segment, PercentEncodeSet::Path)); + m_paths = move(encoded_paths); + } else { + m_paths = move(paths); + } m_valid = compute_validity(); } -void URL::set_query(DeprecatedString query) +void URL::append_path(DeprecatedString path, ApplyPercentEncoding apply_percent_encoding) { + if (apply_percent_encoding == ApplyPercentEncoding::Yes) + path = deprecated_string_percent_encode(path, PercentEncodeSet::Path); + m_paths.append(path); +} + +void URL::set_query(DeprecatedString query, ApplyPercentEncoding apply_percent_encoding) +{ + if (apply_percent_encoding == ApplyPercentEncoding::Yes) + query = deprecated_string_percent_encode(query, is_special() ? PercentEncodeSet::SpecialQuery : PercentEncodeSet::Query); m_query = move(query); } -void URL::set_fragment(DeprecatedString fragment) +void URL::set_fragment(DeprecatedString fragment, ApplyPercentEncoding apply_percent_encoding) { + if (apply_percent_encoding == ApplyPercentEncoding::Yes) + fragment = deprecated_string_percent_encode(fragment, PercentEncodeSet::Fragment); m_fragment = move(fragment); } @@ -171,9 +202,8 @@ URL URL::create_with_file_scheme(DeprecatedString const& path, DeprecatedString // This is because a file URL always needs a non-null hostname. url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname); url.set_paths(lexical_path.parts()); - // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. if (path.ends_with('/')) - url.append_path(""); + url.append_slash(); url.set_fragment(fragment); return url; } @@ -188,9 +218,8 @@ URL URL::create_with_help_scheme(DeprecatedString const& path, DeprecatedString // This is because a file URL always needs a non-null hostname. url.set_host(hostname.is_null() || hostname == "localhost" ? DeprecatedString::empty() : hostname); url.set_paths(lexical_path.parts()); - // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. if (path.ends_with('/')) - url.append_path(""); + url.append_slash(); url.set_fragment(fragment); return url; } @@ -242,10 +271,10 @@ DeprecatedString URL::serialize(ExcludeFragment exclude_fragment) const builder.append("//"sv); if (includes_credentials()) { - builder.append(percent_encode(m_username, PercentEncodeSet::Userinfo)); + builder.append(m_username); if (!m_password.is_empty()) { builder.append(':'); - builder.append(percent_encode(m_password, PercentEncodeSet::Userinfo)); + builder.append(m_password); } builder.append('@'); } @@ -256,24 +285,24 @@ DeprecatedString URL::serialize(ExcludeFragment exclude_fragment) const } if (cannot_be_a_base_url()) { - builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path)); + builder.append(m_paths[0]); } else { if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty()) builder.append("/."sv); for (auto& segment : m_paths) { builder.append('/'); - builder.append(percent_encode(segment, PercentEncodeSet::Path)); + builder.append(segment); } } if (!m_query.is_null()) { builder.append('?'); - builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query)); + builder.append(m_query); } if (exclude_fragment == ExcludeFragment::No && !m_fragment.is_null()) { builder.append('#'); - builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment)); + builder.append(m_fragment); } return builder.to_deprecated_string(); @@ -300,24 +329,24 @@ DeprecatedString URL::serialize_for_display() const } if (cannot_be_a_base_url()) { - builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path)); + builder.append(m_paths[0]); } else { if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty()) builder.append("/."sv); for (auto& segment : m_paths) { builder.append('/'); - builder.append(percent_encode(segment, PercentEncodeSet::Path)); + builder.append(segment); } } if (!m_query.is_null()) { builder.append('?'); - builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query)); + builder.append(m_query); } if (!m_fragment.is_null()) { builder.append('#'); - builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment)); + builder.append(m_fragment); } return builder.to_deprecated_string(); diff --git a/AK/URL.h b/AK/URL.h index 5a0f7bb65c..a5cd5fa2e3 100644 --- a/AK/URL.h +++ b/AK/URL.h @@ -19,8 +19,6 @@ namespace AK { -// NOTE: The member variables cannot contain any percent encoded sequences. -// The URL parser automatically decodes those sequences and the serialize() method will re-encode them as necessary. class URL { friend class URLParser; @@ -70,16 +68,25 @@ public: bool includes_credentials() const { return !m_username.is_empty() || !m_password.is_empty(); } bool is_special() const { return is_special_scheme(m_scheme); } + enum class ApplyPercentEncoding { + Yes, + No + }; void set_scheme(DeprecatedString); - void set_username(DeprecatedString); - void set_password(DeprecatedString); + void set_username(DeprecatedString username, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes); + void set_password(DeprecatedString password, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes); void set_host(DeprecatedString); void set_port(Optional); - void set_paths(Vector); - void set_query(DeprecatedString); - void set_fragment(DeprecatedString); + void set_paths(Vector password, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes); + void set_query(DeprecatedString query, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes); + void set_fragment(DeprecatedString fragment, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes); void set_cannot_be_a_base_url(bool value) { m_cannot_be_a_base_url = value; } - void append_path(DeprecatedString path) { m_paths.append(move(path)); } + void append_path(DeprecatedString path, ApplyPercentEncoding apply_percent_encoding = ApplyPercentEncoding::Yes); + void append_slash() + { + // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment. + append_path("", ApplyPercentEncoding::No); + } DeprecatedString path() const; DeprecatedString basename() const; diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp index 7b570f6257..842f7f4a0a 100644 --- a/AK/URLParser.cpp +++ b/AK/URLParser.cpp @@ -194,9 +194,6 @@ Optional URLParser::parse_data_url(StringView raw_input) // future for validation of URLs, which would then lead to infinite recursion. // The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member // variables' values here, not what the URL class presents to its users. -// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting -// some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode -// everything before setting the member variables. URL URLParser::parse(StringView raw_input, Optional const& base_url, Optional url, Optional state_override) { dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input); @@ -310,7 +307,7 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option ++iterator; } else { url->m_cannot_be_a_base_url = true; - url->append_path(""); + url->append_slash(); state = State::CannotBeABaseUrlPath; } } else { @@ -441,13 +438,11 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option if (password_token_seen) { builder.append(url->password()); URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo); - // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. - url->m_password = URL::percent_decode(builder.string_view()); + url->m_password = builder.string_view(); } else { builder.append(url->username()); URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo); - // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. - url->m_username = URL::percent_decode(builder.string_view()); + url->m_username = builder.string_view(); } } buffer.clear(); @@ -561,7 +556,7 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option url->m_host = base_url->m_host; auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string(); if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0])) - url->append_path(base_url->m_paths[0]); + url->append_path(base_url->m_paths[0], URL::ApplyPercentEncoding::No); state = State::Path; continue; } @@ -616,9 +611,9 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0]))) url->m_paths.remove(url->m_paths.size() - 1); if (code_point != '/' && !(url->is_special() && code_point == '\\')) - url->append_path(""); + url->append_slash(); } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) { - url->append_path(""); + url->append_slash(); } else if (!is_single_dot_path_segment(buffer.string_view())) { if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) { auto drive_letter = buffer.string_view()[0]; @@ -626,8 +621,7 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option buffer.append(drive_letter); buffer.append(':'); } - // NOTE: This needs to be percent decoded since the member variables contain decoded data. - url->append_path(URL::percent_decode(buffer.string_view())); + url->append_path(buffer.string_view(), URL::ApplyPercentEncoding::No); } buffer.clear(); if (code_point == '?') { @@ -649,13 +643,12 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option // NOTE: Verify that the assumptions required for this simplification are correct. VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty()); if (code_point == '?') { - // NOTE: This needs to be percent decoded since the member variables contain decoded data. - url->m_paths[0] = URL::percent_decode(buffer.string_view()); + url->m_paths[0] = buffer.string_view(); url->m_query = ""; state = State::Query; } else if (code_point == '#') { // NOTE: This needs to be percent decoded since the member variables contain decoded data. - url->m_paths[0] = URL::percent_decode(buffer.string_view()); + url->m_paths[0] = buffer.string_view(); url->m_fragment = ""; state = State::Fragment; } else { @@ -665,8 +658,7 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option if (code_point != end_of_file) { URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control); } else { - // NOTE: This needs to be percent decoded since the member variables contain decoded data. - url->m_paths[0] = URL::percent_decode(buffer.string_view()); + url->m_paths[0] = buffer.string_view(); } } break; @@ -696,8 +688,7 @@ URL URLParser::parse(StringView raw_input, Optional const& base_url, Option // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. buffer.append_code_point(code_point); } else { - // NOTE: This needs to be percent decoded since the member variables contain decoded data. - url->m_fragment = URL::percent_decode(buffer.string_view()); + url->m_fragment = buffer.string_view(); buffer.clear(); } break; diff --git a/Tests/AK/TestURL.cpp b/Tests/AK/TestURL.cpp index 49ea621640..b0e5c453fb 100644 --- a/Tests/AK/TestURL.cpp +++ b/Tests/AK/TestURL.cpp @@ -151,7 +151,7 @@ TEST_CASE(file_url_with_encoded_characters) URL url("file:///my/file/test%23file.txt"sv); EXPECT(url.is_valid()); EXPECT_EQ(url.scheme(), "file"); - EXPECT_EQ(url.path(), "/my/file/test#file.txt"); + EXPECT_EQ(url.path(), "/my/file/test%23file.txt"); EXPECT(url.query().is_null()); EXPECT(url.fragment().is_null()); } @@ -389,7 +389,7 @@ TEST_CASE(unicode) { URL url { "http://example.com/_ünicöde_téxt_©"sv }; EXPECT(url.is_valid()); - EXPECT_EQ(url.path(), "/_ünicöde_téxt_©"); + EXPECT_EQ(url.path(), "/_%C3%BCnic%C3%B6de_t%C3%A9xt_%C2%A9"); EXPECT(url.query().is_null()); EXPECT(url.fragment().is_null()); } @@ -415,3 +415,10 @@ TEST_CASE(empty_url_with_base_url) EXPECT_EQ(parsed_url.is_valid(), true); EXPECT(base_url.equals(parsed_url)); } + +TEST_CASE(google_street_view) +{ + constexpr auto streetview_url = "https://www.google.co.uk/maps/@53.3354159,-1.9573545,3a,75y,121.1h,75.67t/data=!3m7!1e1!3m5!1sSY8xCv17jAX4S7SRdV38hg!2e0!6shttps:%2F%2Fstreetviewpixels-pa.googleapis.com%2Fv1%2Fthumbnail%3Fpanoid%3DSY8xCv17jAX4S7SRdV38hg%26cb_client%3Dmaps_sv.tactile.gps%26w%3D203%26h%3D100%26yaw%3D188.13148%26pitch%3D0%26thumbfov%3D100!7i13312!8i6656"; + URL url(streetview_url); + EXPECT_EQ(url.serialize(), streetview_url); +} diff --git a/Userland/Libraries/LibWeb/URL/URL.h b/Userland/Libraries/LibWeb/URL/URL.h index f723308eb2..e17d556f7a 100644 --- a/Userland/Libraries/LibWeb/URL/URL.h +++ b/Userland/Libraries/LibWeb/URL/URL.h @@ -62,7 +62,7 @@ public: WebIDL::ExceptionOr to_json() const; - void set_query(Badge, String query) { m_url.set_query(query.to_deprecated_string()); } + void set_query(Badge, String query) { m_url.set_query(query.to_deprecated_string(), AK::URL::ApplyPercentEncoding::Yes); } private: URL(JS::Realm&, AK::URL, JS::NonnullGCPtr query);