From c90963161e7992c9ca685f21b6fde96c1c989e9a Mon Sep 17 00:00:00 2001 From: Simon Wanner Date: Sun, 2 Jun 2024 15:56:36 +0200 Subject: [PATCH] LibTextCodec: Fix ISO-8859-1 vs. windows-1252 handling in web contexts The Encoding specification maps ISO-8859-1 to windows-1252 and expects the windows-1252 translation table to be used, which differs from ISO-8859-1 for 0x80-0x9F. Other contexts expect to get the actual ISO-8859-1 encoding, with 1-to-1 mapping to U+0000-U+00FF, when requesting it. `decoder_for_exact_name` is introduced, which skips the mapping from aliases to the encoding name done by `get_standardized_encoding`. (cherry picked from commit 6b2c4599017f512279cb26c0d3c48aa5a9453007) --- .../LibTextCodec/GenerateEncodingIndexes.cpp | 1 + .../LibGfx/ImageFormats/JBIG2Loader.cpp | 4 +- .../LibGfx/ImageFormats/JPEG2000Loader.cpp | 2 +- Userland/Libraries/LibTextCodec/Decoder.cpp | 182 +++++++++--------- Userland/Libraries/LibTextCodec/Decoder.h | 9 +- .../Libraries/LibWeb/Encoding/TextDecoder.cpp | 2 +- .../LibWeb/HTML/WindowOrWorkerGlobalScope.cpp | 2 +- 7 files changed, 107 insertions(+), 95 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp index 6377e2e8e1..f18a4692b2 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp @@ -211,6 +211,7 @@ ErrorOr serenity_main(Main::Arguments arguments) { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array()) }, { "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array()) }, { "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array()) }, + { "windows_1252"sv, prepare_table(data.get("windows-1252"sv)->as_array()) }, }, }; diff --git a/Userland/Libraries/LibGfx/ImageFormats/JBIG2Loader.cpp b/Userland/Libraries/LibGfx/ImageFormats/JBIG2Loader.cpp index 8dcb4022ac..14c45ab867 100644 --- a/Userland/Libraries/LibGfx/ImageFormats/JBIG2Loader.cpp +++ b/Userland/Libraries/LibGfx/ImageFormats/JBIG2Loader.cpp @@ -2497,8 +2497,8 @@ static ErrorOr decode_extension(JBIG2LoadingContext&, SegmentData const& s auto second_bytes = TRY(read_string.template operator()()); - auto first = TRY(TextCodec::decoder_for("ISO-8859-1"sv)->to_utf8(StringView { first_bytes })); - auto second = TRY(TextCodec::decoder_for("ISO-8859-1"sv)->to_utf8(StringView { second_bytes })); + auto first = TRY(TextCodec::decoder_for_exact_name("ISO-8859-1"sv)->to_utf8(StringView { first_bytes })); + auto second = TRY(TextCodec::decoder_for_exact_name("ISO-8859-1"sv)->to_utf8(StringView { second_bytes })); dbgln("JBIG2ImageDecoderPlugin: key '{}', value '{}'", first, second); } if (!stream.is_eof()) diff --git a/Userland/Libraries/LibGfx/ImageFormats/JPEG2000Loader.cpp b/Userland/Libraries/LibGfx/ImageFormats/JPEG2000Loader.cpp index 28fdcf2a68..752b37c936 100644 --- a/Userland/Libraries/LibGfx/ImageFormats/JPEG2000Loader.cpp +++ b/Userland/Libraries/LibGfx/ImageFormats/JPEG2000Loader.cpp @@ -496,7 +496,7 @@ static ErrorOr read_comment(ReadonlyBytes data) dbgln_if(JPEG2000_DEBUG, "JPEG2000ImageDecoderPlugin: COM marker segment: comment_type={}, size()={}", (int)com.type, com.data.size()); if (com.type == Comment::ISO_IEC_8859_15) - dbgln_if(JPEG2000_DEBUG, "JPEG2000ImageDecoderPlugin: COM marker segment, ISO/IEC 8859-15 text: '{}'", TRY(TextCodec::decoder_for("ISO-8859-1"sv)->to_utf8(StringView { com.data }))); + dbgln_if(JPEG2000_DEBUG, "JPEG2000ImageDecoderPlugin: COM marker segment, ISO/IEC 8859-15 text: '{}'", TRY(TextCodec::decoder_for("ISO-8859-15"sv)->to_utf8(StringView { com.data }))); return com; } diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index e0e795c7c2..268fe67e9a 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -235,6 +235,8 @@ SingleByteDecoder s_windows874_decoder {{ 0x0E40, 0x0E41, 0x0E42, 0x0E43, 0x0E44, 0x0E45, 0x0E46, 0x0E47, 0x0E48, 0x0E49, 0x0E4A, 0x0E4B, 0x0E4C, 0x0E4D, 0x0E4E, 0x0E4F, 0x0E50, 0x0E51, 0x0E52, 0x0E53, 0x0E54, 0x0E55, 0x0E56, 0x0E57, 0x0E58, 0x0E59, 0x0E5A, 0x0E5B, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, }}; +// https://encoding.spec.whatwg.org/index-windows-1252.txt +SingleByteDecoder s_windows1252_decoder { s_windows_1252_index }; // https://encoding.spec.whatwg.org/index-windows-1253.txt SingleByteDecoder s_windows1253_decoder {{ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x0088, 0x2030, 0x008A, 0x2039, 0x008C, 0x008D, 0x008E, 0x008F, @@ -294,94 +296,97 @@ SingleByteDecoder s_mac_cyrillic_decoder {{ } -Optional decoder_for(StringView a_encoding) +Optional decoder_for(StringView label) { - auto encoding = get_standardized_encoding(a_encoding); - if (encoding.has_value()) { - if (encoding.value().equals_ignoring_ascii_case("windows-1252"sv)) - return s_latin1_decoder; - if (encoding.value().equals_ignoring_ascii_case("utf-8"sv)) - return s_utf8_decoder; - if (encoding.value().equals_ignoring_ascii_case("utf-16be"sv)) - return s_utf16be_decoder; - if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv)) - return s_utf16le_decoder; - if (encoding.value().equals_ignoring_ascii_case("big5"sv)) - return s_big5_decoder; - if (encoding.value().equals_ignoring_ascii_case("euc-jp"sv)) - return s_euc_jp_decoder; - if (encoding.value().equals_ignoring_ascii_case("euc-kr"sv)) - return s_euc_kr_decoder; - if (encoding.value().equals_ignoring_ascii_case("gbk"sv)) - return s_gb18030_decoder; - if (encoding.value().equals_ignoring_ascii_case("gb18030"sv)) - return s_gb18030_decoder; - if (encoding.value().equals_ignoring_ascii_case("ibm866"sv)) - return s_ibm866_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-2022-jp"sv)) - return s_iso_2022_jp_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv)) - return s_latin2_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-3"sv)) - return s_latin3_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-4"sv)) - return s_latin4_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-5"sv)) - return s_latin_cyrillic_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-6"sv)) - return s_latin_arabic_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-7"sv)) - return s_latin_greek_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-8"sv)) - return s_latin_hebrew_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-8-i"sv)) - return s_latin_hebrew_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-10"sv)) - return s_latin6_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-13"sv)) - return s_latin7_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-14"sv)) - return s_latin8_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-15"sv)) - return s_latin9_decoder; - if (encoding.value().equals_ignoring_ascii_case("iso-8859-16"sv)) - return s_latin10_decoder; - if (encoding.value().equals_ignoring_ascii_case("koi8-r"sv)) - return s_koi8r_decoder; - if (encoding.value().equals_ignoring_ascii_case("koi8-u"sv)) - return s_koi8u_decoder; - if (encoding.value().equals_ignoring_ascii_case("macintosh"sv)) - return s_mac_roman_decoder; - if (encoding.value().equals_ignoring_ascii_case("PDFDocEncoding"sv)) - return s_pdf_doc_encoding_decoder; - if (encoding.value().equals_ignoring_ascii_case("replacement"sv)) - return s_replacement_decoder; - if (encoding.value().equals_ignoring_ascii_case("shift_jis"sv)) - return s_shift_jis_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-874"sv)) - return s_windows874_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1250"sv)) - return s_centraleurope_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1251"sv)) - return s_cyrillic_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1253"sv)) - return s_windows1253_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1254"sv)) - return s_turkish_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1255"sv)) - return s_hebrew_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1256"sv)) - return s_windows1256_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1257"sv)) - return s_windows1257_decoder; - if (encoding.value().equals_ignoring_ascii_case("windows-1258"sv)) - return s_windows1258_decoder; - if (encoding.value().equals_ignoring_ascii_case("x-mac-cyrillic"sv)) - return s_mac_cyrillic_decoder; - if (encoding.value().equals_ignoring_ascii_case("x-user-defined"sv)) - return s_x_user_defined_decoder; - } - dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding); + auto encoding = get_standardized_encoding(label); + return encoding.has_value() ? decoder_for_exact_name(encoding.value()) : Optional {}; +} + +Optional decoder_for_exact_name(StringView encoding) +{ + if (encoding.equals_ignoring_ascii_case("iso-8859-1"sv)) + return s_latin1_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1252"sv)) + return s_windows1252_decoder; + if (encoding.equals_ignoring_ascii_case("utf-8"sv)) + return s_utf8_decoder; + if (encoding.equals_ignoring_ascii_case("utf-16be"sv)) + return s_utf16be_decoder; + if (encoding.equals_ignoring_ascii_case("utf-16le"sv)) + return s_utf16le_decoder; + if (encoding.equals_ignoring_ascii_case("big5"sv)) + return s_big5_decoder; + if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) + return s_euc_jp_decoder; + if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) + return s_euc_kr_decoder; + if (encoding.equals_ignoring_ascii_case("gbk"sv)) + return s_gb18030_decoder; + if (encoding.equals_ignoring_ascii_case("gb18030"sv)) + return s_gb18030_decoder; + if (encoding.equals_ignoring_ascii_case("ibm866"sv)) + return s_ibm866_decoder; + if (encoding.equals_ignoring_ascii_case("iso-2022-jp"sv)) + return s_iso_2022_jp_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv)) + return s_latin2_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv)) + return s_latin3_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv)) + return s_latin4_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv)) + return s_latin_cyrillic_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv)) + return s_latin_arabic_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv)) + return s_latin_greek_decoder; + if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv)) + return s_latin_hebrew_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv)) + return s_latin6_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv)) + return s_latin7_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv)) + return s_latin8_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv)) + return s_latin9_decoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv)) + return s_latin10_decoder; + if (encoding.equals_ignoring_ascii_case("koi8-r"sv)) + return s_koi8r_decoder; + if (encoding.equals_ignoring_ascii_case("koi8-u"sv)) + return s_koi8u_decoder; + if (encoding.equals_ignoring_ascii_case("macintosh"sv)) + return s_mac_roman_decoder; + if (encoding.equals_ignoring_ascii_case("PDFDocEncoding"sv)) + return s_pdf_doc_encoding_decoder; + if (encoding.equals_ignoring_ascii_case("replacement"sv)) + return s_replacement_decoder; + if (encoding.equals_ignoring_ascii_case("shift_jis"sv)) + return s_shift_jis_decoder; + if (encoding.equals_ignoring_ascii_case("windows-874"sv)) + return s_windows874_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1250"sv)) + return s_centraleurope_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1251"sv)) + return s_cyrillic_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1253"sv)) + return s_windows1253_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1254"sv)) + return s_turkish_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1255"sv)) + return s_hebrew_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1256"sv)) + return s_windows1256_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1257"sv)) + return s_windows1257_decoder; + if (encoding.equals_ignoring_ascii_case("windows-1258"sv)) + return s_windows1258_decoder; + if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv)) + return s_mac_cyrillic_decoder; + if (encoding.equals_ignoring_ascii_case("x-user-defined"sv)) + return s_x_user_defined_decoder; + dbgln("TextCodec: No decoder implemented for encoding '{}'", encoding); return {}; } @@ -979,7 +984,8 @@ ErrorOr XUserDefinedDecoder::process(StringView input, Function SingleByteDecoder::process(StringView input, Function(u32)> on_code_point) +template +ErrorOr SingleByteDecoder::process(StringView input, Function(u32)> on_code_point) { for (u8 const byte : input) { if (byte < 0x80) { diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h index 7853d4046e..c1dcd66fe1 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.h +++ b/Userland/Libraries/LibTextCodec/Decoder.h @@ -46,9 +46,10 @@ public: virtual ErrorOr to_utf8(StringView) override; }; +template class SingleByteDecoder final : public Decoder { public: - SingleByteDecoder(Array translation_table) + SingleByteDecoder(Array translation_table) : m_translation_table(translation_table) { } @@ -56,7 +57,7 @@ public: virtual ErrorOr process(StringView, Function(u32)> on_code_point) override; private: - Array m_translation_table; + Array m_translation_table; }; class Latin1Decoder final : public Decoder { @@ -131,6 +132,10 @@ public: virtual bool validate(StringView input) override { return input.is_empty(); } }; +// This will return a decoder for the exact name specified, skipping get_standardized_encoding. +// Use this when you want ISO-8859-1 instead of windows-1252. +Optional decoder_for_exact_name(StringView encoding); + Optional decoder_for(StringView encoding); Optional get_standardized_encoding(StringView encoding); diff --git a/Userland/Libraries/LibWeb/Encoding/TextDecoder.cpp b/Userland/Libraries/LibWeb/Encoding/TextDecoder.cpp index 3fa46ff918..0a42948086 100644 --- a/Userland/Libraries/LibWeb/Encoding/TextDecoder.cpp +++ b/Userland/Libraries/LibWeb/Encoding/TextDecoder.cpp @@ -41,7 +41,7 @@ WebIDL::ExceptionOr> TextDecoder::construct_impl(J auto ignore_bom = options.value_or({}).ignore_bom; // NOTE: This should happen in decode(), but we don't support streaming yet and share decoders across calls. - auto decoder = TextCodec::decoder_for(encoding.value()); + auto decoder = TextCodec::decoder_for_exact_name(encoding.value()); VERIFY(decoder.has_value()); return realm.heap().allocate(realm, realm, *decoder, lowercase_encoding_name, fatal, ignore_bom); diff --git a/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp b/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp index 486ee97987..128fcfa6a6 100644 --- a/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp +++ b/Userland/Libraries/LibWeb/HTML/WindowOrWorkerGlobalScope.cpp @@ -137,7 +137,7 @@ WebIDL::ExceptionOr WindowOrWorkerGlobalScopeMixin::atob(String const& d // 3. Return decodedData. // decode_base64() returns a byte string. LibJS uses UTF-8 for strings. Use Latin1Decoder to convert bytes 128-255 to UTF-8. - auto decoder = TextCodec::decoder_for("windows-1252"sv); + auto decoder = TextCodec::decoder_for_exact_name("ISO-8859-1"sv); VERIFY(decoder.has_value()); return TRY_OR_THROW_OOM(vm, decoder->to_utf8(decoded_data.value())); }