diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt index e1020b2afe..dc024db158 100644 --- a/Meta/Lagom/CMakeLists.txt +++ b/Meta/Lagom/CMakeLists.txt @@ -658,6 +658,7 @@ if (BUILD_LAGOM) LibCompress LibGL LibGfx + LibIMAP LibLocale LibMarkdown LibPDF diff --git a/Tests/LibIMAP/CMakeLists.txt b/Tests/LibIMAP/CMakeLists.txt index 9ca8e420e0..621e396db9 100644 --- a/Tests/LibIMAP/CMakeLists.txt +++ b/Tests/LibIMAP/CMakeLists.txt @@ -1,5 +1,6 @@ set(TEST_SOURCES TestQuotedPrintable.cpp + TestMessageHeaderEncoding.cpp ) foreach(source IN LISTS TEST_SOURCES) diff --git a/Tests/LibIMAP/TestMessageHeaderEncoding.cpp b/Tests/LibIMAP/TestMessageHeaderEncoding.cpp new file mode 100644 index 0000000000..8bb8b8dff7 --- /dev/null +++ b/Tests/LibIMAP/TestMessageHeaderEncoding.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023, Valtteri Koskivuori + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +TEST_CASE(test_decode) +{ + auto decode_equal = [](StringView input, StringView expected) { + auto decoded = MUST(IMAP::decode_rfc2047_encoded_words(input)); + EXPECT_EQ(StringView(decoded), StringView(expected)); + }; + + // Underscores should end up as spaces + decode_equal("=?utf-8?Q?Spaces_should_be_spaces_!?="sv, "Spaces should be spaces !"sv); + + // RFC 2047 Section 8 "Examples", https://datatracker.ietf.org/doc/html/rfc2047#section-8 + + decode_equal("=?ISO-8859-1?Q?a?="sv, "a"sv); + + decode_equal("=?ISO-8859-1?Q?a?= b"sv, "a b"sv); + + // White space between adjacent 'encoded-word's is not displayed. + decode_equal("=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?="sv, "ab"sv); + + // Even multiple SPACEs between 'encoded-word's are ignored for the purpose of display. + decode_equal("=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?="sv, "ab"sv); + decode_equal("=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?= =?ISO-8859-1?Q?c?==?ISO-8859-1?Q?d?="sv, "abcd"sv); + + // Any amount of linear-space-white between 'encoded-word's, even if it includes a CRLF followed by one or more SPACEs, is ignored for the purposes of display. + decode_equal("=?utf-8?Q?a?=\r\n=?utf-8?Q?b?= \r\n=?utf-8?Q?c?=\r\n =?utf-8?Q?d?="sv, "abcd"sv); + + // In order to cause a SPACE to be displayed within a portion of encoded text, the SPACE MUST be encoded as part of the 'encoded-word'. + decode_equal("=?ISO-8859-1?Q?a_b?="sv, "a b"sv); + + // In order to cause a SPACE to be displayed between two strings of encoded text, the SPACE MAY be encoded as part of one of the 'encoded-word's. + decode_equal("=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?="sv, "a b"sv); + + // More examples from the RFC document, a nice mix of different charsets & encodings. + auto long_input = "From: =?US-ASCII?Q?Keith_Moore?= " + "To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= " + "CC: =?ISO-8859-1?Q?Andr=E9?= Pirard " + "Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=" + "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?="sv; + + auto long_expected = "From: Keith Moore " + "To: Keld Jørn Simonsen " + "CC: André Pirard " + "Subject: If you can read this you understand the example."sv; + decode_equal(long_input, long_expected); +} diff --git a/Userland/Applications/Mail/MailWidget.cpp b/Userland/Applications/Mail/MailWidget.cpp index 788cde4c35..9ff75e240f 100644 --- a/Userland/Applications/Mail/MailWidget.cpp +++ b/Userland/Applications/Mail/MailWidget.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include MailWidget::MailWidget() @@ -396,6 +397,10 @@ void MailWidget::selected_mailbox() if (subject.is_empty()) subject = "(No subject)"; + if (subject.contains("=?"sv) && subject.contains("?="sv)) { + subject = MUST(IMAP::decode_rfc2047_encoded_words(subject)); + } + auto& from_iterator_value = from_iterator->get<1>().value(); auto from_index = from_iterator_value.find("From:"sv); if (!from_index.has_value()) diff --git a/Userland/Libraries/LibIMAP/CMakeLists.txt b/Userland/Libraries/LibIMAP/CMakeLists.txt index 13a13984a9..b738e2d322 100644 --- a/Userland/Libraries/LibIMAP/CMakeLists.txt +++ b/Userland/Libraries/LibIMAP/CMakeLists.txt @@ -3,9 +3,10 @@ set(SOURCES Objects.cpp Parser.cpp QuotedPrintable.cpp + MessageHeaderEncoding.cpp ) set(GENERATED_SOURCES) serenity_lib(LibIMAP imap) -target_link_libraries(LibIMAP PRIVATE LibCore LibCrypto LibTLS) +target_link_libraries(LibIMAP PRIVATE LibCore LibCrypto LibTextCodec LibTLS) diff --git a/Userland/Libraries/LibIMAP/MessageHeaderEncoding.cpp b/Userland/Libraries/LibIMAP/MessageHeaderEncoding.cpp new file mode 100644 index 0000000000..58ce45f6d3 --- /dev/null +++ b/Userland/Libraries/LibIMAP/MessageHeaderEncoding.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023, Valtteri Koskivuori + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include "QuotedPrintable.h" +#include +#include +#include +#include +#include + +namespace IMAP { + +ErrorOr decode_rfc2047_encoded_words(StringView input) +{ + GenericLexer lexer(input); + StringBuilder output; + + while (!lexer.is_eof()) { + auto ascii_view = lexer.consume_until("=?"sv); + DeprecatedString ascii = ascii_view.replace("\r"sv, " "sv, ReplaceMode::All); + ascii = ascii.replace("\n"sv, " "sv, ReplaceMode::All); + TRY(output.try_append(ascii)); + if (lexer.is_eof()) + break; + lexer.consume_specific("=?"sv); + auto charset = lexer.consume_until('?'); + lexer.consume(); + auto encoding = lexer.consume_until('?'); + lexer.consume(); + auto encoded_text = lexer.consume_until("?="); + lexer.consume_specific("?="sv); + + // RFC 2047 Section 6.2, "...any 'linear-white-space' that separates a pair of adjacent 'encoded-word's is ignored." + // https://datatracker.ietf.org/doc/html/rfc2047#section-6.2 + bool found_next_start = false; + int spaces = 0; + for (size_t i = 0; i < lexer.tell_remaining(); ++i) { + if (lexer.peek(i) == ' ' || lexer.peek(i) == '\r' || lexer.peek(i) == '\n') { + spaces++; + if (lexer.peek(i + 1) == '=' && lexer.peek(i + 2) == '?') { + found_next_start = true; + break; + } + } else { + break; + } + } + if (found_next_start) { + for (int i = 0; i < spaces; i++) { + lexer.consume(); + } + } + + ByteBuffer first_pass_decoded; + if (encoding == 'Q' || encoding == 'q') { + auto maybe_decoded_data = decode_quoted_printable(encoded_text); + if (maybe_decoded_data.is_error()) { + dbgln("Failed to decode quoted-printable rfc2047 text, skipping."); + continue; + } + // RFC 2047 Section 4.2.2, https://datatracker.ietf.org/doc/html/rfc2047#section-4.2 + auto decoded_data = maybe_decoded_data.release_value(); + for (auto character : decoded_data.bytes()) { + if (character == '_') + first_pass_decoded.append(' '); + else + first_pass_decoded.append(character); + } + } else if (encoding == 'B' || encoding == 'b') { + auto maybe_decoded_data = AK::decode_base64(encoded_text); + if (maybe_decoded_data.is_error()) { + dbgln("Failed to decode base64-encoded rfc2047 text, skipping."); + continue; + } + first_pass_decoded = maybe_decoded_data.release_value(); + } else { + dbgln("Unknown encoding \"{}\" found, skipping, original string: \"{}\"", encoding, input); + continue; + } + if (first_pass_decoded.is_empty()) + continue; + auto maybe_decoder = TextCodec::decoder_for(charset); + if (!maybe_decoder.has_value()) { + dbgln("No decoder found for charset \"{}\", skipping.", charset); + continue; + } + auto decoded_text = TRY(maybe_decoder->to_utf8(first_pass_decoded)); + TRY(output.try_append(decoded_text)); + } + + return output.to_byte_buffer(); +} + +} diff --git a/Userland/Libraries/LibIMAP/MessageHeaderEncoding.h b/Userland/Libraries/LibIMAP/MessageHeaderEncoding.h new file mode 100644 index 0000000000..f0a4b7437e --- /dev/null +++ b/Userland/Libraries/LibIMAP/MessageHeaderEncoding.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2023, Valtteri Koskivuori + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include + +namespace IMAP { + +ErrorOr decode_rfc2047_encoded_words(StringView input); + +}