LibWeb/MimeSniff: Add rules for identifying an unknown mime type

This also implements the pattern matching algorithm since it's
needed.
This commit is contained in:
Kemal Zebari 2023-11-28 13:46:20 -08:00 committed by Andrew Kaster
parent 04e19df06a
commit 2babc08c17
2 changed files with 290 additions and 4 deletions

View file

@ -30,3 +30,62 @@ TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset)
EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized()));
}
TEST_CASE(compute_unknown_mime_type)
{
HashMap<StringView, Vector<StringView>> mime_type_to_headers_map;
mime_type_to_headers_map.set("application/octet-stream"sv, { "\x00"sv });
mime_type_to_headers_map.set("text/html"sv, {
"\x09\x09<!DOCTYPE HTML\x20"sv,
"\x0A<HTML\x3E"sv,
"\x0C<HEAD\x20"sv,
"\x0D<SCRIPT>"sv,
"\x20<IFRAME>"sv,
"<H1>"sv,
"<DIV>"sv,
"<FONT>"sv,
"<TABLE>"sv,
"<A>"sv,
"<STYLE>"sv,
"<TITLE>"sv,
"<B>"sv,
"<BODY>"sv,
"<BR>"sv,
"<P>"sv,
"<!-->"sv,
});
mime_type_to_headers_map.set("text/xml"sv, { "<?xml"sv });
mime_type_to_headers_map.set("application/pdf"sv, { "%PDF-"sv });
mime_type_to_headers_map.set("application/postscript"sv, { "%!PS-Adobe-"sv });
mime_type_to_headers_map.set("text/plain"sv, {
"\xFE\xFF\x00\x00"sv,
"\xFF\xFE\x00\x00"sv,
"\xEF\xBB\xBF\x00"sv,
"Hello world!"sv,
});
for (auto const& mime_type_to_headers : mime_type_to_headers_map) {
auto mime_type = mime_type_to_headers.key;
for (auto const& header : mime_type_to_headers.value) {
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header.bytes()));
EXPECT_EQ(mime_type, computed_mime_type.essence());
}
}
}
TEST_CASE(compute_mime_type_given_unknown_supplied_type)
{
Array<Web::MimeSniff::MimeType, 3> unknown_supplied_types = {
MUST(Web::MimeSniff::MimeType::create("unknown"_string, "unknown"_string)),
MUST(Web::MimeSniff::MimeType::create("application"_string, "unknown"_string)),
MUST(Web::MimeSniff::MimeType::create("*"_string, "*"_string))
};
auto header_bytes = "<HTML>"sv.bytes();
for (auto const& unknown_supplied_type : unknown_supplied_types) {
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header_bytes, Web::MimeSniff::SniffingConfiguration { .supplied_type = unknown_supplied_type }));
EXPECT_EQ("text/html"sv, computed_mime_type.essence());
}
}

View file

@ -7,6 +7,236 @@
#include <LibWeb/Fetch/Infrastructure/URL.h>
#include <LibWeb/MimeSniff/Resource.h>
namespace {
using namespace Web::MimeSniff;
struct BytePatternTableRow {
StringView byte_pattern;
StringView pattern_mask;
ReadonlyBytes ignored_leading_bytes;
StringView mime_type;
// NOTE: If the byte pattern has a tag-terminating byte, add a byte where this byte should be. The value itself is ignored in
// the pattern_matching_algorithm() (see the NOTE in this algorithm for more details).
bool is_tag_terminated { false };
};
// https://mimesniff.spec.whatwg.org/#tag-terminating-byte
bool is_tag_terminating_byte(u8 byte)
{
// A tag-terminating byte (abbreviated 0xTT) is any one of the following bytes: 0x20 (SP), 0x3E (">").
return byte == 0x20 || byte == 0x3E;
}
// https://mimesniff.spec.whatwg.org/#binary-data-byte
bool is_binary_data_byte(u8 byte)
{
// A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte 0x0B (VT), a byte in
// the range 0x0E to 0x1A (SO to SUB), or a byte in the range 0x1C to 0x1F (FS to US).
return (byte <= 0x08) || byte == 0x0B || (byte >= 0x0E && byte <= 0x1A) || (byte >= 0x1C && byte <= 0x1F);
}
// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
bool pattern_matching_algorithm(ReadonlyBytes input, ReadonlyBytes pattern, ReadonlyBytes mask, ReadonlyBytes ignored, bool is_tag_terminated = false)
{
// 1. Assert: patterns length is equal to masks length.
VERIFY(pattern.size() == mask.size());
// 2. If inputs length is less than patterns length, return false.
if (input.size() < pattern.size())
return false;
// 3. Let s be 0.
size_t s = 0;
// 4. While s < inputs length:
while (s < input.size()) {
// 1. If ignored does not contain input[s], break.
if (!ignored.contains_slow(input[s]))
break;
// 2. Set s to s + 1.
s++;
}
// 5. Let p be 0.
size_t p = 0;
// 6. While p < patterns length:
while (p < pattern.size()) {
// 1. Let maskedData be the result of applying the bitwise AND operator to input[s] and mask[p].
u8 masked_data = input[s] & mask[p];
// NOTE: This non-standard branch exists to avoid having to create 2 byte patterns just so that
// they can only differ by their tag-terminating byte (which could be a 0x20 or 0x3E byte).
if (is_tag_terminated && p + 1 == pattern.size())
return is_tag_terminating_byte(masked_data);
// 2. If maskedData is not equal to pattern[p], return false.
if (masked_data != pattern[p])
return false;
// 3. Set s to s + 1.
s++;
// 4. Set p to p + 1.
p++;
}
// 7. Return true.
return true;
}
ReadonlyBytes constexpr no_ignored_bytes;
// https://mimesniff.spec.whatwg.org/#rules-for-identifying-an-unknown-mime-type
ErrorOr<MimeType> rules_for_identifying_an_unknown_mime_type(Resource const& resource, bool sniff_scriptable = false)
{
// 1. If the sniff-scriptable flag is set, execute the following steps for each row row in the following table:
if (sniff_scriptable) {
static auto constexpr text_html_mime_type = "text/html"sv;
// https://mimesniff.spec.whatwg.org/#whitespace-byte
// A whitespace byte (abbreviated 0xWS) is any one of the following bytes: 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP).
static Array<u8, 5> constexpr ignored_whitespace_bytes { 0x09, 0x0A, 0x0C, 0x0D, 0x20 };
static Array<BytePatternTableRow, 19> constexpr pattern_table {
// The case-insensitive string "<!DOCTYPE HTML" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C\x00"sv,
"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<HTML" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x48\x54\x4D\x4C\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<HEAD" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x48\x45\x41\x44\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<SCRIPT" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x53\x43\x52\x49\x50\x54\x00"sv,
"\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<IFRAME" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x49\x46\x52\x41\x4D\x45\x00"sv,
"\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<H1" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x48\x31\x00"sv, "\xFF\xDF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<DIV" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x44\x49\x56\x00"sv, "\xFF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<FONT" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x46\x4F\x4E\x54\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<TABLE" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x54\x41\x42\x4C\x45\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<A" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x41\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<STYLE" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x53\x54\x59\x4C\x45\x00"sv,
"\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<TITLE" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x54\x49\x54\x4C\x45\x00"sv,
"\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<B" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x42\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<BODY" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x42\x4F\x44\x59\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<BR" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x42\x52\x00"sv, "\xFF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The case-insensitive string "<P" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x50\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The string "<!--" followed by a tag-terminating byte.
BytePatternTableRow { "\x3C\x21\x2D\x2D\x00"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
// The string "<?xml".
BytePatternTableRow { "\x3C\x3F\x78\x6D\x6C"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, "text/xml"sv },
// The string "%PDF-", the PDF signature.
BytePatternTableRow { "\x25\x50\x44\x46\x2D"sv, "\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/pdf"sv },
};
for (auto const& row : pattern_table) {
// 1. Let patternMatched be the result of the pattern matching algorithm given resources resource header,
// the value in the first column of row, the value in the second column of row, and the value in the
// third column of row.
auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes, row.is_tag_terminated);
// 2. If patternMatched is true, return the value in the fourth column of row.
if (pattern_matched) {
if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
return maybe_type.release_value();
}
}
}
// 2. Execute the following steps for each row row in the following table:
static auto constexpr text_plain_mime_type = "text/plain"sv;
static Array<BytePatternTableRow, 4> constexpr pattern_table {
// The string "%!PS-Adobe-", the PostScript signature.
BytePatternTableRow { "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D"sv,
"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/postscript"sv },
// UTF-16BE BOM
BytePatternTableRow { "\xFE\xFF\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
// UTF-16LE BOM
BytePatternTableRow { "\xFF\xFE\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
// UTF-8 BOM
BytePatternTableRow { "\xEF\xBB\xBF\x00"sv, "\xFF\xFF\xFF\x00"sv, no_ignored_bytes, text_plain_mime_type },
};
for (auto const& row : pattern_table) {
// 1. Let patternMatched be the result of the pattern matching algorithm given resources resource header,
// the value in the first column of row, the value in the second column of row, and the value in the
// third column of row.
auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
// 2. If patternMatched is true, return the value in the fourth column of row.
if (pattern_matched) {
if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
return maybe_type.release_value();
}
}
// FIXME: 3. Let matchedType be the result of executing the image type pattern matching algorithm given resources resource header.
Optional<MimeType> matched_type;
// 4. If matchedType is not undefined, return matchedType.
if (matched_type.has_value())
return matched_type.release_value();
// FIXME: 5. Set matchedType to the result of executing the audio or video type pattern matching algorithm given resources resource header.
// 6. If matchedType is not undefined, return matchedType.
if (matched_type.has_value())
return matched_type.release_value();
// FIXME: 7. Set matchedType to the result of executing the archive type pattern matching algorithm given resources resource header.
// 8. If matchedType is not undefined, return matchedType.
if (matched_type.has_value())
return matched_type.release_value();
// 9. If resources resource header contains no binary data bytes, return "text/plain".
if (!any_of(resource.resource_header(), is_binary_data_byte))
return MimeType::create("text"_string, "plain"_string);
// 10. Return "application/octet-stream".
return MimeType::create("application"_string, "octet-stream"_string);
}
}
namespace Web::MimeSniff {
ErrorOr<Resource> Resource::create(ReadonlyBytes data, SniffingConfiguration configuration)
@ -105,10 +335,7 @@ ErrorOr<void> Resource::mime_type_sniffing_algorithm()
// identifying an unknown MIME type with the sniff-scriptable flag equal to the
// inverse of the no-sniff flag and abort these steps.
if (!m_supplied_mime_type.has_value() || m_supplied_mime_type->essence().is_one_of("unknown/unknown", "application/unknown", "*/*")) {
// FIXME: Execute the rules for identifying an unknown MIME type with the
// sniff-scriptable flag equal to the inverse of the no-sniff flag and abort
// these steps.
m_computed_mime_type = TRY(rules_for_identifying_an_unknown_mime_type(*this, !m_no_sniff));
return {};
}