mirror of
https://github.com/SerenityOS/serenity
synced 2024-09-06 08:56:40 +00:00
LibWeb/MimeSniff: Add rules for identifying an unknown mime type
This also implements the pattern matching algorithm since it's needed.
This commit is contained in:
parent
04e19df06a
commit
2babc08c17
|
@ -30,3 +30,62 @@ TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset)
|
|||
|
||||
EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized()));
|
||||
}
|
||||
|
||||
TEST_CASE(compute_unknown_mime_type)
|
||||
{
|
||||
HashMap<StringView, Vector<StringView>> mime_type_to_headers_map;
|
||||
|
||||
mime_type_to_headers_map.set("application/octet-stream"sv, { "\x00"sv });
|
||||
mime_type_to_headers_map.set("text/html"sv, {
|
||||
"\x09\x09<!DOCTYPE HTML\x20"sv,
|
||||
"\x0A<HTML\x3E"sv,
|
||||
"\x0C<HEAD\x20"sv,
|
||||
"\x0D<SCRIPT>"sv,
|
||||
"\x20<IFRAME>"sv,
|
||||
"<H1>"sv,
|
||||
"<DIV>"sv,
|
||||
"<FONT>"sv,
|
||||
"<TABLE>"sv,
|
||||
"<A>"sv,
|
||||
"<STYLE>"sv,
|
||||
"<TITLE>"sv,
|
||||
"<B>"sv,
|
||||
"<BODY>"sv,
|
||||
"<BR>"sv,
|
||||
"<P>"sv,
|
||||
"<!-->"sv,
|
||||
});
|
||||
mime_type_to_headers_map.set("text/xml"sv, { "<?xml"sv });
|
||||
mime_type_to_headers_map.set("application/pdf"sv, { "%PDF-"sv });
|
||||
mime_type_to_headers_map.set("application/postscript"sv, { "%!PS-Adobe-"sv });
|
||||
mime_type_to_headers_map.set("text/plain"sv, {
|
||||
"\xFE\xFF\x00\x00"sv,
|
||||
"\xFF\xFE\x00\x00"sv,
|
||||
"\xEF\xBB\xBF\x00"sv,
|
||||
"Hello world!"sv,
|
||||
});
|
||||
|
||||
for (auto const& mime_type_to_headers : mime_type_to_headers_map) {
|
||||
auto mime_type = mime_type_to_headers.key;
|
||||
|
||||
for (auto const& header : mime_type_to_headers.value) {
|
||||
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header.bytes()));
|
||||
EXPECT_EQ(mime_type, computed_mime_type.essence());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(compute_mime_type_given_unknown_supplied_type)
|
||||
{
|
||||
Array<Web::MimeSniff::MimeType, 3> unknown_supplied_types = {
|
||||
MUST(Web::MimeSniff::MimeType::create("unknown"_string, "unknown"_string)),
|
||||
MUST(Web::MimeSniff::MimeType::create("application"_string, "unknown"_string)),
|
||||
MUST(Web::MimeSniff::MimeType::create("*"_string, "*"_string))
|
||||
};
|
||||
auto header_bytes = "<HTML>"sv.bytes();
|
||||
|
||||
for (auto const& unknown_supplied_type : unknown_supplied_types) {
|
||||
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header_bytes, Web::MimeSniff::SniffingConfiguration { .supplied_type = unknown_supplied_type }));
|
||||
EXPECT_EQ("text/html"sv, computed_mime_type.essence());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,236 @@
|
|||
#include <LibWeb/Fetch/Infrastructure/URL.h>
|
||||
#include <LibWeb/MimeSniff/Resource.h>
|
||||
|
||||
namespace {
|
||||
|
||||
using namespace Web::MimeSniff;
|
||||
|
||||
struct BytePatternTableRow {
|
||||
StringView byte_pattern;
|
||||
StringView pattern_mask;
|
||||
ReadonlyBytes ignored_leading_bytes;
|
||||
StringView mime_type;
|
||||
|
||||
// NOTE: If the byte pattern has a tag-terminating byte, add a byte where this byte should be. The value itself is ignored in
|
||||
// the pattern_matching_algorithm() (see the NOTE in this algorithm for more details).
|
||||
bool is_tag_terminated { false };
|
||||
};
|
||||
|
||||
// https://mimesniff.spec.whatwg.org/#tag-terminating-byte
|
||||
bool is_tag_terminating_byte(u8 byte)
|
||||
{
|
||||
// A tag-terminating byte (abbreviated 0xTT) is any one of the following bytes: 0x20 (SP), 0x3E (">").
|
||||
return byte == 0x20 || byte == 0x3E;
|
||||
}
|
||||
|
||||
// https://mimesniff.spec.whatwg.org/#binary-data-byte
|
||||
bool is_binary_data_byte(u8 byte)
|
||||
{
|
||||
// A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte 0x0B (VT), a byte in
|
||||
// the range 0x0E to 0x1A (SO to SUB), or a byte in the range 0x1C to 0x1F (FS to US).
|
||||
return (byte <= 0x08) || byte == 0x0B || (byte >= 0x0E && byte <= 0x1A) || (byte >= 0x1C && byte <= 0x1F);
|
||||
}
|
||||
|
||||
// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
|
||||
bool pattern_matching_algorithm(ReadonlyBytes input, ReadonlyBytes pattern, ReadonlyBytes mask, ReadonlyBytes ignored, bool is_tag_terminated = false)
|
||||
{
|
||||
// 1. Assert: pattern’s length is equal to mask’s length.
|
||||
VERIFY(pattern.size() == mask.size());
|
||||
|
||||
// 2. If input’s length is less than pattern’s length, return false.
|
||||
if (input.size() < pattern.size())
|
||||
return false;
|
||||
|
||||
// 3. Let s be 0.
|
||||
size_t s = 0;
|
||||
|
||||
// 4. While s < input’s length:
|
||||
while (s < input.size()) {
|
||||
// 1. If ignored does not contain input[s], break.
|
||||
if (!ignored.contains_slow(input[s]))
|
||||
break;
|
||||
|
||||
// 2. Set s to s + 1.
|
||||
s++;
|
||||
}
|
||||
|
||||
// 5. Let p be 0.
|
||||
size_t p = 0;
|
||||
|
||||
// 6. While p < pattern’s length:
|
||||
while (p < pattern.size()) {
|
||||
// 1. Let maskedData be the result of applying the bitwise AND operator to input[s] and mask[p].
|
||||
u8 masked_data = input[s] & mask[p];
|
||||
|
||||
// NOTE: This non-standard branch exists to avoid having to create 2 byte patterns just so that
|
||||
// they can only differ by their tag-terminating byte (which could be a 0x20 or 0x3E byte).
|
||||
if (is_tag_terminated && p + 1 == pattern.size())
|
||||
return is_tag_terminating_byte(masked_data);
|
||||
|
||||
// 2. If maskedData is not equal to pattern[p], return false.
|
||||
if (masked_data != pattern[p])
|
||||
return false;
|
||||
|
||||
// 3. Set s to s + 1.
|
||||
s++;
|
||||
|
||||
// 4. Set p to p + 1.
|
||||
p++;
|
||||
}
|
||||
|
||||
// 7. Return true.
|
||||
return true;
|
||||
}
|
||||
|
||||
ReadonlyBytes constexpr no_ignored_bytes;
|
||||
|
||||
// https://mimesniff.spec.whatwg.org/#rules-for-identifying-an-unknown-mime-type
|
||||
ErrorOr<MimeType> rules_for_identifying_an_unknown_mime_type(Resource const& resource, bool sniff_scriptable = false)
|
||||
{
|
||||
// 1. If the sniff-scriptable flag is set, execute the following steps for each row row in the following table:
|
||||
if (sniff_scriptable) {
|
||||
static auto constexpr text_html_mime_type = "text/html"sv;
|
||||
|
||||
// https://mimesniff.spec.whatwg.org/#whitespace-byte
|
||||
// A whitespace byte (abbreviated 0xWS) is any one of the following bytes: 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP).
|
||||
static Array<u8, 5> constexpr ignored_whitespace_bytes { 0x09, 0x0A, 0x0C, 0x0D, 0x20 };
|
||||
static Array<BytePatternTableRow, 19> constexpr pattern_table {
|
||||
// The case-insensitive string "<!DOCTYPE HTML" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C\x00"sv,
|
||||
"\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<HTML" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x48\x54\x4D\x4C\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<HEAD" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x48\x45\x41\x44\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<SCRIPT" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x53\x43\x52\x49\x50\x54\x00"sv,
|
||||
"\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<IFRAME" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x49\x46\x52\x41\x4D\x45\x00"sv,
|
||||
"\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<H1" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x48\x31\x00"sv, "\xFF\xDF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<DIV" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x44\x49\x56\x00"sv, "\xFF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<FONT" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x46\x4F\x4E\x54\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<TABLE" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x54\x41\x42\x4C\x45\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<A" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x41\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<STYLE" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x53\x54\x59\x4C\x45\x00"sv,
|
||||
"\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<TITLE" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x54\x49\x54\x4C\x45\x00"sv,
|
||||
"\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<B" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x42\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<BODY" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x42\x4F\x44\x59\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<BR" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x42\x52\x00"sv, "\xFF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The case-insensitive string "<P" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x50\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The string "<!--" followed by a tag-terminating byte.
|
||||
BytePatternTableRow { "\x3C\x21\x2D\x2D\x00"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
|
||||
|
||||
// The string "<?xml".
|
||||
BytePatternTableRow { "\x3C\x3F\x78\x6D\x6C"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, "text/xml"sv },
|
||||
|
||||
// The string "%PDF-", the PDF signature.
|
||||
BytePatternTableRow { "\x25\x50\x44\x46\x2D"sv, "\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/pdf"sv },
|
||||
};
|
||||
|
||||
for (auto const& row : pattern_table) {
|
||||
// 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
|
||||
// the value in the first column of row, the value in the second column of row, and the value in the
|
||||
// third column of row.
|
||||
auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes, row.is_tag_terminated);
|
||||
|
||||
// 2. If patternMatched is true, return the value in the fourth column of row.
|
||||
if (pattern_matched) {
|
||||
if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
|
||||
return maybe_type.release_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Execute the following steps for each row row in the following table:
|
||||
static auto constexpr text_plain_mime_type = "text/plain"sv;
|
||||
static Array<BytePatternTableRow, 4> constexpr pattern_table {
|
||||
// The string "%!PS-Adobe-", the PostScript signature.
|
||||
BytePatternTableRow { "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D"sv,
|
||||
"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/postscript"sv },
|
||||
|
||||
// UTF-16BE BOM
|
||||
BytePatternTableRow { "\xFE\xFF\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
|
||||
|
||||
// UTF-16LE BOM
|
||||
BytePatternTableRow { "\xFF\xFE\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
|
||||
|
||||
// UTF-8 BOM
|
||||
BytePatternTableRow { "\xEF\xBB\xBF\x00"sv, "\xFF\xFF\xFF\x00"sv, no_ignored_bytes, text_plain_mime_type },
|
||||
};
|
||||
|
||||
for (auto const& row : pattern_table) {
|
||||
// 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
|
||||
// the value in the first column of row, the value in the second column of row, and the value in the
|
||||
// third column of row.
|
||||
auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
|
||||
|
||||
// 2. If patternMatched is true, return the value in the fourth column of row.
|
||||
if (pattern_matched) {
|
||||
if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
|
||||
return maybe_type.release_value();
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: 3. Let matchedType be the result of executing the image type pattern matching algorithm given resource’s resource header.
|
||||
Optional<MimeType> matched_type;
|
||||
|
||||
// 4. If matchedType is not undefined, return matchedType.
|
||||
if (matched_type.has_value())
|
||||
return matched_type.release_value();
|
||||
|
||||
// FIXME: 5. Set matchedType to the result of executing the audio or video type pattern matching algorithm given resource’s resource header.
|
||||
|
||||
// 6. If matchedType is not undefined, return matchedType.
|
||||
if (matched_type.has_value())
|
||||
return matched_type.release_value();
|
||||
|
||||
// FIXME: 7. Set matchedType to the result of executing the archive type pattern matching algorithm given resource’s resource header.
|
||||
|
||||
// 8. If matchedType is not undefined, return matchedType.
|
||||
if (matched_type.has_value())
|
||||
return matched_type.release_value();
|
||||
|
||||
// 9. If resource’s resource header contains no binary data bytes, return "text/plain".
|
||||
if (!any_of(resource.resource_header(), is_binary_data_byte))
|
||||
return MimeType::create("text"_string, "plain"_string);
|
||||
|
||||
// 10. Return "application/octet-stream".
|
||||
return MimeType::create("application"_string, "octet-stream"_string);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace Web::MimeSniff {
|
||||
|
||||
ErrorOr<Resource> Resource::create(ReadonlyBytes data, SniffingConfiguration configuration)
|
||||
|
@ -105,10 +335,7 @@ ErrorOr<void> Resource::mime_type_sniffing_algorithm()
|
|||
// identifying an unknown MIME type with the sniff-scriptable flag equal to the
|
||||
// inverse of the no-sniff flag and abort these steps.
|
||||
if (!m_supplied_mime_type.has_value() || m_supplied_mime_type->essence().is_one_of("unknown/unknown", "application/unknown", "*/*")) {
|
||||
|
||||
// FIXME: Execute the rules for identifying an unknown MIME type with the
|
||||
// sniff-scriptable flag equal to the inverse of the no-sniff flag and abort
|
||||
// these steps.
|
||||
m_computed_mime_type = TRY(rules_for_identifying_an_unknown_mime_type(*this, !m_no_sniff));
|
||||
return {};
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue