From 9495f64f913107ef746588986edb8de5e4c39db7 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Mon, 1 Jan 2024 19:31:27 -0500 Subject: [PATCH] LibPDF: Improve hex string parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A local (non-public) PDF I have lying around contains this in a page's operator stream: ``` [<00b4003e> 3 <002600480051> 3 <005700550044004f0003> -29 <00330044> 3 <0055> -3 <004e0040> 4 <0003> -29 <004c00560003> -31 <0057004b> 4 <00480003> -37 <0050 >] TJ ``` That is, there's a newline in a hexstring after a character. This led to `Parser error at offset 5184: Unexpected character`. The spec says in 3.2.3 String Objects, Hexadecimal Strings: """Each pair of hexadecimal digits defines one byte of the string. White-space characters (such as space, tab, carriage return, line feed, and form feed) are ignored.""" But we didn't ignore whitespace before or after a character, only in between the bytes. The spec also says: """If the final digit of a hexadecimal string is missing—that is, if there is an odd number of digits—the final digit is assumed to be 0.""" In that case, we were skipping the closing `>` twice -- or, more accurately, we ignored the character after it too. This has been wrong all the way back in #6974. Add a test that fails if either of the two changes isn't present. --- Tests/LibPDF/TestPDF.cpp | 21 +++++++++++++++++++++ Userland/Libraries/LibPDF/Parser.cpp | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/Tests/LibPDF/TestPDF.cpp b/Tests/LibPDF/TestPDF.cpp index 319aa37488..14c643d112 100644 --- a/Tests/LibPDF/TestPDF.cpp +++ b/Tests/LibPDF/TestPDF.cpp @@ -15,6 +15,27 @@ #include #include +TEST_CASE(parse_value) +{ + // document isn't really used for anything, only to check there's no security_handler. + auto file = MUST(Core::MappedFile::map("linearized.pdf"sv)); + auto document = MUST(PDF::Document::create(file->bytes())); + + auto contents = "<50607><10\n>"sv; + PDF::Parser parser(contents.bytes()); + parser.set_document(document->make_weak_ptr()); + + auto value1 = MUST(parser.parse_value(PDF::Parser::CanBeIndirectValue::No)); + auto string1 = value1.get>()->cast(); + EXPECT(string1->is_binary()); + EXPECT_EQ(string1->string(), "\x50\x60\x70"sv); + + auto value2 = MUST(parser.parse_value(PDF::Parser::CanBeIndirectValue::No)); + auto string2 = value2.get>()->cast(); + EXPECT(string2->is_binary()); + EXPECT_EQ(string2->string(), "\x10"sv); +} + TEST_CASE(linearized_pdf) { auto file = MUST(Core::MappedFile::map("linearized.pdf"sv)); diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index a6d60dabbc..30cc6d95dc 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -352,6 +352,7 @@ PDFErrorOr Parser::parse_hex_string() StringBuilder builder; while (true) { + m_reader.consume_whitespace(); if (m_reader.matches('>')) { m_reader.consume(); return builder.to_byte_string(); @@ -364,7 +365,6 @@ PDFErrorOr Parser::parse_hex_string() if (ch == '>') { // The hex string contains an odd number of characters, and the last character // is assumed to be '0' - m_reader.consume(); hex_value *= 16; builder.append(static_cast(hex_value)); return builder.to_byte_string();