LibPDF: Parse page structures

This commit introduces the ability to parse the document catalog dict,
as well as the page tree and individual pages. Pages obviously aren't
fully parsed, as we won't care about most of the fields until we
start actually rendering PDFs.

One of the primary benefits of the PDF format is laziness. PDFs are
not meant to be parsed all at once, and the same is true for pages.
When a Document is constructed, it builds a map of page number to
object index, but it does not fetch and parse any of the pages. A page
is only parsed when a caller requests that particular page (and is
cached going forwards).

Additionally, this commit also adds an object_cast function which
logs bad casts if DEBUG_PDF is set. Additionally, utility functions
were added to ArrayObject and DictObject to get all types of objects
from the collections to avoid having to manually cast.
This commit is contained in:
Matthew Olsson 2021-05-08 14:57:49 -07:00 committed by Andreas Kling
parent 72f693e9ed
commit 8c745ad0d9
11 changed files with 320 additions and 6 deletions

View file

@ -286,6 +286,10 @@
#cmakedefine01 PATH_DEBUG
#endif
#ifndef PDF_DEBUG
#cmakedefine01 PDF_DEBUG
#endif
#ifndef PNG_DEBUG
#cmakedefine01 PNG_DEBUG
#endif

View file

@ -176,6 +176,7 @@ set(LINE_EDITOR_DEBUG ON)
set(LANGUAGE_SERVER_DEBUG ON)
set(GL_DEBUG ON)
set(WASM_BINPARSER_DEBUG ON)
set(PDF_DEBUG ON)
# False positive: DEBUG is a flag but it works differently.
# set(DEBUG ON)

View file

@ -1,6 +1,7 @@
set(SOURCES
Object.cpp
Document.cpp
Object.cpp
Parser.cpp
Value.cpp
)

View file

@ -17,6 +17,118 @@ Document::Document(const ReadonlyBytes& bytes)
m_xref_table = xref_table;
m_trailer = trailer;
m_catalog = m_trailer->get_dict(this, "Root");
build_page_tree();
}
Value Document::get_or_load_value(u32 index)
{
auto value = get_value(index);
if (value)
return value;
VERIFY(m_xref_table.has_object(index));
auto byte_offset = m_xref_table.byte_offset_for_object(index);
auto indirect_value = m_parser.parse_indirect_value_at_offset(byte_offset);
VERIFY(indirect_value->index() == index);
value = indirect_value->value();
m_values.set(index, value);
return value;
}
u32 Document::get_first_page_index() const
{
// FIXME: A PDF can have a different default first page, which
// should be fetched and returned here
return 0;
}
u32 Document::get_page_count() const
{
return m_page_object_indices.size();
}
Page Document::get_page(u32 index)
{
VERIFY(index < m_page_object_indices.size());
auto cached_page = m_pages.get(index);
if (cached_page.has_value())
return cached_page.value();
auto page_object_index = m_page_object_indices[index];
auto raw_page_object = resolve_to<DictObject>(get_or_load_value(page_object_index));
auto resources = raw_page_object->get_dict(this, "Resources");
auto media_box_array = raw_page_object->get_array(this, "MediaBox");
auto media_box = Rectangle {
media_box_array->at(0).to_float(),
media_box_array->at(1).to_float(),
media_box_array->at(2).to_float(),
media_box_array->at(3).to_float(),
};
auto contents = raw_page_object->get_object(this, "Contents");
Page page { resources, media_box, contents };
m_pages.set(index, page);
return page;
}
Value Document::resolve(const Value& value)
{
if (!value.is_object())
return value;
auto obj = value.as_object();
// FIXME: Surely indirect PDF objects can't contain another indirect PDF object,
// right? Unsure from the spec, but if they can, these return values would have
// to be wrapped with another resolve() call.
if (obj->is_indirect_value_ref()) {
auto object_index = static_cast<NonnullRefPtr<IndirectValueRef>>(obj)->index();
return get_or_load_value(object_index);
}
if (obj->is_indirect_value())
return static_cast<NonnullRefPtr<IndirectValue>>(obj)->value();
return obj;
}
template<IsValueType T>
UnwrappedValueType<T> Document::resolve_to(const Value& value)
{
auto resolved = resolve(value);
if constexpr (IsSame<T, bool>)
return resolved.as_bool();
if constexpr (IsSame<T, int>)
return resolved.as_int();
if constexpr (IsSame<T, float>)
return resolved.as_float();
if constexpr (IsObject<T>)
return object_cast<T>(resolved.as_object());
VERIFY_NOT_REACHED();
}
void Document::build_page_tree()
{
auto page_tree = m_catalog->get_dict(this, "Pages");
auto kids_array = page_tree->get_array(this, "Kids");
auto page_count = page_tree->get("Count").value().as_int();
if (static_cast<size_t>(page_count) != kids_array->elements().size()) {
// FIXME: Support recursive PDF page tree structures
VERIFY_NOT_REACHED();
}
for (auto& value : *kids_array) {
auto reference = resolve_to<IndirectValueRef>(value);
m_page_object_indices.append(reference->index());
}
}
}

View file

@ -6,6 +6,8 @@
#pragma once
#include <AK/Format.h>
#include <AK/HashMap.h>
#include <AK/RefCounted.h>
#include <LibPDF/Object.h>
#include <LibPDF/Parser.h>
@ -13,6 +15,19 @@
namespace PDF {
struct Rectangle {
float lower_left_x;
float lower_left_y;
float upper_right_x;
float upper_right_y;
};
struct Page {
NonnullRefPtr<DictObject> resources;
Rectangle media_box;
NonnullRefPtr<Object> contents;
};
class Document final : public RefCounted<Document> {
public:
explicit Document(const ReadonlyBytes& bytes);
@ -21,6 +36,14 @@ public:
ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; }
[[nodiscard]] Value get_or_load_value(u32 index);
[[nodiscard]] u32 get_first_page_index() const;
[[nodiscard]] u32 get_page_count() const;
[[nodiscard]] Page get_page(u32 index);
ALWAYS_INLINE Value get_value(u32 index) const
{
return m_values.get(index).value_or({});
@ -32,11 +55,53 @@ public:
m_values.set(index, value);
}
// Strips away the layer of indirection by turning indirect value
// refs into the value they reference, and indirect values into
// the value being wrapped.
Value resolve(const Value& value);
// Like resolve, but unwraps the Value into the given type. Accepts
// any object type, and the three primitive Value types.
template<IsValueType T>
UnwrappedValueType<T> resolve_to(const Value& value);
private:
void build_page_tree();
Parser m_parser;
XRefTable m_xref_table;
RefPtr<DictObject> m_trailer;
RefPtr<DictObject> m_catalog;
Vector<u32> m_page_object_indices;
HashMap<u32, Page> m_pages;
HashMap<u32, Value> m_values;
};
}
namespace AK {
template<>
struct Formatter<PDF::Rectangle> : Formatter<StringView> {
void format(FormatBuilder& builder, const PDF::Rectangle& rectangle)
{
Formatter<StringView>::format(builder,
String::formatted("Rectangle {{ ll=({}, {}), ur=({}, {}) }}",
rectangle.lower_left_x,
rectangle.lower_left_y,
rectangle.upper_right_x,
rectangle.upper_right_y));
}
};
template<>
struct Formatter<PDF::Page> : Formatter<StringView> {
void format(FormatBuilder& builder, const PDF::Page& page)
{
constexpr auto fmt_string = "Page {{\n resources={}\n contents={}\n media_box={}\n}}";
auto str = String::formatted(fmt_string, page.resources->to_string(1), page.contents->to_string(1), page.media_box);
Formatter<StringView>::format(builder, str);
}
};
}

View file

@ -30,4 +30,13 @@ ENUMERATE_OBJECT_TYPES(FORWARD_DECL)
template<typename T>
concept IsObject = IsBaseOf<Object, T>;
template<typename T>
concept IsValuePrimitive = IsSame<T, bool> || IsSame<T, int> || IsSame<T, float>;
template<typename T>
concept IsValueType = IsValuePrimitive<T> || IsObject<T>;
template<IsValueType T>
using UnwrappedValueType = Conditional<IsObject<T>, NonnullRefPtr<T>, T>;
}

View file

@ -5,10 +5,34 @@
*/
#include <AK/Hex.h>
#include <LibPDF/Document.h>
#include <LibPDF/Object.h>
namespace PDF {
NonnullRefPtr<Object> ArrayObject::get_object_at(Document* document, size_t index) const
{
return document->resolve_to<Object>(m_elements[index]);
}
NonnullRefPtr<Object> DictObject::get_object(Document* document, const FlyString& key) const
{
return document->resolve_to<Object>(get_value(key));
}
#define DEFINE_ACCESSORS(class_name, snake_name) \
NonnullRefPtr<class_name> ArrayObject::get_##snake_name##_at(Document* document, size_t index) const \
{ \
return document->resolve_to<class_name>(m_elements[index]); \
} \
\
NonnullRefPtr<class_name> DictObject::get_##snake_name(Document* document, const FlyString& key) const \
{ \
return document->resolve_to<class_name>(get(key).value()); \
}
ENUMERATE_DIRECT_OBJECT_TYPES(DEFINE_ACCESSORS)
#undef DEFINE_INDEXER
static void append_indent(StringBuilder& builder, int indent)
{
for (int i = 0; i < indent; i++)

View file

@ -6,10 +6,12 @@
#pragma once
#include <AK/Debug.h>
#include <AK/FlyString.h>
#include <AK/Format.h>
#include <AK/HashMap.h>
#include <AK/RefCounted.h>
#include <AK/SourceLocation.h>
#include <LibPDF/Forward.h>
#include <LibPDF/Value.h>
@ -27,6 +29,10 @@ public:
ENUMERATE_OBJECT_TYPES(DEFINE_ID)
#undef DEFINE_ID
template<typename T>
NonnullRefPtr<T> resolved_to(Document*) const;
virtual const char* type_name() const = 0;
virtual String to_string(int indent) const = 0;
private:
@ -47,6 +53,7 @@ public:
[[nodiscard]] ALWAYS_INLINE bool is_binary() const { return m_is_binary; }
ALWAYS_INLINE bool is_string() const override { return true; }
ALWAYS_INLINE const char* type_name() const override { return "string"; }
String to_string(int indent) const override;
private:
@ -66,6 +73,7 @@ public:
[[nodiscard]] ALWAYS_INLINE FlyString name() const { return m_name; }
ALWAYS_INLINE bool is_name() const override { return true; }
ALWAYS_INLINE const char* type_name() const override { return "name"; }
String to_string(int indent) const override;
private:
@ -83,7 +91,24 @@ public:
[[nodiscard]] ALWAYS_INLINE Vector<Value> elements() const { return m_elements; }
ALWAYS_INLINE bool is_array() const override { return true; }
ALWAYS_INLINE auto begin() const { return m_elements.begin(); }
ALWAYS_INLINE auto end() const { return m_elements.end(); }
ALWAYS_INLINE const Value& operator[](size_t index) const { return at(index); }
ALWAYS_INLINE const Value& at(size_t index) const { return m_elements[index]; }
NonnullRefPtr<Object> get_object_at(Document*, size_t index) const;
#define DEFINE_INDEXER(class_name, snake_name) \
NonnullRefPtr<class_name> get_##snake_name##_at(Document*, size_t index) const;
ENUMERATE_OBJECT_TYPES(DEFINE_INDEXER)
#undef DEFINE_INDEXER
ALWAYS_INLINE bool is_array() const override
{
return true;
}
ALWAYS_INLINE const char* type_name() const override { return "array"; }
String to_string(int indent) const override;
private:
@ -99,9 +124,26 @@ public:
~DictObject() override = default;
[[nodiscard]] ALWAYS_INLINE HashMap<FlyString, Value> map() const { return m_map; }
[[nodiscard]] ALWAYS_INLINE const HashMap<FlyString, Value>& map() const { return m_map; }
ALWAYS_INLINE bool is_dict() const override { return true; }
ALWAYS_INLINE bool contains(const FlyString& key) const { return m_map.contains(key); }
ALWAYS_INLINE Optional<Value> get(const FlyString& key) const { return m_map.get(key); }
Value get_value(const FlyString& key) const { return get(key).value(); }
NonnullRefPtr<Object> get_object(Document*, const FlyString& key) const;
#define DEFINE_GETTER(class_name, snake_name) \
NonnullRefPtr<class_name> get_##snake_name(Document*, const FlyString& key) const;
ENUMERATE_OBJECT_TYPES(DEFINE_GETTER)
#undef DEFINE_GETTER
ALWAYS_INLINE bool is_dict() const override
{
return true;
}
ALWAYS_INLINE const char* type_name() const override { return "dict"; }
String to_string(int indent) const override;
private:
@ -122,6 +164,7 @@ public:
[[nodiscard]] ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; }
ALWAYS_INLINE bool is_stream() const override { return true; }
ALWAYS_INLINE const char* type_name() const override { return "stream"; }
String to_string(int indent) const override;
private:
@ -144,6 +187,7 @@ public:
[[nodiscard]] ALWAYS_INLINE const Value& value() const { return m_value; }
ALWAYS_INLINE bool is_indirect_value() const override { return true; }
ALWAYS_INLINE const char* type_name() const override { return "indirect_object"; }
String to_string(int indent) const override;
private:
@ -164,12 +208,35 @@ public:
[[nodiscard]] ALWAYS_INLINE u32 index() const { return m_index; }
ALWAYS_INLINE bool is_indirect_value_ref() const override { return true; }
ALWAYS_INLINE const char* type_name() const override { return "indirect_object_ref"; }
String to_string(int indent) const override;
private:
u32 m_index;
};
template<IsObject To, IsObject From>
[[nodiscard]] ALWAYS_INLINE static NonnullRefPtr<To> object_cast(NonnullRefPtr<From> obj
#ifdef PDF_DEBUG
,
SourceLocation loc = SourceLocation::current()
#endif
)
{
#ifdef PDF_DEBUG
# define ENUMERATE_TYPES(class_name, snake_name) \
if constexpr (IsSame<To, class_name>) { \
if (!obj->is_##snake_name()) { \
dbgln("{} invalid cast from type {} to type " #snake_name, loc, obj->type_name()); \
} \
}
ENUMERATE_OBJECT_TYPES(ENUMERATE_TYPES)
# undef ENUMERATE_TYPES
#endif
return static_cast<NonnullRefPtr<To>>(obj);
}
}
namespace AK {

View file

@ -48,6 +48,13 @@ Parser::XRefTableAndTrailer Parser::parse_last_xref_table_and_trailer()
return { xref_table, trailer };
}
NonnullRefPtr<IndirectValue> Parser::parse_indirect_value_at_offset(size_t offset)
{
m_reader.set_reading_forwards();
m_reader.move_to(offset);
return parse_indirect_value();
}
bool Parser::parse_header()
{
// FIXME: Do something with the version?
@ -323,11 +330,18 @@ NonnullRefPtr<IndirectValue> Parser::parse_indirect_value(int index, int generat
auto value = parse_value();
VERIFY(value.is_object());
VERIFY(m_reader.matches("endobj"));
VERIFY(consume_whitespace());
return make_object<IndirectValue>(index, generation, value.as_object());
}
NonnullRefPtr<IndirectValue> Parser::parse_indirect_value()
{
auto first_number = parse_number();
auto second_number = parse_number();
VERIFY(first_number.is_int() && second_number.is_int());
return parse_indirect_value(first_number.as_int(), second_number.as_int());
}
Value Parser::parse_number()
{
size_t start_offset = m_reader.offset();
@ -366,7 +380,7 @@ NonnullRefPtr<NameObject> Parser::parse_name()
StringBuilder builder;
while (true) {
if (matches_whitespace())
if (!matches_regular_character())
break;
if (m_reader.matches('#')) {
@ -587,6 +601,16 @@ bool Parser::matches_number() const
return isdigit(ch) || ch == '-' || ch == '+';
}
bool Parser::matches_delimiter() const
{
return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%');
}
bool Parser::matches_regular_character() const
{
return !matches_delimiter() && !matches_whitespace();
}
void Parser::consume_eol()
{
if (m_reader.matches("\r\n")) {

View file

@ -27,6 +27,8 @@ public:
};
XRefTableAndTrailer parse_last_xref_table_and_trailer();
NonnullRefPtr<IndirectValue> parse_indirect_value_at_offset(size_t offset);
private:
bool parse_header();
XRefTable parse_xref_table();
@ -48,6 +50,7 @@ private:
Value parse_value();
Value parse_possible_indirect_value_or_ref();
NonnullRefPtr<IndirectValue> parse_indirect_value(int index, int generation);
NonnullRefPtr<IndirectValue> parse_indirect_value();
Value parse_number();
NonnullRefPtr<NameObject> parse_name();
NonnullRefPtr<StringObject> parse_string();
@ -60,6 +63,8 @@ private:
bool matches_eol() const;
bool matches_whitespace() const;
bool matches_number() const;
bool matches_delimiter() const;
bool matches_regular_character() const;
void consume_eol();
bool consume_whitespace();

View file

@ -123,7 +123,8 @@ public:
ALWAYS_INLINE void load() { m_offset = m_saved_offsets.take_last(); }
ALWAYS_INLINE void discard() { m_saved_offsets.take_last(); }
void dump_state()
#ifdef PDF_DEBUG
void dump_state() const
{
StringBuilder builder;
builder.append("Reader State Dump\n\n");
@ -143,6 +144,7 @@ public:
auto str = builder.to_string();
dbgputstr(str.characters(), str.length());
}
#endif
private:
ReadonlyBytes m_bytes;