From 99daf3ce03f4091c74400f895f9c82a1c046e645 Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Sat, 23 Oct 2021 22:36:47 +0100 Subject: [PATCH] journal: Use 32-bit entry array offsets in compact mode Before: OBJECT TYPE ENTRIES SIZE Unused 0 0B Data 3610336 595.7M Field 5310 285.2K Entry 3498326 1.2G Data Hash Table 29 103.1M Field Hash Table 29 151.3K Entry Array 605991 1011.6M Tag 0 0B Total 7720021 2.9G After: OBJECT TYPE ENTRIES SIZE Unused 0 0B Data 3562667 591.0M Field 3971 213.6K Entry 3498566 1.2G Data Hash Table 20 71.1M Field Hash Table 20 104.3K Entry Array 582647 505.0M Tag 0 0B Total 7647891 2.4G --- docs/JOURNAL_FILE_FORMAT.md | 10 +++- src/journal/managed-journal-file.c | 4 +- src/libsystemd/sd-journal/journal-def.h | 5 +- src/libsystemd/sd-journal/journal-file.c | 57 +++++++++++++--------- src/libsystemd/sd-journal/journal-file.h | 15 +++++- src/libsystemd/sd-journal/journal-verify.c | 22 ++++----- 6 files changed, 73 insertions(+), 40 deletions(-) diff --git a/docs/JOURNAL_FILE_FORMAT.md b/docs/JOURNAL_FILE_FORMAT.md index d40688c440b..c4484693af5 100644 --- a/docs/JOURNAL_FILE_FORMAT.md +++ b/docs/JOURNAL_FILE_FORMAT.md @@ -71,7 +71,7 @@ thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054 ## Basics -* All offsets, sizes, time values, hashes (and most other numeric values) are 64bit unsigned integers in LE format. +* All offsets, sizes, time values, hashes (and most other numeric values) are 32bit/64bit unsigned integers in LE format. * Offsets are always relative to the beginning of the file. * The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part. * All structures are aligned to 64bit boundaries and padded to multiples of 64bit @@ -552,7 +552,10 @@ creativity rather than runtime parameters. _packed_ struct EntryArrayObject { ObjectHeader object; le64_t next_entry_array_offset; - le64_t items[]; + union { + le64_t regular[]; + le32_t compact[]; + } items; }; ``` @@ -560,6 +563,9 @@ Entry Arrays are used to store a sorted array of offsets to entries. Entry arrays are strictly sorted by offsets on disk, and hence by their timestamps and sequence numbers (with some restrictions, see above). +If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, offsets are stored as 32-bit +integers instead of 64bit. + Entry Arrays are chained up. If one entry array is full another one is allocated and the **next_entry_array_offset** field of the old one pointed to it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the diff --git a/src/journal/managed-journal-file.c b/src/journal/managed-journal-file.c index c22aac32715..c8522126f37 100644 --- a/src/journal/managed-journal-file.c +++ b/src/journal/managed-journal-file.c @@ -50,7 +50,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t if (r < 0) return r; - n_items += journal_file_entry_array_n_items(&o); + n_items += journal_file_entry_array_n_items(f, &o); p = q; } @@ -67,7 +67,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t return 0; offset = p + offsetof(Object, entry_array.items) + - (journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t); + (journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f); sz = p + le64toh(o.object.size) - offset; if (sz < MINIMUM_HOLE_SIZE) diff --git a/src/libsystemd/sd-journal/journal-def.h b/src/libsystemd/sd-journal/journal-def.h index e0919a7faa7..c35e438518a 100644 --- a/src/libsystemd/sd-journal/journal-def.h +++ b/src/libsystemd/sd-journal/journal-def.h @@ -117,7 +117,10 @@ struct HashTableObject { struct EntryArrayObject { ObjectHeader object; le64_t next_entry_array_offset; - le64_t items[]; + union { + le64_t regular[0]; + le32_t compact[0]; + } items; } _packed_; #define TAG_LENGTH (256/8) diff --git a/src/libsystemd/sd-journal/journal-file.c b/src/libsystemd/sd-journal/journal-file.c index edec27610f6..d9aa9a3806a 100644 --- a/src/libsystemd/sd-journal/journal-file.c +++ b/src/libsystemd/sd-journal/journal-file.c @@ -716,7 +716,7 @@ static int check_object_header(Object *o, ObjectType type, uint64_t offset) { /* Lightweight object checks. We want this to be fast, so that we won't * slowdown every journal_file_move_to_object() call too much. */ -static int check_object(Object *o, uint64_t offset) { +static int check_object(JournalFile *f, Object *o, uint64_t offset) { assert(o); switch (o->object.type) { @@ -827,8 +827,8 @@ static int check_object(Object *o, uint64_t offset) { sz = le64toh(READ_NOW(o->object.size)); if (sz < offsetof(Object, entry_array.items) || - (sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 || - (sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) + (sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 || + (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Invalid object entry array size: %" PRIu64 ": %" PRIu64, sz, @@ -895,7 +895,7 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset if (r < 0) return r; - r = check_object(o, offset); + r = check_object(f, o, offset); if (r < 0) return r; @@ -944,7 +944,7 @@ int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t of "Short read while reading object: %" PRIu64, offset); - r = check_object(&o, offset); + r = check_object(f, &o, offset); if (r < 0) return r; @@ -1672,7 +1672,7 @@ uint64_t journal_file_entry_n_items(Object *o) { return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem); } -uint64_t journal_file_entry_array_n_items(Object *o) { +uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) { uint64_t sz; assert(o); @@ -1684,7 +1684,7 @@ uint64_t journal_file_entry_array_n_items(Object *o) { if (sz < offsetof(Object, entry_array.items)) return 0; - return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t); + return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f); } uint64_t journal_file_hash_table_n_items(Object *o) { @@ -1702,6 +1702,17 @@ uint64_t journal_file_hash_table_n_items(Object *o) { return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem); } +static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) { + assert(f); + assert(o); + + if (JOURNAL_HEADER_COMPACT(f->header)) { + assert(p <= UINT32_MAX); + o->entry_array.items.compact[i] = htole32(p); + } else + o->entry_array.items.regular[i] = htole64(p); +} + static int link_entry_into_array(JournalFile *f, le64_t *first, le64_t *idx, @@ -1724,9 +1735,9 @@ static int link_entry_into_array(JournalFile *f, if (r < 0) return r; - n = journal_file_entry_array_n_items(o); + n = journal_file_entry_array_n_items(f, o); if (i < n) { - o->entry_array.items[i] = htole64(p); + write_entry_array_item(f, o, i, p); *idx = htole64(hidx + 1); return 0; } @@ -1745,7 +1756,7 @@ static int link_entry_into_array(JournalFile *f, n = 4; r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY, - offsetof(Object, entry_array.items) + n * sizeof(uint64_t), + offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f), &o, &q); if (r < 0) return r; @@ -1756,7 +1767,7 @@ static int link_entry_into_array(JournalFile *f, return r; #endif - o->entry_array.items[i] = htole64(p); + write_entry_array_item(f, o, i, p); if (ap == 0) *first = htole64(q); @@ -2277,7 +2288,7 @@ static int generic_array_get( if (r < 0) return r; - k = journal_file_entry_array_n_items(o); + k = journal_file_entry_array_n_items(f, o); if (i < k) break; @@ -2297,7 +2308,7 @@ static int generic_array_get( if (r < 0) return r; - k = journal_file_entry_array_n_items(o); + k = journal_file_entry_array_n_items(f, o); if (k == 0) break; @@ -2305,12 +2316,12 @@ static int generic_array_get( } do { - p = le64toh(o->entry_array.items[i]); + p = journal_file_entry_array_item(f, o, i); r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret); if (r >= 0) { /* Let's cache this item for the next invocation */ - chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i); + chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i); if (ret_offset) *ret_offset = p; @@ -2438,13 +2449,13 @@ static int generic_array_bisect( if (r < 0) return r; - k = journal_file_entry_array_n_items(array); + k = journal_file_entry_array_n_items(f, array); right = MIN(k, n); if (right <= 0) return 0; i = right - 1; - lp = p = le64toh(array->entry_array.items[i]); + lp = p = journal_file_entry_array_item(f, array, i); if (p <= 0) r = -EBADMSG; else @@ -2477,7 +2488,7 @@ static int generic_array_bisect( if (last_index > 0) { uint64_t x = last_index - 1; - p = le64toh(array->entry_array.items[x]); + p = journal_file_entry_array_item(f, array, x); if (p <= 0) return -EBADMSG; @@ -2497,7 +2508,7 @@ static int generic_array_bisect( if (last_index < right) { uint64_t y = last_index + 1; - p = le64toh(array->entry_array.items[y]); + p = journal_file_entry_array_item(f, array, y); if (p <= 0) return -EBADMSG; @@ -2527,7 +2538,7 @@ static int generic_array_bisect( assert(left < right); i = (left + right) / 2; - p = le64toh(array->entry_array.items[i]); + p = journal_file_entry_array_item(f, array, i); if (p <= 0) r = -EBADMSG; else @@ -2575,14 +2586,14 @@ found: return 0; /* Let's cache this item for the next invocation */ - chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i); + chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, array, 0), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i); if (subtract_one && i == 0) p = last_p; else if (subtract_one) - p = le64toh(array->entry_array.items[i-1]); + p = journal_file_entry_array_item(f, array, i - 1); else - p = le64toh(array->entry_array.items[i]); + p = journal_file_entry_array_item(f, array, i); if (ret) { r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret); diff --git a/src/libsystemd/sd-journal/journal-file.h b/src/libsystemd/sd-journal/journal-file.h index 01942ec72bc..9b5bd1ff363 100644 --- a/src/libsystemd/sd-journal/journal-file.h +++ b/src/libsystemd/sd-journal/journal-file.h @@ -194,7 +194,20 @@ int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset); int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset); uint64_t journal_file_entry_n_items(Object *o) _pure_; -uint64_t journal_file_entry_array_n_items(Object *o) _pure_; +uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_; + +static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) { + assert(f); + assert(o); + return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) : + le64toh(o->entry_array.items.regular[i]); +} + +static inline size_t journal_file_entry_array_item_size(JournalFile *f) { + assert(f); + return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t); +} + uint64_t journal_file_hash_table_n_items(Object *o) _pure_; int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset); diff --git a/src/libsystemd/sd-journal/journal-verify.c b/src/libsystemd/sd-journal/journal-verify.c index b9f6a161edc..d0da9bf8064 100644 --- a/src/libsystemd/sd-journal/journal-verify.c +++ b/src/libsystemd/sd-journal/journal-verify.c @@ -335,8 +335,8 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o break; case OBJECT_ENTRY_ARRAY: - if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 || - (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) { + if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 || + (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) { error(offset, "Invalid object entry array size: %"PRIu64, le64toh(o->object.size)); @@ -350,15 +350,15 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o return -EBADMSG; } - for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++) - if (le64toh(o->entry_array.items[i]) != 0 && - !VALID64(le64toh(o->entry_array.items[i]))) { + for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) { + uint64_t q = journal_file_entry_array_item(f, o, i); + if (q != 0 && !VALID64(q)) { error(offset, "Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt, - i, journal_file_entry_array_n_items(o), - le64toh(o->entry_array.items[i])); + i, journal_file_entry_array_n_items(f, o), q); return -EBADMSG; } + } break; @@ -490,10 +490,10 @@ static int verify_data( return -EBADMSG; } - m = journal_file_entry_array_n_items(o); + m = journal_file_entry_array_n_items(f, o); for (j = 0; i < n && j < m; i++, j++) { - q = le64toh(o->entry_array.items[j]); + q = journal_file_entry_array_item(f, o, j); if (q <= last) { error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last); return -EBADMSG; @@ -737,11 +737,11 @@ static int verify_entry_array( return -EBADMSG; } - m = journal_file_entry_array_n_items(o); + m = journal_file_entry_array_n_items(f, o); for (j = 0; i < n && j < m; i++, j++) { uint64_t p; - p = le64toh(o->entry_array.items[j]); + p = journal_file_entry_array_item(f, o, j); if (p <= last) { error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n); return -EBADMSG;