journal: Use 32-bit entry array offsets in compact mode

Before:

OBJECT TYPE      ENTRIES SIZE
Unused           0       0B
Data             3610336 595.7M
Field            5310    285.2K
Entry            3498326 1.2G
Data Hash Table  29	 103.1M
Field Hash Table 29      151.3K
Entry Array      605991  1011.6M
Tag              0	 0B
Total            7720021 2.9G

After:

OBJECT TYPE      ENTRIES SIZE
Unused           0	 0B
Data             3562667 591.0M
Field            3971    213.6K
Entry            3498566 1.2G
Data Hash Table  20	 71.1M
Field Hash Table 20	 104.3K
Entry Array	 582647  505.0M
Tag              0	 0B
Total            7647891 2.4G
This commit is contained in:
Daan De Meyer 2021-10-23 22:36:47 +01:00
parent d06727aec2
commit 99daf3ce03
6 changed files with 73 additions and 40 deletions

View file

@ -71,7 +71,7 @@ thread](https://lists.freedesktop.org/archives/systemd-devel/2012-October/007054
## Basics
* All offsets, sizes, time values, hashes (and most other numeric values) are 64bit unsigned integers in LE format.
* All offsets, sizes, time values, hashes (and most other numeric values) are 32bit/64bit unsigned integers in LE format.
* Offsets are always relative to the beginning of the file.
* The 64bit hash function siphash24 is used for newer journal files. For older files [Jenkins lookup3](https://en.wikipedia.org/wiki/Jenkins_hash_function) is used, more specifically `jenkins_hashlittle2()` with the first 32bit integer it returns as higher 32bit part of the 64bit value, and the second one uses as lower 32bit part.
* All structures are aligned to 64bit boundaries and padded to multiples of 64bit
@ -552,7 +552,10 @@ creativity rather than runtime parameters.
_packed_ struct EntryArrayObject {
ObjectHeader object;
le64_t next_entry_array_offset;
le64_t items[];
union {
le64_t regular[];
le32_t compact[];
} items;
};
```
@ -560,6 +563,9 @@ Entry Arrays are used to store a sorted array of offsets to entries. Entry
arrays are strictly sorted by offsets on disk, and hence by their timestamps
and sequence numbers (with some restrictions, see above).
If the `HEADER_INCOMPATIBLE_COMPACT` flag is set, offsets are stored as 32-bit
integers instead of 64bit.
Entry Arrays are chained up. If one entry array is full another one is
allocated and the **next_entry_array_offset** field of the old one pointed to
it. An Entry Array with **next_entry_array_offset** set to 0 is the last in the

View file

@ -50,7 +50,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
if (r < 0)
return r;
n_items += journal_file_entry_array_n_items(&o);
n_items += journal_file_entry_array_n_items(f, &o);
p = q;
}
@ -67,7 +67,7 @@ static int managed_journal_file_entry_array_punch_hole(JournalFile *f, uint64_t
return 0;
offset = p + offsetof(Object, entry_array.items) +
(journal_file_entry_array_n_items(&o) - n_unused) * sizeof(le64_t);
(journal_file_entry_array_n_items(f, &o) - n_unused) * journal_file_entry_array_item_size(f);
sz = p + le64toh(o.object.size) - offset;
if (sz < MINIMUM_HOLE_SIZE)

View file

@ -117,7 +117,10 @@ struct HashTableObject {
struct EntryArrayObject {
ObjectHeader object;
le64_t next_entry_array_offset;
le64_t items[];
union {
le64_t regular[0];
le32_t compact[0];
} items;
} _packed_;
#define TAG_LENGTH (256/8)

View file

@ -716,7 +716,7 @@ static int check_object_header(Object *o, ObjectType type, uint64_t offset) {
/* Lightweight object checks. We want this to be fast, so that we won't
* slowdown every journal_file_move_to_object() call too much. */
static int check_object(Object *o, uint64_t offset) {
static int check_object(JournalFile *f, Object *o, uint64_t offset) {
assert(o);
switch (o->object.type) {
@ -827,8 +827,8 @@ static int check_object(Object *o, uint64_t offset) {
sz = le64toh(READ_NOW(o->object.size));
if (sz < offsetof(Object, entry_array.items) ||
(sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
(sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)
(sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
(sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"Invalid object entry array size: %" PRIu64 ": %" PRIu64,
sz,
@ -895,7 +895,7 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
if (r < 0)
return r;
r = check_object(o, offset);
r = check_object(f, o, offset);
if (r < 0)
return r;
@ -944,7 +944,7 @@ int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t of
"Short read while reading object: %" PRIu64,
offset);
r = check_object(&o, offset);
r = check_object(f, &o, offset);
if (r < 0)
return r;
@ -1672,7 +1672,7 @@ uint64_t journal_file_entry_n_items(Object *o) {
return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
}
uint64_t journal_file_entry_array_n_items(Object *o) {
uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) {
uint64_t sz;
assert(o);
@ -1684,7 +1684,7 @@ uint64_t journal_file_entry_array_n_items(Object *o) {
if (sz < offsetof(Object, entry_array.items))
return 0;
return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f);
}
uint64_t journal_file_hash_table_n_items(Object *o) {
@ -1702,6 +1702,17 @@ uint64_t journal_file_hash_table_n_items(Object *o) {
return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
}
static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) {
assert(f);
assert(o);
if (JOURNAL_HEADER_COMPACT(f->header)) {
assert(p <= UINT32_MAX);
o->entry_array.items.compact[i] = htole32(p);
} else
o->entry_array.items.regular[i] = htole64(p);
}
static int link_entry_into_array(JournalFile *f,
le64_t *first,
le64_t *idx,
@ -1724,9 +1735,9 @@ static int link_entry_into_array(JournalFile *f,
if (r < 0)
return r;
n = journal_file_entry_array_n_items(o);
n = journal_file_entry_array_n_items(f, o);
if (i < n) {
o->entry_array.items[i] = htole64(p);
write_entry_array_item(f, o, i, p);
*idx = htole64(hidx + 1);
return 0;
}
@ -1745,7 +1756,7 @@ static int link_entry_into_array(JournalFile *f,
n = 4;
r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f),
&o, &q);
if (r < 0)
return r;
@ -1756,7 +1767,7 @@ static int link_entry_into_array(JournalFile *f,
return r;
#endif
o->entry_array.items[i] = htole64(p);
write_entry_array_item(f, o, i, p);
if (ap == 0)
*first = htole64(q);
@ -2277,7 +2288,7 @@ static int generic_array_get(
if (r < 0)
return r;
k = journal_file_entry_array_n_items(o);
k = journal_file_entry_array_n_items(f, o);
if (i < k)
break;
@ -2297,7 +2308,7 @@ static int generic_array_get(
if (r < 0)
return r;
k = journal_file_entry_array_n_items(o);
k = journal_file_entry_array_n_items(f, o);
if (k == 0)
break;
@ -2305,12 +2316,12 @@ static int generic_array_get(
}
do {
p = le64toh(o->entry_array.items[i]);
p = journal_file_entry_array_item(f, o, i);
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
if (r >= 0) {
/* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i);
if (ret_offset)
*ret_offset = p;
@ -2438,13 +2449,13 @@ static int generic_array_bisect(
if (r < 0)
return r;
k = journal_file_entry_array_n_items(array);
k = journal_file_entry_array_n_items(f, array);
right = MIN(k, n);
if (right <= 0)
return 0;
i = right - 1;
lp = p = le64toh(array->entry_array.items[i]);
lp = p = journal_file_entry_array_item(f, array, i);
if (p <= 0)
r = -EBADMSG;
else
@ -2477,7 +2488,7 @@ static int generic_array_bisect(
if (last_index > 0) {
uint64_t x = last_index - 1;
p = le64toh(array->entry_array.items[x]);
p = journal_file_entry_array_item(f, array, x);
if (p <= 0)
return -EBADMSG;
@ -2497,7 +2508,7 @@ static int generic_array_bisect(
if (last_index < right) {
uint64_t y = last_index + 1;
p = le64toh(array->entry_array.items[y]);
p = journal_file_entry_array_item(f, array, y);
if (p <= 0)
return -EBADMSG;
@ -2527,7 +2538,7 @@ static int generic_array_bisect(
assert(left < right);
i = (left + right) / 2;
p = le64toh(array->entry_array.items[i]);
p = journal_file_entry_array_item(f, array, i);
if (p <= 0)
r = -EBADMSG;
else
@ -2575,14 +2586,14 @@ found:
return 0;
/* Let's cache this item for the next invocation */
chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, array, 0), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
if (subtract_one && i == 0)
p = last_p;
else if (subtract_one)
p = le64toh(array->entry_array.items[i-1]);
p = journal_file_entry_array_item(f, array, i - 1);
else
p = le64toh(array->entry_array.items[i]);
p = journal_file_entry_array_item(f, array, i);
if (ret) {
r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);

View file

@ -194,7 +194,20 @@ int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);
int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset);
uint64_t journal_file_entry_n_items(Object *o) _pure_;
uint64_t journal_file_entry_array_n_items(Object *o) _pure_;
uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) _pure_;
static inline uint64_t journal_file_entry_array_item(JournalFile *f, Object *o, size_t i) {
assert(f);
assert(o);
return JOURNAL_HEADER_COMPACT(f->header) ? le32toh(o->entry_array.items.compact[i]) :
le64toh(o->entry_array.items.regular[i]);
}
static inline size_t journal_file_entry_array_item_size(JournalFile *f) {
assert(f);
return JOURNAL_HEADER_COMPACT(f->header) ? sizeof(le32_t) : sizeof(le64_t);
}
uint64_t journal_file_hash_table_n_items(Object *o) _pure_;
int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *ret_offset);

View file

@ -335,8 +335,8 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
break;
case OBJECT_ENTRY_ARRAY:
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0) {
if ((le64toh(o->object.size) - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 ||
(le64toh(o->object.size) - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) {
error(offset,
"Invalid object entry array size: %"PRIu64,
le64toh(o->object.size));
@ -350,15 +350,15 @@ static int journal_file_object_verify(JournalFile *f, uint64_t offset, Object *o
return -EBADMSG;
}
for (uint64_t i = 0; i < journal_file_entry_array_n_items(o); i++)
if (le64toh(o->entry_array.items[i]) != 0 &&
!VALID64(le64toh(o->entry_array.items[i]))) {
for (uint64_t i = 0; i < journal_file_entry_array_n_items(f, o); i++) {
uint64_t q = journal_file_entry_array_item(f, o, i);
if (q != 0 && !VALID64(q)) {
error(offset,
"Invalid object entry array item (%"PRIu64"/%"PRIu64"): "OFSfmt,
i, journal_file_entry_array_n_items(o),
le64toh(o->entry_array.items[i]));
i, journal_file_entry_array_n_items(f, o), q);
return -EBADMSG;
}
}
break;
@ -490,10 +490,10 @@ static int verify_data(
return -EBADMSG;
}
m = journal_file_entry_array_n_items(o);
m = journal_file_entry_array_n_items(f, o);
for (j = 0; i < n && j < m; i++, j++) {
q = le64toh(o->entry_array.items[j]);
q = journal_file_entry_array_item(f, o, j);
if (q <= last) {
error(p, "Data object's entry array not sorted (%"PRIu64" <= %"PRIu64")", q, last);
return -EBADMSG;
@ -737,11 +737,11 @@ static int verify_entry_array(
return -EBADMSG;
}
m = journal_file_entry_array_n_items(o);
m = journal_file_entry_array_n_items(f, o);
for (j = 0; i < n && j < m; i++, j++) {
uint64_t p;
p = le64toh(o->entry_array.items[j]);
p = journal_file_entry_array_item(f, o, j);
if (p <= last) {
error(a, "Entry array not sorted at %"PRIu64" of %"PRIu64, i, n);
return -EBADMSG;