sd-journal: track newest open journal file per boot ID

This is useful to later order boot IDs by time, addressing #662.

Basically, this determines the most recently written for each boot ID
from all currently open journal files. This is then stored in a hash
table (which maps the boot ID to a prioq of journal files, ordered by
their timestamp).

Why is this useful? If systems lack a battery-buffered RTC they will
initially have a system clock basically starting at zero. Later they
might acquire an NTP fix, or at least roughly monotonic time via a
stored timestamp. Thus, log entries written early during boot tend to be
badly timestamped, and those written most recently are likely to have
most accurate timestamps. Thus, if we track the newest entry for each
boot ID we likely can order the boot ID via their timestamps.

This commit only add the logic to maintain the hash table/prioq. It
doesn't actually make use of this information for ordering yet. A later
patch adds that.
This commit is contained in:
Lennart Poettering 2023-02-08 11:10:32 +01:00
parent 206f0f397e
commit 34af74946e
4 changed files with 195 additions and 0 deletions

View file

@ -28,6 +28,7 @@
#include "lookup3.h"
#include "memory-util.h"
#include "path-util.h"
#include "prioq.h"
#include "random-util.h"
#include "set.h"
#include "sort-util.h"
@ -2059,6 +2060,7 @@ static int journal_file_link_entry(
f->header->tail_entry_realtime = o->entry.realtime;
f->header->tail_entry_monotonic = o->entry.monotonic;
f->header->tail_entry_offset = offset;
f->newest_mtime = 0; /* we have a new tail entry now, explicitly invalidate newest boot id/timestamp info */
/* Link up the items */
for (uint64_t i = 0; i < n_items; i++) {
@ -3787,6 +3789,7 @@ int journal_file_open(
DEFAULT_COMPRESS_THRESHOLD :
MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
.strict_order = FLAGS_SET(file_flags, JOURNAL_STRICT_ORDER),
.newest_boot_id_prioq_idx = PRIOQ_IDX_NULL,
};
if (fname) {

View file

@ -121,6 +121,14 @@ typedef struct JournalFile {
void *fsprg_seed;
size_t fsprg_seed_size;
#endif
/* When we insert this file into the per-boot priority queue 'newest_by_boot_id' in sd_journal, then by these keys */
sd_id128_t newest_boot_id;
sd_id128_t newest_machine_id;
uint64_t newest_monotonic_usec;
uint64_t newest_realtime_usec;
unsigned newest_boot_id_prioq_idx;
usec_t newest_mtime;
} JournalFile;
typedef enum JournalFileFlags {

View file

@ -78,6 +78,7 @@ struct sd_journal {
OrderedHashmap *files;
IteratedCache *files_cache;
MMapCache *mmap;
Hashmap *newest_by_boot_id; /* key: boot_id, value: prioq, ordered by monotonic timestamp of last update */
Location current_location;

View file

@ -34,6 +34,7 @@
#include "lookup3.h"
#include "nulstr-util.h"
#include "path-util.h"
#include "prioq.h"
#include "process-util.h"
#include "replace-var.h"
#include "stat-util.h"
@ -51,6 +52,8 @@
#define DEFAULT_DATA_THRESHOLD (64*1024)
static void remove_file_real(sd_journal *j, JournalFile *f);
static int journal_file_read_tail_timestamp(sd_journal *j, JournalFile *f);
static void journal_file_unlink_newest_by_bood_id(sd_journal *j, JournalFile *f);
static bool journal_pid_changed(sd_journal *j) {
assert(j);
@ -741,6 +744,8 @@ static int next_beyond_location(sd_journal *j, JournalFile *f, direction_t direc
assert(j);
assert(f);
(void) journal_file_read_tail_timestamp(j, f);
n_entries = le64toh(f->header->n_entries);
/* If we hit EOF before, we don't need to look into this file again
@ -1351,6 +1356,7 @@ static int add_any_file(
* which are gone. */
f->last_seen_generation = j->generation;
(void) journal_file_read_tail_timestamp(j, f);
return 0;
}
@ -1391,6 +1397,7 @@ static int add_any_file(
track_file_disposition(j, f);
check_network(j, f->fd);
(void) journal_file_read_tail_timestamp(j, f);
j->current_invalidate_counter++;
@ -1479,6 +1486,7 @@ static void remove_file_real(sd_journal *j, JournalFile *f) {
j->fields_file_lost = true;
}
journal_file_unlink_newest_by_bood_id(j, f);
(void) journal_file_close(f);
j->current_invalidate_counter++;
@ -2173,10 +2181,15 @@ fail:
_public_ void sd_journal_close(sd_journal *j) {
Directory *d;
Prioq *p;
if (!j)
return;
while ((p = hashmap_first(j->newest_by_boot_id)))
journal_file_unlink_newest_by_bood_id(j, prioq_peek(p));
hashmap_free(j->newest_by_boot_id);
sd_journal_flush_matches(j);
ordered_hashmap_free_with_destructor(j->files, journal_file_close);
@ -2208,6 +2221,176 @@ _public_ void sd_journal_close(sd_journal *j) {
free(j);
}
static void journal_file_unlink_newest_by_bood_id(sd_journal *j, JournalFile *f) {
JournalFile *nf;
Prioq *p;
assert(j);
assert(f);
if (f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL) /* not linked currently, hence this is a NOP */
return;
assert_se(p = hashmap_get(j->newest_by_boot_id, &f->newest_boot_id));
assert_se(prioq_remove(p, f, &f->newest_boot_id_prioq_idx) > 0);
nf = prioq_peek(p);
if (nf)
/* There's still a member in the prioq? Then make sure the hashmap key now points to its
* .newest_boot_id field (and not ours!). Not we only replace the memory of the key here, the
* value of the key (and the data associated with it) remain the same. */
assert_se(hashmap_update(j->newest_by_boot_id, &nf->newest_boot_id, p) >= 0);
else {
assert_se(hashmap_remove(j->newest_by_boot_id, &f->newest_boot_id) == p);
prioq_free(p);
}
f->newest_boot_id_prioq_idx = PRIOQ_IDX_NULL;
}
static int journal_file_newest_monotonic_compare(const void *a, const void *b) {
const JournalFile *x = a, *y = b;
return -CMP(x->newest_monotonic_usec, y->newest_monotonic_usec); /* Invert order, we want newest first! */
}
static int journal_file_reshuffle_newest_by_boot_id(sd_journal *j, JournalFile *f) {
Prioq *p;
int r;
assert(j);
assert(f);
p = hashmap_get(j->newest_by_boot_id, &f->newest_boot_id);
if (p) {
/* There's already a priority queue for this boot ID */
if (f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL) {
r = prioq_put(p, f, &f->newest_boot_id_prioq_idx); /* Insert if we aren't in there yet */
if (r < 0)
return r;
} else
prioq_reshuffle(p, f, &f->newest_boot_id_prioq_idx); /* Reshuffle otherwise */
} else {
_cleanup_(prioq_freep) Prioq *q = NULL;
/* No priority queue yet, then allocate one */
assert(f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL); /* we can't be a member either */
q = prioq_new(journal_file_newest_monotonic_compare);
if (!q)
return -ENOMEM;
r = prioq_put(q, f, &f->newest_boot_id_prioq_idx);
if (r < 0)
return r;
r = hashmap_ensure_put(&j->newest_by_boot_id, &id128_hash_ops, &f->newest_boot_id, q);
if (r < 0)
return r;
TAKE_PTR(q);
}
return 0;
}
static int journal_file_read_tail_timestamp(sd_journal *j, JournalFile *f) {
uint64_t offset, mo, rt;
sd_id128_t id;
ObjectType type;
Object *o;
int r;
assert(j);
assert(f);
assert(f->header);
/* Tries to read the timestamp of the most recently written entry. */
r = journal_file_fstat(f);
if (r < 0)
return r;
if (f->newest_mtime == timespec_load(&f->last_stat.st_mtim))
return 0; /* mtime didn't change since last time, don't bother */
if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_offset)) {
offset = le64toh(READ_NOW(f->header->tail_entry_offset));
type = OBJECT_ENTRY;
} else {
offset = le64toh(READ_NOW(f->header->tail_object_offset));
type = OBJECT_UNUSED;
}
if (offset == 0)
return -ENODATA; /* not a single object/entry, hence no tail timestamp */
/* Move to the last object in the journal file, in the hope it is an entry (which it usually will
* be). If we lack the "tail_entry_offset" field in the header, we specify the type as OBJECT_UNUSED
* here, since we cannot be sure what the last object will be, and want no noisy logging if it isn't
* an entry. We instead check after figuring out the pointer. */
r = journal_file_move_to_object(f, type, offset, &o);
if (r < 0) {
log_debug_errno(r, "Failed to move to last object in journal file, ignoring: %m");
o = NULL;
}
if (o && o->object.type == OBJECT_ENTRY) {
/* Yay, last object is an entry, let's use the data. */
id = o->entry.boot_id;
mo = le64toh(o->entry.monotonic);
rt = le64toh(o->entry.realtime);
} else {
/* So the object is not an entry or we couldn't access it? In that case, let's read the most
* recent entry timestamps from the header. It's equally good. Unfortunately though, in old
* versions of the journal the boot ID in the header doesn't have to match the monotonic
* timestamp of the header. Let's check the header flag that indicates whether this strictly
* matches first hence, before using the data. */
if (JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) && f->header->state == STATE_ARCHIVED) {
mo = le64toh(f->header->tail_entry_monotonic);
rt = le64toh(f->header->tail_entry_realtime);
id = f->header->tail_entry_boot_id;
/* Some superficial checking if what we read makes sense. Note that we only do this
* when reading the timestamps from the Header object, but not when reading them from
* the most recent entry object, because in that case journal_file_move_to_object()
* already validated them. */
if (!VALID_MONOTONIC(mo) || !VALID_REALTIME(rt))
return -ENODATA;
} else {
/* Otherwise let's find the last entry manually (this possibly means traversing the
* chain of entry arrays, till the end */
r = journal_file_next_entry(f, 0, DIRECTION_UP, &o, NULL);
if (r < 0)
return r;
id = o->entry.boot_id;
mo = le64toh(o->entry.monotonic);
rt = le64toh(o->entry.realtime);
}
}
if (mo > rt) /* monotonic clock is further ahead than realtime? that's weird, refuse to use the data */
return -ENODATA;
if (!sd_id128_equal(f->newest_boot_id, id))
journal_file_unlink_newest_by_bood_id(j, f);
f->newest_boot_id = id;
f->newest_monotonic_usec = mo;
f->newest_realtime_usec = rt;
f->newest_machine_id = f->header->machine_id;
f->newest_mtime = timespec_load(&f->last_stat.st_mtim);
r = journal_file_reshuffle_newest_by_boot_id(j, f);
if (r < 0)
return r;
return 0;
}
_public_ int sd_journal_get_realtime_usec(sd_journal *j, uint64_t *ret) {
JournalFile *f;
Object *o;