diff --git a/src/libsystemd/sd-journal/journal-def.h b/src/libsystemd/sd-journal/journal-def.h index e52a307c9fb..b773f26719e 100644 --- a/src/libsystemd/sd-journal/journal-def.h +++ b/src/libsystemd/sd-journal/journal-def.h @@ -30,7 +30,7 @@ typedef struct FSSHeader FSSHeader; /* Object types */ typedef enum ObjectType { - OBJECT_UNUSED, /* also serves as "any type" or "additional context" */ + OBJECT_UNUSED, /* also serves as "any type" or "additional category" */ OBJECT_DATA, OBJECT_FIELD, OBJECT_ENTRY, diff --git a/src/libsystemd/sd-journal/journal-file.c b/src/libsystemd/sd-journal/journal-file.c index 334a28f9486..25347b28a8e 100644 --- a/src/libsystemd/sd-journal/journal-file.c +++ b/src/libsystemd/sd-journal/journal-file.c @@ -88,9 +88,6 @@ /* Reread fstat() of the file for detecting deletions at least this often */ #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC) -/* The mmap context to use for the header we pick as one above the last defined typed */ -#define CONTEXT_HEADER _OBJECT_TYPE_MAX - /* Longest hash chain to rotate after */ #define HASH_CHAIN_DEPTH_MAX 100 @@ -821,13 +818,6 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) return journal_file_fstat(f); } -static unsigned type_to_context(ObjectType type) { - /* One context for each type, plus one catch-all for the rest */ - assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS); - assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS); - return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0; -} - static int journal_file_move_to( JournalFile *f, ObjectType type, @@ -864,7 +854,7 @@ static int journal_file_move_to( return -EADDRNOTAVAIL; } - return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret); + return mmap_cache_fd_get(f->cache_fd, type_to_category(type), keep_always, offset, size, &f->last_stat, ret); } static uint64_t minimum_header_size(JournalFile *f, Object *o) { @@ -1135,6 +1125,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset return 0; } +int journal_file_pin_object(JournalFile *f, Object *o) { + assert(f); + assert(o); + + /* This attaches the mmap window that provides the object to the 'pinning' category. So, reading + * another object with the same type will not invalidate the object, until this function is called + * for another object. */ + return mmap_cache_fd_pin(f->cache_fd, type_to_category(o->object.type), o, le64toh(o->object.size)); +} + int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) { ssize_t n; Object o; @@ -3098,24 +3098,30 @@ found: return 1; } -static int generic_array_bisect_plus_one( +static int generic_array_bisect_for_data( JournalFile *f, - uint64_t extra, - uint64_t first, - uint64_t n, + Object *d, uint64_t needle, int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), direction_t direction, Object **ret_object, uint64_t *ret_offset) { + uint64_t extra, first, n; int r; assert(f); + assert(d); + assert(d->object.type == OBJECT_DATA); assert(test_object); + n = le64toh(d->data.n_entries); if (n <= 0) return 0; + n--; /* n_entries is the number of entries linked to the data object, including the 'extra' entry. */ + + extra = le64toh(d->data.entry_offset); + first = le64toh(d->data.entry_array_offset); /* This bisects the array in object 'first', but first checks an extra. */ r = test_object(f, extra, needle); @@ -3151,7 +3157,7 @@ static int generic_array_bisect_plus_one( * object. */ } - r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret_object, ret_offset, NULL); + r = generic_array_bisect(f, first, n, needle, test_object, direction, ret_object, ret_offset, NULL); if (r != 0) return r; /* When > 0, the found object is the first (or last, when DIRECTION_UP) object. * Hence, return the found object now. */ @@ -3341,11 +3347,9 @@ int journal_file_move_to_entry_by_monotonic( if (r <= 0) return r; - return generic_array_bisect_plus_one( + return generic_array_bisect_for_data( f, - le64toh(o->data.entry_offset), - le64toh(o->data.entry_array_offset), - le64toh(o->data.n_entries), + o, monotonic, test_object_monotonic, direction, @@ -3540,11 +3544,9 @@ int journal_file_move_to_entry_by_offset_for_data( assert(d); assert(d->object.type == OBJECT_DATA); - return generic_array_bisect_plus_one( + return generic_array_bisect_for_data( f, - le64toh(d->data.entry_offset), - le64toh(d->data.entry_array_offset), - le64toh(d->data.n_entries), + d, p, test_object_offset, direction, @@ -3560,28 +3562,26 @@ int journal_file_move_to_entry_by_monotonic_for_data( Object **ret_object, uint64_t *ret_offset) { - uint64_t z, entry_offset, entry_array_offset, n_entries; Object *o, *entry; + uint64_t z; int r; assert(f); assert(d); assert(d->object.type == OBJECT_DATA); - /* Save all the required data before the data object gets invalidated. */ - entry_offset = le64toh(READ_NOW(d->data.entry_offset)); - entry_array_offset = le64toh(READ_NOW(d->data.entry_array_offset)); - n_entries = le64toh(READ_NOW(d->data.n_entries)); + /* First, pin the given data object, before reading the _BOOT_ID= data object below. */ + r = journal_file_pin_object(f, d); + if (r < 0) + return r; - /* First, seek by time */ + /* Then, read a data object for _BOOT_ID= and seek by time. */ r = find_data_object_by_boot_id(f, boot_id, &o, NULL); if (r <= 0) return r; - r = generic_array_bisect_plus_one(f, - le64toh(o->data.entry_offset), - le64toh(o->data.entry_array_offset), - le64toh(o->data.n_entries), + r = generic_array_bisect_for_data(f, + o, monotonic, test_object_monotonic, direction, @@ -3596,14 +3596,8 @@ int journal_file_move_to_entry_by_monotonic_for_data( /* The journal entry found by the above bisect_plus_one() may not have the specified data, * that is, it may not be linked in the data object. So, we need to check that. */ - r = generic_array_bisect_plus_one(f, - entry_offset, - entry_array_offset, - n_entries, - z, - test_object_offset, - direction, - ret_object ? &entry : NULL, &p); + r = journal_file_move_to_entry_by_offset_for_data( + f, d, z, direction, ret_object ? &entry : NULL, &p); if (r <= 0) return r; if (p == z) @@ -3613,14 +3607,8 @@ int journal_file_move_to_entry_by_monotonic_for_data( * 'direction') entry linked to the data object. But, the next entry may be in another boot. * So, we need to check that the entry has the matching boot ID. */ - r = generic_array_bisect_plus_one(f, - le64toh(o->data.entry_offset), - le64toh(o->data.entry_array_offset), - le64toh(o->data.n_entries), - p, - test_object_offset, - direction, - ret_object ? &entry : NULL, &z); + r = journal_file_move_to_entry_by_offset_for_data( + f, o, p, direction, ret_object ? &entry : NULL, &z); if (r <= 0) return r; if (p == z) @@ -3648,11 +3636,9 @@ int journal_file_move_to_entry_by_seqnum_for_data( assert(d); assert(d->object.type == OBJECT_DATA); - return generic_array_bisect_plus_one( + return generic_array_bisect_for_data( f, - le64toh(d->data.entry_offset), - le64toh(d->data.entry_array_offset), - le64toh(d->data.n_entries), + d, seqnum, test_object_seqnum, direction, @@ -3670,11 +3656,9 @@ int journal_file_move_to_entry_by_realtime_for_data( assert(d); assert(d->object.type == OBJECT_DATA); - return generic_array_bisect_plus_one( + return generic_array_bisect_for_data( f, - le64toh(d->data.entry_offset), - le64toh(d->data.entry_array_offset), - le64toh(d->data.n_entries), + d, realtime, test_object_realtime, direction, @@ -4086,7 +4070,7 @@ int journal_file_open( goto fail; } - r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h); + r = mmap_cache_fd_get(f->cache_fd, MMAP_CACHE_CATEGORY_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h); if (r == -EINVAL) { /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error diff --git a/src/libsystemd/sd-journal/journal-file.h b/src/libsystemd/sd-journal/journal-file.h index 6c46dff4c45..183a5e43bb4 100644 --- a/src/libsystemd/sd-journal/journal-file.h +++ b/src/libsystemd/sd-journal/journal-file.h @@ -208,6 +208,7 @@ static inline bool VALID_EPOCH(uint64_t u) { FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPACT) int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret); +int journal_file_pin_object(JournalFile *f, Object *o); int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret); int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset); diff --git a/src/libsystemd/sd-journal/mmap-cache.c b/src/libsystemd/sd-journal/mmap-cache.c index e5e03bf83ca..973ade64c0f 100644 --- a/src/libsystemd/sd-journal/mmap-cache.c +++ b/src/libsystemd/sd-journal/mmap-cache.c @@ -16,14 +16,23 @@ #include "sigbus.h" typedef struct Window Window; -typedef struct Context Context; + +typedef enum WindowFlags { + WINDOW_KEEP_ALWAYS = 1u << (_MMAP_CACHE_CATEGORY_MAX + 0), + WINDOW_IN_UNUSED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 1), + WINDOW_INVALIDATED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 2), + + _WINDOW_USED_MASK = WINDOW_IN_UNUSED - 1, /* The mask contains all bits that indicate the windows + * is currently in use. Covers the all the object types + * and the additional WINDOW_KEEP_ALWAYS flag. */ +} WindowFlags; + +#define WINDOW_IS_UNUSED(w) (((w)->flags & _WINDOW_USED_MASK) == 0) struct Window { MMapFileDescriptor *fd; - bool invalidated:1; - bool keep_always:1; - bool in_unused:1; + WindowFlags flags; void *ptr; uint64_t offset; @@ -31,21 +40,15 @@ struct Window { LIST_FIELDS(Window, windows); LIST_FIELDS(Window, unused); - - LIST_HEAD(Context, contexts); -}; - -struct Context { - Window *window; - - LIST_FIELDS(Context, by_window); }; struct MMapFileDescriptor { MMapCache *cache; + int fd; int prot; bool sigbus; + LIST_HEAD(Window, windows); }; @@ -53,7 +56,7 @@ struct MMapCache { unsigned n_ref; unsigned n_windows; - unsigned n_context_cache_hit; + unsigned n_category_cache_hit; unsigned n_window_list_hit; unsigned n_missed; @@ -62,7 +65,7 @@ struct MMapCache { LIST_HEAD(Window, unused); Window *last_unused; - Context contexts[MMAP_CACHE_MAX_CONTEXTS]; + Window *windows_by_category[_MMAP_CACHE_CATEGORY_MAX]; }; #define WINDOWS_MIN 64 @@ -96,17 +99,15 @@ static Window* window_unlink(Window *w) { if (w->ptr) munmap(w->ptr, w->size); - if (w->in_unused) { + if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) { if (m->last_unused == w) m->last_unused = w->unused_prev; - LIST_REMOVE(unused, m->unused, w); } - LIST_FOREACH(by_window, c, w->contexts) { - assert(c->window == w); - c->window = NULL; - } + for (unsigned i = 0; i < _MMAP_CACHE_CATEGORY_MAX; i++) + if (FLAGS_SET(w->flags, 1u << i)) + assert_se(TAKE_PTR(m->windows_by_category[i]) == w); return LIST_REMOVE(windows, w->fd->windows, w); } @@ -115,14 +116,14 @@ static void window_invalidate(Window *w) { assert(w); assert(w->fd); - if (w->invalidated) + if (FLAGS_SET(w->flags, WINDOW_INVALIDATED)) return; /* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure * the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */ assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr); - w->invalidated = true; + w->flags |= WINDOW_INVALIDATED; } static Window* window_free(Window *w) { @@ -145,6 +146,16 @@ static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, si offset + size <= w->offset + w->size; } +static bool window_matches_by_addr(Window *w, MMapFileDescriptor *f, void *addr, size_t size) { + assert(size > 0); + + return + w && + f == w->fd && + (uint8_t*) addr >= (uint8_t*) w->ptr && + (uint8_t*) addr + size <= (uint8_t*) w->ptr + w->size; +} + static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) { MMapCache *m = mmap_cache_fd_cache(f); Window *w; @@ -169,19 +180,20 @@ static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, v return LIST_PREPEND(windows, f->windows, w); } -static void context_detach_window(MMapCache *m, Context *c) { +static void category_detach_window(MMapCache *m, MMapCacheCategory c) { Window *w; assert(m); - assert(c); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); - if (!c->window) - return; + w = TAKE_PTR(m->windows_by_category[c]); + if (!w) + return; /* Nothing attached. */ - w = TAKE_PTR(c->window); - LIST_REMOVE(by_window, w->contexts, c); + assert(FLAGS_SET(w->flags, 1u << c)); + w->flags &= ~(1u << c); - if (!w->contexts && !w->keep_always) { + if (WINDOW_IS_UNUSED(w)) { /* Not used anymore? */ #if ENABLE_DEBUG_MMAP_CACHE /* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */ @@ -190,33 +202,31 @@ static void context_detach_window(MMapCache *m, Context *c) { LIST_PREPEND(unused, m->unused, w); if (!m->last_unused) m->last_unused = w; - - w->in_unused = true; + w->flags |= WINDOW_IN_UNUSED; #endif } } -static void context_attach_window(MMapCache *m, Context *c, Window *w) { +static void category_attach_window(MMapCache *m, MMapCacheCategory c, Window *w) { assert(m); - assert(c); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); assert(w); - if (c->window == w) - return; + if (m->windows_by_category[c] == w) + return; /* Already attached. */ - context_detach_window(m, c); + category_detach_window(m, c); - if (w->in_unused) { + if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) { /* Used again? */ if (m->last_unused == w) m->last_unused = w->unused_prev; LIST_REMOVE(unused, m->unused, w); - - w->in_unused = false; + w->flags &= ~WINDOW_IN_UNUSED; } - c->window = w; - LIST_PREPEND(by_window, w->contexts, c); + m->windows_by_category[c] = w; + w->flags |= (1u << c); } static MMapCache* mmap_cache_free(MMapCache *m) { @@ -322,7 +332,7 @@ static int add_mmap( int mmap_cache_fd_get( MMapFileDescriptor *f, - unsigned context, + MMapCacheCategory c, bool keep_always, uint64_t offset, size_t size, @@ -330,28 +340,25 @@ int mmap_cache_fd_get( void **ret) { MMapCache *m = mmap_cache_fd_cache(f); - Context *c; Window *w; int r; - assert(context < MMAP_CACHE_MAX_CONTEXTS); assert(size > 0); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); assert(ret); if (f->sigbus) return -EIO; - c = &f->cache->contexts[context]; - - /* Check whether the current context is the right one already */ - if (window_matches(c->window, f, offset, size)) { - m->n_context_cache_hit++; - w = c->window; + /* Check whether the current category is the right one already */ + if (window_matches(m->windows_by_category[c], f, offset, size)) { + m->n_category_cache_hit++; + w = m->windows_by_category[c]; goto found; } /* Drop the reference to the window, since it's unnecessary now */ - context_detach_window(m, c); + category_detach_window(m, c); /* Search for a matching mmap */ LIST_FOREACH(windows, i, f->windows) @@ -369,17 +376,62 @@ int mmap_cache_fd_get( return r; found: - w->keep_always = w->keep_always || keep_always; - context_attach_window(m, c, w); + if (keep_always) + w->flags |= WINDOW_KEEP_ALWAYS; + + category_attach_window(m, c, w); *ret = (uint8_t*) w->ptr + (offset - w->offset); return 0; } +int mmap_cache_fd_pin( + MMapFileDescriptor *f, + MMapCacheCategory c, + void *addr, + size_t size) { + + MMapCache *m = mmap_cache_fd_cache(f); + Window *w; + + assert(addr); + assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX); + assert(size > 0); + + if (f->sigbus) + return -EIO; + + /* Check if the current category is the right one. */ + if (window_matches_by_addr(m->windows_by_category[c], f, addr, size)) { + m->n_category_cache_hit++; + w = m->windows_by_category[c]; + goto found; + } + + /* Search for a matching mmap. */ + LIST_FOREACH(windows, i, f->windows) + if (window_matches_by_addr(i, f, addr, size)) { + m->n_window_list_hit++; + w = i; + goto found; + } + + m->n_missed++; + return -EADDRNOTAVAIL; /* Not found. */ + +found: + if (FLAGS_SET(w->flags, WINDOW_KEEP_ALWAYS)) + return 0; /* The window will never unmapped. */ + + /* Attach the window to the 'pinning' category. */ + category_attach_window(m, MMAP_CACHE_CATEGORY_PIN, w); + return 1; +} + void mmap_cache_stats_log_debug(MMapCache *m) { assert(m); - log_debug("mmap cache statistics: %u context cache hit, %u window list hit, %u miss", - m->n_context_cache_hit, m->n_window_list_hit, m->n_missed); + log_debug("mmap cache statistics: %u category cache hit, %u window list hit, %u miss", + m->n_category_cache_hit, m->n_window_list_hit, m->n_missed); } static void mmap_cache_process_sigbus(MMapCache *m) { @@ -404,13 +456,11 @@ static void mmap_cache_process_sigbus(MMapCache *m) { ours = false; HASHMAP_FOREACH(f, m->fds) { - LIST_FOREACH(windows, w, f->windows) { - if ((uint8_t*) addr >= (uint8_t*) w->ptr && - (uint8_t*) addr < (uint8_t*) w->ptr + w->size) { + LIST_FOREACH(windows, w, f->windows) + if (window_matches_by_addr(w, f, addr, 1)) { found = ours = f->sigbus = true; break; } - } if (ours) break; diff --git a/src/libsystemd/sd-journal/mmap-cache.h b/src/libsystemd/sd-journal/mmap-cache.h index 1279337cdd0..1fbc236bda9 100644 --- a/src/libsystemd/sd-journal/mmap-cache.h +++ b/src/libsystemd/sd-journal/mmap-cache.h @@ -1,15 +1,36 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #pragma once +#include #include #include -/* One context per object type, plus one of the header, plus one "additional" one */ -#define MMAP_CACHE_MAX_CONTEXTS 9 +#include "journal-def.h" typedef struct MMapCache MMapCache; typedef struct MMapFileDescriptor MMapFileDescriptor; +typedef enum MMapCacheCategory { + MMAP_CACHE_CATEGORY_ANY = OBJECT_UNUSED, + MMAP_CACHE_CATEGORY_DATA = OBJECT_DATA, + MMAP_CACHE_CATEGORY_FIELD = OBJECT_FIELD, + MMAP_CACHE_CATEGORY_ENTRY = OBJECT_ENTRY, + MMAP_CACHE_CATEGORY_DATA_HASH_TABLE = OBJECT_DATA_HASH_TABLE, + MMAP_CACHE_CATEGORY_FIELD_HASH_TABLE = OBJECT_FIELD_HASH_TABLE, + MMAP_CACHE_CATEGORY_ENTRY_ARRAY = OBJECT_ENTRY_ARRAY, + MMAP_CACHE_CATEGORY_TAG = OBJECT_TAG, + MMAP_CACHE_CATEGORY_HEADER, /* for reading file header */ + MMAP_CACHE_CATEGORY_PIN, /* for temporary pinning a object */ + _MMAP_CACHE_CATEGORY_MAX, + _MMAP_CACHE_CATEGORY_INVALID = -EINVAL, +} MMapCacheCategory; + +assert_cc((int) _OBJECT_TYPE_MAX < (int) _MMAP_CACHE_CATEGORY_MAX); + +static inline MMapCacheCategory type_to_category(ObjectType type) { + return type >= 0 && type < _OBJECT_TYPE_MAX ? (MMapCacheCategory) type : MMAP_CACHE_CATEGORY_ANY; +} + MMapCache* mmap_cache_new(void); MMapCache* mmap_cache_ref(MMapCache *m); MMapCache* mmap_cache_unref(MMapCache *m); @@ -17,12 +38,19 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(MMapCache*, mmap_cache_unref); int mmap_cache_fd_get( MMapFileDescriptor *f, - unsigned context, + MMapCacheCategory c, bool keep_always, uint64_t offset, size_t size, struct stat *st, void **ret); + +int mmap_cache_fd_pin( + MMapFileDescriptor *f, + MMapCacheCategory c, + void *addr, + size_t size); + int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret); MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f); MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f); diff --git a/src/libsystemd/sd-journal/sd-journal.c b/src/libsystemd/sd-journal/sd-journal.c index c61573f8606..73a65da7502 100644 --- a/src/libsystemd/sd-journal/sd-journal.c +++ b/src/libsystemd/sd-journal/sd-journal.c @@ -3190,20 +3190,14 @@ _public_ int sd_journal_enumerate_unique( continue; } - /* We do not use OBJECT_DATA context here, but OBJECT_UNUSED - * instead, so that we can look at this data object at the same - * time as one on another file */ - r = journal_file_move_to_object(j->unique_file, OBJECT_UNUSED, j->unique_offset, &o); + r = journal_file_move_to_object(j->unique_file, OBJECT_DATA, j->unique_offset, &o); if (r < 0) return r; - /* Let's do the type check by hand, since we used 0 context above. */ - if (o->object.type != OBJECT_DATA) - return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), - "%s:offset " OFSfmt ": object has type %d, expected %d", - j->unique_file->path, - j->unique_offset, - o->object.type, OBJECT_DATA); + /* Let's pin the data object, so we can look at it at the same time as one on another file. */ + r = journal_file_pin_object(j->unique_file, o); + if (r < 0) + return r; r = journal_file_data_payload(j->unique_file, o, j->unique_offset, NULL, 0, j->data_threshold, &odata, &ol);