Merge pull request #29469 from yuwata/sd-journal-pin-object

sd-journal: add a way to 'pin' object to protect from reading another object with the same type
This commit is contained in:
Luca Boccassi 2023-11-04 10:37:43 +00:00 committed by GitHub
commit c7cc6d5859
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 190 additions and 133 deletions

View file

@ -30,7 +30,7 @@ typedef struct FSSHeader FSSHeader;
/* Object types */
typedef enum ObjectType {
OBJECT_UNUSED, /* also serves as "any type" or "additional context" */
OBJECT_UNUSED, /* also serves as "any type" or "additional category" */
OBJECT_DATA,
OBJECT_FIELD,
OBJECT_ENTRY,

View file

@ -88,9 +88,6 @@
/* Reread fstat() of the file for detecting deletions at least this often */
#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
/* The mmap context to use for the header we pick as one above the last defined typed */
#define CONTEXT_HEADER _OBJECT_TYPE_MAX
/* Longest hash chain to rotate after */
#define HASH_CHAIN_DEPTH_MAX 100
@ -821,13 +818,6 @@ static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size)
return journal_file_fstat(f);
}
static unsigned type_to_context(ObjectType type) {
/* One context for each type, plus one catch-all for the rest */
assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
}
static int journal_file_move_to(
JournalFile *f,
ObjectType type,
@ -864,7 +854,7 @@ static int journal_file_move_to(
return -EADDRNOTAVAIL;
}
return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
return mmap_cache_fd_get(f->cache_fd, type_to_category(type), keep_always, offset, size, &f->last_stat, ret);
}
static uint64_t minimum_header_size(JournalFile *f, Object *o) {
@ -1135,6 +1125,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
return 0;
}
int journal_file_pin_object(JournalFile *f, Object *o) {
assert(f);
assert(o);
/* This attaches the mmap window that provides the object to the 'pinning' category. So, reading
* another object with the same type will not invalidate the object, until this function is called
* for another object. */
return mmap_cache_fd_pin(f->cache_fd, type_to_category(o->object.type), o, le64toh(o->object.size));
}
int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) {
ssize_t n;
Object o;
@ -3098,24 +3098,30 @@ found:
return 1;
}
static int generic_array_bisect_plus_one(
static int generic_array_bisect_for_data(
JournalFile *f,
uint64_t extra,
uint64_t first,
uint64_t n,
Object *d,
uint64_t needle,
int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
direction_t direction,
Object **ret_object,
uint64_t *ret_offset) {
uint64_t extra, first, n;
int r;
assert(f);
assert(d);
assert(d->object.type == OBJECT_DATA);
assert(test_object);
n = le64toh(d->data.n_entries);
if (n <= 0)
return 0;
n--; /* n_entries is the number of entries linked to the data object, including the 'extra' entry. */
extra = le64toh(d->data.entry_offset);
first = le64toh(d->data.entry_array_offset);
/* This bisects the array in object 'first', but first checks an extra. */
r = test_object(f, extra, needle);
@ -3151,7 +3157,7 @@ static int generic_array_bisect_plus_one(
* object. */
}
r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret_object, ret_offset, NULL);
r = generic_array_bisect(f, first, n, needle, test_object, direction, ret_object, ret_offset, NULL);
if (r != 0)
return r; /* When > 0, the found object is the first (or last, when DIRECTION_UP) object.
* Hence, return the found object now. */
@ -3341,11 +3347,9 @@ int journal_file_move_to_entry_by_monotonic(
if (r <= 0)
return r;
return generic_array_bisect_plus_one(
return generic_array_bisect_for_data(
f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
o,
monotonic,
test_object_monotonic,
direction,
@ -3540,11 +3544,9 @@ int journal_file_move_to_entry_by_offset_for_data(
assert(d);
assert(d->object.type == OBJECT_DATA);
return generic_array_bisect_plus_one(
return generic_array_bisect_for_data(
f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
d,
p,
test_object_offset,
direction,
@ -3560,28 +3562,26 @@ int journal_file_move_to_entry_by_monotonic_for_data(
Object **ret_object,
uint64_t *ret_offset) {
uint64_t z, entry_offset, entry_array_offset, n_entries;
Object *o, *entry;
uint64_t z;
int r;
assert(f);
assert(d);
assert(d->object.type == OBJECT_DATA);
/* Save all the required data before the data object gets invalidated. */
entry_offset = le64toh(READ_NOW(d->data.entry_offset));
entry_array_offset = le64toh(READ_NOW(d->data.entry_array_offset));
n_entries = le64toh(READ_NOW(d->data.n_entries));
/* First, pin the given data object, before reading the _BOOT_ID= data object below. */
r = journal_file_pin_object(f, d);
if (r < 0)
return r;
/* First, seek by time */
/* Then, read a data object for _BOOT_ID= and seek by time. */
r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
if (r <= 0)
return r;
r = generic_array_bisect_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
r = generic_array_bisect_for_data(f,
o,
monotonic,
test_object_monotonic,
direction,
@ -3596,14 +3596,8 @@ int journal_file_move_to_entry_by_monotonic_for_data(
/* The journal entry found by the above bisect_plus_one() may not have the specified data,
* that is, it may not be linked in the data object. So, we need to check that. */
r = generic_array_bisect_plus_one(f,
entry_offset,
entry_array_offset,
n_entries,
z,
test_object_offset,
direction,
ret_object ? &entry : NULL, &p);
r = journal_file_move_to_entry_by_offset_for_data(
f, d, z, direction, ret_object ? &entry : NULL, &p);
if (r <= 0)
return r;
if (p == z)
@ -3613,14 +3607,8 @@ int journal_file_move_to_entry_by_monotonic_for_data(
* 'direction') entry linked to the data object. But, the next entry may be in another boot.
* So, we need to check that the entry has the matching boot ID. */
r = generic_array_bisect_plus_one(f,
le64toh(o->data.entry_offset),
le64toh(o->data.entry_array_offset),
le64toh(o->data.n_entries),
p,
test_object_offset,
direction,
ret_object ? &entry : NULL, &z);
r = journal_file_move_to_entry_by_offset_for_data(
f, o, p, direction, ret_object ? &entry : NULL, &z);
if (r <= 0)
return r;
if (p == z)
@ -3648,11 +3636,9 @@ int journal_file_move_to_entry_by_seqnum_for_data(
assert(d);
assert(d->object.type == OBJECT_DATA);
return generic_array_bisect_plus_one(
return generic_array_bisect_for_data(
f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
d,
seqnum,
test_object_seqnum,
direction,
@ -3670,11 +3656,9 @@ int journal_file_move_to_entry_by_realtime_for_data(
assert(d);
assert(d->object.type == OBJECT_DATA);
return generic_array_bisect_plus_one(
return generic_array_bisect_for_data(
f,
le64toh(d->data.entry_offset),
le64toh(d->data.entry_array_offset),
le64toh(d->data.n_entries),
d,
realtime,
test_object_realtime,
direction,
@ -4086,7 +4070,7 @@ int journal_file_open(
goto fail;
}
r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
r = mmap_cache_fd_get(f->cache_fd, MMAP_CACHE_CATEGORY_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
if (r == -EINVAL) {
/* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
* mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error

View file

@ -208,6 +208,7 @@ static inline bool VALID_EPOCH(uint64_t u) {
FLAGS_SET(le32toh((h)->incompatible_flags), HEADER_INCOMPATIBLE_COMPACT)
int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret);
int journal_file_pin_object(JournalFile *f, Object *o);
int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret);
int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset);

View file

@ -16,14 +16,23 @@
#include "sigbus.h"
typedef struct Window Window;
typedef struct Context Context;
typedef enum WindowFlags {
WINDOW_KEEP_ALWAYS = 1u << (_MMAP_CACHE_CATEGORY_MAX + 0),
WINDOW_IN_UNUSED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 1),
WINDOW_INVALIDATED = 1u << (_MMAP_CACHE_CATEGORY_MAX + 2),
_WINDOW_USED_MASK = WINDOW_IN_UNUSED - 1, /* The mask contains all bits that indicate the windows
* is currently in use. Covers the all the object types
* and the additional WINDOW_KEEP_ALWAYS flag. */
} WindowFlags;
#define WINDOW_IS_UNUSED(w) (((w)->flags & _WINDOW_USED_MASK) == 0)
struct Window {
MMapFileDescriptor *fd;
bool invalidated:1;
bool keep_always:1;
bool in_unused:1;
WindowFlags flags;
void *ptr;
uint64_t offset;
@ -31,21 +40,15 @@ struct Window {
LIST_FIELDS(Window, windows);
LIST_FIELDS(Window, unused);
LIST_HEAD(Context, contexts);
};
struct Context {
Window *window;
LIST_FIELDS(Context, by_window);
};
struct MMapFileDescriptor {
MMapCache *cache;
int fd;
int prot;
bool sigbus;
LIST_HEAD(Window, windows);
};
@ -53,7 +56,7 @@ struct MMapCache {
unsigned n_ref;
unsigned n_windows;
unsigned n_context_cache_hit;
unsigned n_category_cache_hit;
unsigned n_window_list_hit;
unsigned n_missed;
@ -62,7 +65,7 @@ struct MMapCache {
LIST_HEAD(Window, unused);
Window *last_unused;
Context contexts[MMAP_CACHE_MAX_CONTEXTS];
Window *windows_by_category[_MMAP_CACHE_CATEGORY_MAX];
};
#define WINDOWS_MIN 64
@ -96,17 +99,15 @@ static Window* window_unlink(Window *w) {
if (w->ptr)
munmap(w->ptr, w->size);
if (w->in_unused) {
if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) {
if (m->last_unused == w)
m->last_unused = w->unused_prev;
LIST_REMOVE(unused, m->unused, w);
}
LIST_FOREACH(by_window, c, w->contexts) {
assert(c->window == w);
c->window = NULL;
}
for (unsigned i = 0; i < _MMAP_CACHE_CATEGORY_MAX; i++)
if (FLAGS_SET(w->flags, 1u << i))
assert_se(TAKE_PTR(m->windows_by_category[i]) == w);
return LIST_REMOVE(windows, w->fd->windows, w);
}
@ -115,14 +116,14 @@ static void window_invalidate(Window *w) {
assert(w);
assert(w->fd);
if (w->invalidated)
if (FLAGS_SET(w->flags, WINDOW_INVALIDATED))
return;
/* Replace the window with anonymous pages. This is useful when we hit a SIGBUS and want to make sure
* the file cannot trigger any further SIGBUS, possibly overrunning the sigbus queue. */
assert_se(mmap(w->ptr, w->size, w->fd->prot, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == w->ptr);
w->invalidated = true;
w->flags |= WINDOW_INVALIDATED;
}
static Window* window_free(Window *w) {
@ -145,6 +146,16 @@ static bool window_matches(Window *w, MMapFileDescriptor *f, uint64_t offset, si
offset + size <= w->offset + w->size;
}
static bool window_matches_by_addr(Window *w, MMapFileDescriptor *f, void *addr, size_t size) {
assert(size > 0);
return
w &&
f == w->fd &&
(uint8_t*) addr >= (uint8_t*) w->ptr &&
(uint8_t*) addr + size <= (uint8_t*) w->ptr + w->size;
}
static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, void *ptr) {
MMapCache *m = mmap_cache_fd_cache(f);
Window *w;
@ -169,19 +180,20 @@ static Window* window_add(MMapFileDescriptor *f, uint64_t offset, size_t size, v
return LIST_PREPEND(windows, f->windows, w);
}
static void context_detach_window(MMapCache *m, Context *c) {
static void category_detach_window(MMapCache *m, MMapCacheCategory c) {
Window *w;
assert(m);
assert(c);
assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
if (!c->window)
return;
w = TAKE_PTR(m->windows_by_category[c]);
if (!w)
return; /* Nothing attached. */
w = TAKE_PTR(c->window);
LIST_REMOVE(by_window, w->contexts, c);
assert(FLAGS_SET(w->flags, 1u << c));
w->flags &= ~(1u << c);
if (!w->contexts && !w->keep_always) {
if (WINDOW_IS_UNUSED(w)) {
/* Not used anymore? */
#if ENABLE_DEBUG_MMAP_CACHE
/* Unmap unused windows immediately to expose use-after-unmap by SIGSEGV. */
@ -190,33 +202,31 @@ static void context_detach_window(MMapCache *m, Context *c) {
LIST_PREPEND(unused, m->unused, w);
if (!m->last_unused)
m->last_unused = w;
w->in_unused = true;
w->flags |= WINDOW_IN_UNUSED;
#endif
}
}
static void context_attach_window(MMapCache *m, Context *c, Window *w) {
static void category_attach_window(MMapCache *m, MMapCacheCategory c, Window *w) {
assert(m);
assert(c);
assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
assert(w);
if (c->window == w)
return;
if (m->windows_by_category[c] == w)
return; /* Already attached. */
context_detach_window(m, c);
category_detach_window(m, c);
if (w->in_unused) {
if (FLAGS_SET(w->flags, WINDOW_IN_UNUSED)) {
/* Used again? */
if (m->last_unused == w)
m->last_unused = w->unused_prev;
LIST_REMOVE(unused, m->unused, w);
w->in_unused = false;
w->flags &= ~WINDOW_IN_UNUSED;
}
c->window = w;
LIST_PREPEND(by_window, w->contexts, c);
m->windows_by_category[c] = w;
w->flags |= (1u << c);
}
static MMapCache* mmap_cache_free(MMapCache *m) {
@ -322,7 +332,7 @@ static int add_mmap(
int mmap_cache_fd_get(
MMapFileDescriptor *f,
unsigned context,
MMapCacheCategory c,
bool keep_always,
uint64_t offset,
size_t size,
@ -330,28 +340,25 @@ int mmap_cache_fd_get(
void **ret) {
MMapCache *m = mmap_cache_fd_cache(f);
Context *c;
Window *w;
int r;
assert(context < MMAP_CACHE_MAX_CONTEXTS);
assert(size > 0);
assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
assert(ret);
if (f->sigbus)
return -EIO;
c = &f->cache->contexts[context];
/* Check whether the current context is the right one already */
if (window_matches(c->window, f, offset, size)) {
m->n_context_cache_hit++;
w = c->window;
/* Check whether the current category is the right one already */
if (window_matches(m->windows_by_category[c], f, offset, size)) {
m->n_category_cache_hit++;
w = m->windows_by_category[c];
goto found;
}
/* Drop the reference to the window, since it's unnecessary now */
context_detach_window(m, c);
category_detach_window(m, c);
/* Search for a matching mmap */
LIST_FOREACH(windows, i, f->windows)
@ -369,17 +376,62 @@ int mmap_cache_fd_get(
return r;
found:
w->keep_always = w->keep_always || keep_always;
context_attach_window(m, c, w);
if (keep_always)
w->flags |= WINDOW_KEEP_ALWAYS;
category_attach_window(m, c, w);
*ret = (uint8_t*) w->ptr + (offset - w->offset);
return 0;
}
int mmap_cache_fd_pin(
MMapFileDescriptor *f,
MMapCacheCategory c,
void *addr,
size_t size) {
MMapCache *m = mmap_cache_fd_cache(f);
Window *w;
assert(addr);
assert(c >= 0 && c < _MMAP_CACHE_CATEGORY_MAX);
assert(size > 0);
if (f->sigbus)
return -EIO;
/* Check if the current category is the right one. */
if (window_matches_by_addr(m->windows_by_category[c], f, addr, size)) {
m->n_category_cache_hit++;
w = m->windows_by_category[c];
goto found;
}
/* Search for a matching mmap. */
LIST_FOREACH(windows, i, f->windows)
if (window_matches_by_addr(i, f, addr, size)) {
m->n_window_list_hit++;
w = i;
goto found;
}
m->n_missed++;
return -EADDRNOTAVAIL; /* Not found. */
found:
if (FLAGS_SET(w->flags, WINDOW_KEEP_ALWAYS))
return 0; /* The window will never unmapped. */
/* Attach the window to the 'pinning' category. */
category_attach_window(m, MMAP_CACHE_CATEGORY_PIN, w);
return 1;
}
void mmap_cache_stats_log_debug(MMapCache *m) {
assert(m);
log_debug("mmap cache statistics: %u context cache hit, %u window list hit, %u miss",
m->n_context_cache_hit, m->n_window_list_hit, m->n_missed);
log_debug("mmap cache statistics: %u category cache hit, %u window list hit, %u miss",
m->n_category_cache_hit, m->n_window_list_hit, m->n_missed);
}
static void mmap_cache_process_sigbus(MMapCache *m) {
@ -404,13 +456,11 @@ static void mmap_cache_process_sigbus(MMapCache *m) {
ours = false;
HASHMAP_FOREACH(f, m->fds) {
LIST_FOREACH(windows, w, f->windows) {
if ((uint8_t*) addr >= (uint8_t*) w->ptr &&
(uint8_t*) addr < (uint8_t*) w->ptr + w->size) {
LIST_FOREACH(windows, w, f->windows)
if (window_matches_by_addr(w, f, addr, 1)) {
found = ours = f->sigbus = true;
break;
}
}
if (ours)
break;

View file

@ -1,15 +1,36 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include <errno.h>
#include <stdbool.h>
#include <sys/stat.h>
/* One context per object type, plus one of the header, plus one "additional" one */
#define MMAP_CACHE_MAX_CONTEXTS 9
#include "journal-def.h"
typedef struct MMapCache MMapCache;
typedef struct MMapFileDescriptor MMapFileDescriptor;
typedef enum MMapCacheCategory {
MMAP_CACHE_CATEGORY_ANY = OBJECT_UNUSED,
MMAP_CACHE_CATEGORY_DATA = OBJECT_DATA,
MMAP_CACHE_CATEGORY_FIELD = OBJECT_FIELD,
MMAP_CACHE_CATEGORY_ENTRY = OBJECT_ENTRY,
MMAP_CACHE_CATEGORY_DATA_HASH_TABLE = OBJECT_DATA_HASH_TABLE,
MMAP_CACHE_CATEGORY_FIELD_HASH_TABLE = OBJECT_FIELD_HASH_TABLE,
MMAP_CACHE_CATEGORY_ENTRY_ARRAY = OBJECT_ENTRY_ARRAY,
MMAP_CACHE_CATEGORY_TAG = OBJECT_TAG,
MMAP_CACHE_CATEGORY_HEADER, /* for reading file header */
MMAP_CACHE_CATEGORY_PIN, /* for temporary pinning a object */
_MMAP_CACHE_CATEGORY_MAX,
_MMAP_CACHE_CATEGORY_INVALID = -EINVAL,
} MMapCacheCategory;
assert_cc((int) _OBJECT_TYPE_MAX < (int) _MMAP_CACHE_CATEGORY_MAX);
static inline MMapCacheCategory type_to_category(ObjectType type) {
return type >= 0 && type < _OBJECT_TYPE_MAX ? (MMapCacheCategory) type : MMAP_CACHE_CATEGORY_ANY;
}
MMapCache* mmap_cache_new(void);
MMapCache* mmap_cache_ref(MMapCache *m);
MMapCache* mmap_cache_unref(MMapCache *m);
@ -17,12 +38,19 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(MMapCache*, mmap_cache_unref);
int mmap_cache_fd_get(
MMapFileDescriptor *f,
unsigned context,
MMapCacheCategory c,
bool keep_always,
uint64_t offset,
size_t size,
struct stat *st,
void **ret);
int mmap_cache_fd_pin(
MMapFileDescriptor *f,
MMapCacheCategory c,
void *addr,
size_t size);
int mmap_cache_add_fd(MMapCache *m, int fd, int prot, MMapFileDescriptor **ret);
MMapCache* mmap_cache_fd_cache(MMapFileDescriptor *f);
MMapFileDescriptor* mmap_cache_fd_free(MMapFileDescriptor *f);

View file

@ -3190,20 +3190,14 @@ _public_ int sd_journal_enumerate_unique(
continue;
}
/* We do not use OBJECT_DATA context here, but OBJECT_UNUSED
* instead, so that we can look at this data object at the same
* time as one on another file */
r = journal_file_move_to_object(j->unique_file, OBJECT_UNUSED, j->unique_offset, &o);
r = journal_file_move_to_object(j->unique_file, OBJECT_DATA, j->unique_offset, &o);
if (r < 0)
return r;
/* Let's do the type check by hand, since we used 0 context above. */
if (o->object.type != OBJECT_DATA)
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
"%s:offset " OFSfmt ": object has type %d, expected %d",
j->unique_file->path,
j->unique_offset,
o->object.type, OBJECT_DATA);
/* Let's pin the data object, so we can look at it at the same time as one on another file. */
r = journal_file_pin_object(j->unique_file, o);
if (r < 0)
return r;
r = journal_file_data_payload(j->unique_file, o, j->unique_offset, NULL, 0,
j->data_threshold, &odata, &ol);