Merge pull request #22992 from poettering/loop-dissect-tweaks

loop-util/image dissect fixes
This commit is contained in:
Yu Watanabe 2022-04-11 23:08:46 +09:00 committed by GitHub
commit 29d902f03a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 242 additions and 551 deletions

6
TODO
View file

@ -169,12 +169,6 @@ Features:
* bootctl: show whether UEFI audit mode is available
* dissect: rework how we access partitions: instead of letting the kernel probe
partition tables asynchronously, just pass the stuff we parsed in userspace
to the kernel via BLKPG_ADD_PARTITION. Benefit: we don't have to wait for
kernel/netlink/udev, but can run this synchronously without chance of losing
events or similar.
* sd-event: optionally, if per-event source rate limit is hit, downgrade
priority, but leave enabled, and once ratelimit window is over, upgrade
priority again. That way we can combat event source starvation without

View file

@ -2055,6 +2055,12 @@ int setup_namespace(
if (r < 0)
return log_debug_errno(r, "Failed to create loop device for root image: %m");
/* Make sure udevd won't issue BLKRRPART (which might flush out the loaded partition table)
* while we are still trying to mount things */
r = loop_device_flock(loop_device, LOCK_SH);
if (r < 0)
return log_debug_errno(r, "Failed to lock loopback device with LOCK_SH: %m");
r = dissect_image(
loop_device->fd,
&verity,
@ -2403,6 +2409,14 @@ int setup_namespace(
goto finish;
}
/* Now release the block device lock, so that udevd is free to call BLKRRPART on the device
* if it likes. */
r = loop_device_flock(loop_device, LOCK_UN);
if (r < 0) {
log_debug_errno(r, "Failed to release lock on loopback block device: %m");
goto finish;
}
if (decrypted_image) {
r = decrypted_image_relinquish(decrypted_image);
if (r < 0) {

View file

@ -639,6 +639,10 @@ static int action_mount(DissectedImage *m, LoopDevice *d) {
if (r < 0)
return r;
r = loop_device_flock(d, LOCK_UN);
if (r < 0)
return log_error_errno(r, "Failed to unlock loopback block device: %m");
if (di) {
r = decrypted_image_relinquish(di);
if (r < 0)
@ -687,6 +691,10 @@ static int action_copy(DissectedImage *m, LoopDevice *d) {
mounted_dir = TAKE_PTR(created_dir);
r = loop_device_flock(d, LOCK_UN);
if (r < 0)
return log_error_errno(r, "Failed to unlock loopback block device: %m");
if (di) {
r = decrypted_image_relinquish(di);
if (r < 0)
@ -845,6 +853,12 @@ static int run(int argc, char *argv[]) {
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", arg_image);
/* Make sure udevd doesn't issue BLKRRPART underneath us thus making devices disappear in the middle,
* that we assume already are there. */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return log_error_errno(r, "Failed to lock loopback device: %m");
r = dissect_image_and_warn(
d->fd,
arg_image,

View file

@ -1,6 +1,7 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <stdlib.h>
#include <sys/file.h>
#include <unistd.h>
#include "sd-device.h"
@ -696,6 +697,12 @@ static int enumerate_partitions(dev_t devnum) {
if (r <= 0)
return r;
/* Let's take a LOCK_SH lock on the block device, in case udevd is already running. If we don't take
* the lock, udevd might end up issuing BLKRRPART in the middle, and we don't want that, since that
* might remove all partitions while we are operating on them. */
if (flock(fd, LOCK_SH) < 0)
return log_error_errno(errno, "Failed to lock root block device: %m");
r = dissect_image(
fd,
NULL, NULL,
@ -703,7 +710,6 @@ static int enumerate_partitions(dev_t devnum) {
UINT64_MAX,
USEC_INFINITY,
DISSECT_IMAGE_GPT_ONLY|
DISSECT_IMAGE_NO_UDEV|
DISSECT_IMAGE_USR_NO_ROOT,
&m);
if (r == -ENOPKG) {

View file

@ -5737,6 +5737,13 @@ static int run(int argc, char *argv[]) {
goto finish;
}
/* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
r = loop_device_flock(loop, LOCK_SH);
if (r < 0) {
log_error_errno(r, "Failed to take lock on loopback block device: %m");
goto finish;
}
r = dissect_image_and_warn(
loop->fd,
arg_image,

View file

@ -12,6 +12,7 @@
#include <sys/ioctl.h>
#include <sys/stat.h>
#include "sd-device.h"
#include "sd-id128.h"
#include "alloc-util.h"
@ -3643,14 +3644,13 @@ static int resolve_copy_blocks_auto_candidate(
sd_id128_t *ret_uuid) {
_cleanup_(blkid_free_probep) blkid_probe b = NULL;
_cleanup_free_ char *p = NULL;
_cleanup_(sd_device_unrefp) sd_device *dev = NULL;
_cleanup_close_ int fd = -1;
const char *pttype, *t;
const char *pttype, *t, *p;
sd_id128_t pt_parsed, u;
blkid_partition pp;
dev_t whole_devno;
blkid_partlist pl;
struct stat st;
int r;
/* Checks if the specified partition has the specified GPT type UUID, and is located on the specified
@ -3673,21 +3673,19 @@ static int resolve_copy_blocks_auto_candidate(
major(partition_devno), minor(partition_devno),
major(restrict_devno), minor(restrict_devno));
r = device_path_make_major_minor(S_IFBLK, whole_devno, &p);
r = sd_device_new_from_devnum(&dev, 'b', whole_devno);
if (r < 0)
return log_error_errno(r, "Failed to convert block device to device node path: %m");
return log_error_errno(r, "Failed to create sd-device for block device %u:%u: %m",
major(whole_devno), minor(whole_devno));
fd = open(p, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
r = sd_device_get_devname(dev, &p);
if (r < 0)
return log_error_errno(r, "Failed to get name of block device %u:%u: %m",
major(whole_devno), minor(whole_devno));
fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
if (fd < 0)
return log_error_errno(r, "Failed to open '%s': %m", p);
if (fstat(fd, &st) < 0)
return log_error_errno(r, "Failed to stat '%s': %m", p);
if (!S_ISBLK(st.st_mode) || st.st_rdev != whole_devno)
return log_error_errno(
SYNTHETIC_ERRNO(EPERM),
"Opened and determined block device don't match, refusing.");
return log_error_errno(fd, "Failed to open block device %s: %m", p);
b = blkid_new_probe();
if (!b)
@ -3805,7 +3803,7 @@ static int resolve_copy_blocks_auto(
sd_id128_t type_uuid,
const char *root,
dev_t restrict_devno,
char **ret_path,
dev_t *ret_devno,
sd_id128_t *ret_uuid) {
const char *try1 = NULL, *try2 = NULL;
@ -3815,8 +3813,6 @@ static int resolve_copy_blocks_auto(
dev_t devno, found = 0;
int r;
assert(ret_path);
/* Enforce some security restrictions: CopyBlocks=auto should not be an avenue to get outside of the
* --root=/--image= confinement. Specifically, refuse CopyBlocks= in combination with --root= at all,
* and restrict block device references in the --image= case to loopback block device we set up.
@ -3926,9 +3922,8 @@ static int resolve_copy_blocks_auto(
return log_error_errno(SYNTHETIC_ERRNO(ENXIO),
"Unable to automatically discover suitable partition to copy blocks from.");
r = device_path_make_major_minor(S_IFBLK, found, ret_path);
if (r < 0)
return log_error_errno(r, "Failed to convert dev_t to device node path: %m");
if (ret_devno)
*ret_devno = found;
if (ret_uuid)
*ret_uuid = found_uuid;
@ -3972,32 +3967,43 @@ static int context_open_copy_block_paths(
"Copying from block device node is not permitted in --image=/--root= mode, refusing.");
} else if (p->copy_blocks_auto) {
_cleanup_(sd_device_unrefp) sd_device *dev = NULL;
const char *devname;
dev_t devno;
r = resolve_copy_blocks_auto(p->type_uuid, root, restrict_devno, &opened, &uuid);
r = resolve_copy_blocks_auto(p->type_uuid, root, restrict_devno, &devno, &uuid);
if (r < 0)
return r;
source_fd = open(opened, O_RDONLY|O_CLOEXEC|O_NOCTTY);
r = sd_device_new_from_devnum(&dev, 'b', devno);
if (r < 0)
return log_error_errno(r, "Failed to create sd-device object for device %u:%u: %m", major(devno), minor(devno));
r = sd_device_get_devname(dev, &devname);
if (r < 0)
return log_error_errno(r, "Failed to get device name of %u:%u: %m", major(devno), minor(devno));
opened = strdup(devname);
if (!opened)
return log_oom();
source_fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
if (source_fd < 0)
return log_error_errno(errno, "Failed to open automatically determined source block copy device '%s': %m", opened);
return log_error_errno(source_fd, "Failed to open automatically determined source block copy device '%s': %m", opened);
if (fstat(source_fd, &st) < 0)
return log_error_errno(errno, "Failed to stat block copy file '%s': %m", opened);
/* If we found it automatically, it must be a block device, let's enforce that */
if (!S_ISBLK(st.st_mode))
return log_error_errno(SYNTHETIC_ERRNO(EBADF),
"Automatically detected source block copy device '%s' is not a block device, refusing: %m", opened);
} else
} else
continue;
if (S_ISDIR(st.st_mode)) {
_cleanup_free_ char *bdev = NULL;
_cleanup_(sd_device_unrefp) sd_device *dev = NULL;
const char *bdev;
/* If the file is a directory, automatically find the backing block device */
if (major(st.st_dev) != 0)
r = device_path_make_major_minor(S_IFBLK, st.st_dev, &bdev);
r = sd_device_new_from_devnum(&dev, 'b', st.st_dev);
else {
dev_t devt;
@ -4009,22 +4015,23 @@ static int context_open_copy_block_paths(
if (r < 0)
return log_error_errno(r, "Unable to determine backing block device of '%s': %m", opened);
r = device_path_make_major_minor(S_IFBLK, devt, &bdev);
r = sd_device_new_from_devnum(&dev, 'b', devt);
}
if (r < 0)
return log_error_errno(r, "Failed to determine block device path for block device backing '%s': %m", opened);
return log_error_errno(r, "Failed to create sd-device object for block device backing '%s': %m", opened);
r = sd_device_get_devpath(dev, &bdev);
if (r < 0)
return log_error_errno(r, "Failed to get device name for block device backing '%s': %m", opened);
safe_close(source_fd);
source_fd = open(bdev, O_RDONLY|O_CLOEXEC|O_NOCTTY);
source_fd = sd_device_open(dev, O_RDONLY|O_CLOEXEC|O_NONBLOCK);
if (source_fd < 0)
return log_error_errno(errno, "Failed to open block device '%s': %m", bdev);
return log_error_errno(source_fd, "Failed to open block device '%s': %m", bdev);
if (fstat(source_fd, &st) < 0)
return log_error_errno(errno, "Failed to stat block device '%s': %m", bdev);
if (!S_ISBLK(st.st_mode))
return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Block device '%s' is not actually a block device, refusing.", bdev);
}
if (S_ISREG(st.st_mode))

View file

@ -359,6 +359,10 @@ static int portable_extract_by_path(
/* We now have a loopback block device, let's fork off a child in its own mount namespace, mount it
* there, and extract the metadata we need. The metadata is sent from the child back to us. */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return log_debug_errno(r, "Failed to acquire lock on loopback block device: %m");
BLOCK_SIGNALS(SIGCHLD);
r = mkdtemp_malloc("/tmp/inspect-XXXXXX", &tmpdir);

View file

@ -1196,6 +1196,12 @@ int image_read_metadata(Image *i) {
if (r < 0)
return r;
/* Make sure udevd doesn't issue BLKRRPART in the background which might make our partitions
* disappear temporarily. */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return r;
r = dissect_image(
d->fd,
NULL, NULL,

View file

@ -4,6 +4,7 @@
#include <valgrind/memcheck.h>
#endif
#include <linux/blkpg.h>
#include <linux/dm-ioctl.h>
#include <linux/loop.h>
#include <sys/mount.h>
@ -125,389 +126,6 @@ not_found:
}
#if HAVE_BLKID
static int enumerator_for_parent(sd_device *d, sd_device_enumerator **ret) {
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
int r;
assert(d);
assert(ret);
r = sd_device_enumerator_new(&e);
if (r < 0)
return r;
r = sd_device_enumerator_add_match_subsystem(e, "block", true);
if (r < 0)
return r;
r = sd_device_enumerator_add_match_parent(e, d);
if (r < 0)
return r;
r = sd_device_enumerator_add_match_sysattr(e, "partition", NULL, true);
if (r < 0)
return r;
*ret = TAKE_PTR(e);
return 0;
}
static int device_is_partition(
sd_device *d,
sd_device *expected_parent,
blkid_partition pp) {
const char *v, *parent_syspath, *expected_parent_syspath;
blkid_loff_t bsize, bstart;
uint64_t size, start;
int partno, bpartno, r;
sd_device *parent;
assert(d);
assert(expected_parent);
assert(pp);
r = sd_device_get_subsystem(d, &v);
if (r < 0)
return r;
if (!streq(v, "block"))
return false;
if (sd_device_get_devtype(d, &v) < 0 || !streq(v, "partition"))
return false;
r = sd_device_get_parent(d, &parent);
if (r < 0)
return false; /* Doesn't have a parent? No relevant to us */
r = sd_device_get_syspath(parent, &parent_syspath); /* Check parent of device of this action */
if (r < 0)
return r;
r = sd_device_get_syspath(expected_parent, &expected_parent_syspath); /* Check parent of device we are looking for */
if (r < 0)
return r;
if (!path_equal(parent_syspath, expected_parent_syspath))
return false; /* Has a different parent than what we need, not interesting to us */
/* On kernel uevents we may find the partition number in the PARTN= field. Let's use that preferably,
* since it's cheaper and more importantly: the sysfs attribute "partition" appears to become
* available late, hence let's use the property instead, which is available at the moment we see the
* uevent. */
r = sd_device_get_property_value(d, "PARTN", &v);
if (r == -ENOENT)
r = sd_device_get_sysattr_value(d, "partition", &v);
if (r < 0)
return r;
r = safe_atoi(v, &partno);
if (r < 0)
return r;
errno = 0;
bpartno = blkid_partition_get_partno(pp);
if (bpartno < 0)
return errno_or_else(EIO);
if (partno != bpartno)
return false;
r = sd_device_get_sysattr_value(d, "start", &v);
if (r < 0)
return r;
r = safe_atou64(v, &start);
if (r < 0)
return r;
errno = 0;
bstart = blkid_partition_get_start(pp);
if (bstart < 0)
return errno_or_else(EIO);
if (start != (uint64_t) bstart)
return false;
r = sd_device_get_sysattr_value(d, "size", &v);
if (r < 0)
return r;
r = safe_atou64(v, &size);
if (r < 0)
return r;
errno = 0;
bsize = blkid_partition_get_size(pp);
if (bsize < 0)
return errno_or_else(EIO);
if (size != (uint64_t) bsize)
return false;
return true;
}
static int find_partition(
sd_device *parent,
blkid_partition pp,
usec_t timestamp_not_before,
DissectImageFlags flags,
sd_device **ret) {
_cleanup_(sd_device_enumerator_unrefp) sd_device_enumerator *e = NULL;
sd_device *q;
int r;
assert(parent);
assert(pp);
assert(ret);
r = enumerator_for_parent(parent, &e);
if (r < 0)
return r;
FOREACH_DEVICE(e, q) {
uint64_t usec;
if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
r = sd_device_get_usec_initialized(q, &usec);
if (r == -EBUSY) /* Not initialized yet */
continue;
if (r < 0)
return r;
if (timestamp_not_before != USEC_INFINITY &&
usec < timestamp_not_before) /* udev database entry older than our attachment? Then it's not ours */
continue;
}
r = device_is_partition(q, parent, pp);
if (r < 0)
return r;
if (r > 0) {
*ret = sd_device_ref(q);
return 0;
}
}
return -ENXIO;
}
struct wait_data {
sd_device *parent_device;
blkid_partition blkidp;
sd_device *found;
uint64_t diskseq;
uint64_t uevent_seqnum_not_before;
usec_t timestamp_not_before;
DissectImageFlags flags;
};
static inline void wait_data_done(struct wait_data *d) {
sd_device_unref(d->found);
}
static int device_monitor_handler(sd_device_monitor *monitor, sd_device *device, void *userdata) {
struct wait_data *w = userdata;
int r;
assert(w);
if (device_for_action(device, SD_DEVICE_REMOVE))
return 0;
if (w->diskseq != 0) {
uint64_t diskseq;
/* If w->diskseq is non-zero, then we must have a disk seqnum */
r = sd_device_get_diskseq(device, &diskseq);
if (r < 0) {
log_debug_errno(r, "Dropping event because it has no diskseq, but waiting for %" PRIu64, w->diskseq);
return 0;
}
if (diskseq < w->diskseq) {
log_debug("Dropping event because diskseq too old (%" PRIu64 " < %" PRIu64 ")",
diskseq, w->diskseq);
return 0;
}
if (diskseq > w->diskseq) {
r = -EBUSY;
goto finish; /* Newer than what we were expecting, so we missed it, stop waiting */
}
} else if (w->uevent_seqnum_not_before != UINT64_MAX) {
uint64_t seqnum;
r = sd_device_get_seqnum(device, &seqnum);
if (r < 0)
goto finish;
if (seqnum <= w->uevent_seqnum_not_before) { /* From an older use of this loop device */
log_debug("Dropping event because seqnum too old (%" PRIu64 " <= %" PRIu64 ")",
seqnum, w->uevent_seqnum_not_before);
return 0;
}
}
r = device_is_partition(device, w->parent_device, w->blkidp);
if (r < 0)
goto finish;
if (r == 0) /* Not the one we need */
return 0;
/* It's the one we need! Yay! */
assert(!w->found);
w->found = sd_device_ref(device);
r = 0;
finish:
return sd_event_exit(sd_device_monitor_get_event(monitor), r);
}
static int timeout_handler(sd_event_source *s, uint64_t usec, void *userdata) {
struct wait_data *w = userdata;
int r;
assert(w);
/* Why partition not appeared within the timeout? We may lost some uevent, as some properties
* were not ready when we received uevent... Not sure, but anyway, let's try to find the
* partition again before give up. */
r = find_partition(w->parent_device, w->blkidp, w->timestamp_not_before, w->flags, &w->found);
if (r == -ENXIO)
return log_debug_errno(SYNTHETIC_ERRNO(ETIMEDOUT),
"Partition still not appeared after timeout reached.");
if (r < 0)
return log_debug_errno(r, "Failed to find partition: %m");
log_debug("Partition appeared after timeout reached.");
return sd_event_exit(sd_event_source_get_event(s), 0);
}
static int retry_handler(sd_event_source *s, uint64_t usec, void *userdata) {
struct wait_data *w = userdata;
int r;
assert(w);
r = find_partition(w->parent_device, w->blkidp, w->timestamp_not_before, w->flags, &w->found);
if (r != -ENXIO) {
if (r < 0)
return log_debug_errno(r, "Failed to find partition: %m");
log_debug("Partition found by a periodic search.");
return sd_event_exit(sd_event_source_get_event(s), 0);
}
r = sd_event_source_set_time_relative(s, 500 * USEC_PER_MSEC);
if (r < 0)
return r;
return sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
}
static int wait_for_partition_device(
sd_device *parent,
blkid_partition pp,
usec_t deadline,
uint64_t diskseq,
uint64_t uevent_seqnum_not_before,
usec_t timestamp_not_before,
DissectImageFlags flags,
sd_device **ret) {
_cleanup_(sd_event_source_unrefp) sd_event_source *timeout_source = NULL, *retry_source = NULL;
_cleanup_(sd_device_monitor_unrefp) sd_device_monitor *monitor = NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
int r;
assert(parent);
assert(pp);
assert(ret);
r = find_partition(parent, pp, timestamp_not_before, flags, ret);
if (r != -ENXIO)
return r;
r = sd_event_new(&event);
if (r < 0)
return r;
r = sd_device_monitor_new(&monitor);
if (r < 0)
return r;
r = sd_device_monitor_filter_add_match_subsystem_devtype(monitor, "block", "partition");
if (r < 0)
return r;
r = sd_device_monitor_filter_add_match_parent(monitor, parent, true);
if (r < 0)
return r;
r = sd_device_monitor_filter_add_match_sysattr(monitor, "partition", NULL, true);
if (r < 0)
return r;
r = sd_device_monitor_attach_event(monitor, event);
if (r < 0)
return r;
_cleanup_(wait_data_done) struct wait_data w = {
.parent_device = parent,
.blkidp = pp,
.diskseq = diskseq,
.uevent_seqnum_not_before = uevent_seqnum_not_before,
.timestamp_not_before = timestamp_not_before,
.flags = flags,
};
r = sd_device_monitor_start(monitor, device_monitor_handler, &w);
if (r < 0)
return r;
/* Check again, the partition might have appeared in the meantime */
r = find_partition(parent, pp, timestamp_not_before, flags, ret);
if (r != -ENXIO)
return r;
if (deadline != USEC_INFINITY) {
r = sd_event_add_time(
event, &timeout_source,
CLOCK_MONOTONIC, deadline, 0,
timeout_handler, &w);
if (r < 0)
return r;
r = sd_event_source_set_exit_on_failure(timeout_source, true);
if (r < 0)
return r;
}
/* If we don't have a disk sequence number then we cannot do exact matching,
* and we cannot know if we missed it or if it has not been sent yet, so set
* up additional retries to increase the chances of receiving the event. */
if (diskseq == 0) {
r = sd_event_add_time_relative(
event, &retry_source,
CLOCK_MONOTONIC, 500 * USEC_PER_MSEC, 0,
retry_handler, &w);
if (r < 0)
return r;
r = sd_event_source_set_exit_on_failure(retry_source, true);
if (r < 0)
return r;
}
r = sd_event_loop(event);
if (r < 0)
return r;
assert(w.found);
*ret = TAKE_PTR(w.found);
return 0;
}
static void check_partition_flags(
const char *node,
unsigned long long pflags,
@ -530,77 +148,8 @@ static void check_partition_flags(
log_debug("Unexpected partition flag %llu set on %s!", bit, node);
}
}
static int device_wait_for_initialization_harder(
sd_device *device,
const char *subsystem,
usec_t deadline,
sd_device **ret) {
usec_t start, left, retrigger_timeout;
int r;
start = now(CLOCK_MONOTONIC);
left = usec_sub_unsigned(deadline, start);
if (DEBUG_LOGGING) {
const char *sn = NULL;
(void) sd_device_get_sysname(device, &sn);
log_device_debug(device,
"Will wait up to %s for '%s' to initialize…", FORMAT_TIMESPAN(left, 0), strna(sn));
}
if (left != USEC_INFINITY)
retrigger_timeout = CLAMP(left / 4, 1 * USEC_PER_SEC, 5 * USEC_PER_SEC); /* A fourth of the total timeout, but let's clamp to 1s…5s range */
else
retrigger_timeout = 2 * USEC_PER_SEC;
for (;;) {
usec_t local_deadline, n;
bool last_try;
n = now(CLOCK_MONOTONIC);
assert(n >= start);
/* Find next deadline, when we'll retrigger */
local_deadline = start +
DIV_ROUND_UP(n - start, retrigger_timeout) * retrigger_timeout;
if (deadline != USEC_INFINITY && deadline <= local_deadline) {
local_deadline = deadline;
last_try = true;
} else
last_try = false;
r = device_wait_for_initialization(device, subsystem, local_deadline, ret);
if (r >= 0 && DEBUG_LOGGING) {
const char *sn = NULL;
(void) sd_device_get_sysname(device, &sn);
log_device_debug(device,
"Successfully waited for device '%s' to initialize for %s.",
strna(sn),
FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0));
}
if (r != -ETIMEDOUT || last_try)
return r;
if (DEBUG_LOGGING)
log_device_debug(device,
"Device didn't initialize within %s, assuming lost event. Retriggering device.",
FORMAT_TIMESPAN(usec_sub_unsigned(now(CLOCK_MONOTONIC), start), 0));
r = sd_device_trigger(device, SD_DEVICE_CHANGE);
if (r < 0)
return r;
}
}
#endif
#define DEVICE_TIMEOUT_USEC (45 * USEC_PER_SEC)
static void dissected_partition_done(DissectedPartition *p) {
assert(p);
@ -617,6 +166,62 @@ static void dissected_partition_done(DissectedPartition *p) {
};
}
#if HAVE_BLKID
static int ioctl_partition_add(
int fd,
const char *name,
int nr,
uint64_t start,
uint64_t size) {
assert(fd >= 0);
assert(name);
assert(nr > 0);
struct blkpg_partition bp = {
.pno = nr,
.start = start,
.length = size,
};
struct blkpg_ioctl_arg ba = {
.op = BLKPG_ADD_PARTITION,
.data = &bp,
.datalen = sizeof(bp),
};
if (strlen(name) >= sizeof(bp.devname))
return -EINVAL;
strcpy(bp.devname, name);
return RET_NERRNO(ioctl(fd, BLKPG, &ba));
}
static int make_partition_devname(
const char *whole_devname,
int nr,
char **ret) {
bool need_p;
assert(whole_devname);
assert(nr > 0);
/* Given a whole block device node name (e.g. /dev/sda or /dev/loop7) generate a partition device
* name (e.g. /dev/sda7 or /dev/loop7p5). The rule the kernel uses is simple: if whole block device
* node name ends in a digit, then suffix a 'p', followed by the partition number. Otherwise, just
* suffix the partition number without any 'p'. */
if (isempty(whole_devname)) /* Make sure there *is* a last char */
return -EINVAL;
need_p = strchr(DIGITS, whole_devname[strlen(whole_devname)-1]); /* Last char a digit? */
return asprintf(ret, "%s%s%i", whole_devname, need_p ? "p" : "", nr);
}
#endif
int dissect_image(
int fd,
const VeritySettings *verity,
@ -638,11 +243,10 @@ int dissect_image(
_cleanup_(blkid_free_probep) blkid_probe b = NULL;
_cleanup_free_ char *generic_node = NULL;
sd_id128_t generic_uuid = SD_ID128_NULL;
const char *pttype = NULL, *sysname = NULL;
const char *pttype = NULL, *sysname = NULL, *devname = NULL;
blkid_partlist pl;
int r, generic_nr = -1, n_partitions;
struct stat st;
usec_t deadline;
assert(fd >= 0);
assert(ret);
@ -698,23 +302,6 @@ int dissect_image(
if (r < 0)
return r;
if (!FLAGS_SET(flags, DISSECT_IMAGE_NO_UDEV)) {
_cleanup_(sd_device_unrefp) sd_device *initialized = NULL;
/* If udev support is enabled, then let's wait for the device to be initialized before we doing anything. */
r = device_wait_for_initialization_harder(
d,
"block",
usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC),
&initialized);
if (r < 0)
return r;
sd_device_unref(d);
d = TAKE_PTR(initialized);
}
b = blkid_new_probe();
if (!b)
return -ENOMEM;
@ -770,6 +357,9 @@ int dissect_image(
if (r < 0)
return r;
}
r = sd_device_get_devname(d, &devname);
if (r < 0)
return log_debug_errno(r, "Failed to get device devname: %m");
if (!image_name_is_valid(m->image_name)) {
log_debug("Image name %s is not valid, ignoring", strempty(m->image_name));
@ -785,8 +375,8 @@ int dissect_image(
(void) blkid_probe_lookup_value(b, "USAGE", &usage, NULL);
if (STRPTR_IN_SET(usage, "filesystem", "crypto")) {
const char *fstype = NULL, *options = NULL, *devname = NULL;
_cleanup_free_ char *t = NULL, *n = NULL, *o = NULL;
const char *fstype = NULL, *options = NULL;
/* OK, we have found a file system, that's our root partition then. */
(void) blkid_probe_lookup_value(b, "TYPE", &fstype, NULL);
@ -797,10 +387,6 @@ int dissect_image(
return -ENOMEM;
}
r = sd_device_get_devname(d, &devname);
if (r < 0)
return r;
n = strdup(devname);
if (!n)
return -ENOMEM;
@ -873,13 +459,11 @@ int dissect_image(
if (n_partitions < 0)
return errno_or_else(EIO);
deadline = usec_add(now(CLOCK_MONOTONIC), DEVICE_TIMEOUT_USEC);
for (int i = 0; i < n_partitions; i++) {
_cleanup_(sd_device_unrefp) sd_device *q = NULL;
_cleanup_free_ char *node = NULL;
unsigned long long pflags;
blkid_loff_t start, size;
blkid_partition pp;
const char *node;
int nr;
errno = 0;
@ -887,14 +471,6 @@ int dissect_image(
if (!pp)
return errno_or_else(EIO);
r = wait_for_partition_device(d, pp, deadline, diskseq, uevent_seqnum_not_before, timestamp_not_before, flags, &q);
if (r < 0)
return r;
r = sd_device_get_devname(q, &node);
if (r < 0)
return r;
pflags = blkid_partition_get_flags(pp);
errno = 0;
@ -916,6 +492,31 @@ int dissect_image(
assert((uint64_t) size < UINT64_MAX/512);
r = make_partition_devname(devname, nr, &node);
if (r < 0)
return r;
/* So here's the thing: after the main ("whole") block device popped up it might take a while
* before the kernel fully probed the partition table. Waiting for that to finish is icky in
* userspace. So here's what we do instead. We issue the BLKPG_ADD_PARTITION ioctl to add the
* partition ourselves, racing against the kernel. Good thing is: if this call fails with
* EBUSY then the kernel was quicker than us, and that's totally OK, the outcome is good for
* us: the device node will exist. If OTOH our call was successful we won the race. Which is
* also good as the outcome is the same: the partition block device exists, and we can use
* it.
*
* Kernel returns EBUSY if there's already a partition by that number or an overlapping
* partition already existent. */
r = ioctl_partition_add(fd, node, nr, (uint64_t) start * 512, (uint64_t) size * 512);
if (r < 0) {
if (r != -EBUSY)
return log_debug_errno(r, "BLKPG_ADD_PARTITION failed: %m");
log_debug_errno(r, "Kernel was quicker than us in adding partition %i.", nr);
} else
log_debug("We were quicker than kernel in adding partition %i.", nr);
if (is_gpt) {
PartitionDesignator designator = _PARTITION_DESIGNATOR_INVALID;
Architecture architecture = _ARCHITECTURE_INVALID;
@ -1447,7 +1048,7 @@ int dissect_image(
(flags & DISSECT_IMAGE_GENERIC_ROOT) &&
(!verity || !verity->root_hash || verity->designator != PARTITION_USR)) {
/* OK, we found nothing usable, then check if there's a single generic one distro, and use
/* OK, we found nothing usable, then check if there's a single generic partition, and use
* that. If the root hash was set however, then we won't fall back to a generic node, because
* the root hash decides. */
@ -3379,6 +2980,11 @@ int mount_image_privately_interactively(
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", image);
/* Make sure udevd doesn't issue BLKRRPART behind our backs */
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return r;
r = dissect_image_and_warn(d->fd, image, &verity, NULL, d->diskseq, d->uevent_seqnum_not_before, d->timestamp_not_before, flags, &dissected_image);
if (r < 0)
return r;
@ -3405,6 +3011,10 @@ int mount_image_privately_interactively(
if (r < 0)
return r;
r = loop_device_flock(d, LOCK_UN);
if (r < 0)
return r;
if (decrypted_image) {
r = decrypted_image_relinquish(decrypted_image);
if (r < 0)
@ -3485,6 +3095,10 @@ int verity_dissect_and_mount(
if (r < 0)
return log_debug_errno(r, "Failed to create loop device for image: %m");
r = loop_device_flock(loop_device, LOCK_SH);
if (r < 0)
return log_debug_errno(r, "Failed to lock loop device: %m");
r = dissect_image(
loop_device->fd,
&verity,
@ -3532,6 +3146,10 @@ int verity_dissect_and_mount(
if (r < 0)
return log_debug_errno(r, "Failed to mount image: %m");
r = loop_device_flock(loop_device, LOCK_UN);
if (r < 0)
return log_debug_errno(r, "Failed to unlock loopback device: %m");
/* If we got os-release values from the caller, then we need to match them with the image's
* extension-release.d/ content. Return -EINVAL if there's any mismatch.
* First, check the distro ID. If that matches, then check the new SYSEXT_LEVEL value if

View file

@ -188,19 +188,18 @@ typedef enum DissectImageFlags {
DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY = 1 << 7, /* Mount only the non-root and non-/usr partitions */
DISSECT_IMAGE_VALIDATE_OS = 1 << 8, /* Refuse mounting images that aren't identifiable as OS images */
DISSECT_IMAGE_VALIDATE_OS_EXT = 1 << 9, /* Refuse mounting images that aren't identifiable as OS extension images */
DISSECT_IMAGE_NO_UDEV = 1 << 10, /* Don't wait for udev initializing things */
DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 11, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
DISSECT_IMAGE_FSCK = 1 << 12, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 13, /* Only recognize single file system images */
DISSECT_IMAGE_VERITY_SHARE = 1 << 14, /* When activating a verity device, reuse existing one if already open */
DISSECT_IMAGE_MKDIR = 1 << 15, /* Make top-level directory to mount right before mounting, if missing */
DISSECT_IMAGE_USR_NO_ROOT = 1 << 16, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
DISSECT_IMAGE_REQUIRE_ROOT = 1 << 17, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 18, /* Make mounts read-only */
DISSECT_IMAGE_RELAX_VAR_CHECK = 1 << 10, /* Don't insist that the UUID of /var is hashed from /etc/machine-id */
DISSECT_IMAGE_FSCK = 1 << 11, /* File system check the partition before mounting (no effect when combined with DISSECT_IMAGE_READ_ONLY) */
DISSECT_IMAGE_NO_PARTITION_TABLE = 1 << 12, /* Only recognize single file system images */
DISSECT_IMAGE_VERITY_SHARE = 1 << 13, /* When activating a verity device, reuse existing one if already open */
DISSECT_IMAGE_MKDIR = 1 << 14, /* Make top-level directory to mount right before mounting, if missing */
DISSECT_IMAGE_USR_NO_ROOT = 1 << 15, /* If no root fs is in the image, but /usr is, then allow this (so that we can mount the rootfs as tmpfs or so */
DISSECT_IMAGE_REQUIRE_ROOT = 1 << 16, /* Don't accept disks without root partition (or at least /usr partition if DISSECT_IMAGE_USR_NO_ROOT is set) */
DISSECT_IMAGE_MOUNT_READ_ONLY = 1 << 17, /* Make mounts read-only */
DISSECT_IMAGE_READ_ONLY = DISSECT_IMAGE_DEVICE_READ_ONLY |
DISSECT_IMAGE_MOUNT_READ_ONLY,
DISSECT_IMAGE_GROWFS = 1 << 19, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 20, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
DISSECT_IMAGE_GROWFS = 1 << 18, /* Grow file systems in partitions marked for that to the size of the partitions after mount */
DISSECT_IMAGE_MOUNT_IDMAPPED = 1 << 19, /* Mount mounts with kernel 5.12-style userns ID mapping, if file system type doesn't support uid=/gid= */
} DissectImageFlags;
struct DissectedImage {

View file

@ -533,6 +533,10 @@ static int merge_subprocess(Hashmap *images, const char *workspace) {
if (r < 0)
return log_error_errno(r, "Failed to set up loopback device for %s: %m", img->path);
r = loop_device_flock(d, LOCK_SH);
if (r < 0)
return log_error_errno(r, "Failed to lock loopback device: %m");
r = dissect_image_and_warn(
d->fd,
img->path,

View file

@ -55,6 +55,9 @@ static void* thread_func(void *ptr) {
log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted);
/* Let's make sure udev doesn't call BLKRRPART in the background, while we try to mount the device. */
assert_se(loop_device_flock(loop, LOCK_SH) >= 0);
r = dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, DISSECT_IMAGE_READ_ONLY, &dissected);
if (r < 0)
log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node);
@ -85,6 +88,10 @@ static void* thread_func(void *ptr) {
log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted);
assert_se(r >= 0);
/* Now the block device is mounted, we don't need no manual lock anymore, the devices are now
* pinned by the mounts. */
assert_se(loop_device_flock(loop, LOCK_UN) >= 0);
log_notice("Unmounting %s", mounted);
mounted = umount_and_rmdir_and_free(mounted);
@ -158,12 +165,6 @@ static int run(int argc, char *argv[]) {
return EXIT_TEST_SKIP;
}
if (strstr_ptr(ci_environment(), "autopkgtest") || strstr_ptr(ci_environment(), "github-actions")) {
// FIXME: we should reenable this one day
log_tests_skipped("Skipping test on Ubuntu autopkgtest CI/GH Actions, test too slow and installed udev too flakey.");
return EXIT_TEST_SKIP;
}
/* This is a test for the loopback block device setup code and it's use by the image dissection
* logic: since the kernel APIs are hard use and prone to races, let's test this in a heavy duty
* test: we open a bunch of threads and repeatedly allocate and deallocate loopback block devices in
@ -221,6 +222,11 @@ static int run(int argc, char *argv[]) {
pthread_t threads[arg_n_threads];
sd_id128_t id;
/* Take an explicit lock while we format the file systems, in accordance with
* https://systemd.io/BLOCK_DEVICE_LOCKING/. We don't want udev to interfere and probe while we write
* or even issue BLKRRPART or similar while we are working on this. */
assert_se(loop_device_flock(loop, LOCK_EX) >= 0);
assert_se(dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, 0, &dissected) >= 0);
assert_se(dissected->partitions[PARTITION_ESP].found);
@ -249,9 +255,21 @@ static int run(int argc, char *argv[]) {
assert_se(mkdtemp_malloc(NULL, &mounted) >= 0);
/* We are particularly correct here, and now downgrade LOCK → LOCK_SH. That's because we are done
* with formatting the file systems, so we don't need the exclusive lock anymore. From now on a
* shared one is fine. This way udev can now probe the device if it wants, but still won't call
* BLKRRPART on it, and that's good, because that would destroy our partition table while we are at
* it. */
assert_se(loop_device_flock(loop, LOCK_SH) >= 0);
/* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */
assert_se(dissected_image_mount(dissected, mounted, UID_INVALID, UID_INVALID, 0) >= 0);
/* Now we mounted everything, the partitions are pinned. Now it's fine to release the lock
* fully. This means udev could now issue BLKRRPART again, but that's OK given this will fail because
* we now mounted the device. */
assert_se(loop_device_flock(loop, LOCK_UN) >= 0);
assert_se(umount_recursive(mounted, 0) >= 0);
loop = loop_device_unref(loop);