core: reuse credential dir across start and start-post if populated,

fresh otherwise

Currently, exec_setup_credential() always rewrite all credentials
upon exec_invoke(), i.e. invocation of each ExecCommand, and within
a single tmpfs instance. This is problematic though:

* When writing each tmp cred file, we essentially double the size
  of the credential. Therefore, if one cred is bigger than half
  of CREDENTIALS_TOTAL_SIZE_MAX, confusing ENOSPC occurs (see also
  https://github.com/systemd/systemd/pull/24734#issuecomment-1925440546)

* Credential is a unit-wide thing and thus should not change
  during the whole lifetime of main process. However, if e.g.
  a on-disk credential or SetCredential= in unit file
  changes between ExecStart= and ExecStartPost=,
  the credentials are overwritten when the latter gets to run,
  and the already-running main process is suddenly seeing
  completely different creds.

So, let's try to reuse final cred dir if the main process has started
and the tmpfs has been populated, so that the creds used is stable
across all ExecStart= and ExecStartPost=-s. We still want to retain
the ability of updating creds through ExecStartPre= though, therefore
we forcibly use a fresh cred dir for those. 'Fresh' means to actually
unmount the old tmpfs first, so the first problem goes away, too.
This commit is contained in:
Mike Yuan 2024-02-04 23:22:46 +08:00
parent 1221ba0f6f
commit cfbf7538d8
No known key found for this signature in database
GPG key ID: 417471C0A40F58B3
4 changed files with 121 additions and 89 deletions

View file

@ -52,7 +52,7 @@ DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
bool exec_params_need_credentials(const ExecParameters *p) {
assert(p);
return FLAGS_SET(p->flags, EXEC_SETUP_CREDENTIALS);
return p->flags & (EXEC_SETUP_CREDENTIALS|EXEC_SETUP_CREDENTIALS_FRESH);
}
bool exec_context_has_credentials(const ExecContext *c) {
@ -787,10 +787,9 @@ static int setup_credentials_internal(
uid_t uid,
gid_t gid) {
bool final_mounted;
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
bool final_mounted;
const char *where;
assert(context);
assert(params);
@ -798,6 +797,30 @@ static int setup_credentials_internal(
assert(final);
assert(workspace);
r = path_is_mount_point(final);
if (r < 0)
return r;
final_mounted = r > 0;
if (final_mounted) {
if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
final_mounted = false;
} else {
/* We can reuse the previous credential dir */
r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
if (r < 0)
return r;
if (r == 0) {
log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
return 0;
}
}
}
if (reuse_workspace) {
r = path_is_mount_point(workspace);
if (r < 0)
@ -810,40 +833,19 @@ static int setup_credentials_internal(
} else
workspace_mounted = -1; /* ditto */
r = path_is_mount_point(final);
if (r < 0)
return r;
if (r > 0) {
/* If the final place already has something mounted, we use that. If the workspace also has
* something mounted we assume it's actually the same mount (but with MS_RDONLY
* different). */
final_mounted = true;
if (workspace_mounted < 0) {
/* If the final place is mounted, but the workspace isn't, then let's bind mount
* the final version to the workspace, and make it writable, so that we can make
* changes */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0)
return r;
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
if (r < 0)
return r;
workspace_mounted = true;
}
} else
final_mounted = false;
/* If both the final place and the workspace are mounted, we have no mounts to set up, based on
* the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
* If the workspace is not mounted, we just bind the final place over and make it writable. */
must_mount = must_mount || final_mounted;
if (workspace_mounted < 0) {
/* Nothing is mounted on the workspace yet, let's try to mount something now */
r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
if (r < 0) {
/* If that didn't work, try to make a bind mount from the final to the workspace, so
* that we can make it writable there. */
if (!final_mounted)
/* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
* not using the final place. */
r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
if (final_mounted || r < 0) {
/* If using final place or failed to mount new tmpfs, make a bind mount from
* the final to the workspace, so that we can make it writable there. */
r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
if (r < 0) {
if (!ERRNO_IS_PRIVILEGE(r))
@ -856,12 +858,19 @@ static int setup_credentials_internal(
return r;
/* If we lack privileges to bind mount stuff, then let's gracefully proceed
* for compat with container envs, and just use the final dir as is. */
* for compat with container envs, and just use the final dir as is.
* Final place must not be mounted in this case (refused by must_mount
* above) */
workspace_mounted = false;
} else {
/* Make the new bind mount writable (i.e. drop MS_RDONLY) */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
r = mount_nofollow_verbose(LOG_DEBUG,
NULL,
workspace,
NULL,
MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
NULL);
if (r < 0)
return r;
@ -871,34 +880,26 @@ static int setup_credentials_internal(
workspace_mounted = true;
}
assert(!must_mount || workspace_mounted > 0);
where = workspace_mounted ? workspace : final;
assert(workspace_mounted >= 0);
assert(!must_mount || workspace_mounted);
const char *where = workspace_mounted ? workspace : final;
(void) label_fix_full(AT_FDCWD, where, final, 0);
r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
if (r < 0)
if (r < 0) {
/* If we're using final place as workspace, and failed to acquire credentials, we might
* have left half-written creds there. Let's get rid of the whole mount, so future
* calls won't reuse it. */
if (final_mounted)
(void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
return r;
}
if (workspace_mounted) {
bool install;
/* Determine if we should actually install the prepared mount in the final location by bind
* mounting it there. We do so only if the mount is not established there already, and if the
* mount is actually non-empty (i.e. carries at least one credential). Not that in the best
* case we are doing all this in a mount namespace, thus no one else will see that we
* allocated a file system we are getting rid of again here. */
if (final_mounted)
install = false; /* already installed */
else {
r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
if (r < 0)
return r;
install = r == 0; /* install only if non-empty */
}
if (install) {
if (!final_mounted) {
/* Make workspace read-only now, so that any bind mount we make from it defaults to
* read-only too */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
@ -908,7 +909,7 @@ static int setup_credentials_internal(
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
/* Otherwise get rid of it */
/* Otherwise we just get rid of the bind mount of final place */
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;

View file

@ -390,22 +390,23 @@ static inline bool exec_context_with_rootfs(const ExecContext *c) {
}
typedef enum ExecFlags {
EXEC_APPLY_SANDBOXING = 1 << 0,
EXEC_APPLY_CHROOT = 1 << 1,
EXEC_APPLY_TTY_STDIN = 1 << 2,
EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
EXEC_CGROUP_DELEGATE = 1 << 6,
EXEC_IS_CONTROL = 1 << 7,
EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
EXEC_SETUP_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
EXEC_APPLY_SANDBOXING = 1 << 0,
EXEC_APPLY_CHROOT = 1 << 1,
EXEC_APPLY_TTY_STDIN = 1 << 2,
EXEC_PASS_LOG_UNIT = 1 << 3, /* Whether to pass the unit name to the service's journal stream connection */
EXEC_CHOWN_DIRECTORIES = 1 << 4, /* chown() the runtime/state/cache/log directories to the user we run as, under all conditions */
EXEC_NSS_DYNAMIC_BYPASS = 1 << 5, /* Set the SYSTEMD_NSS_DYNAMIC_BYPASS environment variable, to disable nss-systemd blocking on PID 1, for use by dbus-daemon */
EXEC_CGROUP_DELEGATE = 1 << 6,
EXEC_IS_CONTROL = 1 << 7,
EXEC_CONTROL_CGROUP = 1 << 8, /* Place the process not in the indicated cgroup but in a subcgroup '/.control', but only EXEC_CGROUP_DELEGATE and EXEC_IS_CONTROL is set, too */
EXEC_SETUP_CREDENTIALS = 1 << 9, /* Set up the credential store logic */
EXEC_SETUP_CREDENTIALS_FRESH = 1 << 10, /* Set up a new credential store (disable reuse) */
/* The following are not used by execute.c, but by consumers internally */
EXEC_PASS_FDS = 1 << 10,
EXEC_SETENV_RESULT = 1 << 11,
EXEC_SET_WATCHDOG = 1 << 12,
EXEC_SETENV_MONITOR_RESULT = 1 << 13, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
EXEC_PASS_FDS = 1 << 11,
EXEC_SETENV_RESULT = 1 << 12,
EXEC_SET_WATCHDOG = 1 << 13,
EXEC_SETENV_MONITOR_RESULT = 1 << 14, /* Pass exit status to OnFailure= and OnSuccess= dependencies. */
} ExecFlags;
/* Parameters for a specific invocation of a command. This structure is put together right before a command is

View file

@ -1595,27 +1595,33 @@ static Service *service_get_triggering_service(Service *s) {
return NULL;
}
static ExecFlags service_exec_flags(ServiceExecCommand command_id) {
static ExecFlags service_exec_flags(ServiceExecCommand command_id, ExecFlags cred_flag) {
/* All service main/control processes honor sandboxing and namespacing options (except those
explicitly excluded in service_spawn()) */
ExecFlags flags = EXEC_APPLY_SANDBOXING|EXEC_APPLY_CHROOT;
assert(command_id >= 0);
assert(command_id < _SERVICE_EXEC_COMMAND_MAX);
assert((cred_flag & ~(EXEC_SETUP_CREDENTIALS_FRESH|EXEC_SETUP_CREDENTIALS)) == 0);
assert((cred_flag != 0) == (command_id == SERVICE_EXEC_START));
/* Control processes spawned before main process also get tty access */
if (IN_SET(command_id, SERVICE_EXEC_CONDITION, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START))
flags |= EXEC_APPLY_TTY_STDIN;
/* All start phases get access to credentials */
if (IN_SET(command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START, SERVICE_EXEC_START_POST))
/* All start phases get access to credentials. ExecStartPre= gets a new credential store upon
* every invocation, so that updating credential files through it works. When the first main process
* starts, passed creds become stable. Also see 'cred_flag'. */
if (command_id == SERVICE_EXEC_START_PRE)
flags |= EXEC_SETUP_CREDENTIALS_FRESH;
if (command_id == SERVICE_EXEC_START_POST)
flags |= EXEC_SETUP_CREDENTIALS;
if (IN_SET(command_id, SERVICE_EXEC_START_PRE, SERVICE_EXEC_START))
flags |= EXEC_SETENV_MONITOR_RESULT;
if (command_id == SERVICE_EXEC_START)
return flags|EXEC_PASS_FDS|EXEC_SET_WATCHDOG;
return flags|cred_flag|EXEC_PASS_FDS|EXEC_SET_WATCHDOG;
flags |= EXEC_IS_CONTROL;
@ -1632,13 +1638,12 @@ static ExecFlags service_exec_flags(ServiceExecCommand command_id) {
static int service_spawn_internal(
const char *caller,
Service *s,
ServiceExecCommand command_id,
ExecCommand *c,
ExecFlags flags,
usec_t timeout,
PidRef *ret_pid) {
_cleanup_(exec_params_shallow_clear) ExecParameters exec_params =
EXEC_PARAMETERS_INIT(service_exec_flags(command_id));
_cleanup_(exec_params_shallow_clear) ExecParameters exec_params = EXEC_PARAMETERS_INIT(flags);
_cleanup_(sd_event_source_unrefp) sd_event_source *exec_fd_source = NULL;
_cleanup_strv_free_ char **final_env = NULL, **our_env = NULL;
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
@ -2083,8 +2088,8 @@ static void service_enter_stop_post(Service *s, ServiceResult f) {
pidref_done(&s->control_pid);
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_stop_usec,
&s->control_pid);
if (r < 0) {
@ -2206,8 +2211,8 @@ static void service_enter_stop(Service *s, ServiceResult f) {
pidref_done(&s->control_pid);
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_stop_usec,
&s->control_pid);
if (r < 0) {
@ -2290,8 +2295,8 @@ static void service_enter_start_post(Service *s) {
pidref_done(&s->control_pid);
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
if (r < 0) {
@ -2400,8 +2405,8 @@ static void service_enter_start(Service *s) {
timeout = s->timeout_start_usec;
r = service_spawn(s,
SERVICE_EXEC_START,
c,
service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS_FRESH),
timeout,
&pidref);
if (r < 0) {
@ -2460,8 +2465,8 @@ static void service_enter_start_pre(Service *s) {
s->control_command_id = SERVICE_EXEC_START_PRE;
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
if (r < 0) {
@ -2497,8 +2502,8 @@ static void service_enter_condition(Service *s) {
pidref_done(&s->control_pid);
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
@ -2608,8 +2613,8 @@ static void service_enter_reload(Service *s) {
pidref_done(&s->control_pid);
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
s->timeout_start_usec,
&s->control_pid);
if (r < 0) {
@ -2664,8 +2669,8 @@ static void service_run_next_control(Service *s) {
pidref_done(&s->control_pid);
r = service_spawn(s,
s->control_command_id,
s->control_command,
service_exec_flags(s->control_command_id, /* cred_flag = */ 0),
timeout,
&s->control_pid);
if (r < 0) {
@ -2696,8 +2701,8 @@ static void service_run_next_main(Service *s) {
service_unwatch_main_pid(s);
r = service_spawn(s,
SERVICE_EXEC_START,
s->main_command,
service_exec_flags(SERVICE_EXEC_START, EXEC_SETUP_CREDENTIALS),
s->timeout_start_usec,
&pidref);
if (r < 0) {

View file

@ -3,6 +3,9 @@
# shellcheck disable=SC2016
set -eux
# shellcheck source=test/units/util.sh
. "$(dirname "$0")"/util.sh
systemd-analyze log-level debug
run_with_cred_compare() {
@ -309,6 +312,28 @@ systemd-run -p DynamicUser=yes -p 'LoadCredential=os:/etc/os-release' \
--service-type=oneshot --wait --pipe \
true | cmp /etc/os-release
# https://github.com/systemd/systemd/pull/24734#issuecomment-1925440546
# Also ExecStartPre= should be able to update creds
dd if=/dev/urandom of=/tmp/cred-huge bs=600K count=1
chmod 777 /tmp/cred-huge
systemd-run -p ProtectSystem=full \
-p 'LoadCredential=huge:/tmp/cred-huge' \
-p 'ExecStartPre=true' \
-p 'ExecStartPre=bash -c "echo fresh >/tmp/cred-huge"' \
--unit=test-54-huge-cred.service \
--wait --pipe \
systemd-creds cat huge | cmp - <(echo "fresh")
rm /tmp/cred-huge
echo stable >/tmp/cred-stable
systemd-run -p 'LoadCredential=stable:/tmp/cred-stable' \
-p 'ExecStartPost=systemd-creds cat stable' \
--unit=test-54-stable.service \
--service-type=oneshot --wait --pipe \
bash -c "echo bogus >/tmp/cred-stable" | cmp - <(echo "stable")
assert_eq "$(cat /tmp/cred-stable)" "bogus"
rm /tmp/cred-stable
if ! systemd-detect-virt -q -c ; then
# Validate that the credential we inserted via the initrd logic arrived
test "$(systemd-creds cat --system myinitrdcred)" = "guatemala"