Revert "core: do not leak mount for credentials directory if mount namespace is enabled"

This reverts commits
- 9ae3624889
  "test-execute: add tests for credentials directory with mount namespace"↲
- 94fe4cf255
  "core: do not leak mount for credentials directory if mount namespace is enabled",
- 7241b9cd72
  "core/credential: make setup_credentials() return path to credentials directory",
- fbaf3b23ae
  "core: set $CREDENTIALS_DIRECTORY only when we set up credentials"

Before the commits, credentials directory set up on ExecStart= was kept
on e.g. ExecStop=. But, with the changes, if a service requests a
private mount namespace, the credentials directory is discarded after
ExecStart= is finished.

Let's revert the change, and find better way later.

Addresses the post-merge comment
https://github.com/systemd/systemd/pull/28787#issuecomment-1690614202.
This commit is contained in:
Yu Watanabe 2023-08-25 15:54:52 +09:00
parent 53c0397b1d
commit 73ff4d48de
12 changed files with 30 additions and 170 deletions

View file

@ -10,7 +10,6 @@
#include "glob-util.h"
#include "io-util.h"
#include "label-util.h"
#include "missing_syscall.h"
#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
@ -18,7 +17,6 @@
#include "random-util.h"
#include "recurse-dir.h"
#include "rm-rf.h"
#include "socket-util.h"
#include "tmpfile-util.h"
ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
@ -732,8 +730,7 @@ static int setup_credentials_internal(
bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
uid_t uid,
gid_t gid,
int *ret_mount_fd) {
gid_t gid) {
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
@ -851,27 +848,6 @@ static int setup_credentials_internal(
if (r < 0)
return r;
if (ret_mount_fd) {
_cleanup_close_ int mount_fd = -EBADF;
r = mount_fd = RET_NERRNO(open_tree(AT_FDCWD, workspace, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC));
if (r >= 0) {
/* The workspace is already cloned in the above, and not necessary
* anymore. Even though the workspace is unmounted when the short-lived
* child process exits, let's explicitly unmount it here for safety. */
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
*ret_mount_fd = TAKE_FD(mount_fd);
return 0;
}
/* Old kernel? Unprivileged? */
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
return r;
}
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
@ -892,8 +868,6 @@ static int setup_credentials_internal(
return -errno;
}
if (ret_mount_fd)
*ret_mount_fd = -EBADF;
return 0;
}
@ -902,25 +876,16 @@ int setup_credentials(
const ExecParameters *params,
const char *unit,
uid_t uid,
gid_t gid,
char **ret_path,
int *ret_mount_fd) {
gid_t gid) {
_cleanup_close_pair_ int socket_pair[2] = PIPE_EBADF;
_cleanup_free_ char *p = NULL, *q = NULL;
_cleanup_close_ int mount_fd = -EBADF;
int r;
assert(context);
assert(params);
assert(ret_path);
assert(ret_mount_fd);
if (!exec_context_has_credentials(context)) {
*ret_path = NULL;
*ret_mount_fd = -EBADF;
if (!exec_context_has_credentials(context))
return 0;
}
if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
return -EINVAL;
@ -943,12 +908,7 @@ int setup_credentials(
if (r < 0 && r != -EEXIST)
return r;
if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, socket_pair) < 0)
return -errno;
r = safe_fork_full("(sd-mkdcreds)",
NULL, &socket_pair[1], 1,
FORK_DEATHSIG|FORK_CLOSE_ALL_FDS|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_REOPEN_LOG, NULL);
r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
if (r < 0) {
_cleanup_free_ char *t = NULL, *u = NULL;
@ -984,15 +944,14 @@ int setup_credentials(
true, /* reuse the workspace if it is already a mount */
false, /* it's OK to fall back to a plain directory if we can't mount anything */
uid,
gid,
NULL);
gid);
(void) rmdir(u); /* remove the workspace again if we can. */
if (r < 0)
return r;
} else if (r == 0) { /* child */
} else if (r == 0) {
/* We managed to set up a mount namespace, and are now in a child. That's great. In this case
* we can use the same directory for all cases, after turning off propagation. Question
@ -1011,8 +970,6 @@ int setup_credentials(
* given that we do this in a privately namespaced short-lived single-threaded process that
* no one else sees this should be OK to do. */
_cleanup_close_ int fd = -EBADF;
/* Turn off propagation from our namespace to host */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
if (r < 0)
@ -1027,14 +984,7 @@ int setup_credentials(
false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
true, /* insist that something is mounted, do not allow fallback to plain directory */
uid,
gid,
&fd);
if (r < 0)
goto child_fail;
r = send_one_fd_iov(socket_pair[1], fd,
&IOVEC_MAKE((int[]) { fd >= 0 }, sizeof(int)), 1,
MSG_DONTWAIT);
gid);
if (r < 0)
goto child_fail;
@ -1042,17 +992,6 @@ int setup_credentials(
child_fail:
_exit(EXIT_FAILURE);
} else { /* parent */
int ret;
struct iovec iov = IOVEC_MAKE(&ret, sizeof(int));
r = receive_one_fd_iov(socket_pair[0], &iov, 1, MSG_DONTWAIT, &mount_fd);
if (r < 0)
return r;
if (ret > 0 && mount_fd < 0)
return -EIO;
}
/* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
@ -1060,8 +999,5 @@ int setup_credentials(
* actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
* seen by users when trying access this inode. */
(void) rmdir(p);
*ret_path = TAKE_PTR(p);
*ret_mount_fd = TAKE_FD(mount_fd);
return 0;
}

View file

@ -45,6 +45,4 @@ int setup_credentials(
const ExecParameters *params,
const char *unit,
uid_t uid,
gid_t gid,
char **ret_path,
int *ret_mount_fd);
gid_t gid);

View file

@ -7,7 +7,6 @@
#include <sys/file.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/shm.h>
@ -73,7 +72,6 @@
#include "missing_fs.h"
#include "missing_ioprio.h"
#include "missing_prctl.h"
#include "missing_syscall.h"
#include "mkdir-label.h"
#include "namespace.h"
#include "parse-util.h"
@ -1866,7 +1864,6 @@ static int build_environment(
dev_t journal_stream_dev,
ino_t journal_stream_ino,
const char *memory_pressure_path,
const char *creds_path,
char ***ret) {
_cleanup_strv_free_ char **our_env = NULL;
@ -2044,8 +2041,8 @@ static int build_environment(
our_env[n_env++] = x;
}
if (creds_path) {
x = strjoin("CREDENTIALS_DIRECTORY=", creds_path);
if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
if (!x)
return -ENOMEM;
@ -3113,14 +3110,12 @@ static int apply_mount_namespace(
const ExecParameters *params,
ExecRuntime *runtime,
const char *memory_pressure_path,
const char *creds_path,
int creds_fd,
char **error_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
_cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
**read_write_paths_cleanup = NULL;
_cleanup_free_ char *incoming_dir = NULL, *propagate_dir = NULL,
_cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
*extension_dir = NULL, *host_os_release_stage = NULL;
const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
char **read_write_paths;
@ -3222,6 +3217,14 @@ static int apply_mount_namespace(
if (context->mount_propagation_flag == MS_SHARED)
log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
if (exec_context_has_credentials(context) &&
params->prefix[EXEC_DIRECTORY_RUNTIME] &&
FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
if (!creds_path)
return -ENOMEM;
}
if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
propagate_dir = path_join("/run/systemd/propagate/", u->id);
if (!propagate_dir)
@ -3290,7 +3293,6 @@ static int apply_mount_namespace(
tmp_dir,
var_tmp_dir,
creds_path,
creds_fd,
context->log_namespace,
context->mount_propagation_flag,
&verity,
@ -3944,7 +3946,7 @@ static int exec_child(
int r, ngids = 0, exec_fd;
_cleanup_free_ gid_t *supplementary_gids = NULL;
const char *username = NULL, *groupname = NULL;
_cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL, *creds_path = NULL;
_cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
const char *home = NULL, *shell = NULL;
char **final_argv = NULL;
dev_t journal_stream_dev = 0;
@ -3975,7 +3977,6 @@ static int exec_child(
int ngids_after_pam = 0;
_cleanup_free_ int *fds = NULL;
_cleanup_strv_free_ char **fdnames = NULL;
_cleanup_close_ int creds_fd = -EBADF;
assert(unit);
assert(command);
@ -4426,7 +4427,7 @@ static int exec_child(
}
if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
r = setup_credentials(context, params, unit->id, uid, gid, &creds_path, &creds_fd);
r = setup_credentials(context, params, unit->id, uid, gid);
if (r < 0) {
*exit_status = EXIT_CREDENTIALS;
return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
@ -4446,7 +4447,6 @@ static int exec_child(
journal_stream_dev,
journal_stream_ino,
memory_pressure_path,
creds_path,
&our_env);
if (r < 0) {
*exit_status = EXIT_MEMORY;
@ -4640,7 +4640,7 @@ static int exec_child(
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;
r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, creds_fd, &error_path);
r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
@ -4648,19 +4648,6 @@ static int exec_child(
}
}
if (creds_fd >= 0) {
assert(creds_path);
/* When a mount namespace is not requested, then the target directory may not exist yet.
* Here, we ignore the failure, as if it fails, the subsequent move_mount() will fail. */
(void) mkdir_p_label(creds_path, 0755);
if (move_mount(creds_fd, "", AT_FDCWD, creds_path, MOVE_MOUNT_F_EMPTY_PATH) < 0) {
*exit_status = EXIT_CREDENTIALS;
return log_unit_error_errno(unit, errno, "Failed to mount credentials directory on %s: %m", creds_path);
}
}
if (needs_sandboxing) {
r = apply_protect_hostname(unit, context, exit_status);
if (r < 0)

View file

@ -74,8 +74,7 @@ typedef enum MountMode {
EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
MQUEUEFS,
READWRITE_IMPLICIT, /* Should have the 2nd lowest priority. */
MKDIR, /* Should have the lowest priority. */
READWRITE_IMPLICIT, /* Should have the lowest priority. */
_MOUNT_MODE_MAX,
} MountMode;
@ -232,7 +231,6 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
[EXTENSION_IMAGES] = "extension-images",
[MQUEUEFS] = "mqueuefs",
[READWRITE_IMPLICIT] = "read-write-implicit",
[MKDIR] = "mkdir",
};
/* Helper struct for naming simplicity and reusability */
@ -1549,12 +1547,6 @@ static int apply_one_mount(
case OVERLAY_MOUNT:
return mount_overlay(m);
case MKDIR:
r = mkdir_p_label(mount_entry_path(m), 0755);
if (r < 0)
return r;
return 1;
default:
assert_not_reached();
}
@ -2021,7 +2013,6 @@ int setup_namespace(
const char* tmp_dir,
const char* var_tmp_dir,
const char *creds_path,
int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,
@ -2344,22 +2335,13 @@ int setup_namespace(
.flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
};
/* If we have mount fd for credentials directory, then it will be mounted after
* namespace is set up. So, here we only create the mount point. */
if (creds_fd < 0)
*(m++) = (MountEntry) {
.path_const = creds_path,
.mode = BIND_MOUNT,
.read_only = true,
.source_const = creds_path,
.ignore = true,
};
else
*(m++) = (MountEntry) {
.path_const = creds_path,
.mode = MKDIR,
};
*(m++) = (MountEntry) {
.path_const = creds_path,
.mode = BIND_MOUNT,
.read_only = true,
.source_const = creds_path,
.ignore = true,
};
} else {
/* If our service has no credentials store configured, then make the whole
* credentials tree inaccessible wholesale. */

View file

@ -122,7 +122,6 @@ int setup_namespace(
const char *tmp_dir,
const char *var_tmp_dir,
const char *creds_path,
int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,

View file

@ -282,11 +282,7 @@ static void test_exec_cpuaffinity(Manager *m) {
static void test_exec_credentials(Manager *m) {
test(m, "exec-set-credential.service", 0, CLD_EXITED);
test(m, "exec-set-credential-with-mount-namespace.service", 0, CLD_EXITED);
test(m, "exec-set-credential-with-seccomp.service", 0, CLD_EXITED);
test(m, "exec-load-credential.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_CREDENTIALS, CLD_EXITED);
test(m, "exec-load-credential-with-mount-namespace.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_CREDENTIALS, CLD_EXITED);
test(m, "exec-load-credential-with-seccomp.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_CREDENTIALS, CLD_EXITED);
test(m, "exec-credentials-dir-specifier.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_CREDENTIALS, CLD_EXITED);
}

View file

@ -194,7 +194,6 @@ TEST(protect_kernel_logs) {
NULL,
NULL,
NULL,
-EBADF,
NULL,
0,
NULL,

View file

@ -96,7 +96,6 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
NULL,
-EBADF,
NULL,
0,
NULL,

View file

@ -1,9 +0,0 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Test for LoadCredential=
[Service]
ExecStart=/bin/sh -x -c 'test "$$(cat %d/test-execute.load-credential)" = "foo"'
Type=oneshot
LoadCredential=test-execute.load-credential
PrivateMounts=yes

View file

@ -1,9 +0,0 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Test for LoadCredential=
[Service]
ExecStart=/bin/sh -x -c 'test "$$(cat %d/test-execute.load-credential)" = "foo"'
Type=oneshot
LoadCredential=test-execute.load-credential
SystemCallFilter=~open_tree move_mount

View file

@ -1,9 +0,0 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Test for SetCredential=
[Service]
ExecStart=/bin/sh -x -c 'test "$$(cat %d/test-execute.set-credential)" = "hoge"'
Type=oneshot
SetCredential=test-execute.set-credential:hoge
PrivateMounts=yes

View file

@ -1,9 +0,0 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
[Unit]
Description=Test for SetCredential=
[Service]
ExecStart=/bin/sh -x -c 'test "$$(cat %d/test-execute.set-credential)" = "hoge"'
Type=oneshot
SetCredential=test-execute.set-credential:hoge
SystemCallFilter=~open_tree move_mount