core: do not leak mount for credentials directory if mount namespace is enabled

Since kernel v5.2, open_tree() and move_mount() are added. If a service
loads or sets credentials, then let's try to clone the mount that contains
credentials with open_tree(), then mount it after a (private) mount
namespace is initialized for the service. Then, we can setup a mount for
credentials directory without leaking it to the main shared mount
namespace.

With this change, the credentials for services that request their own
private mount namespace become much much safer. And, the number of mount
events triggered by setting up credential directories can be decreased.

Unfortunately, this does not 'fix' the original issue #25527, as the
reported service does not requests private mount namespace, but the
situation should be better now.
This commit is contained in:
Yu Watanabe 2023-08-12 15:18:41 +09:00
parent 7241b9cd72
commit 94fe4cf255
7 changed files with 115 additions and 17 deletions

View file

@ -10,6 +10,7 @@
#include "glob-util.h"
#include "io-util.h"
#include "label-util.h"
#include "missing_syscall.h"
#include "mkdir-label.h"
#include "mount-util.h"
#include "mountpoint-util.h"
@ -17,6 +18,7 @@
#include "random-util.h"
#include "recurse-dir.h"
#include "rm-rf.h"
#include "socket-util.h"
#include "tmpfile-util.h"
ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
@ -730,7 +732,8 @@ static int setup_credentials_internal(
bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
uid_t uid,
gid_t gid) {
gid_t gid,
int *ret_mount_fd) {
int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
* if we mounted something; false if we definitely can't mount anything */
@ -848,6 +851,27 @@ static int setup_credentials_internal(
if (r < 0)
return r;
if (ret_mount_fd) {
_cleanup_close_ int mount_fd = -EBADF;
r = mount_fd = RET_NERRNO(open_tree(AT_FDCWD, workspace, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC));
if (r >= 0) {
/* The workspace is already cloned in the above, and not necessary
* anymore. Even though the workspace is unmounted when the short-lived
* child process exits, let's explicitly unmount it here for safety. */
r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
if (r < 0)
return r;
*ret_mount_fd = TAKE_FD(mount_fd);
return 0;
}
/* Old kernel? Unprivileged? */
if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
return r;
}
/* And mount it to the final place, read-only */
r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
} else
@ -868,6 +892,8 @@ static int setup_credentials_internal(
return -errno;
}
if (ret_mount_fd)
*ret_mount_fd = -EBADF;
return 0;
}
@ -877,17 +903,22 @@ int setup_credentials(
const char *unit,
uid_t uid,
gid_t gid,
char **ret_path) {
char **ret_path,
int *ret_mount_fd) {
_cleanup_close_pair_ int socket_pair[2] = PIPE_EBADF;
_cleanup_free_ char *p = NULL, *q = NULL;
_cleanup_close_ int mount_fd = -EBADF;
int r;
assert(context);
assert(params);
assert(ret_path);
assert(ret_mount_fd);
if (!exec_context_has_credentials(context)) {
*ret_path = NULL;
*ret_mount_fd = -EBADF;
return 0;
}
@ -912,7 +943,12 @@ int setup_credentials(
if (r < 0 && r != -EEXIST)
return r;
r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
if (socketpair(AF_UNIX, SOCK_DGRAM | SOCK_CLOEXEC, 0, socket_pair) < 0)
return -errno;
r = safe_fork_full("(sd-mkdcreds)",
NULL, &socket_pair[1], 1,
FORK_DEATHSIG|FORK_CLOSE_ALL_FDS|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_REOPEN_LOG, NULL);
if (r < 0) {
_cleanup_free_ char *t = NULL, *u = NULL;
@ -948,14 +984,15 @@ int setup_credentials(
true, /* reuse the workspace if it is already a mount */
false, /* it's OK to fall back to a plain directory if we can't mount anything */
uid,
gid);
gid,
NULL);
(void) rmdir(u); /* remove the workspace again if we can. */
if (r < 0)
return r;
} else if (r == 0) {
} else if (r == 0) { /* child */
/* We managed to set up a mount namespace, and are now in a child. That's great. In this case
* we can use the same directory for all cases, after turning off propagation. Question
@ -974,6 +1011,8 @@ int setup_credentials(
* given that we do this in a privately namespaced short-lived single-threaded process that
* no one else sees this should be OK to do. */
_cleanup_close_ int fd = -EBADF;
/* Turn off propagation from our namespace to host */
r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
if (r < 0)
@ -988,7 +1027,14 @@ int setup_credentials(
false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
true, /* insist that something is mounted, do not allow fallback to plain directory */
uid,
gid);
gid,
&fd);
if (r < 0)
goto child_fail;
r = send_one_fd_iov(socket_pair[1], fd,
&IOVEC_MAKE((int[]) { fd >= 0 }, sizeof(int)), 1,
MSG_DONTWAIT);
if (r < 0)
goto child_fail;
@ -996,6 +1042,17 @@ int setup_credentials(
child_fail:
_exit(EXIT_FAILURE);
} else { /* parent */
int ret;
struct iovec iov = IOVEC_MAKE(&ret, sizeof(int));
r = receive_one_fd_iov(socket_pair[0], &iov, 1, MSG_DONTWAIT, &mount_fd);
if (r < 0)
return r;
if (ret > 0 && mount_fd < 0)
return -EIO;
}
/* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
@ -1005,5 +1062,6 @@ int setup_credentials(
(void) rmdir(p);
*ret_path = TAKE_PTR(p);
*ret_mount_fd = TAKE_FD(mount_fd);
return 0;
}

View file

@ -46,4 +46,5 @@ int setup_credentials(
const char *unit,
uid_t uid,
gid_t gid,
char **ret_path);
char **ret_path,
int *ret_mount_fd);

View file

@ -7,6 +7,7 @@
#include <sys/file.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/shm.h>
@ -72,6 +73,7 @@
#include "missing_fs.h"
#include "missing_ioprio.h"
#include "missing_prctl.h"
#include "missing_syscall.h"
#include "mkdir-label.h"
#include "namespace.h"
#include "parse-util.h"
@ -3112,6 +3114,7 @@ static int apply_mount_namespace(
ExecRuntime *runtime,
const char *memory_pressure_path,
const char *creds_path,
int creds_fd,
char **error_path) {
_cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
@ -3287,6 +3290,7 @@ static int apply_mount_namespace(
tmp_dir,
var_tmp_dir,
creds_path,
creds_fd,
context->log_namespace,
context->mount_propagation_flag,
&verity,
@ -3971,6 +3975,7 @@ static int exec_child(
int ngids_after_pam = 0;
_cleanup_free_ int *fds = NULL;
_cleanup_strv_free_ char **fdnames = NULL;
_cleanup_close_ int creds_fd = -EBADF;
assert(unit);
assert(command);
@ -4421,7 +4426,7 @@ static int exec_child(
}
if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
r = setup_credentials(context, params, unit->id, uid, gid, &creds_path);
r = setup_credentials(context, params, unit->id, uid, gid, &creds_path, &creds_fd);
if (r < 0) {
*exit_status = EXIT_CREDENTIALS;
return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
@ -4635,7 +4640,7 @@ static int exec_child(
if (needs_mount_namespace) {
_cleanup_free_ char *error_path = NULL;
r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, &error_path);
r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, creds_path, creds_fd, &error_path);
if (r < 0) {
*exit_status = EXIT_NAMESPACE;
return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
@ -4643,6 +4648,19 @@ static int exec_child(
}
}
if (creds_fd >= 0) {
assert(creds_path);
/* When a mount namespace is not requested, then the target directory may not exist yet.
* Here, we ignore the failure, as if it fails, the subsequent move_mount() will fail. */
(void) mkdir_p_label(creds_path, 0755);
if (move_mount(creds_fd, "", AT_FDCWD, creds_path, MOVE_MOUNT_F_EMPTY_PATH) < 0) {
*exit_status = EXIT_CREDENTIALS;
return log_unit_error_errno(unit, errno, "Failed to mount credentials directory on %s: %m", creds_path);
}
}
if (needs_sandboxing) {
r = apply_protect_hostname(unit, context, exit_status);
if (r < 0)

View file

@ -74,7 +74,8 @@ typedef enum MountMode {
EXTENSION_DIRECTORIES, /* Bind-mounted outside the root directory, and used by subsequent mounts */
EXTENSION_IMAGES, /* Mounted outside the root directory, and used by subsequent mounts */
MQUEUEFS,
READWRITE_IMPLICIT, /* Should have the lowest priority. */
READWRITE_IMPLICIT, /* Should have the 2nd lowest priority. */
MKDIR, /* Should have the lowest priority. */
_MOUNT_MODE_MAX,
} MountMode;
@ -227,6 +228,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = {
[EXEC] = "exec",
[NOEXEC] = "noexec",
[MQUEUEFS] = "mqueuefs",
[MKDIR] = "mkdir",
};
/* Helper struct for naming simplicity and reusability */
@ -1551,6 +1553,12 @@ static int apply_one_mount(
case OVERLAY_MOUNT:
return mount_overlay(m);
case MKDIR:
r = mkdir_p_label(mount_entry_path(m), 0755);
if (r < 0)
return r;
return 1;
default:
assert_not_reached();
}
@ -2017,6 +2025,7 @@ int setup_namespace(
const char* tmp_dir,
const char* var_tmp_dir,
const char *creds_path,
int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,
@ -2339,13 +2348,22 @@ int setup_namespace(
.flags = MS_NODEV|MS_STRICTATIME|MS_NOSUID|MS_NOEXEC,
};
*(m++) = (MountEntry) {
.path_const = creds_path,
.mode = BIND_MOUNT,
.read_only = true,
.source_const = creds_path,
.ignore = true,
};
/* If we have mount fd for credentials directory, then it will be mounted after
* namespace is set up. So, here we only create the mount point. */
if (creds_fd < 0)
*(m++) = (MountEntry) {
.path_const = creds_path,
.mode = BIND_MOUNT,
.read_only = true,
.source_const = creds_path,
.ignore = true,
};
else
*(m++) = (MountEntry) {
.path_const = creds_path,
.mode = MKDIR,
};
} else {
/* If our service has no credentials store configured, then make the whole
* credentials tree inaccessible wholesale. */

View file

@ -122,6 +122,7 @@ int setup_namespace(
const char *tmp_dir,
const char *var_tmp_dir,
const char *creds_path,
int creds_fd,
const char *log_namespace,
unsigned long mount_propagation_flag,
VeritySettings *verity,

View file

@ -194,6 +194,7 @@ TEST(protect_kernel_logs) {
NULL,
NULL,
NULL,
-EBADF,
NULL,
0,
NULL,

View file

@ -96,6 +96,7 @@ int main(int argc, char *argv[]) {
tmp_dir,
var_tmp_dir,
NULL,
-EBADF,
NULL,
0,
NULL,