From 0e551b04efb911d38b586cca1a6a462c87a2cb1b Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 8 May 2024 20:12:57 +0100 Subject: [PATCH] core: do not imply PrivateTmp with DynamicUser, create a private tmpfs instead DynamicUser= enables PrivateTmp= implicitly to avoid files owned by reusable uids leaking into the host. Change it to instead create a fully private tmpfs instance instead, which also ensures the same result, since it has less impactful semantics with respect to PrivateTmp=yes, which links the mount namespace to the host's /tmp instead. If a user specifies PrivateTmp manually, let the existing behaviour unchanged to ensure backward compatibility is not broken. --- man/systemd.exec.xml | 13 ++- src/core/dbus-execute.c | 4 +- src/core/dbus-util.c | 39 +++++++ src/core/dbus-util.h | 3 + src/core/exec-invoke.c | 16 +-- src/core/execute-serialize.c | 9 +- src/core/execute.c | 11 +- src/core/execute.h | 2 +- src/core/load-fragment-gperf.gperf.in | 2 +- src/core/load-fragment.c | 28 +++++ src/core/load-fragment.h | 1 + src/core/namespace.c | 141 ++++++++++++++++++------ src/core/namespace.h | 14 ++- src/core/unit.c | 8 +- test/units/TEST-07-PID1.exec-context.sh | 13 +++ 15 files changed, 244 insertions(+), 60 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 49099b2ee97..2fd69f6912f 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -671,12 +671,13 @@ part of a unit for which dynamic users/groups are enabled do not leave files or directories owned by these users/groups around, as a different unit might get the same UID/GID assigned later on, and thus gain access to these files or directories. If DynamicUser= is enabled, - RemoveIPC= and PrivateTmp= are implied (and cannot be turned - off). This ensures that the lifetime of IPC objects and temporary files created by the executed - processes is bound to the runtime of the service, and hence the lifetime of the dynamic - user/group. Since /tmp/ and /var/tmp/ are usually the only - world-writable directories on a system this ensures that a unit making use of dynamic user/group - allocation cannot leave files around after unit termination. Furthermore + RemoveIPC= is implied (and cannot be turned off). This ensures that the lifetime + of IPC objects and temporary files created by the executed processes is bound to the runtime of the + service, and hence the lifetime of the dynamic user/group. Since /tmp/ and + /var/tmp/ are usually the only world-writable directories on a system, unless + PrivateTmp= is manually enabled, those directories will be placed on a private + tmpfs filesystem, as this ensures that a unit making use of dynamic user/group allocation cannot + leave files around after unit termination. Furthermore NoNewPrivileges= and RestrictSUIDSGID= are implicitly enabled (and cannot be disabled), to ensure that processes invoked cannot take benefit or create SUID/SGID files or directories. Moreover ProtectSystem=strict and diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index 21c260b26b8..530a0f7a0be 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1055,7 +1055,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("NoExecPaths", "as", NULL, offsetof(ExecContext, no_exec_paths), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ExecSearchPath", "as", NULL, offsetof(ExecContext, exec_search_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("MountFlags", "t", bus_property_get_ulong, offsetof(ExecContext, mount_propagation_flag), SD_BUS_VTABLE_PROPERTY_CONST), - SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_private_tmp, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectClock", "b", bus_property_get_bool, offsetof(ExecContext, protect_clock), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST), @@ -1736,7 +1736,7 @@ int bus_exec_context_set_transient_property( return bus_set_transient_unsigned(u, name, &c->tty_cols, message, flags, error); if (streq(name, "PrivateTmp")) - return bus_set_transient_bool(u, name, &c->private_tmp, message, flags, error); + return bus_set_transient_private_tmp(u, name, &c->private_tmp, message, flags, error); if (streq(name, "PrivateDevices")) return bus_set_transient_bool(u, name, &c->private_devices, message, flags, error); diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c index b871d893682..46676d7a210 100644 --- a/src/core/dbus-util.c +++ b/src/core/dbus-util.c @@ -150,6 +150,45 @@ int bus_set_transient_usec_internal( return 1; } +int bus_set_transient_private_tmp( + Unit *u, + const char *name, + PrivateTmp *p, + sd_bus_message *message, + UnitWriteFlags flags, + sd_bus_error *error) { + + int v, r; + + assert(p); + + r = sd_bus_message_read(message, "b", &v); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + *p = v ? PRIVATE_TMP_CONNECTED : PRIVATE_TMP_OFF; + unit_write_settingf(u, flags, name, "%s=%s", name, yes_no(v)); + } + + return 1; +} + +int bus_property_get_private_tmp( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + PrivateTmp *p = ASSERT_PTR(userdata); + int b = *p != PRIVATE_TMP_OFF; + + return sd_bus_message_append_basic(reply, 'b', &b); +} + int bus_verify_manage_units_async_full( Unit *u, const char *verb, diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h index 0fc3a949610..29796eb249f 100644 --- a/src/core/dbus-util.h +++ b/src/core/dbus-util.h @@ -4,6 +4,7 @@ #include "sd-bus.h" #include "dissect-image.h" +#include "execute.h" #include "unit.h" int bus_property_get_triggered_unit(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); @@ -244,6 +245,7 @@ int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_tristate(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +int bus_set_transient_private_tmp(Unit *u, const char *name, PrivateTmp *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); } @@ -255,3 +257,4 @@ int bus_verify_manage_units_async_full(Unit *u, const char *verb, const char *po int bus_read_mount_options(sd_bus_message *message, sd_bus_error *error, MountOptions **ret_options, char **ret_format_str, const char *separator); int bus_property_get_activation_details(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +int bus_property_get_private_tmp(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index ee8db04e763..8b88ccb1e98 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3054,7 +3054,7 @@ static int apply_mount_namespace( _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL, **read_write_paths_cleanup = NULL; _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL, - *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL; + *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL; const char *tmp_dir = NULL, *var_tmp_dir = NULL; char **read_write_paths; bool setup_os_release_symlink; @@ -3108,7 +3108,7 @@ static int apply_mount_namespace( * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to * use here. This does not apply when we are using /run/systemd/empty as fallback. */ - if (context->private_tmp && runtime && runtime->shared) { + if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime && runtime->shared) { if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY)) tmp_dir = runtime->shared->tmp_dir; else if (runtime->shared->tmp_dir) @@ -3145,8 +3145,8 @@ static int apply_mount_namespace( if (!incoming_dir) return -ENOMEM; - extension_dir = strdup("/run/systemd/unit-extensions"); - if (!extension_dir) + private_namespace_dir = strdup("/run/systemd"); + if (!private_namespace_dir) return -ENOMEM; /* If running under a different root filesystem, propagate the host's os-release. We make a @@ -3159,7 +3159,7 @@ static int apply_mount_namespace( } else { assert(params->runtime_scope == RUNTIME_SCOPE_USER); - if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) + if (asprintf(&private_namespace_dir, "/run/user/" UID_FMT "/systemd", geteuid()) < 0) return -ENOMEM; if (setup_os_release_symlink) { @@ -3205,6 +3205,8 @@ static int apply_mount_namespace( .temporary_filesystems = context->temporary_filesystems, .n_temporary_filesystems = context->n_temporary_filesystems, + .private_tmp = context->private_tmp, + .mount_images = context->mount_images, .n_mount_images = context->n_mount_images, .mount_image_policy = context->mount_image_policy ?: &image_policy_service, @@ -3225,7 +3227,7 @@ static int apply_mount_namespace( .propagate_dir = propagate_dir, .incoming_dir = incoming_dir, - .extension_dir = extension_dir, + .private_namespace_dir = private_namespace_dir, .notify_socket = root_dir || root_image ? params->notify_socket : NULL, .host_os_release_stage = host_os_release_stage, @@ -3857,7 +3859,7 @@ static bool exec_context_need_unprivileged_private_users( return false; return context->private_users || - context->private_tmp || + context->private_tmp != PRIVATE_TMP_OFF || context->private_devices || context->private_network || context->network_namespace_path || diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index ecd1e70db67..d6c0162a7c2 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1840,7 +1840,7 @@ static int exec_context_serialize(const ExecContext *c, FILE *f) { if (r < 0) return r; - r = serialize_bool_elide(f, "exec-context-private-tmp", c->private_tmp); + r = serialize_item(f, "exec-context-private-tmp", private_tmp_to_string(c->private_tmp)); if (r < 0) return r; @@ -2720,10 +2720,9 @@ static int exec_context_deserialize(ExecContext *c, FILE *f) { if (r < 0) return r; } else if ((val = startswith(l, "exec-context-private-tmp="))) { - r = parse_boolean(val); - if (r < 0) - return r; - c->private_tmp = r; + c->private_tmp = private_tmp_from_string(val); + if (c->private_tmp < 0) + return c->private_tmp; } else if ((val = startswith(l, "exec-context-private-devices="))) { r = parse_boolean(val); if (r < 0) diff --git a/src/core/execute.c b/src/core/execute.c index 513e95e09d7..9ba5a77975a 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -231,7 +231,10 @@ bool exec_needs_mount_namespace( if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED)) return true; - if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir)) + if (context->private_tmp == PRIVATE_TMP_DISCONNECTED) + return true; + + if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir)) return true; if (context->private_devices || @@ -964,7 +967,7 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { prefix, empty_to_root(c->root_directory), prefix, yes_no(c->root_ephemeral), prefix, yes_no(c->non_blocking), - prefix, yes_no(c->private_tmp), + prefix, private_tmp_to_string(c->private_tmp), prefix, yes_no(c->private_devices), prefix, yes_no(c->protect_kernel_tunables), prefix, yes_no(c->protect_kernel_modules), @@ -2155,12 +2158,12 @@ static int exec_shared_runtime_make( assert(id); /* It is not necessary to create ExecSharedRuntime object. */ - if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) { + if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && c->private_tmp != PRIVATE_TMP_CONNECTED) { *ret = NULL; return 0; } - if (c->private_tmp && + if (c->private_tmp == PRIVATE_TMP_CONNECTED && !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") && (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") || prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) { diff --git a/src/core/execute.h b/src/core/execute.h index 107ae252438..ef8dadfe202 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -314,7 +314,7 @@ struct ExecContext { int private_mounts; int mount_apivfs; int memory_ksm; - bool private_tmp; + PrivateTmp private_tmp; bool private_network; bool private_devices; bool private_users; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index df219d86dfe..46c2198ac71 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -119,7 +119,7 @@ {{type}}.BindPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context) {{type}}.BindReadOnlyPaths, config_parse_bind_paths, 0, offsetof({{type}}, exec_context) {{type}}.TemporaryFileSystem, config_parse_temporary_filesystems, 0, offsetof({{type}}, exec_context) -{{type}}.PrivateTmp, config_parse_bool, 0, offsetof({{type}}, exec_context.private_tmp) +{{type}}.PrivateTmp, config_parse_private_tmp, 0, offsetof({{type}}, exec_context) {{type}}.PrivateDevices, config_parse_bool, 0, offsetof({{type}}, exec_context.private_devices) {{type}}.ProtectKernelTunables, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_tunables) {{type}}.ProtectKernelModules, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_kernel_modules) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index 4ff1e66d7ba..3701270ab53 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -5199,6 +5199,34 @@ int config_parse_temporary_filesystems( } } +int config_parse_private_tmp( + const char* unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + ExecContext *c = ASSERT_PTR(data); + int r; + + assert(filename); + assert(rvalue); + + r = parse_boolean(rvalue); + if (r < 0) { + log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse boolean value: %s ignoring", rvalue); + return 0; + } + + c->private_tmp = r ? PRIVATE_TMP_CONNECTED : PRIVATE_TMP_OFF; + return 0; +} + int config_parse_bind_paths( const char *unit, const char *filename, diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 005b9151060..94b13a4bb31 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -113,6 +113,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_import_credential); CONFIG_PARSER_PROTOTYPE(config_parse_set_status); CONFIG_PARSER_PROTOTYPE(config_parse_namespace_path_strv); CONFIG_PARSER_PROTOTYPE(config_parse_temporary_filesystems); +CONFIG_PARSER_PROTOTYPE(config_parse_private_tmp); CONFIG_PARSER_PROTOTYPE(config_parse_cpu_quota); CONFIG_PARSER_PROTOTYPE(config_parse_allowed_cpuset); CONFIG_PARSER_PROTOTYPE(config_parse_protect_home); diff --git a/src/core/namespace.c b/src/core/namespace.c index e521dc964c4..da96fc2f33c 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -72,6 +72,7 @@ typedef enum MountMode { MOUNT_EXEC, MOUNT_TMPFS, MOUNT_RUN, + MOUNT_PRIVATE_TMPFS, /* Mounted outside the root directory, and used by subsequent mounts */ MOUNT_EXTENSION_DIRECTORY, /* Bind-mounted outside the root directory, and used by subsequent mounts */ MOUNT_EXTENSION_IMAGE, /* Mounted outside the root directory, and used by subsequent mounts */ MOUNT_MQUEUEFS, @@ -97,6 +98,8 @@ typedef struct MountEntry { bool nosuid:1; /* Shall set MS_NOSUID on the mount itself */ bool noexec:1; /* Shall set MS_NOEXEC on the mount itself */ bool exec:1; /* Shall clear MS_NOEXEC on the mount itself */ + bool create_source_dir:1; /* Create the source directory if it doesn't exist - for implicit bind mounts */ + mode_t source_dir_mode; /* Mode for the source directory, if it is to be created */ MountEntryState state; /* Whether it was already processed or skipped */ char *path_malloc; /* Use this instead of 'path_const' if we had to allocate memory */ const char *unprefixed_path_const; /* If the path was amended with a prefix, these will save the original */ @@ -246,6 +249,7 @@ static const char * const mount_mode_table[_MOUNT_MODE_MAX] = { [MOUNT_EXEC] = "exec", [MOUNT_TMPFS] = "tmpfs", [MOUNT_RUN] = "run", + [MOUNT_PRIVATE_TMPFS] = "private-tmpfs", [MOUNT_EXTENSION_DIRECTORY] = "extension-directory", [MOUNT_EXTENSION_IMAGE] = "extension-image", [MOUNT_MQUEUEFS] = "mqueuefs", @@ -467,7 +471,7 @@ static int append_mount_images(MountList *ml, const MountImage *mount_images, si static int append_extensions( MountList *ml, const char *root, - const char *extension_dir, + const char *private_namespace_dir, char **hierarchies, const MountImage *mount_images, size_t n_mount_images, @@ -482,7 +486,7 @@ static int append_extensions( if (n_mount_images == 0 && strv_isempty(extension_directories)) return 0; - assert(extension_dir); + assert(private_namespace_dir); n_overlays = strv_length(hierarchies); if (n_overlays == 0) @@ -519,7 +523,7 @@ static int append_extensions( "No matching entry in .v/ directory %s found.", m->source); - if (asprintf(&mount_point, "%s/%zu", extension_dir, i) < 0) + if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, i) < 0) return -ENOMEM; for (size_t j = 0; hierarchies && hierarchies[j]; ++j) { @@ -556,7 +560,7 @@ static int append_extensions( bool ignore_enoent = false; /* Pick up the counter where the ExtensionImages left it. */ - if (asprintf(&mount_point, "%s/%zu", extension_dir, n_mount_images++) < 0) + if (asprintf(&mount_point, "%s/unit-extensions/%zu", private_namespace_dir, n_mount_images++) < 0) return -ENOMEM; /* Look for any prefixes */ @@ -888,8 +892,12 @@ static void drop_outside_root(MountList *ml, const char *root_directory) { for (f = ml->mounts, t = ml->mounts; f < ml->mounts + ml->n_mounts; f++) { - /* ExtensionImages/Directories bases are opened in /run/systemd/unit-extensions on the host */ - if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) && !path_startswith(mount_entry_path(f), root_directory)) { + /* ExtensionImages/Directories bases are opened in /run/[user/xyz/]systemd/unit-extensions + * on the host, and a private (invisible to the guest) tmpfs instance is mounted on + * /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private /tmp and + * /var/tmp. */ + if (!IN_SET(f->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) && + !path_startswith(mount_entry_path(f), root_directory)) { log_debug("%s is outside of root directory.", mount_entry_path(f)); mount_entry_done(f); continue; @@ -1618,6 +1626,14 @@ static int apply_one_mount( * that bind mount source paths are always relative to the host root, hence we pass NULL as * root directory to chase() here. */ + /* When we create implicit mounts, we might need to create the path ourselves as it is on a + * just-created tmpfs, for example. */ + if (m->create_source_dir) { + r = mkdir_p(mount_entry_source(m), m->source_dir_mode); + if (r < 0) + return log_debug_errno(r, "Failed to create source directory %s: %m", mount_entry_source(m)); + } + r = chase(mount_entry_source(m), NULL, CHASE_TRAIL_SLASH, &chased, NULL); if (r == -ENOENT && m->ignore) { log_debug_errno(r, "Path %s does not exist, ignoring.", mount_entry_source(m)); @@ -1637,6 +1653,7 @@ static int apply_one_mount( } case MOUNT_EMPTY_DIR: + case MOUNT_PRIVATE_TMPFS: case MOUNT_TMPFS: return mount_tmpfs(m); @@ -1743,7 +1760,7 @@ static int make_read_only(const MountEntry *m, char **deny_list, FILE *proc_self * and running Linux <= 4.17. */ submounts = mount_entry_read_only(m) && - !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS); + !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS); if (submounts) r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo); else @@ -1783,7 +1800,7 @@ static int make_noexec(const MountEntry *m, char **deny_list, FILE *proc_self_mo if (flags_mask == 0) /* No Change? */ return 0; - submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS); + submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS); if (submounts) r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), new_flags, flags_mask, deny_list, proc_self_mountinfo); @@ -1808,7 +1825,7 @@ static int make_nosuid(const MountEntry *m, FILE *proc_self_mountinfo) { if (m->state != MOUNT_APPLIED) return 0; - submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS); + submounts = !IN_SET(m->mode, MOUNT_EMPTY_DIR, MOUNT_TMPFS, MOUNT_PRIVATE_TMPFS); if (submounts) r = bind_remount_recursive_with_mountinfo(mount_entry_path(m), MS_NOSUID, MS_NOSUID, NULL, proc_self_mountinfo); else @@ -1959,8 +1976,11 @@ static int apply_mounts( if (m->state != MOUNT_PENDING) continue; - /* ExtensionImages/Directories are first opened in the propagate directory, not in the root_directory */ - r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY) ? root : NULL, m); + /* ExtensionImages/Directories are first opened in the propagate directory, not in + * the root_directory. A private (invisible to the guest) tmpfs instance is mounted + * on /run/[user/xyz/]systemd/unit-private-tmp as the storage backend of private + * /tmp and /var/tmp. */ + r = follow_symlink(!IN_SET(m->mode, MOUNT_EXTENSION_IMAGE, MOUNT_EXTENSION_DIRECTORY, MOUNT_PRIVATE_TMPFS) ? root : NULL, m); if (r < 0) { mount_entry_path_debug_string(root, m, error_path); return r; @@ -2245,39 +2265,88 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { if (r < 0) return r; - if (p->tmp_dir) { - bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY); + /* When DynamicUser=yes enforce that /tmp/ and /var/tmp/ are disconnected from the host's + * directories, as they are world writable and ephemeral uid/gid will be used. */ + if (p->private_tmp == PRIVATE_TMP_DISCONNECTED) { + _cleanup_free_ char *tmpfs_dir = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL; + MountEntry *tmpfs_entry, *tmp_entry, *var_tmp_entry; - MountEntry *me = mount_list_extend(&ml); - if (!me) + tmpfs_dir = path_join(p->private_namespace_dir, "unit-private-tmp"); + tmp_dir = path_join(tmpfs_dir, "tmp"); + var_tmp_dir = path_join(tmpfs_dir, "var-tmp"); + if (!tmpfs_dir || !tmp_dir || !var_tmp_dir) return log_oom_debug(); - *me = (MountEntry) { + tmpfs_entry = mount_list_extend(&ml); + if (!tmpfs_entry) + return log_oom_debug(); + *tmpfs_entry = (MountEntry) { + .path_malloc = TAKE_PTR(tmpfs_dir), + .mode = MOUNT_PRIVATE_TMPFS, + .options_const = "mode=0700" NESTED_TMPFS_LIMITS, + .flags = MS_NODEV|MS_STRICTATIME, + .has_prefix = true, + }; + + tmp_entry = mount_list_extend(&ml); + if (!tmp_entry) + return log_oom_debug(); + *tmp_entry = (MountEntry) { + .source_malloc = TAKE_PTR(tmp_dir), .path_const = "/tmp", - .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP, - .source_const = p->tmp_dir, + .mode = MOUNT_BIND, + .source_dir_mode = 01777, + .create_source_dir = true, }; - } - if (p->var_tmp_dir) { - bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY); - - MountEntry *me = mount_list_extend(&ml); - if (!me) + var_tmp_entry = mount_list_extend(&ml); + if (!var_tmp_entry) return log_oom_debug(); - - *me = (MountEntry) { + *var_tmp_entry = (MountEntry) { + .source_malloc = TAKE_PTR(var_tmp_dir), .path_const = "/var/tmp", - .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP, - .source_const = p->var_tmp_dir, + .mode = MOUNT_BIND, + .source_dir_mode = 01777, + .create_source_dir = true, }; + + /* Ensure that the tmpfs is mounted first, and bind mounts are added later. */ + assert_cc(MOUNT_BIND < MOUNT_PRIVATE_TMPFS); + } else { + if (p->tmp_dir) { + bool ro = streq(p->tmp_dir, RUN_SYSTEMD_EMPTY); + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/tmp", + .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP, + .source_const = p->tmp_dir, + }; + } + + if (p->var_tmp_dir) { + bool ro = streq(p->var_tmp_dir, RUN_SYSTEMD_EMPTY); + + MountEntry *me = mount_list_extend(&ml); + if (!me) + return log_oom_debug(); + + *me = (MountEntry) { + .path_const = "/var/tmp", + .mode = ro ? MOUNT_PRIVATE_TMP_READ_ONLY : MOUNT_PRIVATE_TMP, + .source_const = p->var_tmp_dir, + }; + } } r = append_mount_images(&ml, p->mount_images, p->n_mount_images); if (r < 0) return r; - r = append_extensions(&ml, root, p->extension_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories); + r = append_extensions(&ml, root, p->private_namespace_dir, hierarchies, p->extension_images, p->n_extension_images, p->extension_directories); if (r < 0) return r; @@ -2530,10 +2599,12 @@ int setup_namespace(const NamespaceParameters *p, char **error_path) { if (setup_propagate) (void) mkdir_p(p->propagate_dir, 0600); - if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) + if (p->n_extension_images > 0 || !strv_isempty(p->extension_directories)) { /* ExtensionImages/Directories mountpoint directories will be created while parsing the * mounts to create, so have the parent ready */ - (void) mkdir_p(p->extension_dir, 0600); + char *extension_dir = strjoina(p->private_namespace_dir, "/unit-extensions"); + (void) mkdir_p(extension_dir, 0600); + } /* Remount / as SLAVE so that nothing now mounted in the namespace * shows up in the parent */ @@ -3074,3 +3145,11 @@ static const char* const proc_subset_table[_PROC_SUBSET_MAX] = { }; DEFINE_STRING_TABLE_LOOKUP(proc_subset, ProcSubset); + +static const char* const private_tmp_table[_PRIVATE_TMP_MAX] = { + [PRIVATE_TMP_OFF] = "off", + [PRIVATE_TMP_CONNECTED] = "connected", + [PRIVATE_TMP_DISCONNECTED] = "disconnected", +}; + +DEFINE_STRING_TABLE_LOOKUP(private_tmp, PrivateTmp); diff --git a/src/core/namespace.h b/src/core/namespace.h index 921716bf3ec..bff99b9daa8 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -53,6 +53,14 @@ typedef enum ProcSubset { _PROC_SUBSET_INVALID = -EINVAL, } ProcSubset; +typedef enum PrivateTmp { + PRIVATE_TMP_OFF, + PRIVATE_TMP_CONNECTED, /* Bind mounted from the host's filesystem */ + PRIVATE_TMP_DISCONNECTED, /* A completely private tmpfs, invisible from the host */ + _PRIVATE_TMP_MAX, + _PRIVATE_TMP_INVALID = -EINVAL, +} PrivateTmp; + struct BindMount { char *source; char *destination; @@ -127,7 +135,7 @@ struct NamespaceParameters { const char *propagate_dir; const char *incoming_dir; - const char *extension_dir; + const char *private_namespace_dir; const char *notify_socket; const char *host_os_release_stage; @@ -150,6 +158,7 @@ struct NamespaceParameters { ProtectSystem protect_system; ProtectProc protect_proc; ProcSubset proc_subset; + PrivateTmp private_tmp; }; int setup_namespace(const NamespaceParameters *p, char **error_path); @@ -184,6 +193,9 @@ ProtectProc protect_proc_from_string(const char *s) _pure_; const char* proc_subset_to_string(ProcSubset i) _const_; ProcSubset proc_subset_from_string(const char *s) _pure_; +const char* private_tmp_to_string(PrivateTmp i) _const_; +PrivateTmp private_tmp_from_string(const char *s) _pure_; + void bind_mount_free_many(BindMount *b, size_t n); int bind_mount_add(BindMount **b, size_t *n, const BindMount *item); diff --git a/src/core/unit.c b/src/core/unit.c index 2f82902d4f2..ac91af187ac 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -1275,7 +1275,7 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) { return r; } - if (c->private_tmp) { + if (c->private_tmp == PRIVATE_TMP_CONNECTED) { r = unit_add_mounts_for(u, "/tmp", UNIT_DEPENDENCY_FILE, UNIT_MOUNT_WANTS); if (r < 0) return r; @@ -4287,7 +4287,11 @@ int unit_patch_contexts(Unit *u) { * UID/GID around in the file system or on IPC objects. Hence enforce a strict * sandbox. */ - ec->private_tmp = true; + /* With DynamicUser= we want private directories, so if the user hasn't manually + * selected PrivateTmp=, enable it, but to a fully private (disconnected) tmpfs + * instance. */ + if (ec->private_tmp == PRIVATE_TMP_OFF) + ec->private_tmp = PRIVATE_TMP_DISCONNECTED; ec->remove_ipc = true; ec->protect_system = PROTECT_SYSTEM_STRICT; if (ec->protect_home == PROTECT_HOME_NO) diff --git a/test/units/TEST-07-PID1.exec-context.sh b/test/units/TEST-07-PID1.exec-context.sh index a3379ef4020..69274a57439 100755 --- a/test/units/TEST-07-PID1.exec-context.sh +++ b/test/units/TEST-07-PID1.exec-context.sh @@ -340,6 +340,19 @@ if [[ ! -v ASAN_OPTIONS ]] && systemctl --version | grep "+BPF_FRAMEWORK" && ker (! systemd-run --wait --pipe -p RestrictFileSystems="~proc devtmpfs sysfs" ls /sys) fi +if [[ ! -v ASAN_OPTIONS ]]; then + # Ensure DynamicUser=yes does not imply PrivateTmp=yes if TemporaryFileSystem=/tmp /var/tmp is set + systemd-run --unit test-07-dynamic-user-tmp.service \ + --service-type=notify \ + -p DynamicUser=yes \ + -p NotifyAccess=all \ + sh -c 'touch /tmp/a && touch /var/tmp/b && ! test -f /tmp/b && ! test -f /var/tmp/a && systemd-notify --ready && sleep infinity' + (! ls /tmp/systemd-private-"$(tr -d '-' < /proc/sys/kernel/random/boot_id)"-test-07-dynamic-user-tmp.service-* &>/dev/null) + (! ls /var/tmp/systemd-private-"$(tr -d '-' < /proc/sys/kernel/random/boot_id)"-test-07-dynamic-user-tmp.service-* &>/dev/null) + systemctl is-active test-07-dynamic-user-tmp.service + systemctl stop test-07-dynamic-user-tmp.service +fi + # Make sure we properly (de)serialize various string arrays, including whitespaces # See: https://github.com/systemd/systemd/issues/31214 systemd-run --wait --pipe -p Environment="FOO='bar4 '" \