user units: implicitly enable PrivateUsers= when sandboxing options are set

Enabling these options when not running as root requires a user
namespace, so implicitly enable PrivateUsers=.
This has a side effect as it changes which users are visible to the unit.
However until now these options did not work at all for user units, and
in practice just a handful of user units in Fedora, Debian and Ubuntu
mistakenly used them (and they have been all fixed since).

This fixes the long-standing confusing issue that the user and system
units take the same options but the behaviour is wildly (and sometimes
silently) different depending on which is which, with user units
requiring manually specifiying PrivateUsers= in order for sandboxing
options to actually work and not be silently ignored.
This commit is contained in:
Luca Boccassi 2022-11-01 23:34:15 +00:00 committed by Luca Boccassi
parent ce963a747f
commit 6ef721cbc7
4 changed files with 98 additions and 52 deletions

View file

@ -8,9 +8,13 @@
<refsect1>
<para id="singular">This option is only available for system services, or for services running in per-user
instances of the service manager when <varname>PrivateUsers=</varname> is enabled.</para>
instances of the service manager in which case <varname>PrivateUsers=</varname> is implicitly enabled
(requires unprivileged user namespaces support to be enabled in the kernel via the
<literal>kernel.unprivileged_userns_clone=</literal> sysctl).</para>
<para id="plural">These options are only available for system services, or for services running in per-user
instances of the service manager when <varname>PrivateUsers=</varname> is enabled.</para>
instances of the service manager in which case <varname>PrivateUsers=</varname> is implicitly enabled
(requires unprivileged user namespaces support to be enabled in the kernel via the
<literal>kernel.unprivileged_userns_clone=</literal> sysctl).</para>
</refsect1>

View file

@ -4400,6 +4400,44 @@ static void log_command_line(Unit *unit, const char *msg, const char *executable
LOG_UNIT_INVOCATION_ID(unit));
}
static bool exec_context_need_unprivileged_private_users(const ExecContext *context, const Manager *manager) {
assert(context);
assert(manager);
/* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
* to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
* (system manager) then we have privileges and don't need this. */
if (MANAGER_IS_SYSTEM(manager))
return false;
return context->private_users ||
context->private_tmp ||
context->private_devices ||
context->private_network ||
context->network_namespace_path ||
context->private_ipc ||
context->ipc_namespace_path ||
context->private_mounts ||
context->mount_apivfs ||
context->n_bind_mounts > 0 ||
context->n_temporary_filesystems > 0 ||
context->root_directory ||
!strv_isempty(context->extension_directories) ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups ||
context->protect_clock ||
context->protect_hostname ||
!strv_isempty(context->read_write_paths) ||
!strv_isempty(context->read_only_paths) ||
!strv_isempty(context->inaccessible_paths) ||
!strv_isempty(context->exec_paths) ||
!strv_isempty(context->no_exec_paths);
}
static int exec_child(
Unit *unit,
const ExecCommand *command,
@ -5032,17 +5070,22 @@ static int exec_child(
}
}
if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, unit->manager)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
userns_set_up = true;
r = setup_private_users(saved_uid, saved_gid, uid, gid);
if (r < 0) {
/* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
* the actual requested operations fail (or silently continue). */
if (r < 0 && context->private_users) {
*exit_status = EXIT_USER;
return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
}
if (r < 0)
log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
else
userns_set_up = true;
}
if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {

View file

@ -401,9 +401,9 @@ static void test_exec_ignoresigpipe(Manager *m) {
static void test_exec_privatetmp(Manager *m) {
assert_se(touch("/tmp/test-exec_privatetmp") >= 0);
test(m, "exec-privatetmp-yes.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatetmp-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatetmp-no.service", 0, CLD_EXITED);
test(m, "exec-privatetmp-disabled-by-prefix.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatetmp-disabled-by-prefix.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
unlink("/tmp/test-exec_privatetmp");
}
@ -420,10 +420,10 @@ static void test_exec_privatedevices(Manager *m) {
return;
}
test(m, "exec-privatedevices-yes.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-no.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-disabled-by-prefix.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes-with-group.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_GROUP, CLD_EXITED);
test(m, "exec-privatedevices-disabled-by-prefix.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-yes-with-group.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
/* We use capsh to test if the capabilities are
* properly set, so be sure that it exists */
@ -433,10 +433,10 @@ static void test_exec_privatedevices(Manager *m) {
return;
}
test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-mknod.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-sys-rawio.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-sys-rawio.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-mknod.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-sys-rawio.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED);
}
static void test_exec_protecthome(Manager *m) {
@ -466,23 +466,23 @@ static void test_exec_protectkernelmodules(Manager *m) {
return;
}
test(m, "exec-protectkernelmodules-no-capabilities.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-capabilities.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_readonlypaths(Manager *m) {
test(m, "exec-readonlypaths-simple.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readonlypaths-simple.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
if (path_is_read_only_fs("/var") > 0) {
log_notice("Directory /var is readonly, skipping remaining tests in %s", __func__);
return;
}
test(m, "exec-readonlypaths.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readonlypaths.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-readonlypaths-with-bindpaths.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-readonlypaths-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readonlypaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_readwritepaths(Manager *m) {
@ -492,7 +492,7 @@ static void test_exec_readwritepaths(Manager *m) {
return;
}
test(m, "exec-readwritepaths-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readwritepaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_inaccessiblepaths(Manager *m) {
@ -502,14 +502,14 @@ static void test_exec_inaccessiblepaths(Manager *m) {
return;
}
test(m, "exec-inaccessiblepaths-sys.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-inaccessiblepaths-sys.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
if (path_is_read_only_fs("/") > 0) {
log_notice("Root directory is readonly, skipping remaining tests in %s", __func__);
return;
}
test(m, "exec-inaccessiblepaths-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-inaccessiblepaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}
static int on_spawn_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
@ -687,14 +687,14 @@ static void test_exec_mount_apivfs(Manager *m) {
assert_se(mkdir_p("/tmp/test-exec-mount-apivfs-no/root", 0755) >= 0);
test(m, "exec-mount-apivfs-no.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-mount-apivfs-no.service", can_unshare || !MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
(void) rm_rf("/tmp/test-exec-mount-apivfs-no/root", REMOVE_ROOT|REMOVE_PHYSICAL);
}
static void test_exec_noexecpaths(Manager *m) {
test(m, "exec-noexecpaths-simple.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-noexecpaths-simple.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_temporaryfilesystem(Manager *m) {
@ -964,8 +964,8 @@ static void test_exec_passenvironment(Manager *m) {
}
static void test_exec_umask(Manager *m) {
test(m, "exec-umask-default.service", 0, CLD_EXITED);
test(m, "exec-umask-0177.service", 0, CLD_EXITED);
test(m, "exec-umask-default.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-umask-0177.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_runtimedirectory(Manager *m) {
@ -1012,7 +1012,7 @@ static void test_exec_capabilityboundingset(Manager *m) {
}
static void test_exec_basic(Manager *m) {
test(m, "exec-basic.service", 0, CLD_EXITED);
test(m, "exec-basic.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_ambientcapabilities(Manager *m) {
@ -1052,7 +1052,7 @@ static void test_exec_ambientcapabilities(Manager *m) {
}
static void test_exec_privatenetwork(Manager *m) {
int r, status;
int r;
r = find_executable("ip", NULL);
if (r < 0) {
@ -1060,9 +1060,8 @@ static void test_exec_privatenetwork(Manager *m) {
return;
}
status = can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_FAILURE;
test(m, "exec-privatenetwork-yes-privatemounts-no.service", status, CLD_EXITED);
test(m, "exec-privatenetwork-yes-privatemounts-yes.service", status, CLD_EXITED);
test(m, "exec-privatenetwork-yes-privatemounts-no.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatenetwork-yes-privatemounts-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_networknamespacepath(Manager *m) {
@ -1075,7 +1074,7 @@ static void test_exec_networknamespacepath(Manager *m) {
}
test(m, "exec-networknamespacepath-privatemounts-no.service", MANAGER_IS_SYSTEM(m) ? EXIT_SUCCESS : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-networknamespacepath-privatemounts-yes.service", can_unshare ? EXIT_SUCCESS : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-networknamespacepath-privatemounts-yes.service", can_unshare ? EXIT_SUCCESS : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}
static void test_exec_oomscoreadjust(Manager *m) {
@ -1105,12 +1104,12 @@ static void test_exec_unsetenvironment(Manager *m) {
}
static void test_exec_specifier(Manager *m) {
test(m, "exec-specifier.service", 0, CLD_EXITED);
test(m, "exec-specifier.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
if (MANAGER_IS_SYSTEM(m))
test(m, "exec-specifier-system.service", 0, CLD_EXITED);
else
test(m, "exec-specifier-user.service", 0, CLD_EXITED);
test(m, "exec-specifier@foo-bar.service", 0, CLD_EXITED);
test(m, "exec-specifier@foo-bar.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-specifier-interpolation.service", 0, CLD_EXITED);
}

View file

@ -17,7 +17,7 @@ runas testuser systemd-run --wait --user --unit=test-private-users \
runas testuser systemctl --user log-level debug
runas testuser systemd-run --wait --user --unit=test-private-tmp-innerfile \
-p PrivateUsers=yes -p PrivateTmp=yes \
-p PrivateTmp=yes \
-P touch /tmp/innerfile.txt
# File should not exist outside the job's tmp directory.
test ! -e /tmp/innerfile.txt
@ -25,7 +25,7 @@ test ! -e /tmp/innerfile.txt
touch /tmp/outerfile.txt
# File should not appear in unit's private tmp.
runas testuser systemd-run --wait --user --unit=test-private-tmp-outerfile \
-p PrivateUsers=yes -p PrivateTmp=yes \
-p PrivateTmp=yes \
-P test ! -e /tmp/outerfile.txt
# Confirm that creating a file in home works
@ -35,7 +35,7 @@ test -e /home/testuser/works.txt
# Confirm that creating a file in home is blocked under read-only
runas testuser systemd-run --wait --user --unit=test-protect-home-read-only \
-p PrivateUsers=yes -p ProtectHome=read-only \
-p ProtectHome=read-only \
-P bash -c '
test -e /home/testuser/works.txt || exit 10
touch /home/testuser/blocked.txt && exit 11
@ -45,13 +45,13 @@ test ! -e /home/testuser/blocked.txt
# Check that tmpfs hides the whole directory
runas testuser systemd-run --wait --user --unit=test-protect-home-tmpfs \
-p PrivateUsers=yes -p ProtectHome=tmpfs \
-p ProtectHome=tmpfs \
-P test ! -e /home/testuser
# Confirm that home, /root, and /run/user are inaccessible under "yes"
# shellcheck disable=SC2016
runas testuser systemd-run --wait --user --unit=test-protect-home-yes \
-p PrivateUsers=yes -p ProtectHome=yes \
-p ProtectHome=yes \
-P bash -c '
test "$(stat -c %a /home)" = "0"
test "$(stat -c %a /root)" = "0"
@ -70,11 +70,11 @@ runas testuser systemd-run --wait --user --unit=test-group-fail \
# Check that with a new user namespace we can bind mount
# files and use a different root directory
runas testuser systemd-run --wait --user --unit=test-bind-mount \
-p PrivateUsers=yes -p BindPaths=/dev/null:/etc/os-release \
-p BindPaths=/dev/null:/etc/os-release \
test ! -s /etc/os-release
runas testuser systemd-run --wait --user --unit=test-read-write \
-p PrivateUsers=yes -p ReadOnlyPaths=/ \
-p ReadOnlyPaths=/ \
-p ReadWritePaths="/var /run /tmp" \
-p NoExecPaths=/ -p ExecPaths=/usr \
test ! -w /etc/os-release
@ -85,50 +85,50 @@ runas testuser systemd-run --wait --user --unit=test-caps \
test -s /etc/os-release
runas testuser systemd-run --wait --user --unit=test-devices \
-p PrivateUsers=yes -p PrivateDevices=yes -p PrivateIPC=yes \
-p PrivateDevices=yes -p PrivateIPC=yes \
sh -c "ls -1 /dev/ | wc -l | grep -q -F 18"
# Same check as test/test-execute/exec-privatenetwork-yes.service
runas testuser systemd-run --wait --user --unit=test-network \
-p PrivateUsers=yes -p PrivateNetwork=yes \
-p PrivateNetwork=yes \
/bin/sh -x -c '! ip link | grep -E "^[0-9]+: " | grep -Ev ": (lo|(erspan|gre|gretap|ip_vti|ip6_vti|ip6gre|ip6tnl|sit|tunl)0@.*):"'
runas testuser systemd-run --wait --user --unit=test-hostname \
-p PrivateUsers=yes -p ProtectHostname=yes \
-p ProtectHostname=yes \
hostnamectl hostname foo \
&& { echo 'unexpected success'; exit 1; }
runas testuser systemd-run --wait --user --unit=test-clock \
-p PrivateUsers=yes -p ProtectClock=yes \
-p ProtectClock=yes \
timedatectl set-time "2012-10-30 18:17:16" \
&& { echo 'unexpected success'; exit 1; }
runas testuser systemd-run --wait --user --unit=test-kernel-tunable \
-p PrivateUsers=yes -p ProtectKernelTunables=yes \
-p ProtectKernelTunables=yes \
sh -c "echo 0 >/proc/sys/user/max_user_namespaces" \
&& { echo 'unexpected success'; exit 1; }
runas testuser systemd-run --wait --user --unit=test-kernel-mod \
-p PrivateUsers=yes -p ProtectKernelModules=yes \
-p ProtectKernelModules=yes \
sh -c "modprobe -r overlay && modprobe overlay" \
&& { echo 'unexpected success'; exit 1; }
if sysctl kernel.dmesg_restrict=0; then
runas testuser systemd-run --wait --user --unit=test-kernel-log \
-p PrivateUsers=yes -p ProtectKernelLogs=yes -p LogNamespace=yes \
-p ProtectKernelLogs=yes -p LogNamespace=yes \
dmesg \
&& { echo 'unexpected success'; exit 1; }
fi
unsquashfs -no-xattrs -d /tmp/img /usr/share/minimal_0.raw
runas testuser systemd-run --wait --user --unit=test-root-dir \
-p PrivateUsers=yes -p RootDirectory=/tmp/img \
-p RootDirectory=/tmp/img \
grep MARKER=1 /etc/os-release
mkdir /tmp/img_bind
mount --bind /tmp/img /tmp/img_bind
runas testuser systemd-run --wait --user --unit=test-root-dir-bind \
-p PrivateUsers=yes -p RootDirectory=/tmp/img_bind -p MountFlags=private \
-p RootDirectory=/tmp/img_bind -p MountFlags=private \
grep MARKER=1 /etc/os-release
umount /tmp/img_bind
@ -137,7 +137,7 @@ mkdir -p /tmp/a /tmp/b /tmp/c
if unshare --mount --user --map-root-user mount -t overlay overlay /tmp/c -o lowerdir=/tmp/a:/tmp/b; then
unsquashfs -no-xattrs -d /tmp/app2 /usr/share/app1.raw
runas testuser systemd-run --wait --user --unit=test-extension-dir \
-p PrivateUsers=yes -p ExtensionDirectories=/tmp/app2 \
-p ExtensionDirectories=/tmp/app2 \
-p TemporaryFileSystem=/run -p RootDirectory=/tmp/img \
-p MountAPIVFS=yes \
grep PORTABLE_PREFIXES=app1 /usr/lib/extension-release.d/extension-release.app2