core: create inaccessible nodes for users when making runtime dirs

To support ProtectHome=y in a user namespace (which mounts the inaccessible
nodes), the nodes need to be accessible by the user. Create these paths and
devices in the user runtime directory so they can be used later if needed.
This commit is contained in:
Anita Zhang 2019-11-19 14:24:52 -08:00
parent a49ad4c482
commit e5f10cafe0
11 changed files with 96 additions and 30 deletions

View file

@ -536,7 +536,7 @@ int mount_setup(bool loaded_policy) {
/* Also create /run/systemd/inaccessible nodes, so that we always have something to mount inaccessible nodes
* from. */
(void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID);
(void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
return 0;
}

View file

@ -12,6 +12,7 @@
#include "base-filesystem.h"
#include "dev-setup.h"
#include "fd-util.h"
#include "format-util.h"
#include "fs-util.h"
#include "label.h"
#include "loop-util.h"
@ -905,6 +906,7 @@ static int apply_mount(
const char *root_directory,
MountEntry *m) {
_cleanup_free_ char *inaccessible = NULL;
bool rbind = true, make = false;
const char *what;
int r;
@ -916,6 +918,8 @@ static int apply_mount(
switch (m->mode) {
case INACCESSIBLE: {
_cleanup_free_ char *tmp = NULL;
const char *runtime_dir;
struct stat target;
/* First, get rid of everything that is below if there
@ -930,10 +934,20 @@ static int apply_mount(
return log_debug_errno(errno, "Failed to lstat() %s to determine what to mount over it: %m", mount_entry_path(m));
}
what = mode_to_inaccessible_node(target.st_mode);
if (!what)
if (geteuid() == 0)
runtime_dir = "/run/systemd";
else {
if (asprintf(&tmp, "/run/user/"UID_FMT, geteuid()) < 0)
log_oom();
runtime_dir = tmp;
}
r = mode_to_inaccessible_node(runtime_dir, target.st_mode, &inaccessible);
if (r < 0)
return log_debug_errno(SYNTHETIC_ERRNO(ELOOP),
"File type not supported for inaccessible mounts. Note that symlinks are not allowed");
what = inaccessible;
break;
}

View file

@ -6,6 +6,7 @@
#include "sd-bus.h"
#include "bus-error.h"
#include "dev-setup.h"
#include "fs-util.h"
#include "format-util.h"
#include "label.h"
@ -91,6 +92,8 @@ static int user_mkdir_runtime_path(
log_warning_errno(r, "Failed to fix label of \"%s\", ignoring: %m", runtime_path);
}
/* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */
(void) make_inaccessible_nodes(runtime_path, uid, gid);
return 0;
fail:

View file

@ -883,8 +883,7 @@ static int mount_overlay(const char *dest, CustomMount *m) {
}
static int mount_inaccessible(const char *dest, CustomMount *m) {
_cleanup_free_ char *where = NULL;
const char *source;
_cleanup_free_ char *where = NULL, *source = NULL;
struct stat st;
int r;
@ -897,7 +896,9 @@ static int mount_inaccessible(const char *dest, CustomMount *m) {
return m->graceful ? 0 : r;
}
assert_se(source = mode_to_inaccessible_node(st.st_mode));
r = mode_to_inaccessible_node("/run/systemd", st.st_mode, &source);
if (r < 0)
return m->graceful ? 0 : r;
r = mount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL);
if (r < 0)

View file

@ -3252,6 +3252,7 @@ static int outer_child(
int netns_fd) {
_cleanup_close_ int fd = -1;
const char *p;
pid_t pid;
ssize_t l;
int r;
@ -3447,7 +3448,9 @@ static int outer_child(
return r;
(void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
(void) make_inaccessible_nodes(directory, arg_uid_shift, arg_uid_shift);
p = prefix_roota(directory, "/run/systemd");
(void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
r = setup_pts(directory);
if (r < 0)

View file

@ -61,20 +61,20 @@ int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid) {
const char *name;
mode_t mode;
} table[] = {
{ "/run/systemd", S_IFDIR | 0755 },
{ "/run/systemd/inaccessible", S_IFDIR | 0000 },
{ "/run/systemd/inaccessible/reg", S_IFREG | 0000 },
{ "/run/systemd/inaccessible/dir", S_IFDIR | 0000 },
{ "/run/systemd/inaccessible/fifo", S_IFIFO | 0000 },
{ "/run/systemd/inaccessible/sock", S_IFSOCK | 0000 },
{ "", S_IFDIR | 0755 },
{ "/inaccessible", S_IFDIR | 0000 },
{ "/inaccessible/reg", S_IFREG | 0000 },
{ "/inaccessible/dir", S_IFDIR | 0000 },
{ "/inaccessible/fifo", S_IFIFO | 0000 },
{ "/inaccessible/sock", S_IFSOCK | 0000 },
/* The following two are likely to fail if we lack the privs for it (for example in an userns
* environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0
* device nodes to be created). But that's entirely fine. Consumers of these files should carry
* fallback to use a different node then, for example /run/systemd/inaccessible/sock, which is close
* fallback to use a different node then, for example <root>/inaccessible/sock, which is close
* enough in behaviour and semantics for most uses. */
{ "/run/systemd/inaccessible/chr", S_IFCHR | 0000 },
{ "/run/systemd/inaccessible/blk", S_IFBLK | 0000 },
{ "/inaccessible/chr", S_IFCHR | 0000 },
{ "/inaccessible/blk", S_IFBLK | 0000 },
};
_cleanup_umask_ mode_t u;

View file

@ -339,38 +339,72 @@ int repeat_unmount(const char *path, int flags) {
}
}
const char* mode_to_inaccessible_node(mode_t mode) {
int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest) {
/* This function maps a node type to a corresponding inaccessible file node. These nodes are created during
* early boot by PID 1. In some cases we lacked the privs to create the character and block devices (maybe
* because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a devices policy that excludes
* device nodes with major and minor of 0), but that's fine, in that case we use an AF_UNIX file node instead,
* which is not the same, but close enough for most uses. And most importantly, the kernel allows bind mounts
* from socket nodes to any non-directory file nodes, and that's the most important thing that matters. */
_cleanup_free_ char *d = NULL;
const char *node = NULL;
char *tmp;
assert(dest);
switch(mode & S_IFMT) {
case S_IFREG:
return "/run/systemd/inaccessible/reg";
node = "/inaccessible/reg";
break;
case S_IFDIR:
return "/run/systemd/inaccessible/dir";
node = "/inaccessible/dir";
break;
case S_IFCHR:
if (access("/run/systemd/inaccessible/chr", F_OK) == 0)
return "/run/systemd/inaccessible/chr";
return "/run/systemd/inaccessible/sock";
d = path_join(runtime_dir, "/inaccessible/chr");
if (!d)
return log_oom();
if (access(d, F_OK) == 0) {
*dest = TAKE_PTR(d);
return 0;
}
node = "/inaccessible/sock";
break;
case S_IFBLK:
if (access("/run/systemd/inaccessible/blk", F_OK) == 0)
return "/run/systemd/inaccessible/blk";
return "/run/systemd/inaccessible/sock";
d = path_join(runtime_dir, "/inaccessible/blk");
if (!d)
return log_oom();
if (access(d, F_OK) == 0) {
*dest = TAKE_PTR(d);
return 0;
}
node = "/inaccessible/sock";
break;
case S_IFIFO:
return "/run/systemd/inaccessible/fifo";
node = "/inaccessible/fifo";
break;
case S_IFSOCK:
return "/run/systemd/inaccessible/sock";
node = "/inaccessible/sock";
break;
}
return NULL;
if (!node)
return -EINVAL;
tmp = path_join(runtime_dir, node);
if (!tmp)
return log_oom();
*dest = tmp;
return 0;
}
#define FLAG(name) (flags & name ? STRINGIFY(name) "|" : "")

View file

@ -31,4 +31,4 @@ int mount_option_mangle(
unsigned long *ret_mount_flags,
char **ret_remaining_options);
const char* mode_to_inaccessible_node(mode_t mode);
int mode_to_inaccessible_node(const char *runtime_dir, mode_t mode, char **dest);

View file

@ -20,7 +20,8 @@ int main(int argc, char *argv[]) {
f = prefix_roota(p, "/run");
assert_se(mkdir(f, 0755) >= 0);
assert_se(make_inaccessible_nodes(p, 1, 1) >= 0);
f = prefix_roota(p, "/run/systemd");
assert_se(make_inaccessible_nodes(f, 1, 1) >= 0);
f = prefix_roota(p, "/run/systemd/inaccessible/reg");
assert_se(stat(f, &st) >= 0);

View file

@ -11,6 +11,7 @@ test_setup() {
eval $(udevadm info --export --query=env --name=${LOOPDEV}p2)
setup_basic_environment
inst_binary stat
mask_supporting_services

View file

@ -46,6 +46,15 @@ runas nobody systemd-run --user --unit=test-protect-home-tmpfs \
-p PrivateUsers=yes -p ProtectHome=tmpfs \
-P test ! -e /home/nobody
# Confirm that home, /root, and /run/user are inaccessible under "yes"
runas nobody systemd-run --user --unit=test-protect-home-yes \
-p PrivateUsers=yes -p ProtectHome=yes \
-P bash -c '
test "$(stat -c %a /home)" = "0"
test "$(stat -c %a /root)" = "0"
test "$(stat -c %a /run/user)" = "0"
'
# Confirm we cannot change groups because we only have one mapping in the user
# namespace (no CAP_SETGID in the parent namespace to write the additional
# mapping of the user supplied group and thus cannot change groups to an