nspawn: add support for rootidmap bind option

rootidmap bind option will map the root user from the container to the
owner of the mounted directory on the filesystem. This will ensure files
and directories created by the root user in the container will be owned
by the directory owner on the filesystem. All other user will remain
unmapped.
This commit is contained in:
Quentin Deslandes 2022-09-04 20:30:58 +02:00 committed by Quentin Deslandes
parent 1aa18710af
commit 2b2777eda9
6 changed files with 66 additions and 27 deletions

View file

@ -1363,16 +1363,37 @@ After=sys-subsystem-net-devices-ens1.device</programlisting>
multiple times for creating multiple independent bind mount points.</para>
<para>Mount options are comma-separated. <option>rbind</option> and <option>norbind</option> control whether
to create a recursive or a regular bind mount. Defaults to "rbind". <option>idmap</option> and <option>noidmap</option>
control if the bind mount should use filesystem id mappings. Using this option requires support by the source filesystem
for id mappings. Defaults to "noidmap".</para>
to create a recursive or a regular bind mount. Defaults to "rbind". <option>noidmap</option>,
<option>idmap</option>, and <option>rootidmap</option> control ID mapping.</para>
<para>Using <option>idmap</option> or <option>rootidmap</option> requires support by the source filesystem
for user/group ID mapped mounts. Defaults to "noidmap". With <option>x</option> being the container's UID range
offset, <option>y</option> being the length of the container's UID range, and <option>p</option> being the
owner UID of the bind mount source inode on the host:
<itemizedlist>
<listitem><para>If <option>noidmap</option> is used, any user <option>z</option> in the range
<option>0 … y</option> seen from inside of the container is mapped to <option>x + z</option> in the
<option>x … x + y</option> range on the host. All host users outside of that range are mapped to
<option>nobody</option> inside the container.</para></listitem>
<listitem><para>If <option>idmap</option> is used, any user <option>z</option> in the UID range
<option>0 … y</option> as seen from inside the container is mapped to the same <option>z</option>
in the same <option>0 … y</option> range on the host. All host users outside of that range are
mapped to <option>nobody</option> inside the container.</para></listitem>
<listitem><para>If <option>rootidmap</option> is used, the user <option>0</option> seen from inside
of the container is mapped to <option>p</option> on the host. All host users outside of that range
are mapped to <option>nobody</option> inside the container.</para></listitem>
</itemizedlist></para>
<para>Whichever ID mapping option is used, the same mapping will be used for users and groups IDs. If
<option>rootidmap</option> is used, the group owning the bind mounted directory will have no effect</para>
<para>Note that when this option is used in combination with <option>--private-users</option>, the resulting
mount points will be owned by the <constant>nobody</constant> user. That's because the mount and its files and
directories continue to be owned by the relevant host users and groups, which do not exist in the container,
and thus show up under the wildcard UID 65534 (nobody). If such bind mounts are created, it is recommended to
make them read-only, using <option>--bind-ro=</option>. Alternatively you can use the "idmap" mount option to
map the filesystem ids.</para></listitem>
map the filesystem IDs.</para></listitem>
</varlistentry>
<varlistentry>

View file

@ -733,6 +733,8 @@ static int parse_mount_bind_options(const char *options, unsigned long *mount_fl
new_idmapping = REMOUNT_IDMAPPING_HOST_ROOT;
else if (streq(word, "noidmap"))
new_idmapping = REMOUNT_IDMAPPING_NONE;
else if (streq(word, "rootidmap"))
new_idmapping = REMOUNT_IDMAPPING_HOST_OWNER;
else
return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
"Invalid bind mount option: %s", word);
@ -816,7 +818,7 @@ static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t u
}
if (idmapping != REMOUNT_IDMAPPING_NONE) {
r = remount_idmap(where, uid_shift, uid_range, idmapping);
r = remount_idmap(where, uid_shift, uid_range, source_st.st_uid, idmapping);
if (r < 0)
return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
}

View file

@ -3806,7 +3806,7 @@ static int outer_child(
IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
arg_uid_shift != 0) {
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAPPING_HOST_ROOT);
r = remount_idmap(directory, arg_uid_shift, arg_uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
/* This might fail because the kernel or file system doesn't support idmapping. We
* can't really distinguish this nicely, nor do we have any guarantees about the

View file

@ -1380,7 +1380,7 @@ static int mount_partition(
(void) fs_grow(node, p);
if (remap_uid_gid) {
r = remount_idmap(p, uid_shift, uid_range, REMOUNT_IDMAPPING_HOST_ROOT);
r = remount_idmap(p, uid_shift, uid_range, UID_INVALID, REMOUNT_IDMAPPING_HOST_ROOT);
if (r < 0)
return r;
}

View file

@ -1053,32 +1053,43 @@ int make_mount_point(const char *path) {
return 1;
}
static int make_userns(uid_t uid_shift, uid_t uid_range, RemountIdmapping idmapping) {
static int make_userns(uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping) {
_cleanup_close_ int userns_fd = -1;
_cleanup_free_ char *line = NULL;
/* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
* process whose only purpose is to give us a new user namespace. It's killed when we got it. */
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
return log_oom_debug();
/* If requested we'll include an entry in the mapping so that the host root user can make changes to
* the uidmapped mount like it normally would. Specifically, we'll map the user with UID_HOST_ROOT on
* the backing fs to UID 0. This is useful, since nspawn code wants to create various missing inodes
* in the OS tree before booting into it, and this becomes very easy and straightforward to do if it
* can just do it under its own regular UID. Note that in that case the container's runtime uidmap
* (i.e. the one the container payload processes run in) will leave this UID unmapped, i.e. if we
* accidentally leave files owned by host root in the already uidmapped tree around they'll show up
* as owned by 'nobody', which is safe. (Of course, we shouldn't leave such inodes around, but always
* chown() them to the container's own UID range, but it's good to have a safety net, in case we
* forget it.) */
if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
if (strextendf(&line,
UID_FMT " " UID_FMT " " UID_FMT "\n",
UID_MAPPED_ROOT, 0u, 1u) < 0)
if (IN_SET(idmapping, REMOUNT_IDMAPPING_NONE, REMOUNT_IDMAPPING_HOST_ROOT)) {
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0u, uid_shift, uid_range) < 0)
return log_oom_debug();
/* If requested we'll include an entry in the mapping so that the host root user can make
* changes to the uidmapped mount like it normally would. Specifically, we'll map the user
* with UID_MAPPED_ROOT on the backing fs to UID 0. This is useful, since nspawn code wants
* to create various missing inodes in the OS tree before booting into it, and this becomes
* very easy and straightforward to do if it can just do it under its own regular UID. Note
* that in that case the container's runtime uidmap (i.e. the one the container payload
* processes run in) will leave this UID unmapped, i.e. if we accidentally leave files owned
* by host root in the already uidmapped tree around they'll show up as owned by 'nobody',
* which is safe. (Of course, we shouldn't leave such inodes around, but always chown() them
* to the container's own UID range, but it's good to have a safety net, in case we
* forget it.) */
if (idmapping == REMOUNT_IDMAPPING_HOST_ROOT)
if (strextendf(&line,
UID_FMT " " UID_FMT " " UID_FMT "\n",
UID_MAPPED_ROOT, 0u, 1u) < 0)
return log_oom_debug();
}
if (idmapping == REMOUNT_IDMAPPING_HOST_OWNER) {
/* Remap the owner of the bind mounted directory to the root user within the container. This
* way every file written by root within the container to the bind-mounted directory will
* be owned by the original user. All other user will remain unmapped. */
if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", owner, uid_shift, 1u) < 0)
return log_oom_debug();
}
/* We always assign the same UID and GID ranges */
userns_fd = userns_acquire(line, line);
if (userns_fd < 0)
@ -1091,6 +1102,7 @@ int remount_idmap(
const char *p,
uid_t uid_shift,
uid_t uid_range,
uid_t owner,
RemountIdmapping idmapping) {
_cleanup_close_ int mount_fd = -1, userns_fd = -1;
@ -1107,7 +1119,7 @@ int remount_idmap(
return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", p);
/* Create a user namespace mapping */
userns_fd = make_userns(uid_shift, uid_range, idmapping);
userns_fd = make_userns(uid_shift, uid_range, owner, idmapping);
if (userns_fd < 0)
return userns_fd;

View file

@ -128,11 +128,15 @@ typedef enum RemountIdmapping {
* to add inodes to file systems mapped this way should set this flag, but given it comes with
* certain security implications defaults to off, and requires explicit opt-in. */
REMOUNT_IDMAPPING_HOST_ROOT,
/* Define a mapping from root user within the container to the owner of the bind mounted directory.
* This ensure no root-owned files will be written in a bind-mounted directory owned by a different
* user. No other users are mapped. */
REMOUNT_IDMAPPING_HOST_OWNER,
_REMOUNT_IDMAPPING_MAX,
_REMOUNT_IDMAPPING_INVALID = -EINVAL,
} RemountIdmapping;
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, RemountIdmapping idmapping);
int remount_idmap(const char *p, uid_t uid_shift, uid_t uid_range, uid_t owner, RemountIdmapping idmapping);
/* Creates a mount point (not parents) based on the source path or stat - ie, a file or a directory */
int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode);