Merge pull request #26704 from poettering/mnt-nosymlinks

Set MS_NOSYMFOLLOW for ESP + XBOOTLDR and many mount option clean-ups
2024-09-16 06:43:18 +00:00 · 2023-03-10 09:34:04 +01:00 · 2023-03-10 09:34:04 +01:00 · 96c96fb250
parent 4b8ce14f6c e1349c045e
commit 96c96fb250
9 changed files with 423 additions and 91 deletions
--- a/meson.build
+++ b/meson.build
@ -629,6 +629,7 @@ foreach ident : [
        ['open_tree',         '''#include <sys/mount.h>'''],
        ['fsopen',            '''#include <sys/mount.h>'''],
        ['fsconfig',          '''#include <sys/mount.h>'''],
+        ['fsmount',           '''#include <sys/mount.h>'''],
        ['getdents64',        '''#include <dirent.h>'''],
 ]

--- a/src/basic/missing_syscall.h
+++ b/src/basic/missing_syscall.h
@ -591,10 +591,22 @@ static inline int missing_fsopen(const char *fsname, unsigned flags) {

 #if !HAVE_FSCONFIG

+#ifndef FSCONFIG_SET_FLAG
+#define FSCONFIG_SET_FLAG 0 /* Set parameter, supplying no value */
+#endif
+
 #ifndef FSCONFIG_SET_STRING
 #define FSCONFIG_SET_STRING 1 /* Set parameter, supplying a string value */
 #endif

+#ifndef FSCONFIG_SET_FD
+#define FSCONFIG_SET_FD 5 /* Set parameter, supplying an object by fd */
+#endif
+
+#ifndef FSCONFIG_CMD_CREATE
+#define FSCONFIG_CMD_CREATE 6 /* Invoke superblock creation */
+#endif
+
 static inline int missing_fsconfig(int fd, unsigned cmd, const char *key, const void *value, int aux) {
 #  if defined __NR_fsconfig && __NR_fsconfig >= 0
        return syscall(__NR_fsconfig, fd, cmd, key, value, aux);
@ -609,6 +621,26 @@ static inline int missing_fsconfig(int fd, unsigned cmd, const char *key, const

 /* ======================================================================= */

+#if !HAVE_FSMOUNT
+
+#ifndef FSMOUNT_CLOEXEC
+#define FSMOUNT_CLOEXEC 0x00000001
+#endif
+
+static inline int missing_fsmount(int fd, unsigned flags, unsigned ms_flags) {
+#  if defined __NR_fsmount && __NR_fsmount >= 0
+        return syscall(__NR_fsmount, fd, flags, ms_flags);
+#  else
+        errno = ENOSYS;
+        return -1;
+#  endif
+}
+
+#  define fsmount missing_fsmount
+#endif
+
+/* ======================================================================= */
+
 #if !HAVE_GETDENTS64

 static inline ssize_t missing_getdents64(int fd, void *buffer, size_t length) {
--- a/src/basic/mountpoint-util.c
+++ b/src/basic/mountpoint-util.c
@ -3,6 +3,9 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <sys/mount.h>
+#if WANT_LINUX_FS_H
+#include <linux/fs.h>
+#endif

 #include "alloc-util.h"
 #include "chase-symlinks.h"
@ -10,6 +13,8 @@
 #include "fileio.h"
 #include "filesystems.h"
 #include "fs-util.h"
+#include "missing_fs.h"
+#include "missing_mount.h"
 #include "missing_stat.h"
 #include "missing_syscall.h"
 #include "mkdir.h"
@ -456,6 +461,15 @@ bool fstype_is_ro(const char *fstype) {
 }

 bool fstype_can_discard(const char *fstype) {
+        int r;
+
+        assert(fstype);
+
+        /* On new kernels we can just ask the kernel */
+        r = mount_option_supported(fstype, "discard", NULL);
+        if (r >= 0)
+                return r;
+
        return STR_IN_SET(fstype,
                          "btrfs",
                          "f2fs",
@ -464,10 +478,42 @@ bool fstype_can_discard(const char *fstype) {
                          "xfs");
 }

-bool fstype_can_uid_gid(const char *fstype) {
+bool fstype_can_norecovery(const char *fstype) {
+        int r;

-        /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and directories,
-         * current and future. */
+        assert(fstype);
+
+        /* On new kernels we can just ask the kernel */
+        r = mount_option_supported(fstype, "norecovery", NULL);
+        if (r >= 0)
+                return r;
+
+        return STR_IN_SET(fstype,
+                          "ext3",
+                          "ext4",
+                          "xfs",
+                          "btrfs");
+}
+
+bool fstype_can_umask(const char *fstype) {
+        int r;
+
+        assert(fstype);
+
+        /* On new kernels we can just ask the kernel */
+        r = mount_option_supported(fstype, "umask", "0077");
+        if (r >= 0)
+                return r;
+
+        return streq(fstype, "vfat");
+}
+
+bool fstype_can_uid_gid(const char *fstype) {
+        /* All file systems that have a uid=/gid= mount option that fixates the owners of all files and
+         * directories, current and future. Note that this does *not* ask the kernel via
+         * mount_option_supported() here because the uid=/gid= setting of various file systems mean different
+         * things: some apply it only to the root dir inode, others to all inodes in the file system. Thus we
+         * maintain the curated list below. 😢 */

        return STR_IN_SET(fstype,
                          "adfs",
@ -602,3 +648,111 @@ int mount_propagation_flag_from_string(const char *name, unsigned long *ret) {
 bool mount_propagation_flag_is_valid(unsigned long flag) {
        return IN_SET(flag, 0, MS_SHARED, MS_PRIVATE, MS_SLAVE);
 }
+
+unsigned long ms_nosymfollow_supported(void) {
+        _cleanup_close_ int fsfd = -EBADF, mntfd = -EBADF;
+        static int cache = -1;
+
+        /* Returns MS_NOSYMFOLLOW if it is supported, zero otherwise. */
+
+        if (cache >= 0)
+                return cache ? MS_NOSYMFOLLOW : 0;
+
+        /* Checks if MS_NOSYMFOLLOW is supported (which was added in 5.10). We use the new mount API's
+         * mount_setattr() call for that, which was added in 5.12, which is close enough. */
+
+        fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC);
+        if (fsfd < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno))
+                        goto not_supported;
+
+                log_debug_errno(errno, "Failed to open superblock context for tmpfs: %m");
+                return 0;
+        }
+
+        if (fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno))
+                        goto not_supported;
+
+                log_debug_errno(errno, "Failed to create tmpfs superblock: %m");
+                return 0;
+        }
+
+        mntfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+        if (mntfd < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno))
+                        goto not_supported;
+
+                log_debug_errno(errno, "Failed to turn superblock fd into mount fd: %m");
+                return 0;
+        }
+
+        if (mount_setattr(mntfd, "", AT_EMPTY_PATH|AT_RECURSIVE,
+                          &(struct mount_attr) {
+                                  .attr_set = MOUNT_ATTR_NOSYMFOLLOW,
+                          }, sizeof(struct mount_attr)) < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno))
+                        goto not_supported;
+
+                log_debug_errno(errno, "Failed to set MOUNT_ATTR_NOSYMFOLLOW mount attribute: %m");
+                return 0;
+        }
+
+        cache = true;
+        return MS_NOSYMFOLLOW;
+
+not_supported:
+        cache = false;
+        return 0;
+}
+
+int mount_option_supported(const char *fstype, const char *key, const char *value) {
+        _cleanup_close_ int fd = -EBADF;
+        int r;
+
+        /* Checks if the specified file system supports a mount option. Returns > 0 if it suppors it, == 0 if
+         * it does not. Return -EAGAIN if we can't determine it. And any other error otherwise. */
+
+        assert(fstype);
+        assert(key);
+
+        fd = fsopen(fstype, FSOPEN_CLOEXEC);
+        if (fd < 0) {
+                if (ERRNO_IS_NOT_SUPPORTED(errno))
+                        return -EAGAIN;  /* new mount API not available → don't know */
+
+                return log_debug_errno(errno, "Failed to open superblock context for '%s': %m", fstype);
+        }
+
+        /* Various file systems have not been converted to the new mount API yet. For such file systems
+         * fsconfig() with FSCONFIG_SET_STRING/FSCONFIG_SET_FLAG never fail. Which sucks, because we want to
+         * use it for testing support, after all. Let's hence do a check if the file system got converted yet
+         * first. */
+        if (fsconfig(fd, FSCONFIG_SET_FD, "adefinitelynotexistingmountoption", NULL, fd) < 0) {
+                /* If FSCONFIG_SET_FD is not supported for the fs, then the file system was not converted to
+                 * the new mount API yet. If it returns EINVAL the mount option doesn't exist, but the fstype
+                 * is converted. */
+                if (errno == EOPNOTSUPP)
+                        return -EAGAIN; /* FSCONFIG_SET_FD not supported on the fs, hence not converted to new mount API → don't know */
+                if (errno != EINVAL)
+                        return log_debug_errno(errno, "Failed to check if file system has been converted to new mount API: %m");
+
+                /* So FSCONFIG_SET_FD worked, but the option didn't exist (we got EINVAL), this means the fs
+                 * is converted. Let's now ask the actual question we wonder about. */
+        } else
+                return log_debug_errno(SYNTHETIC_ERRNO(EAGAIN), "FSCONFIG_SET_FD worked unexpectedly for '%s', whoa!", fstype);
+
+        if (value)
+                r = fsconfig(fd, FSCONFIG_SET_STRING, key, value, 0);
+        else
+                r = fsconfig(fd, FSCONFIG_SET_FLAG, key, NULL, 0);
+        if (r < 0) {
+                if (errno == EINVAL)
+                        return false; /* EINVAL means option not supported. */
+
+                return log_debug_errno(errno, "Failed to set '%s%s%s' on '%s' superblock context: %m",
+                                       key, value ? "=" : "", strempty(value), fstype);
+        }
+
+        return true; /* works! */
+}
--- a/src/basic/mountpoint-util.h
+++ b/src/basic/mountpoint-util.h
@ -49,6 +49,8 @@ bool fstype_is_blockdev_backed(const char *fstype);
 bool fstype_is_ro(const char *fsype);
 bool fstype_can_discard(const char *fstype);
 bool fstype_can_uid_gid(const char *fstype);
+bool fstype_can_norecovery(const char *fstype);
+bool fstype_can_umask(const char *fstype);

 int dev_is_devtmpfs(void);

@ -58,3 +60,7 @@ int mount_nofollow(const char *source, const char *target, const char *filesyste
 const char *mount_propagation_flag_to_string(unsigned long flags);
 int mount_propagation_flag_from_string(const char *name, unsigned long *ret);
 bool mount_propagation_flag_is_valid(unsigned long flag);
+
+unsigned long ms_nosymfollow_supported(void);
+
+int mount_option_supported(const char *fstype, const char *key, const char *value);
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@ -1099,27 +1099,6 @@ static int mount_bind_sysfs(const MountEntry *m) {
        return 1;
 }

-static bool mount_option_supported(const char *fstype, const char *key, const char *value) {
-        _cleanup_close_ int fd = -EBADF;
-        int r;
-
-        /* This function assumes support by default. Only if the fsconfig() call fails with -EINVAL/-EOPNOTSUPP
-         * will it report that the option/value is not supported. */
-
-        fd = fsopen(fstype, FSOPEN_CLOEXEC);
-        if (fd < 0) {
-                if (errno != ENOSYS)
-                        log_debug_errno(errno, "Failed to open superblock context for '%s': %m", fstype);
-                return true; /* If fsopen() fails for whatever reason, assume the value is supported. */
-        }
-
-        r = fsconfig(fd, FSCONFIG_SET_STRING, key, value, 0);
-        if (r < 0 && !IN_SET(errno, EINVAL, EOPNOTSUPP, ENOSYS))
-                log_debug_errno(errno, "Failed to set '%s=%s' on '%s' superblock context: %m", key, value, fstype);
-
-        return r >= 0 || !IN_SET(errno, EINVAL, EOPNOTSUPP);
-}
-
 static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
        _cleanup_free_ char *opts = NULL;
        const char *entry_path;
@ -1147,13 +1126,14 @@ static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
                 * fsopen()/fsconfig() was also backported on some distros which allows us to detect
                 * hidepid=/subset= support in even more scenarios. */

-                if (mount_option_supported("proc", "hidepid", hpv)) {
+                if (mount_option_supported("proc", "hidepid", hpv) != 0) {
                        opts = strjoin("hidepid=", hpv);
                        if (!opts)
                                return -ENOMEM;
                }

-                if (ns_info->proc_subset == PROC_SUBSET_PID && mount_option_supported("proc", "subset", "pid"))
+                if (ns_info->proc_subset == PROC_SUBSET_PID &&
+                    mount_option_supported("proc", "subset", "pid") != 0)
                        if (!strextend_with_separator(&opts, ",", "subset=pid"))
                                return -ENOMEM;
        }
--- a/src/gpt-auto-generator/gpt-auto-generator.c
+++ b/src/gpt-auto-generator/gpt-auto-generator.c
@ -245,9 +245,7 @@ static int add_mount(
                fprintf(f, "Type=%s\n", fstype);

        if (options)
-                fprintf(f, "Options=%s,%s\n", options, rw ? "rw" : "ro");
-        else
-                fprintf(f, "Options=%s\n", rw ? "rw" : "ro");
+                fprintf(f, "Options=%s\n", options);

        r = fflush_and_check(f);
        if (r < 0)
@ -301,18 +299,31 @@ static int path_is_busy(const char *where) {
 }

 static int add_partition_mount(
+                PartitionDesignator d,
                DissectedPartition *p,
                const char *id,
                const char *where,
                const char *description) {

+        _cleanup_free_ char *options = NULL;
        int r;
+
        assert(p);

        r = path_is_busy(where);
        if (r != 0)
                return r < 0 ? r : 0;

+        r = partition_pick_mount_options(
+                        d,
+                        dissected_partition_fstype(p),
+                        p->rw,
+                        /* discard= */ true,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return r;
+
        return add_mount(
                        id,
                        p->node,
@ -321,7 +332,7 @@ static int add_partition_mount(
                        p->rw,
                        p->growfs,
                        /* measure= */ STR_IN_SET(id, "root", "var"), /* by default measure rootfs and /var, since they contain the "identity" of the system */
-                        NULL,
+                        options,
                        description,
                        SPECIAL_LOCAL_FS_TARGET);
 }
@ -452,20 +463,8 @@ static int add_automount(
        return generator_add_symlink(arg_dest, SPECIAL_LOCAL_FS_TARGET, "wants", unit);
 }

-static const char *esp_or_xbootldr_options(const DissectedPartition *p) {
-        assert(p);
-
-        /* Discovered ESP and XBOOTLDR partition are always hardened with "noexec,nosuid,nodev".
-         * If we probed vfat or have no idea about the file system then assume these file systems are vfat
-         * and thus understand "umask=0077". */
-
-        if (!p->fstype || streq(p->fstype, "vfat"))
-                return "umask=0077,noexec,nosuid,nodev";
-
-        return "noexec,nosuid,nodev";
-}
-
 static int add_partition_xbootldr(DissectedPartition *p) {
+        _cleanup_free_ char *options = NULL;
        int r;

        assert(p);
@ -489,13 +488,23 @@ static int add_partition_xbootldr(DissectedPartition *p) {
        if (r > 0)
                return 0;

+        r = partition_pick_mount_options(
+                        PARTITION_XBOOTLDR,
+                        dissected_partition_fstype(p),
+                        /* rw= */ true,
+                        /* discard= */ false,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine default mount options for Boot Loader Partition: %m");
+
        return add_automount("boot",
                             p->node,
                             "/boot",
                             p->fstype,
                             /* rw= */ true,
                             /* growfs= */ false,
-                             esp_or_xbootldr_options(p),
+                             options,
                             "Boot Loader Partition",
                             120 * USEC_PER_SEC);
 }
@ -503,6 +512,7 @@ static int add_partition_xbootldr(DissectedPartition *p) {
 #if ENABLE_EFI
 static int add_partition_esp(DissectedPartition *p, bool has_xbootldr) {
        const char *esp_path = NULL, *id = NULL;
+        _cleanup_free_ char *options = NULL;
        int r;

        assert(p);
@ -569,13 +579,23 @@ static int add_partition_esp(DissectedPartition *p, bool has_xbootldr) {
        } else
                log_debug("Not an EFI boot, skipping ESP check.");

+        r = partition_pick_mount_options(
+                        PARTITION_ESP,
+                        dissected_partition_fstype(p),
+                        /* rw= */ true,
+                        /* discard= */ false,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to determine default mount options for EFI System Partition: %m");
+
        return add_automount(id,
                             p->node,
                             esp_path,
                             p->fstype,
                             /* rw= */ true,
                             /* growfs= */ false,
-                             esp_or_xbootldr_options(p),
+                             options,
                             "EFI System Partition Automount",
                             120 * USEC_PER_SEC);
 }
@ -637,6 +657,7 @@ static int add_root_cryptsetup(void) {

 static int add_root_mount(void) {
 #if ENABLE_EFI
+        _cleanup_free_ char *options = NULL;
        int r;

        if (!is_efi_boot()) {
@ -668,6 +689,20 @@ static int add_root_mount(void) {
        /* Note that we do not need to enable systemd-remount-fs.service here. If
         * /etc/fstab exists, systemd-fstab-generator will pull it in for us. */

+        r = partition_pick_mount_options(
+                        PARTITION_ROOT,
+                        arg_root_fstype,
+                        arg_root_rw > 0,
+                        /* discard= */ true,
+                        &options,
+                        /* ret_ms_flags= */ NULL);
+        if (r < 0)
+                return log_error_errno(r, "Failed to pick root mount options: %m");
+
+        if (arg_root_options)
+                if (!strextend_with_separator(&options, ",", arg_root_options))
+                        return log_oom();
+
        return add_mount(
                        "root",
                        "/dev/gpt-auto-root",
@ -676,7 +711,7 @@ static int add_root_mount(void) {
                        /* rw= */ arg_root_rw > 0,
                        /* growfs= */ false,
                        /* measure= */ true,
-                        arg_root_options,
+                        options,
                        "Root Partition",
                        in_initrd() ? SPECIAL_INITRD_ROOT_FS_TARGET : SPECIAL_LOCAL_FS_TARGET);
 #else
@ -745,25 +780,25 @@ static int enumerate_partitions(dev_t devnum) {
        }

        if (m->partitions[PARTITION_HOME].found) {
-                k = add_partition_mount(m->partitions + PARTITION_HOME, "home", "/home", "Home Partition");
+                k = add_partition_mount(PARTITION_HOME, m->partitions + PARTITION_HOME, "home", "/home", "Home Partition");
                if (k < 0)
                        r = k;
        }

        if (m->partitions[PARTITION_SRV].found) {
-                k = add_partition_mount(m->partitions + PARTITION_SRV, "srv", "/srv", "Server Data Partition");
+                k = add_partition_mount(PARTITION_SRV, m->partitions + PARTITION_SRV, "srv", "/srv", "Server Data Partition");
                if (k < 0)
                        r = k;
        }

        if (m->partitions[PARTITION_VAR].found) {
-                k = add_partition_mount(m->partitions + PARTITION_VAR, "var", "/var", "Variable Data Partition");
+                k = add_partition_mount(PARTITION_VAR, m->partitions + PARTITION_VAR, "var", "/var", "Variable Data Partition");
                if (k < 0)
                        r = k;
        }

        if (m->partitions[PARTITION_TMP].found) {
-                k = add_partition_mount(m->partitions + PARTITION_TMP, "var-tmp", "/var/tmp", "Temporary Data Partition");
+                k = add_partition_mount(PARTITION_TMP, m->partitions + PARTITION_TMP, "var-tmp", "/var/tmp", "Temporary Data Partition");
                if (k < 0)
                        r = k;
        }
--- a/src/shared/dissect-image.c
+++ b/src/shared/dissect-image.c
@ -50,6 +50,7 @@
 #include "id128-util.h"
 #include "import-util.h"
 #include "io-util.h"
+#include "missing_mount.h"
 #include "mkdir-label.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
@ -1502,7 +1503,99 @@ static int fs_grow(const char *node_path, const char *mount_path) {
        return 0;
 }

+int partition_pick_mount_options(
+                PartitionDesignator d,
+                const char *fstype,
+                bool rw,
+                bool discard,
+                char **ret_options,
+                unsigned long *ret_ms_flags) {
+
+        _cleanup_free_ char *options = NULL;
+
+        assert(ret_options);
+
+        /* Selects a baseline of bind mount flags, that should always apply.
+         *
+         * Firstly, we set MS_NODEV universally on all mounts, since we don't want to allow device nodes outside of /dev/.
+         *
+         * On /var/tmp/ we'll also set MS_NOSUID, same as we set for /tmp/ on the host.
+         *
+         * On the ESP and XBOOTLDR partitions we'll also disable symlinks, and execution. These file systems
+         * are generally untrusted (i.e. not encrypted or authenticated), and typically VFAT hence we should
+         * be as restrictive as possible, and this shouldn't hurt, since the functionality is not available
+         * there anyway. */
+
+        unsigned long flags = MS_NODEV;
+
+        if (!rw)
+                flags |= MS_RDONLY;
+
+        switch (d) {
+
+        case PARTITION_ESP:
+        case PARTITION_XBOOTLDR:
+                flags |= MS_NOSUID|MS_NOEXEC|ms_nosymfollow_supported();
+
+                /* The ESP might contain a pre-boot random seed. Let's make this unaccessible to regular
+                 * userspace. ESP/XBOOTLDR is almost certainly VFAT, hence if we don't know assume it is. */
+                if (!fstype || fstype_can_umask(fstype))
+                        if (!strextend_with_separator(&options, ",", "umask=0077"))
+                                return -ENOMEM;
+                break;
+
+        case PARTITION_TMP:
+                flags |= MS_NOSUID;
+                break;
+
+        default:
+                break;
+        }
+
+        /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the
+         * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices)
+         * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses
+         * from the upper file system still get propagated through to the underlying file system,
+         * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify
+         * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to
+         * carry a per file system table here.
+         *
+         * Note that this means that we might not be able to mount corrupted file systems as read-only
+         * anymore (since in some cases the kernel implementations will refuse mounting when corrupted,
+         * read-only and "norecovery" is specified). But I think for the case of automatically determined
+         * mount options for loopback devices this is the right choice, since otherwise using the same
+         * loopback file twice even in read-only mode, is going to fail badly sooner or later. The usecase of
+         * making reuse of the immutable images "just work" is more relevant to us than having read-only
+         * access that actually modifies stuff work on such image files. Or to say this differently: if
+         * people want their file systems to be fixed up they should just open them in writable mode, where
+         * all these problems don't exist. */
+        if (!rw && fstype && fstype_can_norecovery(fstype))
+                if (!strextend_with_separator(&options, ",", "norecovery"))
+                        return -ENOMEM;
+
+        if (discard && fstype && fstype_can_discard(fstype))
+                if (!strextend_with_separator(&options, ",", "discard"))
+                        return -ENOMEM;
+
+        if (!ret_ms_flags) /* Fold flags into option string if ret_flags specified as NULL */
+                if (!strextend_with_separator(&options, ",",
+                                              FLAGS_SET(flags, MS_RDONLY) ? "ro" : "rw",
+                                              FLAGS_SET(flags, MS_NODEV) ? "nodev" : "dev",
+                                              FLAGS_SET(flags, MS_NOSUID) ? "nosuid" : "suid",
+                                              FLAGS_SET(flags, MS_NOEXEC) ? "noexec" : "exec",
+                                              FLAGS_SET(flags, MS_NOSYMFOLLOW) ? "nosymfollow" : NULL))
+                        /* NB: we suppress 'symfollow' here, since it's the default, and old /bin/mount might not know it */
+                        return -ENOMEM;
+
+        if (ret_ms_flags)
+                *ret_ms_flags = flags;
+
+        *ret_options = TAKE_PTR(options);
+        return 0;
+}
+
 static int mount_partition(
+                PartitionDesignator d,
                DissectedPartition *m,
                const char *where,
                const char *directory,
@ -1511,8 +1604,9 @@ static int mount_partition(
                DissectImageFlags flags) {

        _cleanup_free_ char *chased = NULL, *options = NULL;
+        bool rw, discard, remap_uid_gid = false;
        const char *p, *node, *fstype;
-        bool rw, remap_uid_gid = false;
+        unsigned long ms_flags;
        int r;

        assert(m);
@ -1523,7 +1617,7 @@ static int mount_partition(

        /* Use decrypted node and matching fstype if available, otherwise use the original device */
        node = FORMAT_PROC_FD_PATH(m->mount_node_fd);
-        fstype = m->decrypted_node ? m->decrypted_fstype: m->fstype;
+        fstype = dissected_partition_fstype(m);

        if (!fstype)
                return -EAFNOSUPPORT;
@ -1541,6 +1635,9 @@ static int mount_partition(

        rw = m->rw && !(flags & DISSECT_IMAGE_MOUNT_READ_ONLY);

+        discard = ((flags & DISSECT_IMAGE_DISCARD) ||
+                   ((flags & DISSECT_IMAGE_DISCARD_ON_LOOP) && is_loop_device(m->node) > 0));
+
        if (FLAGS_SET(flags, DISSECT_IMAGE_FSCK) && rw) {
                r = run_fsck(m->mount_node_fd, fstype);
                if (r < 0)
@ -1571,14 +1668,9 @@ static int mount_partition(
                p = where;
        }

-        /* If requested, turn on discard support. */
-        if (fstype_can_discard(fstype) &&
-            ((flags & DISSECT_IMAGE_DISCARD) ||
-             ((flags & DISSECT_IMAGE_DISCARD_ON_LOOP) && is_loop_device(m->node) > 0))) {
-                options = strdup("discard");
-                if (!options)
-                        return -ENOMEM;
-        }
+        r = partition_pick_mount_options(d, dissected_partition_fstype(m), rw, discard, &options, &ms_flags);
+        if (r < 0)
+                return r;

        if (uid_is_valid(uid_shift) && uid_shift != 0) {

@ -1598,28 +1690,7 @@ static int mount_partition(
                if (!strextend_with_separator(&options, ",", m->mount_options))
                        return -ENOMEM;

-        /* So, when you request MS_RDONLY from ext4, then this means nothing. It happily still writes to the
-         * backing storage. What's worse, the BLKRO[GS]ET flag and (in case of loopback devices)
-         * LO_FLAGS_READ_ONLY don't mean anything, they affect userspace accesses only, and write accesses
-         * from the upper file system still get propagated through to the underlying file system,
-         * unrestricted. To actually get ext4/xfs/btrfs to stop writing to the device we need to specify
-         * "norecovery" as mount option, in addition to MS_RDONLY. Yes, this sucks, since it means we need to
-         * carry a per file system table here.
-         *
-         * Note that this means that we might not be able to mount corrupted file systems as read-only
-         * anymore (since in some cases the kernel implementations will refuse mounting when corrupted,
-         * read-only and "norecovery" is specified). But I think for the case of automatically determined
-         * mount options for loopback devices this is the right choice, since otherwise using the same
-         * loopback file twice even in read-only mode, is going to fail badly sooner or later. The usecase of
-         * making reuse of the immutable images "just work" is more relevant to us than having read-only
-         * access that actually modifies stuff work on such image files. Or to say this differently: if
-         * people want their file systems to be fixed up they should just open them in writable mode, where
-         * all these problems don't exist. */
-        if (!rw && STRPTR_IN_SET(fstype, "ext3", "ext4", "xfs", "btrfs"))
-                if (!strextend_with_separator(&options, ",", "norecovery"))
-                        return -ENOMEM;
-
-        r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), options);
+        r = mount_nofollow_verbose(LOG_DEBUG, node, p, fstype, ms_flags, options);
        if (r < 0)
                return r;

@ -1692,14 +1763,14 @@ int dissected_image_mount(

                /* First mount the root fs. If there's none we use a tmpfs. */
                if (m->partitions[PARTITION_ROOT].found)
-                        r = mount_partition(m->partitions + PARTITION_ROOT, where, NULL, uid_shift, uid_range, flags);
+                        r = mount_partition(PARTITION_ROOT, m->partitions + PARTITION_ROOT, where, NULL, uid_shift, uid_range, flags);
                else
                        r = mount_root_tmpfs(where, uid_shift, flags);
                if (r < 0)
                        return r;

                /* For us mounting root always means mounting /usr as well */
-                r = mount_partition(m->partitions + PARTITION_USR, where, "/usr", uid_shift, uid_range, flags);
+                r = mount_partition(PARTITION_USR, m->partitions + PARTITION_USR, where, "/usr", uid_shift, uid_range, flags);
                if (r < 0)
                        return r;

@ -1731,23 +1802,23 @@ int dissected_image_mount(
        if (flags & DISSECT_IMAGE_MOUNT_ROOT_ONLY)
                return 0;

-        r = mount_partition(m->partitions + PARTITION_HOME, where, "/home", uid_shift, uid_range, flags);
+        r = mount_partition(PARTITION_HOME, m->partitions + PARTITION_HOME, where, "/home", uid_shift, uid_range, flags);
        if (r < 0)
                return r;

-        r = mount_partition(m->partitions + PARTITION_SRV, where, "/srv", uid_shift, uid_range, flags);
+        r = mount_partition(PARTITION_SRV, m->partitions + PARTITION_SRV, where, "/srv", uid_shift, uid_range, flags);
        if (r < 0)
                return r;

-        r = mount_partition(m->partitions + PARTITION_VAR, where, "/var", uid_shift, uid_range, flags);
+        r = mount_partition(PARTITION_VAR, m->partitions + PARTITION_VAR, where, "/var", uid_shift, uid_range, flags);
        if (r < 0)
                return r;

-        r = mount_partition(m->partitions + PARTITION_TMP, where, "/var/tmp", uid_shift, uid_range, flags);
+        r = mount_partition(PARTITION_TMP, m->partitions + PARTITION_TMP, where, "/var/tmp", uid_shift, uid_range, flags);
        if (r < 0)
                return r;

-        xbootldr_mounted = mount_partition(m->partitions + PARTITION_XBOOTLDR, where, "/boot", uid_shift, uid_range, flags);
+        xbootldr_mounted = mount_partition(PARTITION_XBOOTLDR, m->partitions + PARTITION_XBOOTLDR, where, "/boot", uid_shift, uid_range, flags);
        if (xbootldr_mounted < 0)
                return xbootldr_mounted;

@ -1773,7 +1844,7 @@ int dissected_image_mount(
                                                return r;
                                } else if (dir_is_empty(p, /* ignore_hidden_or_backup= */ false) > 0) {
                                        /* It exists and is an empty directory. Let's mount the ESP there. */
-                                        r = mount_partition(m->partitions + PARTITION_ESP, where, "/boot", uid_shift, uid_range, flags);
+                                        r = mount_partition(PARTITION_ESP, m->partitions + PARTITION_ESP, where, "/boot", uid_shift, uid_range, flags);
                                        if (r < 0)
                                                return r;

@ -1785,7 +1856,7 @@ int dissected_image_mount(
                if (!esp_done) {
                        /* OK, let's mount the ESP now to /efi (possibly creating the dir if missing) */

-                        r = mount_partition(m->partitions + PARTITION_ESP, where, "/efi", uid_shift, uid_range, flags);
+                        r = mount_partition(PARTITION_ESP, m->partitions + PARTITION_ESP, where, "/efi", uid_shift, uid_range, flags);
                        if (r < 0)
                                return r;
                }
--- a/src/shared/dissect-image.h
+++ b/src/shared/dissect-image.h
@ -193,3 +193,11 @@ int dissect_fstype_ok(const char *fstype);

 int probe_sector_size(int fd, uint32_t *ret);
 int probe_sector_size_prefer_ioctl(int fd, uint32_t *ret);
+
+int partition_pick_mount_options(PartitionDesignator d, const char *fstype, bool rw, bool discard, char **ret_options, unsigned long *ret_ms_flags);
+
+static inline const char *dissected_partition_fstype(const DissectedPartition *m) {
+        assert(m);
+
+        return m->decrypted_node ? m->decrypted_fstype : m->fstype;
+}
--- a/src/test/test-mountpoint-util.c
+++ b/src/test/test-mountpoint-util.c
@ -321,6 +321,51 @@ TEST(fd_is_mount_point) {
        assert_se(fd_is_mount_point(fd, "", 0) == -EINVAL);
 }

+TEST(ms_nosymfollow_supported) {
+        log_info("MS_NOSYMFOLLOW supported: %s", yes_no(ms_nosymfollow_supported()));
+}
+
+TEST(mount_option_supported) {
+        int r;
+
+        r = mount_option_supported("tmpfs", "size", "64M");
+        log_info("tmpfs supports size=64M: %s (%i)", r < 0 ? "dont know" : yes_no(r), r);
+        assert_se(r > 0 || (r < 0 && ERRNO_IS_PRIVILEGE(r)));
+
+        r = mount_option_supported("ext4", "discard", NULL);
+        log_info("ext4 supports discard: %s (%i)", r < 0 ? "dont know" : yes_no(r), r);
+        assert_se(r > 0 || r == -EAGAIN || (r < 0 && ERRNO_IS_PRIVILEGE(r)));
+
+        r = mount_option_supported("tmpfs", "idontexist", "64M");
+        log_info("tmpfs supports idontexist: %s (%i)", r < 0 ? "dont know" : yes_no(r), r);
+        assert_se(r == 0 || (r < 0 && ERRNO_IS_PRIVILEGE(r)));
+
+        r = mount_option_supported("tmpfs", "ialsodontexist", NULL);
+        log_info("tmpfs supports ialsodontexist: %s (%i)", r < 0 ? "dont know" : yes_no(r), r);
+        assert_se(r == 0 || (r < 0 && ERRNO_IS_PRIVILEGE(r)));
+
+        r = mount_option_supported("proc", "hidepid", "1");
+        log_info("proc supports hidepid=1: %s (%i)", r < 0 ? "dont know" : yes_no(r), r);
+        assert_se(r >= 0 || (r < 0 && ERRNO_IS_PRIVILEGE(r)));
+}
+
+TEST(fstype_can_discard) {
+        assert_se(fstype_can_discard("ext4"));
+        assert_se(!fstype_can_discard("squashfs"));
+        assert_se(!fstype_can_discard("iso9660"));
+}
+
+TEST(fstype_can_norecovery) {
+        assert_se(fstype_can_norecovery("ext4"));
+        assert_se(!fstype_can_norecovery("vfat"));
+        assert_se(!fstype_can_norecovery("tmpfs"));
+}
+
+TEST(fstype_can_umask) {
+        assert_se(fstype_can_umask("vfat"));
+        assert_se(!fstype_can_umask("tmpfs"));
+}
+
 static int intro(void) {
        /* let's move into our own mount namespace with all propagation from the host turned off, so
         * that /proc/self/mountinfo is static and constant for the whole time our test runs. */