Merge pull request #25723 from keszybz/generators-tmp

Run generators with / ro and /tmp mounted
This commit is contained in:
Yu Watanabe 2022-12-15 12:53:49 +09:00 committed by GitHub
commit a6e16d949c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 341 additions and 289 deletions

View file

@ -43,7 +43,7 @@
// src/basic/umask-util.h
#define _cleanup_umask_
#define RUN_WITH_UMASK(mask) \
#define WITH_UMASK(mask) \
for (_cleanup_umask_ mode_t _saved_umask_ = umask(mask) | S_IFMT; \
FLAGS_SET(_saved_umask_, S_IFMT); \
_saved_umask_ &= 0777)

View file

@ -21,7 +21,7 @@ ArchLinux initrds.
* The initrd should mount `/run/` as a tmpfs and pass it pre-mounted when
jumping into the main system when executing systemd. The mount options should
be `mode=755,nodev,nosuid,strictatime`.
be `mode=0755,nodev,nosuid,strictatime`.
* It's highly recommended that the initrd also mounts `/usr/` (if split off) as
appropriate and passes it pre-mounted to the main system, to avoid the

View file

@ -522,6 +522,52 @@ int dev_is_devtmpfs(void) {
return false;
}
int mount_fd(const char *source,
int target_fd,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
if (mount(source, FORMAT_PROC_FD_PATH(target_fd), filesystemtype, mountflags, data) < 0) {
if (errno != ENOENT)
return -errno;
/* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
* mounted. Check for the latter to generate better error messages. */
if (proc_mounted() == 0)
return -ENOSYS;
return -ENOENT;
}
return 0;
}
int mount_nofollow(
const char *source,
const char *target,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
_cleanup_close_ int fd = -1;
/* In almost all cases we want to manipulate the mount table without following symlinks, hence
* mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
* not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
* initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
* fs to mount) we can only use traditional mount() directly.
*
* Note that this disables following only for the final component of the target, i.e symlinks within
* the path of the target are honoured, as are symlinks in the source path everywhere. */
fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
if (fd < 0)
return -errno;
return mount_fd(source, fd, filesystemtype, mountflags, data);
}
const char *mount_propagation_flags_to_string(unsigned long flags) {
switch (flags & (MS_SHARED|MS_SLAVE|MS_PRIVATE)) {

View file

@ -5,6 +5,36 @@
#include <stdbool.h>
#include <sys/types.h>
/* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't
* consume any space and udev isn't supposed to create regular file either. There's no limit on the
* max number of inodes since such limit is hard to guess especially on large storage array
* systems. */
#define TMPFS_LIMITS_DEV ",size=4m"
/* The limit used for /dev in private namespaces. 4MB for contents of regular files. The number of
* inodes should be relatively low in private namespaces but for now use a 64k limit. */
#define TMPFS_LIMITS_PRIVATE_DEV ",size=4m,nr_inodes=64k"
/* Very little, if any use expected */
#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k"
#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST
#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST
/* On an extremely small device with only 256MB of RAM, 20% of RAM should be enough for the re-execution of
* PID1 because 16MB of free space is required. */
#define TMPFS_LIMITS_RUN ",size=20%,nr_inodes=800k"
/* The limit used for various nested tmpfs mounts, in particular for guests started by systemd-nspawn.
* 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25%
* translates to 1M inodes.
* (On the host, /tmp is configured through a .mount unit file.) */
#define NESTED_TMPFS_LIMITS ",size=10%,nr_inodes=400k"
/* More space for volatile root and /var */
#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m"
#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR
#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR
int name_to_handle_at_loop(int fd, const char *path, struct file_handle **ret_handle, int *ret_mnt_id, int flags);
int path_get_mnt_id(const char *path, int *ret);
@ -22,5 +52,8 @@ bool fstype_can_uid_gid(const char *fstype);
int dev_is_devtmpfs(void);
int mount_fd(const char *source, int target_fd, const char *filesystemtype, unsigned long mountflags, const void *data);
int mount_nofollow(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
const char *mount_propagation_flags_to_string(unsigned long flags);
int mount_propagation_flags_from_string(const char *name, unsigned long *ret);

View file

@ -36,6 +36,7 @@
#include "memory-util.h"
#include "missing_sched.h"
#include "missing_syscall.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "nulstr-util.h"
#include "parse-util.h"
@ -1252,15 +1253,26 @@ int safe_fork_full(
}
if (FLAGS_SET(flags, FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE)) {
/* Optionally, make sure we never propagate mounts to the host. */
if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) < 0) {
log_full_errno(prio, errno, "Failed to remount root directory as MS_SLAVE: %m");
_exit(EXIT_FAILURE);
}
}
if (FLAGS_SET(flags, FORK_PRIVATE_TMP)) {
assert(FLAGS_SET(flags, FORK_NEW_MOUNTNS));
/* Optionally, overmount new tmpfs instance on /tmp/. */
r = mount_nofollow("tmpfs", "/tmp", "tmpfs",
MS_NOSUID|MS_NODEV,
"mode=01777" TMPFS_LIMITS_RUN);
if (r < 0) {
log_full_errno(prio, r, "Failed to overmount /tmp/: %m");
_exit(EXIT_FAILURE);
}
}
if (flags & FORK_CLOSE_ALL_FDS) {
/* Close the logs here in case it got reopened above, as close_all_fds() would close them for us */
log_close();

View file

@ -147,11 +147,12 @@ typedef enum ForkFlags {
FORK_WAIT = 1 << 7, /* Wait until child exited */
FORK_NEW_MOUNTNS = 1 << 8, /* Run child in its own mount namespace */
FORK_MOUNTNS_SLAVE = 1 << 9, /* Make child's mount namespace MS_SLAVE */
FORK_RLIMIT_NOFILE_SAFE = 1 << 10, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 11, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 12, /* fflush() stdout (and stderr) before forking */
FORK_NEW_USERNS = 1 << 13, /* Run child in its own user namespace */
FORK_CLOEXEC_OFF = 1 << 14, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
FORK_PRIVATE_TMP = 1 << 10, /* Mount new /tmp/ in the child (combine with FORK_NEW_MOUNTNS!) */
FORK_RLIMIT_NOFILE_SAFE = 1 << 11, /* Set RLIMIT_NOFILE soft limit to 1K for select() compat */
FORK_STDOUT_TO_STDERR = 1 << 12, /* Make stdout a copy of stderr */
FORK_FLUSH_STDIO = 1 << 13, /* fflush() stdout (and stderr) before forking */
FORK_NEW_USERNS = 1 << 14, /* Run child in its own user namespace */
FORK_CLOEXEC_OFF = 1 << 15, /* In the child: turn off O_CLOEXEC on all fds in except_fds[] */
} ForkFlags;
int safe_fork_full(const char *name, const int except_fds[], size_t n_except_fds, ForkFlags flags, pid_t *ret_pid);

View file

@ -15,12 +15,12 @@ static inline void umaskp(mode_t *u) {
/* We make use of the fact here that the umask() concept is using only the lower 9 bits of mode_t, although
* mode_t has space for the file type in the bits further up. We simply OR in the file type mask S_IFMT to
* distinguish the first and the second iteration of the RUN_WITH_UMASK() loop, so that we can run the first
* one, and exit on the second. */
* distinguish the first and the second iteration of the WITH_UMASK() loop, so that we can run the first one,
* and exit on the second. */
assert_cc((S_IFMT & 0777) == 0);
#define RUN_WITH_UMASK(mask) \
#define WITH_UMASK(mask) \
for (_cleanup_umask_ mode_t _saved_umask_ = umask(mask) | S_IFMT; \
FLAGS_SET(_saved_umask_, S_IFMT); \
_saved_umask_ &= 0777)

View file

@ -828,7 +828,7 @@ static int copy_file_with_version_check(const char *from, const char *to, bool f
if (r < 0)
return log_oom();
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
fd_to = open(t, O_WRONLY|O_CREAT|O_CLOEXEC|O_EXCL|O_NOFOLLOW, 0644);
if (fd_to < 0)
return log_error_errno(errno, "Failed to open \"%s\" for writing: %m", t);
@ -2073,7 +2073,7 @@ static int install_random_seed(const char *esp) {
/* Let's write this variable with an umask in effect, so that unprivileged users can't see the token
* and possibly get identification information or too much insight into the kernel's entropy pool
* state. */
RUN_WITH_UMASK(0077) {
WITH_UMASK(0077) {
r = efi_set_variable(EFI_LOADER_VARIABLE(LoaderSystemToken), buffer, sizeof(buffer));
if (r < 0) {
if (!arg_graceful)
@ -2147,7 +2147,7 @@ static int verb_install(int argc, char *argv[], void *userdata) {
const char *arch = arg_arch_all ? "" : get_efi_arch();
RUN_WITH_UMASK(0002) {
WITH_UMASK(0002) {
if (install) {
/* Don't create any of these directories when we are just updating. When we update
* we'll drop-in our files (unless there are newer ones already), but we won't create

View file

@ -941,7 +941,7 @@ int bus_init_private(Manager *m) {
if (fd < 0)
return log_error_errno(errno, "Failed to allocate private socket: %m");
RUN_WITH_UMASK(0077)
WITH_UMASK(0077)
r = bind(fd, &sa.sa, sa_len);
if (r < 0)
return log_error_errno(errno, "Failed to bind private socket: %m");

View file

@ -1406,7 +1406,7 @@ static int write_container_id(void) {
if (isempty(c))
return 0;
RUN_WITH_UMASK(0022)
WITH_UMASK(0022)
r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
if (r < 0)
return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");

View file

@ -6,6 +6,7 @@
#include <sys/epoll.h>
#include <sys/inotify.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/reboot.h>
#include <sys/timerfd.h>
#include <sys/utsname.h>
@ -62,6 +63,7 @@
#include "manager-serialize.h"
#include "memory-util.h"
#include "mkdir-label.h"
#include "mount-util.h"
#include "os-util.h"
#include "parse-util.h"
#include "path-lookup.h"
@ -1103,7 +1105,7 @@ static int manager_setup_cgroups_agent(Manager *m) {
(void) sockaddr_un_unlink(&sa.un);
/* Only allow root to connect to this socket */
RUN_WITH_UMASK(0077)
WITH_UMASK(0077)
r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
if (r < 0)
return log_error_errno(errno, "bind(%s) failed: %m", sa.un.sun_path);
@ -3674,7 +3676,7 @@ static int manager_run_environment_generators(Manager *m) {
if (!generator_path_any((const char* const*) paths))
return 0;
RUN_WITH_UMASK(0022)
WITH_UMASK(0022)
r = execute_directories((const char* const*) paths, DEFAULT_TIMEOUT_USEC, gather_environment,
args, NULL, m->transient_environment,
EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
@ -3740,8 +3742,45 @@ static int build_generator_environment(Manager *m, char ***ret) {
return 0;
}
static int manager_execute_generators(Manager *m, char **paths, bool remount_ro) {
_cleanup_strv_free_ char **ge = NULL;
const char *argv[] = {
NULL, /* Leave this empty, execute_directory() will fill something in */
m->lookup_paths.generator,
m->lookup_paths.generator_early,
m->lookup_paths.generator_late,
NULL,
};
int r;
r = build_generator_environment(m, &ge);
if (r < 0)
return log_error_errno(r, "Failed to build generator environment: %m");
if (remount_ro) {
/* Remount most of the filesystem tree read-only. We leave /sys/ as-is, because our code
* checks whether it is read-only to detect containerized execution environments. We leave
* /run/ as-is too, because that's where our output goes. We also leave /proc/ and /dev/shm/
* because they're API, and /tmp/ that safe_fork() mounted for us.
*/
r = bind_remount_recursive("/", MS_RDONLY, MS_RDONLY,
STRV_MAKE("/sys", "/run", "/proc", "/dev/shm", "/tmp"));
if (r < 0)
log_warning_errno(r, "Read-only bind remount failed, ignoring: %m");
}
BLOCK_WITH_UMASK(0022);
return execute_directories(
(const char* const*) paths,
DEFAULT_TIMEOUT_USEC,
/* callbacks= */ NULL, /* callback_args= */ NULL,
(char**) argv,
ge,
EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
}
static int manager_run_generators(Manager *m) {
_cleanup_strv_free_ char **paths = NULL, **ge = NULL;
_cleanup_strv_free_ char **paths = NULL;
int r;
assert(m);
@ -3762,30 +3801,22 @@ static int manager_run_generators(Manager *m) {
goto finish;
}
const char *argv[] = {
NULL, /* Leave this empty, execute_directory() will fill something in */
m->lookup_paths.generator,
m->lookup_paths.generator_early,
m->lookup_paths.generator_late,
NULL,
};
r = build_generator_environment(m, &ge);
if (r < 0) {
log_error_errno(r, "Failed to build generator environment: %m");
/* If we are the system manager, we fork and invoke the generators in a sanitized mount namespace. If
* we are the user manager, let's just execute the generators directly. We might not have the
* necessary privileges, and the system manager has already mounted /tmp/ and everything else for us.
*/
if (MANAGER_IS_USER(m)) {
r = manager_execute_generators(m, paths, /* remount_ro= */ false);
goto finish;
}
RUN_WITH_UMASK(0022)
(void) execute_directories(
(const char* const*) paths,
DEFAULT_TIMEOUT_USEC,
/* callbacks= */ NULL, /* callback_args= */ NULL,
(char**) argv,
ge,
EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS | EXEC_DIR_SET_SYSTEMD_EXEC_PID);
r = 0;
r = safe_fork("(sd-gens)",
FORK_RESET_SIGNALS | FORK_LOG | FORK_WAIT | FORK_NEW_MOUNTNS | FORK_MOUNTNS_SLAVE | FORK_PRIVATE_TMP,
NULL);
if (r == 0) {
r = manager_execute_generators(m, paths, /* remount_ro= */ true);
_exit(r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE);
}
finish:
lookup_paths_trim_generator(&m->lookup_paths);

View file

@ -104,7 +104,7 @@ static const MountEntry apivfs_table[] = {
{ "/proc", PROCFS, false },
{ "/dev", BIND_DEV, false },
{ "/sys", SYSFS, false },
{ "/run", RUN, false, .options_const = "mode=755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
{ "/run", RUN, false, .options_const = "mode=0755" TMPFS_LIMITS_RUN, .flags = MS_NOSUID|MS_NODEV|MS_STRICTATIME },
};
/* ProtectKernelTunables= option and the related filesystem APIs */
@ -366,7 +366,7 @@ static int append_empty_dir_mounts(MountEntry **p, char **strv) {
.mode = EMPTY_DIR,
.ignore = false,
.read_only = true,
.options_const = "mode=755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
.options_const = "mode=0755" TMPFS_LIMITS_EMPTY_OR_ALMOST,
.flags = MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
};
}
@ -927,7 +927,7 @@ static int mount_private_dev(MountEntry *m) {
dev = strjoina(temporary_mount, "/dev");
(void) mkdir(dev, 0755);
r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=755" TMPFS_LIMITS_PRIVATE_DEV);
r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", dev, "tmpfs", DEV_MOUNT_OPTIONS, "mode=0755" TMPFS_LIMITS_PRIVATE_DEV);
if (r < 0)
goto fail;
@ -1113,8 +1113,8 @@ static int mount_procfs(const MountEntry *m, const NamespaceInfo *ns_info) {
* added in the same commit: if it's supported it is thus also per-instance. */
const char *hpv = ns_info->protect_proc == PROTECT_PROC_DEFAULT ?
"off" :
protect_proc_to_string(ns_info->protect_proc);
"off" :
protect_proc_to_string(ns_info->protect_proc);
/* hidepid= support was added in 5.8, so we can use fsconfig()/fsopen() (which were added in
* 5.2) to check if hidepid= is supported. This avoids a noisy dmesg log by the kernel when
@ -1256,8 +1256,8 @@ static int mount_image(const MountEntry *m, const char *root_directory) {
}
r = verity_dissect_and_mount(
/* src_fd= */ -1, mount_entry_source(m), mount_entry_path(m), m->image_options,
host_os_release_id, host_os_release_version_id, host_os_release_sysext_level, NULL);
/* src_fd= */ -1, mount_entry_source(m), mount_entry_path(m), m->image_options,
host_os_release_id, host_os_release_version_id, host_os_release_sysext_level, NULL);
if (r == -ENOENT && m->ignore)
return 0;
if (r == -ESTALE && host_os_release_id)
@ -1704,7 +1704,7 @@ static size_t namespace_calculate_mounts(
n_bind_mounts +
n_mount_images +
(n_extension_images > 0 || n_extension_directories > 0 ? /* Mount each image and directory plus an overlay per hierarchy */
n_hierarchies + n_extension_images + n_extension_directories: 0) +
n_hierarchies + n_extension_images + n_extension_directories: 0) +
n_temporary_filesystems +
ns_info->private_dev +
(ns_info->protect_kernel_tunables ?
@ -2708,7 +2708,7 @@ static int make_tmp_prefix(const char *prefix) {
if (errno != ENOENT)
return -errno;
RUN_WITH_UMASK(000)
WITH_UMASK(000)
r = mkdir_parents(prefix, 0755);
if (r < 0)
return r;
@ -2765,7 +2765,7 @@ static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, ch
if (r < 0)
return r;
RUN_WITH_UMASK(0077)
WITH_UMASK(0077)
if (!mkdtemp(x)) {
if (errno == EROFS || ERRNO_IS_DISK_SPACE(errno))
rw = false;
@ -2778,9 +2778,9 @@ static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, ch
if (!y)
return -ENOMEM;
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
if (mkdir(y, 0777 | S_ISVTX) < 0)
return -errno;
return -errno;
r = label_fix_full(AT_FDCWD, y, prefix, 0);
if (r < 0)
@ -2792,7 +2792,7 @@ static int setup_one_tmp_dir(const char *id, const char *prefix, char **path, ch
/* Trouble: we failed to create the directory. Instead of failing, let's simulate /tmp being
* read-only. This way the service will get the EROFS result as if it was writing to the real
* file system. */
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
r = mkdir_p(RUN_SYSTEMD_EMPTY, 0500);
if (r < 0)
return r;

View file

@ -4469,7 +4469,7 @@ int unit_make_transient(Unit *u) {
/* Let's open the file we'll write the transient settings into. This file is kept open as long as we are
* creating the transient, and is closed in unit_load(), as soon as we start loading the file. */
RUN_WITH_UMASK(0022) {
WITH_UMASK(0022) {
f = fopen(path, "we");
if (!f)
return -errno;

View file

@ -671,12 +671,12 @@ static int parse_fstab(bool initrd) {
if (path_is_read_only_fs("/sys") > 0) {
if (streq(what, "sysfs")) {
log_info("Running in a container, ignoring fstab entry for %s.", what);
log_info("/sys/ is read-only (running in a container?), ignoring fstab entry for %s.", me->mnt_dir);
continue;
}
if (is_device_path(what)) {
log_info("Running in a container, ignoring fstab device entry for %s.", what);
log_info("/sys/ is read-only (running in a container?), ignoring fstab device entry for %s.", what);
continue;
}
}

View file

@ -130,7 +130,7 @@ int home_create_directory_or_subvolume(UserRecord *h, HomeSetup *setup, UserReco
switch (user_record_storage(h)) {
case USER_SUBVOLUME:
RUN_WITH_UMASK(0077)
WITH_UMASK(0077)
r = btrfs_subvol_make(d);
if (r >= 0) {

View file

@ -317,7 +317,7 @@ static int mount_legacy_cgns_supported(
* uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
* pass uid 0 and not uid_shift to tmpfs_patch_options().
*/
r = tmpfs_patch_options("mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, 0, selinux_apifs_context, &options);
r = tmpfs_patch_options("mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP, 0, selinux_apifs_context, &options);
if (r < 0)
return log_oom();
@ -390,7 +390,8 @@ skip_controllers:
if (!userns)
return mount_nofollow_verbose(LOG_ERR, NULL, cgroup_root, NULL,
MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY,
"mode=0755");
return 0;
}
@ -419,7 +420,10 @@ static int mount_legacy_cgns_unsupported(
if (r == 0) {
_cleanup_free_ char *options = NULL;
r = tmpfs_patch_options("mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &options);
r = tmpfs_patch_options("mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP,
uid_shift == 0 ? UID_INVALID : uid_shift,
selinux_apifs_context,
&options);
if (r < 0)
return log_oom();
@ -498,7 +502,8 @@ skip_controllers:
return r;
return mount_nofollow_verbose(LOG_ERR, NULL, cgroup_root, NULL,
MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY,
"mode=0755");
}
static int mount_unified_cgroups(const char *dest) {

View file

@ -576,19 +576,19 @@ int mount_all(const char *dest,
MOUNT_IN_USERNS|MOUNT_MKDIR },
/* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
{ "tmpfs", "/tmp", "tmpfs", "mode=1777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/tmp", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
{ "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "tmpfs", "/sys", "tmpfs", "mode=0555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
{ "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS,
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */
{ "tmpfs", "/dev", "tmpfs", "mode=755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
{ "tmpfs", "/dev", "tmpfs", "mode=0755" TMPFS_LIMITS_PRIVATE_DEV, MS_NOSUID|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/dev/shm", "tmpfs", "mode=01777" NESTED_TMPFS_LIMITS, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
{ "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
MOUNT_FATAL|MOUNT_MKDIR },
{ "/run/host", "/run/host", NULL, NULL, MS_BIND,
MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PREFIX_ROOT }, /* Prepare this so that we can make it read-only when we are done */
@ -1043,7 +1043,7 @@ static int setup_volatile_state(const char *directory, uid_t uid_shift, const ch
if (r < 0 && errno != EEXIST)
return log_error_errno(errno, "Failed to create %s: %m", directory);
options = "mode=755" TMPFS_LIMITS_VOLATILE_STATE;
options = "mode=0755" TMPFS_LIMITS_VOLATILE_STATE;
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
return log_oom();
@ -1087,7 +1087,7 @@ static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
options = "mode=755" TMPFS_LIMITS_ROOTFS;
options = "mode=0755" TMPFS_LIMITS_ROOTFS;
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
goto fail;
@ -1154,7 +1154,7 @@ static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
options = "mode=755" TMPFS_LIMITS_ROOTFS;
options = "mode=0755" TMPFS_LIMITS_ROOTFS;
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
goto finish;

View file

@ -33,90 +33,90 @@ static int add_syscall_filters(
const char* name;
} allow_list[] = {
/* Let's use set names where we can */
{ 0, "@aio" },
{ 0, "@basic-io" },
{ 0, "@chown" },
{ 0, "@default" },
{ 0, "@file-system" },
{ 0, "@io-event" },
{ 0, "@ipc" },
{ 0, "@mount" },
{ 0, "@network-io" },
{ 0, "@process" },
{ 0, "@resources" },
{ 0, "@setuid" },
{ 0, "@signal" },
{ 0, "@sync" },
{ 0, "@timer" },
{ 0, "@aio" },
{ 0, "@basic-io" },
{ 0, "@chown" },
{ 0, "@default" },
{ 0, "@file-system" },
{ 0, "@io-event" },
{ 0, "@ipc" },
{ 0, "@mount" },
{ 0, "@network-io" },
{ 0, "@process" },
{ 0, "@resources" },
{ 0, "@setuid" },
{ 0, "@signal" },
{ 0, "@sync" },
{ 0, "@timer" },
/* The following four are sets we optionally enable, in case the caps have been configured for it */
{ CAP_SYS_TIME, "@clock" },
{ CAP_SYS_MODULE, "@module" },
{ CAP_SYS_RAWIO, "@raw-io" },
{ CAP_IPC_LOCK, "@memlock" },
/* The following four are sets we optionally enable, n case the caps have been configured for it */
{ CAP_SYS_TIME, "@clock" },
{ CAP_SYS_MODULE, "@module" },
{ CAP_SYS_RAWIO, "@raw-io" },
{ CAP_IPC_LOCK, "@memlock" },
/* Plus a good set of additional syscalls which are not part of any of the groups above */
{ 0, "brk" },
{ 0, "capget" },
{ 0, "capset" },
{ 0, "copy_file_range" },
{ 0, "fadvise64" },
{ 0, "fadvise64_64" },
{ 0, "flock" },
{ 0, "get_mempolicy" },
{ 0, "getcpu" },
{ 0, "getpriority" },
{ 0, "getrandom" },
{ 0, "ioctl" },
{ 0, "ioprio_get" },
{ 0, "kcmp" },
{ 0, "madvise" },
{ 0, "mincore" },
{ 0, "mprotect" },
{ 0, "mremap" },
{ 0, "name_to_handle_at" },
{ 0, "oldolduname" },
{ 0, "olduname" },
{ 0, "personality" },
{ 0, "readahead" },
{ 0, "readdir" },
{ 0, "remap_file_pages" },
{ 0, "sched_get_priority_max" },
{ 0, "sched_get_priority_min" },
{ 0, "sched_getaffinity" },
{ 0, "sched_getattr" },
{ 0, "sched_getparam" },
{ 0, "sched_getscheduler" },
{ 0, "sched_rr_get_interval" },
{ 0, "brk" },
{ 0, "capget" },
{ 0, "capset" },
{ 0, "copy_file_range" },
{ 0, "fadvise64" },
{ 0, "fadvise64_64" },
{ 0, "flock" },
{ 0, "get_mempolicy" },
{ 0, "getcpu" },
{ 0, "getpriority" },
{ 0, "getrandom" },
{ 0, "ioctl" },
{ 0, "ioprio_get" },
{ 0, "kcmp" },
{ 0, "madvise" },
{ 0, "mincore" },
{ 0, "mprotect" },
{ 0, "mremap" },
{ 0, "name_to_handle_at" },
{ 0, "oldolduname" },
{ 0, "olduname" },
{ 0, "personality" },
{ 0, "readahead" },
{ 0, "readdir" },
{ 0, "remap_file_pages" },
{ 0, "sched_get_priority_max" },
{ 0, "sched_get_priority_min" },
{ 0, "sched_getaffinity" },
{ 0, "sched_getattr" },
{ 0, "sched_getparam" },
{ 0, "sched_getscheduler" },
{ 0, "sched_rr_get_interval" },
{ 0, "sched_rr_get_interval_time64" },
{ 0, "sched_yield" },
{ 0, "seccomp" },
{ 0, "sendfile" },
{ 0, "sendfile64" },
{ 0, "setdomainname" },
{ 0, "setfsgid" },
{ 0, "setfsgid32" },
{ 0, "setfsuid" },
{ 0, "setfsuid32" },
{ 0, "sethostname" },
{ 0, "setpgid" },
{ 0, "setsid" },
{ 0, "splice" },
{ 0, "sysinfo" },
{ 0, "tee" },
{ 0, "umask" },
{ 0, "uname" },
{ 0, "userfaultfd" },
{ 0, "vmsplice" },
{ 0, "sched_yield" },
{ 0, "seccomp" },
{ 0, "sendfile" },
{ 0, "sendfile64" },
{ 0, "setdomainname" },
{ 0, "setfsgid" },
{ 0, "setfsgid32" },
{ 0, "setfsuid" },
{ 0, "setfsuid32" },
{ 0, "sethostname" },
{ 0, "setpgid" },
{ 0, "setsid" },
{ 0, "splice" },
{ 0, "sysinfo" },
{ 0, "tee" },
{ 0, "umask" },
{ 0, "uname" },
{ 0, "userfaultfd" },
{ 0, "vmsplice" },
/* The following individual syscalls are added depending on specified caps */
{ CAP_SYS_PACCT, "acct" },
{ CAP_SYS_PTRACE, "process_vm_readv" },
{ CAP_SYS_PTRACE, "process_vm_writev" },
{ CAP_SYS_PTRACE, "ptrace" },
{ CAP_SYS_BOOT, "reboot" },
{ CAP_SYSLOG, "syslog" },
{ CAP_SYS_TTY_CONFIG, "vhangup" },
{ CAP_SYS_PACCT, "acct" },
{ CAP_SYS_PTRACE, "process_vm_readv" },
{ CAP_SYS_PTRACE, "process_vm_writev" },
{ CAP_SYS_PTRACE, "ptrace" },
{ CAP_SYS_BOOT, "reboot" },
{ CAP_SYSLOG, "syslog" },
{ CAP_SYS_TTY_CONFIG, "vhangup" },
/*
* The following syscalls and groups are knowingly excluded:

View file

@ -687,7 +687,7 @@ static int create_socket(char **ret) {
return r;
sa_len = r;
RUN_WITH_UMASK(0177)
WITH_UMASK(0177)
if (bind(fd, &sa.sa, sa_len) < 0)
return -errno;

View file

@ -178,7 +178,7 @@ int base_filesystem_create(const char *root, uid_t uid, gid_t gid) {
continue;
}
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
r = mkdirat(fd, table[i].dir, table[i].mode);
if (r < 0) {
log_full_errno(IN_SET(errno, EEXIST, EROFS) || table[i].ignore_failure ? LOG_DEBUG : LOG_ERR, errno,

View file

@ -1357,7 +1357,7 @@ int copy_file_full(
if (r < 0)
return r;
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
if (copy_flags & COPY_MAC_CREATE) {
r = mac_selinux_create_file_prepare(to, S_IFREG);
if (r < 0)

View file

@ -89,7 +89,7 @@ int machine_id_setup(const char *root, bool force_transient, sd_id128_t machine_
etc_machine_id = prefix_roota(root, "/etc/machine-id");
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
/* We create this 0444, to indicate that this isn't really
* something you should ever modify. Of course, since the file
* will be owned by root it doesn't matter much, but maybe
@ -166,7 +166,7 @@ int machine_id_setup(const char *root, bool force_transient, sd_id128_t machine_
run_machine_id = prefix_roota(root, "/run/machine-id");
RUN_WITH_UMASK(0022)
WITH_UMASK(0022)
r = id128_write(run_machine_id, ID128_FORMAT_PLAIN, machine_id);
if (r < 0) {
(void) unlink(run_machine_id);

View file

@ -62,55 +62,55 @@ typedef struct MountPoint {
#endif
static const MountPoint mount_table[] = {
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_FATAL|MNT_IN_CONTAINER|MNT_FOLLOW_SYMLINK },
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "devtmpfs", "/dev", "devtmpfs", "mode=755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME,
{ "devtmpfs", "/dev", "devtmpfs", "mode=0755" TMPFS_LIMITS_DEV, MS_NOSUID|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "securityfs", "/sys/kernel/security", "securityfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE },
#if ENABLE_SMACK
{ "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "smackfs", "/sys/fs/smackfs", "smackfs", "smackfsdef=*", MS_NOSUID|MS_NOEXEC|MS_NODEV,
mac_smack_use, MNT_FATAL },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/dev/shm", "tmpfs", "mode=01777,smackfsroot=*", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
mac_smack_use, MNT_FATAL },
#endif
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/dev/shm", "tmpfs", "mode=01777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "devpts", "/dev/pts", "devpts", "mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
{ "devpts", "/dev/pts", "devpts", "mode=0620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC,
NULL, MNT_IN_CONTAINER },
#if ENABLE_SMACK
{ "tmpfs", "/run", "tmpfs", "mode=755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/run", "tmpfs", "mode=0755,smackfsroot=*" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
mac_smack_use, MNT_FATAL },
#endif
{ "tmpfs", "/run", "tmpfs", "mode=755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/run", "tmpfs", "mode=0755" TMPFS_LIMITS_RUN, MS_NOSUID|MS_NODEV|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate,memory_recursiveprot", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP, MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", "nsdelegate", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup2", "/sys/fs/cgroup/unified", "cgroup2", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_hybrid_wanted, MNT_IN_CONTAINER|MNT_CHECK_WRITABLE },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_legacy_wanted, MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
#if ENABLE_PSTORE
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE },
#endif
#if ENABLE_EFI
{ "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "efivarfs", "/sys/firmware/efi/efivars", "efivarfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
is_efi_boot, MNT_NONE },
#endif
{ "bpf", "/sys/fs/bpf", "bpf", "mode=700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
{ "bpf", "/sys/fs/bpf", "bpf", "mode=0700", MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE, },
};
@ -356,7 +356,9 @@ int mount_cgroup_controllers(void) {
}
/* Now that we mounted everything, let's make the tmpfs the cgroup file systems are mounted into read-only. */
(void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs", MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755" TMPFS_LIMITS_SYS_FS_CGROUP);
(void) mount_nofollow("tmpfs", "/sys/fs/cgroup", "tmpfs",
MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY,
"mode=0755" TMPFS_LIMITS_SYS_FS_CGROUP);
return 0;
}

View file

@ -42,52 +42,6 @@
#include "tmpfile-util.h"
#include "user-util.h"
int mount_fd(const char *source,
int target_fd,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
if (mount(source, FORMAT_PROC_FD_PATH(target_fd), filesystemtype, mountflags, data) < 0) {
if (errno != ENOENT)
return -errno;
/* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
* mounted. Check for the latter to generate better error messages. */
if (proc_mounted() == 0)
return -ENOSYS;
return -ENOENT;
}
return 0;
}
int mount_nofollow(
const char *source,
const char *target,
const char *filesystemtype,
unsigned long mountflags,
const void *data) {
_cleanup_close_ int fd = -1;
/* In almost all cases we want to manipulate the mount table without following symlinks, hence
* mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
* not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
* initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
* fs to mount) we can only use traditional mount() directly.
*
* Note that this disables following only for the final component of the target, i.e symlinks within
* the path of the target are honoured, as are symlinks in the source path everywhere. */
fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
if (fd < 0)
return -errno;
return mount_fd(source, fd, filesystemtype, mountflags, data);
}
int umount_recursive(const char *prefix, int flags) {
int n = 0, r;
bool again;
@ -809,9 +763,9 @@ int mount_option_mangle(
/* This extracts mount flags from the mount options, and stores
* non-mount-flag options to '*ret_remaining_options'.
* E.g.,
* "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
* "rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000"
* is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
* "size=1630748k,mode=700,uid=1000,gid=1000".
* "size=1630748k,mode=0700,uid=1000,gid=1000".
* See more examples in test-mount-util.c.
*
* If 'options' does not contain any non-mount-flag options,

View file

@ -25,39 +25,6 @@ const char* mount_attr_propagation_type_to_string(MountAttrPropagationType t) _c
MountAttrPropagationType mount_attr_propagation_type_from_string(const char *s) _pure_;
unsigned int mount_attr_propagation_type_to_flag(MountAttrPropagationType t);
/* The limit used for /dev itself. 4MB should be enough since device nodes and symlinks don't
* consume any space and udev isn't supposed to create regular file either. There's no limit on the
* max number of inodes since such limit is hard to guess especially on large storage array
* systems. */
#define TMPFS_LIMITS_DEV ",size=4m"
/* The limit used for /dev in private namespaces. 4MB for contents of regular files. The number of
* inodes should be relatively low in private namespaces but for now use a 64k limit. */
#define TMPFS_LIMITS_PRIVATE_DEV ",size=4m,nr_inodes=64k"
/* Very little, if any use expected */
#define TMPFS_LIMITS_EMPTY_OR_ALMOST ",size=4m,nr_inodes=1k"
#define TMPFS_LIMITS_SYS TMPFS_LIMITS_EMPTY_OR_ALMOST
#define TMPFS_LIMITS_SYS_FS_CGROUP TMPFS_LIMITS_EMPTY_OR_ALMOST
/* On an extremely small device with only 256MB of RAM, 20% of RAM should be enough for the re-execution of
* PID1 because 16MB of free space is required. */
#define TMPFS_LIMITS_RUN ",size=20%,nr_inodes=800k"
/* The limit used for various nested tmpfs mounts, in particular for guests started by systemd-nspawn.
* 10% of RAM (using 16GB of RAM as a baseline) translates to 400k inodes (assuming 4k each) and 25%
* translates to 1M inodes.
* (On the host, /tmp is configured through a .mount unit file.) */
#define NESTED_TMPFS_LIMITS ",size=10%,nr_inodes=400k"
/* More space for volatile root and /var */
#define TMPFS_LIMITS_VAR ",size=25%,nr_inodes=1m"
#define TMPFS_LIMITS_ROOTFS TMPFS_LIMITS_VAR
#define TMPFS_LIMITS_VOLATILE_STATE TMPFS_LIMITS_VAR
int mount_fd(const char *source, int target_fd, const char *filesystemtype, unsigned long mountflags, const void *data);
int mount_nofollow(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const void *data);
int repeat_unmount(const char *path, int flags);
int umount_recursive(const char *target, int flags);

View file

@ -42,7 +42,7 @@ int update_reboot_parameter_and_warn(const char *parameter, bool keep) {
return 0;
}
RUN_WITH_UMASK(0022) {
WITH_UMASK(0022) {
r = write_string_file("/run/systemd/reboot-param", parameter,
WRITE_STRING_FILE_CREATE|WRITE_STRING_FILE_ATOMIC);
if (r < 0)

View file

@ -101,7 +101,7 @@ int socket_address_listen(
(void) mkdir_parents_label(p, directory_mode);
/* Enforce the right access mode for the socket */
RUN_WITH_UMASK(~socket_mode) {
WITH_UMASK(~socket_mode) {
r = mac_selinux_bind(fd, &a->sockaddr.sa, a->size);
if (r == -EADDRINUSE) {
/* Unlink and try again */

View file

@ -2326,7 +2326,7 @@ int varlink_server_listen_address(VarlinkServer *s, const char *address, mode_t
(void) sockaddr_un_unlink(&sockaddr.un);
RUN_WITH_UMASK(~m & 0777) {
WITH_UMASK(~m & 0777) {
r = mac_selinux_bind(fd, &sockaddr.sa, sockaddr_len);
if (r < 0)
return r;

View file

@ -17,7 +17,7 @@ TEST(install_file) {
assert_se(a = path_join(p, "foo"));
assert_se(b = path_join(p, "bar"));
RUN_WITH_UMASK(0077)
WITH_UMASK(0077)
assert_se(write_string_file(a, "wups", WRITE_STRING_FILE_CREATE) >= 0);
assert_se(lstat(a, &stat1) >= 0);

View file

@ -11,6 +11,7 @@
#include "missing_mount.h"
#include "mkdir.h"
#include "mount-util.h"
#include "mountpoint-util.h"
#include "namespace-util.h"
#include "path-util.h"
#include "process-util.h"
@ -36,14 +37,14 @@ TEST(mount_option_mangle) {
assert_se(f == (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC));
assert_se(opts == NULL);
assert_se(mount_option_mangle("ro,nosuid,nodev,noexec,mode=755", 0, &f, &opts) == 0);
assert_se(mount_option_mangle("ro,nosuid,nodev,noexec,mode=0755", 0, &f, &opts) == 0);
assert_se(f == (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC));
assert_se(streq(opts, "mode=755"));
assert_se(streq(opts, "mode=0755"));
opts = mfree(opts);
assert_se(mount_option_mangle("rw,nosuid,foo,hogehoge,nodev,mode=755", 0, &f, &opts) == 0);
assert_se(mount_option_mangle("rw,nosuid,foo,hogehoge,nodev,mode=0755", 0, &f, &opts) == 0);
assert_se(f == (MS_NOSUID|MS_NODEV));
assert_se(streq(opts, "foo,hogehoge,mode=755"));
assert_se(streq(opts, "foo,hogehoge,mode=0755"));
opts = mfree(opts);
assert_se(mount_option_mangle("rw,nosuid,nodev,noexec,relatime,net_cls,net_prio", MS_RDONLY, &f, &opts) == 0);
@ -51,19 +52,19 @@ TEST(mount_option_mangle) {
assert_se(streq(opts, "net_cls,net_prio"));
opts = mfree(opts);
assert_se(mount_option_mangle("rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000", MS_RDONLY, &f, &opts) == 0);
assert_se(mount_option_mangle("rw,nosuid,nodev,relatime,size=1630748k,mode=0700,uid=1000,gid=1000", MS_RDONLY, &f, &opts) == 0);
assert_se(f == (MS_NOSUID|MS_NODEV|MS_RELATIME));
assert_se(streq(opts, "size=1630748k,mode=700,uid=1000,gid=1000"));
assert_se(streq(opts, "size=1630748k,mode=0700,uid=1000,gid=1000"));
opts = mfree(opts);
assert_se(mount_option_mangle("size=1630748k,rw,gid=1000,,,nodev,relatime,,mode=700,nosuid,uid=1000", MS_RDONLY, &f, &opts) == 0);
assert_se(mount_option_mangle("size=1630748k,rw,gid=1000,,,nodev,relatime,,mode=0700,nosuid,uid=1000", MS_RDONLY, &f, &opts) == 0);
assert_se(f == (MS_NOSUID|MS_NODEV|MS_RELATIME));
assert_se(streq(opts, "size=1630748k,gid=1000,mode=700,uid=1000"));
assert_se(streq(opts, "size=1630748k,gid=1000,mode=0700,uid=1000"));
opts = mfree(opts);
assert_se(mount_option_mangle("rw,exec,size=8143984k,nr_inodes=2035996,mode=755", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, &f, &opts) == 0);
assert_se(mount_option_mangle("rw,exec,size=8143984k,nr_inodes=2035996,mode=0755", MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, &f, &opts) == 0);
assert_se(f == (MS_NOSUID|MS_NODEV));
assert_se(streq(opts, "size=8143984k,nr_inodes=2035996,mode=755"));
assert_se(streq(opts, "size=8143984k,nr_inodes=2035996,mode=0755"));
opts = mfree(opts);
assert_se(mount_option_mangle("rw,relatime,fmask=0022,,,dmask=0022", MS_RDONLY, &f, &opts) == 0);
@ -73,9 +74,9 @@ TEST(mount_option_mangle) {
assert_se(mount_option_mangle("rw,relatime,fmask=0022,dmask=0022,\"hogehoge", MS_RDONLY, &f, &opts) < 0);
assert_se(mount_option_mangle("mode=1777,size=10%,nr_inodes=400k,uid=496107520,gid=496107520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\"", 0, &f, &opts) == 0);
assert_se(mount_option_mangle("mode=01777,size=10%,nr_inodes=400k,uid=496107520,gid=496107520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\"", 0, &f, &opts) == 0);
assert_se(f == 0);
assert_se(streq(opts, "mode=1777,size=10%,nr_inodes=400k,uid=496107520,gid=496107520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\""));
assert_se(streq(opts, "mode=01777,size=10%,nr_inodes=400k,uid=496107520,gid=496107520,context=\"system_u:object_r:svirt_sandbox_file_t:s0:c0,c1\""));
opts = mfree(opts);
}

View file

@ -12,7 +12,7 @@ int main(int argc, char *argv[]) {
u = umask(0111);
n = 0;
RUN_WITH_UMASK(0123) {
WITH_UMASK(0123) {
assert_se(umask(000) == 0123);
n++;
}
@ -20,7 +20,7 @@ int main(int argc, char *argv[]) {
assert_se(n == 1);
assert_se(umask(u) == 0111);
RUN_WITH_UMASK(0135) {
WITH_UMASK(0135) {
assert_se(umask(000) == 0135);
n++;
}
@ -28,7 +28,7 @@ int main(int argc, char *argv[]) {
assert_se(n == 2);
assert_se(umask(0111) == u);
RUN_WITH_UMASK(0315) {
WITH_UMASK(0315) {
assert_se(umask(000) == 0315);
n++;
break;

View file

@ -1499,7 +1499,7 @@ static int create_file(Item *i, const char *path) {
if (dir_fd < 0)
return dir_fd;
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
mac_selinux_create_file_prepare(path, S_IFREG);
fd = RET_NERRNO(openat(dir_fd, bn, O_CREAT|O_EXCL|O_NOFOLLOW|O_NONBLOCK|O_CLOEXEC|O_WRONLY|O_NOCTTY, i->mode));
mac_selinux_create_file_clear();
@ -1572,7 +1572,7 @@ static int truncate_file(Item *i, const char *path) {
if (fd == -ENOENT) {
creation = CREATION_NORMAL; /* Didn't work without O_CREATE, try again with */
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
mac_selinux_create_file_prepare(path, S_IFREG);
fd = RET_NERRNO(openat(dir_fd, bn, O_CREAT|O_NOFOLLOW|O_NONBLOCK|O_CLOEXEC|O_WRONLY|O_NOCTTY, i->mode));
mac_selinux_create_file_clear();
@ -1716,14 +1716,14 @@ static int create_directory_or_subvolume(
subvol = false;
else {
RUN_WITH_UMASK((~mode) & 0777)
WITH_UMASK((~mode) & 0777)
r = btrfs_subvol_make_fd(pfd, bn);
}
} else
r = 0;
if (!subvol || ERRNO_IS_NOT_SUPPORTED(r))
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
r = mkdirat_label(pfd, bn, mode);
creation = r >= 0 ? CREATION_NORMAL : CREATION_EXISTING;
@ -1869,7 +1869,7 @@ static int create_device(Item *i, mode_t file_type) {
if (dfd < 0)
return dfd;
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
mac_selinux_create_file_prepare(i->path, file_type);
r = RET_NERRNO(mknodat(dfd, bn, i->mode | file_type, i->major_minor));
mac_selinux_create_file_clear();
@ -1900,7 +1900,7 @@ static int create_device(Item *i, mode_t file_type) {
if (i->append_or_force) {
fd = safe_close(fd);
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
mac_selinux_create_file_prepare(i->path, file_type);
r = mknodat_atomic(dfd, bn, i->mode | file_type, i->major_minor);
mac_selinux_create_file_clear();
@ -1971,7 +1971,7 @@ static int create_fifo(Item *i) {
if (pfd < 0)
return pfd;
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
mac_selinux_create_file_prepare(i->path, S_IFIFO);
r = RET_NERRNO(mkfifoat(pfd, bn, i->mode));
mac_selinux_create_file_clear();
@ -1996,7 +1996,7 @@ static int create_fifo(Item *i) {
if (i->append_or_force) {
fd = safe_close(fd);
RUN_WITH_UMASK(0000) {
WITH_UMASK(0000) {
mac_selinux_create_file_prepare(i->path, S_IFIFO);
r = mkfifoat_atomic(pfd, bn, i->mode);
mac_selinux_create_file_clear();
@ -2378,7 +2378,7 @@ static int mkdir_parents_rm_if_wrong_type(mode_t child_mode, const char *path) {
if (r == -ENOENT)
r = rm_if_wrong_type_safe(S_IFDIR, parent_fd, &parent_st, t, AT_SYMLINK_NOFOLLOW);
if (r == -ENOENT) {
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
r = mkdirat_label(parent_fd, t, 0755);
if (r < 0) {
_cleanup_free_ char *parent_name = NULL;
@ -2416,7 +2416,7 @@ static int mkdir_parents_item(Item *i, mode_t child_mode) {
if (r < 0 && r != -ENOENT)
return r;
} else
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
(void) mkdir_parents_label(i->path, 0755);
return 0;

View file

@ -277,7 +277,7 @@ int manager_startup(Manager *m) {
(void) sockaddr_un_unlink(&sockaddr.un);
RUN_WITH_UMASK(0000)
WITH_UMASK(0000)
if (bind(m->listen_fd, &sockaddr.sa, SOCKADDR_UN_LEN(sockaddr.un)) < 0)
return log_error_errno(errno, "Failed to bind socket: %m");

View file

@ -29,7 +29,7 @@ static int make_volatile(const char *path) {
if (r < 0)
return log_error_errno(r, "Couldn't generate volatile sysroot directory: %m");
r = mount_nofollow_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=755" TMPFS_LIMITS_ROOTFS);
r = mount_nofollow_verbose(LOG_ERR, "tmpfs", "/run/systemd/volatile-sysroot", "tmpfs", MS_STRICTATIME, "mode=0755" TMPFS_LIMITS_ROOTFS);
if (r < 0)
goto finish_rmdir;
@ -80,7 +80,7 @@ static int make_overlay(const char *path) {
if (r < 0)
return log_error_errno(r, "Couldn't create overlay sysroot directory: %m");
r = mount_nofollow_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755" TMPFS_LIMITS_ROOTFS);
r = mount_nofollow_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=0755" TMPFS_LIMITS_ROOTFS);
if (r < 0)
goto finish;

View file

@ -2444,7 +2444,7 @@ sub udev_setup {
rmdir($udev_tmpfs);
mkdir($udev_tmpfs) || die "unable to create udev_tmpfs: $udev_tmpfs\n";
if (system("mount", "-o", "rw,mode=755,nosuid,noexec", "-t", "tmpfs", "tmpfs", $udev_tmpfs)) {
if (system("mount", "-o", "rw,mode=0755,nosuid,noexec", "-t", "tmpfs", "tmpfs", $udev_tmpfs)) {
warn "unable to mount tmpfs";
return 0;
}