nspawn: add --suppress-sync=yes mode for turning sync() and friends into NOPs via seccomp

This is supposed to be used by package/image builders such as mkosi to
speed up building, since it allows us to suppress sync() inside a
container.

This does what Debian's eatmydata tool does, but for a container, and
via seccomp (instead of LD_PRELOAD).
This commit is contained in:
Lennart Poettering 2021-10-19 14:56:49 +02:00
parent 231c7645ca
commit 4a4654e024
9 changed files with 167 additions and 5 deletions

View file

@ -138,6 +138,12 @@ All tools:
* `$SYSTEMD_NSPAWN_TMPFS_TMP=0` — if set, do not overmount `/tmp/` in the
container with a tmpfs, but leave the directory from the image in place.
* `$SYSTEMD_SUPPRESS_SYNC=1` — if set, all disk synchronization syscalls are
blocked to the container payload (e.g. `sync()`, `fsync()`, `syncfs()`, …)
and the `O_SYNC`/`O_DSYNC` flags are made unavailable to `open()` and
friends. This is equivalent to passing `--suppress-sync=yes` on the
`systemd-nspawn` command line.
`systemd-logind`:
* `$SYSTEMD_BYPASS_HIBERNATION_MEMORY_CHECK=1` — if set, report that

View file

@ -570,6 +570,24 @@
before sending its own to systemd. For more details about notifications
see <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--suppress-sync=</option></term>
<listitem><para>Expects a boolean argument. If true, turns off any form of on-disk file system
synchronization for the container payload. This means all system calls such as <citerefentry
project='man-pages'><refentrytitle>sync</refentrytitle><manvolnum>2</manvolnum></citerefentry>,
<function>fsync()</function>, <function>syncfs()</function>, … will execute no operation, and the
<constant>O_SYNC</constant>/<constant>O_DSYNC</constant> flags to <citerefentry
project='man-pages'><refentrytitle>open</refentrytitle><manvolnum>2</manvolnum></citerefentry> and
related calls will be made unavailable. This is potentially dangerous, as assumed data integrity
guarantees to the container payload are not actually enforced (i.e. data assumed to have been written
to disk might be lost if the system is shut down abnormally). However, this can dramatically improve
container runtime performance as long as these guarantees are not required or desirable, for
example because any data written by the container is of temporary, redundant nature, or just an
intermediary artifact that will be further processed and finalized by a later step in a
pipeline. Defaults to false.</para></listitem>
</varlistentry>
</variablelist>
</refsect2><refsect2>

View file

@ -365,6 +365,16 @@
details.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>SuppressSync=</varname></term>
<listitem><para>Configures whether to suppress disk synchronization for the container payload. This
is equivalent to the <option>--suppress-sync=</option> command line switch, and takes the same
parameter. See
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
for details.</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View file

@ -63,7 +63,7 @@ _systemd_nspawn() {
local -A OPTS=(
[STANDALONE]='-h --help --version --private-network -b --boot --read-only -q --quiet --share-system
--keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U'
--keep-unit -n --network-veth -j -x --ephemeral -a --as-pid2 -U --suppress-sync=yes'
[ARG]='-D --directory -u --user --uuid --capability --drop-capability --link-journal --bind --bind-ro
-M --machine -S --slice -E --setenv -Z --selinux-context -L --selinux-apifs-context
--register --network-interface --network-bridge --personality -i --image --tmpfs

View file

@ -59,6 +59,7 @@ Exec.CPUAffinity, config_parse_cpu_affinity, 0, 0
Exec.ResolvConf, config_parse_resolv_conf, 0, offsetof(Settings, resolv_conf)
Exec.LinkJournal, config_parse_link_journal, 0, 0
Exec.Timezone, config_parse_timezone, 0, offsetof(Settings, timezone)
Exec.SuppressSync, config_parse_bool, 0, offsetof(Settings, suppress_sync)
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0

View file

@ -127,9 +127,10 @@ typedef enum SettingsMask {
SETTING_CONSOLE_MODE = UINT64_C(1) << 29,
SETTING_CREDENTIALS = UINT64_C(1) << 30,
SETTING_BIND_USER = UINT64_C(1) << 31,
SETTING_RLIMIT_FIRST = UINT64_C(1) << 32, /* we define one bit per resource limit here */
SETTING_RLIMIT_LAST = UINT64_C(1) << (32 + _RLIMIT_MAX - 1),
_SETTINGS_MASK_ALL = (UINT64_C(1) << (32 + _RLIMIT_MAX)) -1,
SETTING_SUPPRESS_SYNC = UINT64_C(1) << 32,
SETTING_RLIMIT_FIRST = UINT64_C(1) << 33, /* we define one bit per resource limit here */
SETTING_RLIMIT_LAST = UINT64_C(1) << (33 + _RLIMIT_MAX - 1),
_SETTINGS_MASK_ALL = (UINT64_C(1) << (33 + _RLIMIT_MAX)) -1,
_SETTING_FORCE_ENUM_WIDTH = UINT64_MAX
} SettingsMask;
@ -189,6 +190,7 @@ typedef struct Settings {
LinkJournal link_journal;
bool link_journal_try;
TimezoneMode timezone;
bool suppress_sync;
/* [Files] */
int read_only;

View file

@ -229,6 +229,7 @@ static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
static Credential *arg_credentials = NULL;
static size_t arg_n_credentials = 0;
static char **arg_bind_user = NULL;
static bool arg_suppress_sync = false;
STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
@ -342,7 +343,9 @@ static int help(void) {
" -E --setenv=NAME[=VALUE] Pass an environment variable to PID 1\n"
" -u --user=USER Run the command under specified user or UID\n"
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
" --notify-ready=BOOLEAN Receive notifications from the child init process\n\n"
" --notify-ready=BOOLEAN Receive notifications from the child init process\n"
" --suppress-sync=BOOLEAN\n"
" Suppress any form of disk data synchronization\n\n"
"%3$sSystem Identity:%4$s\n"
" -M --machine=NAME Set the machine name for the container\n"
" --hostname=NAME Override the hostname for the container\n"
@ -654,6 +657,12 @@ static int parse_environment(void) {
if (e)
arg_container_service_name = e;
r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
if (r >= 0)
arg_suppress_sync = r;
else if (r != -ENXIO)
log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
return detect_unified_cgroup_hierarchy_from_environment();
}
@ -713,6 +722,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_SET_CREDENTIAL,
ARG_LOAD_CREDENTIAL,
ARG_BIND_USER,
ARG_SUPPRESS_SYNC,
};
static const struct option options[] = {
@ -785,6 +795,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "set-credential", required_argument, NULL, ARG_SET_CREDENTIAL },
{ "load-credential", required_argument, NULL, ARG_LOAD_CREDENTIAL },
{ "bind-user", required_argument, NULL, ARG_BIND_USER },
{ "suppress-sync", required_argument, NULL, ARG_SUPPRESS_SYNC },
{}
};
@ -1668,6 +1679,14 @@ static int parse_argv(int argc, char *argv[]) {
arg_settings_mask |= SETTING_BIND_USER;
break;
case ARG_SUPPRESS_SYNC:
r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
if (r < 0)
return r;
arg_settings_mask |= SETTING_SUPPRESS_SYNC;
break;
case '?':
return -EINVAL;
@ -3385,6 +3404,12 @@ static int inner_child(
return r;
}
if (arg_suppress_sync) {
r = seccomp_suppress_sync();
if (r < 0)
log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
}
#if HAVE_SELINUX
if (arg_selinux_context)
if (setexeccon(arg_selinux_context) < 0)
@ -4552,6 +4577,9 @@ static int merge_settings(Settings *settings, const char *path) {
arg_console_mode = settings->console_mode;
}
if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0)
arg_suppress_sync = settings->suppress_sync;
/* The following properties can only be set through the OCI settings logic, not from the command line, hence we
* don't consult arg_settings_mask for them. */

View file

@ -2205,3 +2205,98 @@ int parse_syscall_and_errno(const char *in, char **name, int *error) {
return 0;
}
static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
bool any = false;
int r;
/* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
* EINVAL, in the hope the client code will retry without O_SYNC then. */
#if SCMP_SYS(open) > 0
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EINVAL),
SCMP_SYS(open),
1,
SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
if (r < 0)
log_debug_errno(r, "Failed to add filter for open: %m");
else
any = true;
#endif
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EINVAL),
SCMP_SYS(openat),
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
if (r < 0)
log_debug_errno(r, "Failed to add filter for openat: %m");
else
any = true;
#if defined(__SNR_openat2)
/* The new openat2() system call can't be filtered sensibly, see above. */
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(ENOSYS),
SCMP_SYS(openat2),
0);
if (r < 0)
log_debug_errno(r, "Failed to add filter for openat2: %m");
else
any = true;
#endif
return any ? 0 : r;
}
int seccomp_suppress_sync(void) {
uint32_t arch;
int r;
/* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
* manageable, and also masks O_SYNC/O_DSYNC */
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
const char *c;
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
if (r < 0)
return r;
NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
int id;
id = seccomp_syscall_resolve_name(c);
if (id == __NR_SCMP_ERROR) {
log_debug("System call %s is not known, ignoring.", c);
continue;
}
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
id,
0);
if (r < 0)
log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
}
(void) block_open_flag(seccomp, O_SYNC);
#if O_DSYNC != O_SYNC
(void) block_open_flag(seccomp, O_DSYNC);
#endif
r = seccomp_load(seccomp);
if (ERRNO_IS_SECCOMP_FATAL(r))
return r;
if (r < 0)
log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
}
return 0;
}

View file

@ -150,3 +150,5 @@ static inline const char *seccomp_errno_or_action_to_string(int num) {
}
int parse_syscall_and_errno(const char *in, char **name, int *error);
int seccomp_suppress_sync(void);