From 85614c6e2fb791b742941a8f98ea1851cf705240 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 28 Feb 2023 12:39:35 -0800 Subject: [PATCH] add support for KSM This adds support for KSM (kernel samepage merging). It adds a new boolean parameter called MemoryKSM to enable the feature. The feature can only be enabled with newer kernels. --- man/org.freedesktop.systemd1.xml | 24 ++++++++++++++++++++++++ man/systemd.exec.xml | 16 ++++++++++++++++ mkosi.kernel.config | 1 + src/basic/missing_prctl.h | 4 ++++ src/core/dbus-execute.c | 4 ++++ src/core/execute.c | 11 +++++++++++ src/core/execute.h | 1 + src/core/load-fragment-gperf.gperf.in | 1 + src/shared/bus-unit-util.c | 1 + src/shared/exit-status.c | 1 + src/shared/exit-status.h | 1 + 11 files changed, 65 insertions(+) diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index a08108dd885..70273bbf64b 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -3174,6 +3174,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectHostname = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b MemoryKSM = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s IPCNamespacePath = '...'; @@ -3739,6 +3741,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4405,6 +4409,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -5184,6 +5190,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectHostname = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b MemoryKSM = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s IPCNamespacePath = '...'; @@ -5761,6 +5769,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6407,6 +6417,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7061,6 +7073,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectHostname = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b MemoryKSM = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s IPCNamespacePath = '...'; @@ -7566,6 +7580,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8130,6 +8146,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8911,6 +8929,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly b ProtectHostname = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly b MemoryKSM = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s NetworkNamespacePath = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s IPCNamespacePath = '...'; @@ -9402,6 +9422,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -9952,6 +9974,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 3f3ed77f460..9fb6c9b9084 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1773,6 +1773,22 @@ BindReadOnlyPaths=/var/lib/systemd + + MemoryKSM= + + Takes a boolean argument. When set, it enables KSM (kernel samepage merging) for + the processes. KSM is a memory-saving de-duplication feature. Anonymous memory pages with identical + content can be replaced by a single write-protected page. This feature should only be enabled for + jobs that share the same security domain. For details, see + Kernel Samepage Merging in the + kernel documentation. + + Note that this functionality might not be available, for example if KSM is disabled in the + kernel, or the kernel doesn't support controlling KSM at the process level through + prctl(). + + + PrivateUsers= diff --git a/mkosi.kernel.config b/mkosi.kernel.config index 7866b7da8c6..f6141370ac3 100644 --- a/mkosi.kernel.config +++ b/mkosi.kernel.config @@ -202,6 +202,7 @@ CONFIG_X86_MSR=y CONFIG_XFRM_USER=y CONFIG_XFS_FS=y CONFIG_XFS_POSIX_ACL=y +CONFIG_KSM=y # CONFIG_WIRELESS is not set # CONFIG_WLAN is not set diff --git a/src/basic/missing_prctl.h b/src/basic/missing_prctl.h index 016085bb02d..7d9e395c921 100644 --- a/src/basic/missing_prctl.h +++ b/src/basic/missing_prctl.h @@ -20,3 +20,7 @@ #ifndef PR_MDWE_REFUSE_EXEC_GAIN #define PR_MDWE_REFUSE_EXEC_GAIN 1 #endif + +#ifndef PR_SET_MEMORY_MERGE +#define PR_SET_MEMORY_MERGE 67 +#endif diff --git a/src/core/dbus-execute.c b/src/core/dbus-execute.c index a8553c962c3..fb22a9769d8 100644 --- a/src/core/dbus-execute.c +++ b/src/core/dbus-execute.c @@ -1347,6 +1347,7 @@ const sd_bus_vtable bus_exec_vtable[] = { SD_BUS_PROPERTY("ProtectProc", "s", property_get_protect_proc, offsetof(ExecContext, protect_proc), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProcSubset", "s", property_get_proc_subset, offsetof(ExecContext, proc_subset), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("ProtectHostname", "b", bus_property_get_bool, offsetof(ExecContext, protect_hostname), SD_BUS_VTABLE_PROPERTY_CONST), + SD_BUS_PROPERTY("MemoryKSM", "b", bus_property_get_tristate, offsetof(ExecContext, memory_ksm), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("NetworkNamespacePath", "s", NULL, offsetof(ExecContext, network_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("IPCNamespacePath", "s", NULL, offsetof(ExecContext, ipc_namespace_path), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("RootImagePolicy", "s", property_get_image_policy, offsetof(ExecContext, root_image_policy), SD_BUS_VTABLE_PROPERTY_CONST), @@ -2024,6 +2025,9 @@ int bus_exec_context_set_transient_property( if (streq(name, "ProtectHostname")) return bus_set_transient_bool(u, name, &c->protect_hostname, message, flags, error); + if (streq(name, "MemoryKSM")) + return bus_set_transient_tristate(u, name, &c->memory_ksm, message, flags, error); + if (streq(name, "UtmpIdentifier")) return bus_set_transient_string(u, name, &c->utmp_id, message, flags, error); diff --git a/src/core/execute.c b/src/core/execute.c index 3b327ac88d7..1802ae05b30 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -5193,6 +5193,16 @@ static int exec_child( return r; } + if (context->memory_ksm >= 0) + if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) { + if (ERRNO_IS_NOT_SUPPORTED(errno)) + log_unit_debug_errno(unit, errno, "KSM support not available, ignoring."); + else { + *exit_status = EXIT_KSM; + return log_unit_error_errno(unit, errno, "Failed to set KSM: %m"); + } + } + /* Drop groups as early as possible. * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root. * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */ @@ -5773,6 +5783,7 @@ void exec_context_init(ExecContext *c) { c->tty_cols = UINT_MAX; numa_policy_reset(&c->numa_policy); c->private_mounts = -1; + c->memory_ksm = -1; } void exec_context_done(ExecContext *c) { diff --git a/src/core/execute.h b/src/core/execute.h index e46f31037e3..1c8378c8b09 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -310,6 +310,7 @@ struct ExecContext { ProcSubset proc_subset; /* subset= */ int private_mounts; + int memory_ksm; bool private_tmp; bool private_network; bool private_devices; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 83efe844562..64a00fef28d 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -176,6 +176,7 @@ {{type}}.SmackProcessLabel, config_parse_warn_compat, DISABLED_CONFIGURATION, 0 {% endif %} {{type}}.ProtectHostname, config_parse_bool, 0, offsetof({{type}}, exec_context.protect_hostname) +{{type}}.MemoryKSM, config_parse_tristate, 0, offsetof({{type}}, exec_context.memory_ksm) {%- endmacro -%} {%- macro KILL_CONTEXT_CONFIG_ITEMS(type) -%} diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index b32071104b6..8b1a353a9b3 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -991,6 +991,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con "CPUSchedulingResetOnFork", "LockPersonality", "ProtectHostname", + "MemoryKSM", "RestrictSUIDSGID")) return bus_append_parse_boolean(m, field, eq); diff --git a/src/shared/exit-status.c b/src/shared/exit-status.c index 9063f231e15..623adda89e7 100644 --- a/src/shared/exit-status.c +++ b/src/shared/exit-status.c @@ -72,6 +72,7 @@ const ExitStatusMapping exit_status_mappings[256] = { [EXIT_NUMA_POLICY] = { "NUMA_POLICY", EXIT_STATUS_SYSTEMD }, [EXIT_CREDENTIALS] = { "CREDENTIALS", EXIT_STATUS_SYSTEMD }, [EXIT_BPF] = { "BPF", EXIT_STATUS_SYSTEMD }, + [EXIT_KSM] = { "KSM", EXIT_STATUS_SYSTEMD }, [EXIT_EXCEPTION] = { "EXCEPTION", EXIT_STATUS_SYSTEMD }, diff --git a/src/shared/exit-status.h b/src/shared/exit-status.h index 3f9a2ad54fb..c22cba05b2a 100644 --- a/src/shared/exit-status.h +++ b/src/shared/exit-status.h @@ -72,6 +72,7 @@ enum { EXIT_NUMA_POLICY, EXIT_CREDENTIALS, EXIT_BPF, + EXIT_KSM, EXIT_EXCEPTION = 255, /* Whenever we want to propagate an abnormal/signal exit, in line with bash */ };