1
0
mirror of https://github.com/systemd/systemd synced 2024-07-09 04:26:06 +00:00

oom: Add support for user unit ManagedOOM property updates

Compared to PID1 where systemd-oomd has to be the client to PID1
because PID1 is a more privileged process than systemd-oomd, systemd-oomd
is the more privileged process compared to a user manager so we have
user managers be the client whereas systemd-oomd is now the server.

The same varlink protocol is used between user managers and systemd-oomd
to deliver ManagedOOM property updates. systemd-oomd now sets up a varlink
server that user managers connect to to send ManagedOOM property updates.

We also add extra validation to make sure that non-root senders don't
send updates for cgroups they don't own.

The integration test was extended to repeat the chill/bloat test using
a user manager instead of PID1.
This commit is contained in:
Daan De Meyer 2021-09-09 16:12:55 +01:00
parent f2ed82d510
commit 064a5c1438
13 changed files with 299 additions and 29 deletions

View File

@ -64,4 +64,7 @@
.un.sun_path = "\0/org/freedesktop/plymouthd", \
}
#define VARLINK_ADDR_PATH_MANAGED_OOM "/run/systemd/io.system.ManagedOOM"
/* Path where PID1 listens for varlink subscriptions from systemd-oomd to notify of changes in ManagedOOM settings. */
#define VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM "/run/systemd/io.system.ManagedOOM"
/* Path where systemd-oomd listens for varlink connections from user managers to report changes in ManagedOOM settings. */
#define VARLINK_ADDR_PATH_MANAGED_OOM_USER "/run/systemd/oom/io.system.ManagedOOM"

View File

@ -93,9 +93,22 @@ int manager_varlink_send_managed_oom_update(Unit *u) {
assert(u);
if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->manager->managed_oom_varlink_request || !u->cgroup_path)
if (!UNIT_VTABLE(u)->can_set_managed_oom || !u->manager || !u->cgroup_path)
return 0;
if (MANAGER_IS_SYSTEM(u->manager)) {
/* In system mode we can't send any notifications unless oomd connected back to us. In this
* mode oomd must initiate communication, not us. */
if (!u->manager->managed_oom_varlink)
return 0;
} else {
/* If we are in user mode, let's connect to oomd if we aren't connected yet. In this mode we
* must initiate communication to oomd, not the other way round. */
r = manager_varlink_init(u->manager);
if (r <= 0)
return r;
}
c = unit_get_cgroup_context(u);
if (!c)
return 0;
@ -120,7 +133,16 @@ int manager_varlink_send_managed_oom_update(Unit *u) {
if (r < 0)
return r;
return varlink_notify(u->manager->managed_oom_varlink_request, v);
if (MANAGER_IS_SYSTEM(u->manager))
/* in system mode, oomd is our client, thus send out notifications as replies to the
* initiating method call from them. */
r = varlink_notify(u->manager->managed_oom_varlink, v);
else
/* in user mode, we are oomd's client, thus send out notifications as method calls that do
* not expect a reply. */
r = varlink_send(u->manager->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
return r;
}
static int build_managed_oom_cgroups_json(Manager *m, JsonVariant **ret) {
@ -194,7 +216,7 @@ static int vl_method_subscribe_managed_oom_cgroups(
/* We only take one subscriber for this method so return an error if there's already an existing one.
* This shouldn't happen since systemd-oomd is the only client of this method. */
if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink_request)
if (FLAGS_SET(flags, VARLINK_METHOD_MORE) && m->managed_oom_varlink)
return varlink_error(link, VARLINK_ERROR_SUBSCRIPTION_TAKEN, NULL);
r = build_managed_oom_cgroups_json(m, &v);
@ -204,9 +226,27 @@ static int vl_method_subscribe_managed_oom_cgroups(
if (!FLAGS_SET(flags, VARLINK_METHOD_MORE))
return varlink_reply(link, v);
assert(!m->managed_oom_varlink_request);
m->managed_oom_varlink_request = varlink_ref(link);
return varlink_notify(m->managed_oom_varlink_request, v);
assert(!m->managed_oom_varlink);
m->managed_oom_varlink = varlink_ref(link);
return varlink_notify(m->managed_oom_varlink, v);
}
static int manager_varlink_send_managed_oom_initial(Manager *m) {
_cleanup_(json_variant_unrefp) JsonVariant *v = NULL;
int r;
assert(m);
if (MANAGER_IS_SYSTEM(m))
return 0;
assert(m->managed_oom_varlink);
r = build_managed_oom_cgroups_json(m, &v);
if (r < 0)
return r;
return varlink_send(m->managed_oom_varlink, "io.systemd.oom.ReportManagedOOMCGroups", v);
}
static int vl_method_get_user_record(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
@ -433,18 +473,18 @@ static void vl_disconnect(VarlinkServer *s, Varlink *link, void *userdata) {
assert(s);
assert(link);
if (link == m->managed_oom_varlink_request)
m->managed_oom_varlink_request = varlink_unref(link);
if (link == m->managed_oom_varlink)
m->managed_oom_varlink = varlink_unref(link);
}
int manager_varlink_init(Manager *m) {
static int manager_varlink_init_system(Manager *m) {
_cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
int r;
assert(m);
if (m->varlink_server)
return 0;
return 1;
if (!MANAGER_IS_SYSTEM(m))
return 0;
@ -475,7 +515,7 @@ int manager_varlink_init(Manager *m) {
if (r < 0)
return log_error_errno(r, "Failed to bind to varlink socket: %m");
r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM, 0666);
r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM, 0666);
if (r < 0)
return log_error_errno(r, "Failed to bind to varlink socket: %m");
}
@ -485,9 +525,73 @@ int manager_varlink_init(Manager *m) {
return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
m->varlink_server = TAKE_PTR(s);
return 1;
}
static int vl_reply(Varlink *link, JsonVariant *parameters, const char *error_id, VarlinkReplyFlags flags, void *userdata) {
Manager *m = userdata;
int r;
assert(m);
if (error_id)
log_debug("varlink systemd-oomd client error: %s", error_id);
if (FLAGS_SET(flags, VARLINK_REPLY_ERROR) && FLAGS_SET(flags, VARLINK_REPLY_LOCAL)) {
/* Varlink connection was closed, likely because of systemd-oomd restart. Let's try to
* reconnect and send the initial ManagedOOM update again. */
m->managed_oom_varlink = varlink_unref(link);
log_debug("Reconnecting to %s", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
r = manager_varlink_init(m);
if (r <= 0)
return r;
}
return 0;
}
static int manager_varlink_init_user(Manager *m) {
_cleanup_(varlink_close_unrefp) Varlink *link = NULL;
int r;
assert(m);
if (m->managed_oom_varlink)
return 1;
r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_USER);
if (r == -ENOENT || ERRNO_IS_DISCONNECT(r)) {
log_debug("systemd-oomd varlink unix socket not found, skipping user manager varlink setup");
return 0;
}
if (r < 0)
return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM_USER);
varlink_set_userdata(link, m);
r = varlink_bind_reply(link, vl_reply);
if (r < 0)
return r;
r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
if (r < 0)
return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
m->managed_oom_varlink = TAKE_PTR(link);
/* Queue the initial ManagedOOM update. */
(void) manager_varlink_send_managed_oom_initial(m);
return 1;
}
int manager_varlink_init(Manager *m) {
return MANAGER_IS_SYSTEM(m) ? manager_varlink_init_system(m) : manager_varlink_init_user(m);
}
void manager_varlink_done(Manager *m) {
assert(m);
@ -495,7 +599,8 @@ void manager_varlink_done(Manager *m) {
* the manager, and only then disconnect it in two steps so that we don't end up accidentally
* unreffing it twice. After all, closing the connection might cause the disconnect handler we
* installed (vl_disconnect() above) to be called, where we will unref it too. */
varlink_close_unref(TAKE_PTR(m->managed_oom_varlink_request));
varlink_close_unref(TAKE_PTR(m->managed_oom_varlink));
m->varlink_server = varlink_server_unref(m->varlink_server);
m->managed_oom_varlink = varlink_close_unref(m->managed_oom_varlink);
}

View File

@ -1810,7 +1810,7 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
r = manager_varlink_init(m);
if (r < 0)
log_warning_errno(r, "Failed to set up Varlink server, ignoring: %m");
log_warning_errno(r, "Failed to set up Varlink, ignoring: %m");
/* Third, fire things up! */
manager_coldplug(m);

View File

@ -446,8 +446,11 @@ struct Manager {
bool honor_device_enumeration;
VarlinkServer *varlink_server;
/* Only systemd-oomd should be using this to subscribe to changes in ManagedOOM settings */
Varlink *managed_oom_varlink_request;
/* When we're a system manager, this object manages the subscription from systemd-oomd to PID1 that's
* used to report changes in ManagedOOM settings (systemd server - oomd client). When
* we're a user manager, this object manages the client connection from the user manager to
* systemd-oomd to report changes in ManagedOOM settings (systemd client - oomd server). */
Varlink *managed_oom_varlink;
};
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {

View File

@ -1,11 +1,14 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "sd-daemon.h"
#include "bus-log-control-api.h"
#include "bus-util.h"
#include "bus-polkit.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "memory-util.h"
#include "oomd-manager-bus.h"
#include "oomd-manager.h"
@ -40,7 +43,7 @@ static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags
return 0;
}
static int process_managed_oom_message(Manager *m, JsonVariant *parameters) {
static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) {
JsonVariant *c, *cgroups;
int r;
@ -75,6 +78,23 @@ static int process_managed_oom_message(Manager *m, JsonVariant *parameters) {
if (r < 0)
continue;
if (uid != 0) {
uid_t cg_uid;
r = cg_path_get_owner_uid(message.path, &cg_uid);
if (r < 0) {
log_debug("Failed to get cgroup %s owner uid: %m", message.path);
continue;
}
/* Let's not be lenient for permission errors and skip processing if we receive an
* update for a cgroup that doesn't belong to the user. */
if (uid != cg_uid)
return log_error_errno(SYNTHETIC_ERRNO(EPERM),
"cgroup path owner UID does not match sender uid "
"(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
}
monitor_hm = streq(message.property, "ManagedOOMSwap") ?
m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
@ -112,6 +132,24 @@ static int process_managed_oom_message(Manager *m, JsonVariant *parameters) {
return 0;
}
static int process_managed_oom_request(
Varlink *link,
JsonVariant *parameters,
VarlinkMethodFlags flags,
void *userdata) {
Manager *m = userdata;
uid_t uid;
int r;
assert(m);
r = varlink_get_peer_uid(link, &uid);
if (r < 0)
return log_error_errno(r, "Failed to get varlink peer uid: %m");
return process_managed_oom_message(m, uid, parameters);
}
static int process_managed_oom_reply(
Varlink *link,
JsonVariant *parameters,
@ -119,6 +157,7 @@ static int process_managed_oom_reply(
VarlinkReplyFlags flags,
void *userdata) {
Manager *m = userdata;
uid_t uid;
int r;
assert(m);
@ -129,11 +168,17 @@ static int process_managed_oom_reply(
goto finish;
}
r = process_managed_oom_message(m, parameters);
r = varlink_get_peer_uid(link, &uid);
if (r < 0) {
log_error_errno(r, "Failed to get varlink peer uid: %m");
goto finish;
}
r = process_managed_oom_message(m, uid, parameters);
finish:
if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
m->varlink = varlink_close_unref(link);
m->varlink_client = varlink_close_unref(link);
return r;
}
@ -279,9 +324,9 @@ static int acquire_managed_oom_connect(Manager *m) {
assert(m);
assert(m->event);
r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM);
r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM);
if (r < 0)
return log_error_errno(r, "Failed to connect to %s: %m", VARLINK_ADDR_PATH_MANAGED_OOM);
return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m");
(void) varlink_set_userdata(link, m);
(void) varlink_set_description(link, "oomd");
@ -299,7 +344,7 @@ static int acquire_managed_oom_connect(Manager *m) {
if (r < 0)
return log_error_errno(r, "Failed to observe varlink call: %m");
m->varlink = TAKE_PTR(link);
m->varlink_client = TAKE_PTR(link);
return 0;
}
@ -321,7 +366,7 @@ static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
if (!m->varlink_client) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection: %m");
@ -411,7 +456,7 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
if (!m->varlink_client) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection: %m");
@ -568,7 +613,8 @@ static int monitor_memory_pressure_contexts(Manager *m) {
Manager* manager_free(Manager *m) {
assert(m);
varlink_close_unref(m->varlink);
varlink_server_unref(m->varlink_server);
varlink_close_unref(m->varlink_client);
sd_event_source_unref(m->swap_context_event_source);
sd_event_source_unref(m->mem_pressure_context_event_source);
sd_event_unref(m->event);
@ -652,12 +698,47 @@ static int manager_connect_bus(Manager *m) {
return 0;
}
static int manager_varlink_init(Manager *m, int fd) {
_cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
int r;
assert(m);
assert(!m->varlink_server);
r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
if (r < 0)
return log_error_errno(r, "Failed to allocate varlink server object: %m");
varlink_server_set_userdata(s, m);
r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request);
if (r < 0)
return log_error_errno(r, "Failed to register varlink method: %m");
if (fd < 0)
r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666);
else
r = varlink_server_listen_fd(s, fd);
if (r < 0)
return log_error_errno(r, "Failed to bind to varlink socket: %m");
r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
if (r < 0)
return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
log_debug("Initialized systemd-oomd varlink server");
m->varlink_server = TAKE_PTR(s);
return 0;
}
int manager_start(
Manager *m,
bool dry_run,
int swap_used_limit_permyriad,
int mem_pressure_limit_permyriad,
usec_t mem_pressure_usec) {
usec_t mem_pressure_usec,
int fd) {
unsigned long l, f;
int r;
@ -692,6 +773,10 @@ int manager_start(
if (r < 0)
return r;
r = manager_varlink_init(m, fd);
if (r < 0)
return r;
r = monitor_memory_pressure_contexts(m);
if (r < 0)
return r;

View File

@ -53,7 +53,12 @@ struct Manager {
sd_event_source *swap_context_event_source;
sd_event_source *mem_pressure_context_event_source;
Varlink *varlink;
/* This varlink object is used to manage the subscription from systemd-oomd to PID1 which it uses to
* listen for changes in ManagedOOM settings (oomd client - systemd server). */
Varlink *varlink_client;
/* This varlink server object is used to manage systemd-oomd's varlink server which is used by user
* managers to report changes in ManagedOOM settings (oomd server - systemd client). */
VarlinkServer *varlink_server;
};
Manager* manager_free(Manager *m);
@ -61,7 +66,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
int manager_new(Manager **ret);
int manager_start(Manager *m, bool dry_run, int swap_used_limit_permyriad, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec);
int manager_start(Manager *m, bool dry_run, int swap_used_limit_permyriad, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec, int fd);
int manager_get_dump_string(Manager *m, char **ret);

View File

@ -136,6 +136,12 @@ static int run(int argc, char *argv[]) {
/* Do some basic requirement checks for running systemd-oomd. It's not exhaustive as some of the other
* requirements do not have a reliable means to check for in code. */
int n = sd_listen_fds(0);
if (n > 1)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Received too many file descriptors");
int fd = n == 1 ? SD_LISTEN_FDS_START : -1;
/* SwapTotal is always available in /proc/meminfo and defaults to 0, even on swap-disabled kernels. */
r = get_proc_field("/proc/meminfo", "SwapTotal", WHITESPACE, &swap);
if (r < 0)
@ -175,7 +181,8 @@ static int run(int argc, char *argv[]) {
arg_dry_run,
arg_swap_used_limit_permyriad,
arg_mem_pressure_limit_permyriad,
arg_mem_pressure_usec);
arg_mem_pressure_usec,
fd);
if (r < 0)
return log_error_errno(r, "Failed to start up daemon: %m");

View File

@ -1030,6 +1030,9 @@ install_systemd() {
echo LogLevel=debug >>"$initdir/etc/systemd/system.conf"
# store coredumps in journal
echo Storage=journal >>"$initdir/etc/systemd/coredump.conf"
# Propagate SYSTEMD_UNIT_PATH to user systemd managers
mkdir "$initdir/etc/systemd/system/user@.service.d/"
echo -e "[Service]\nPassEnvironment=SYSTEMD_UNIT_PATH\n" >"$initdir/etc/systemd/system/user@.service.d/override.conf"
}
get_ldpath() {

View File

@ -1,5 +1,7 @@
[Unit]
Description=TESTSUITE-55-OOMD
After=user@4711.service
Wants=user@4711.service
[Service]
ExecStartPre=rm -f /failed /skipped /testok

View File

@ -57,6 +57,39 @@ done
if systemctl status testsuite-55-testbloat.service; then exit 42; fi
if ! systemctl status testsuite-55-testchill.service; then exit 24; fi
# Make sure we also work correctly on user units.
runas() {
declare userid=$1
shift
# shellcheck disable=SC2016
su "$userid" -s /bin/sh -c 'XDG_RUNTIME_DIR=/run/user/$UID exec "$@"' -- sh "$@"
}
runas testuser systemctl start --user testsuite-55-testchill.service
runas testuser systemctl start --user testsuite-55-testbloat.service
# Verify systemd-oomd is monitoring the expected units
oomctl | grep -E "/user.slice.*/testsuite-55-workload.slice"
oomctl | grep "20.00%"
oomctl | grep "Default Memory Pressure Duration: 2s"
runas testuser systemctl --user status testsuite-55-testchill.service
# systemd-oomd watches for elevated pressure for 2 seconds before acting.
# It can take time to build up pressure so either wait 2 minutes or for the service to fail.
timeout="$(date -ud "2 minutes" +%s)"
while [[ $(date -u +%s) -le $timeout ]]; do
if ! runas testuser systemctl --user status testsuite-55-testbloat.service; then
break
fi
sleep 2
done
# testbloat should be killed and testchill should be fine
if runas testuser systemctl --user status testsuite-55-testbloat.service; then exit 42; fi
if ! runas testuser systemctl --user status testsuite-55-testchill.service; then exit 24; fi
# only run this portion of the test if we can set xattrs
if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then
sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down

View File

@ -163,6 +163,7 @@ units = [
['user.slice', ''],
['var-lib-machines.mount', 'ENABLE_MACHINED',
'remote-fs.target.wants/ machines.target.wants/'],
['systemd-oomd.socket', 'ENABLE_OOMD'],
]
in_units = [

View File

@ -18,6 +18,8 @@ ConditionControlGroupController=memory
ConditionPathExists=/proc/pressure/cpu
ConditionPathExists=/proc/pressure/io
ConditionPathExists=/proc/pressure/memory
Requires=systemd-oomd.socket
After=systemd-oomd.socket
[Service]
AmbientCapabilities=CAP_KILL CAP_DAC_OVERRIDE

21
units/systemd-oomd.socket Normal file
View File

@ -0,0 +1,21 @@
# SPDX-License-Identifier: LGPL-2.1-or-later
#
# This file is part of systemd.
#
# systemd is free software; you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or
# (at your option) any later version.
[Unit]
Description=Userspace Out-Of-Memory (OOM) Killer Socket
Documentation=man:systemd-oomd.service(8)
DefaultDependencies=no
Before=sockets.target
[Socket]
ListenStream=/run/systemd/oom/io.system.ManagedOOM
SocketMode=0666
[Install]
WantedBy=sockets.target