diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 400e35c4574..b104044cc26 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -505,7 +505,7 @@ DefaultOOMPolicy= Configure the default policy for reacting to processes being killed by the Linux - Out-Of-Memory (OOM) killer. This may be used to pick a global default for the per-unit + Out-Of-Memory (OOM) killer or systemd-oomd. This may be used to pick a global default for the per-unit OOMPolicy= setting. See systemd.service5 for details. Note that this default is not used for services that have Delegate= diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 89a6c337594..5bb1679aea4 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1039,7 +1039,7 @@ CapabilityBoundingSet=~CAP_B CAP_C normally at 0. Use the OOMPolicy= setting of service units to configure how the service - manager shall react to the kernel OOM killer terminating a process of the service. See + manager shall react to the kernel OOM killer or systemd-oomd terminating a process of the service. See systemd.service5 for details. diff --git a/man/systemd.service.xml b/man/systemd.service.xml index 95cb0aca3d4..4e4a9732e41 100644 --- a/man/systemd.service.xml +++ b/man/systemd.service.xml @@ -1140,7 +1140,11 @@ shall be considered preferred or less preferred candidates for process termination by the Linux OOM killer logic. See systemd.exec5 for - details. + details. + + This setting also applies to systemd-oomd, similar to kernel OOM kills + this setting determines the state of the service after systemd-oomd kills a cgroup associated + with the service. diff --git a/src/core/cgroup.c b/src/core/cgroup.c index dda87db7e17..15ab363548a 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -2958,6 +2958,10 @@ static int on_cgroup_empty_event(sd_event_source *s, void *userdata) { log_debug_errno(r, "Failed to reenable cgroup empty event source, ignoring: %m"); } + /* Update state based on OOM kills before we notify about cgroup empty event */ + (void) unit_check_oom(u); + (void) unit_check_oomd_kill(u); + unit_add_to_gc_queue(u); if (UNIT_VTABLE(u)->notify_cgroup_empty) @@ -3037,7 +3041,7 @@ int unit_check_oomd_kill(Unit *u) { else if (r == 0) return 0; - r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value); + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_ooms", &value); if (r < 0 && r != -ENODATA) return r; @@ -3053,11 +3057,25 @@ int unit_check_oomd_kill(Unit *u) { if (!increased) return 0; + n = 0; + value = mfree(value); + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value); + if (r >= 0 && !isempty(value)) + (void) safe_atou64(value, &n); + if (n > 0) log_unit_struct(u, LOG_NOTICE, "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR, LOG_UNIT_INVOCATION_ID(u), - LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n)); + LOG_UNIT_MESSAGE(u, "systemd-oomd killed %"PRIu64" process(es) in this unit.", n), + "N_PROCESSES=%" PRIu64, n); + else + log_unit_struct(u, LOG_NOTICE, + "MESSAGE_ID=" SD_MESSAGE_UNIT_OOMD_KILL_STR, + LOG_UNIT_INVOCATION_ID(u), + LOG_UNIT_MESSAGE(u, "systemd-oomd killed some process(es) in this unit.")); + + unit_notify_cgroup_oom(u, /* ManagedOOM= */ true); return 1; } @@ -3093,8 +3111,7 @@ int unit_check_oom(Unit *u) { LOG_UNIT_INVOCATION_ID(u), LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer.")); - if (UNIT_VTABLE(u)->notify_cgroup_oom) - UNIT_VTABLE(u)->notify_cgroup_oom(u); + unit_notify_cgroup_oom(u, /* ManagedOOM= */ false); return 1; } diff --git a/src/core/manager.c b/src/core/manager.c index a379bbefeaa..69717e5ba6e 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -2644,9 +2644,7 @@ static int manager_dispatch_sigchld(sd_event_source *source, void *userdata) { * We only do this for the cgroup the PID belonged to. */ (void) unit_check_oom(u1); - /* This only logs for now. In the future when the interface for kills/notifications - * is more stable we can extend service results table similar to how kernel oom kills - * are managed. */ + /* We check if systemd-oomd perfomed a kill so that we log and notify appropriately */ (void) unit_check_oomd_kill(u1); manager_invoke_sigchld_event(m, u1, &si); diff --git a/src/core/manager.h b/src/core/manager.h index f9096cf3485..c989ce9c32e 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -60,9 +60,9 @@ typedef enum StatusType { } StatusType; typedef enum OOMPolicy { - OOM_CONTINUE, /* The kernel kills the process it wants to kill, and that's it */ - OOM_STOP, /* The kernel kills the process it wants to kill, and we stop the unit */ - OOM_KILL, /* The kernel kills the process it wants to kill, and all others in the unit, and we stop the unit */ + OOM_CONTINUE, /* The kernel or systemd-oomd kills the process it wants to kill, and that's it */ + OOM_STOP, /* The kernel or systemd-oomd kills the process it wants to kill, and we stop the unit */ + OOM_KILL, /* The kernel or systemd-oomd kills the process it wants to kill, and all others in the unit, and we stop the unit */ _OOM_POLICY_MAX, _OOM_POLICY_INVALID = -EINVAL, } OOMPolicy; diff --git a/src/core/service.c b/src/core/service.c index 396c27956c0..2d7a0868524 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -3404,10 +3404,13 @@ static void service_notify_cgroup_empty_event(Unit *u) { } } -static void service_notify_cgroup_oom_event(Unit *u) { +static void service_notify_cgroup_oom_event(Unit *u, bool managed_oom) { Service *s = SERVICE(u); - log_unit_debug(u, "Process of control group was killed by the OOM killer."); + if (managed_oom) + log_unit_debug(u, "Process(es) of control group were killed by systemd-oomd."); + else + log_unit_debug(u, "Process of control group was killed by the OOM killer."); if (s->oom_policy == OOM_CONTINUE) return; diff --git a/src/core/service.h b/src/core/service.h index 4116e40d8f3..91e02e6d7ee 100644 --- a/src/core/service.h +++ b/src/core/service.h @@ -75,7 +75,7 @@ typedef enum ServiceResult { SERVICE_FAILURE_CORE_DUMP, SERVICE_FAILURE_WATCHDOG, SERVICE_FAILURE_START_LIMIT_HIT, - SERVICE_FAILURE_OOM_KILL, + SERVICE_FAILURE_OOM_KILL, /* OOM Kill by the Kernel or systemd-oomd */ SERVICE_SKIP_CONDITION, _SERVICE_RESULT_MAX, _SERVICE_RESULT_INVALID = -EINVAL, diff --git a/src/core/unit.c b/src/core/unit.c index 69ece074479..42fb4220f60 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -3801,6 +3801,13 @@ int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error) { return UNIT_VTABLE(u)->kill(u, w, signo, error); } +void unit_notify_cgroup_oom(Unit *u, bool managed_oom) { + assert(u); + + if (UNIT_VTABLE(u)->notify_cgroup_oom) + UNIT_VTABLE(u)->notify_cgroup_oom(u, managed_oom); +} + static Set *unit_pid_set(pid_t main_pid, pid_t control_pid) { _cleanup_set_free_ Set *pid_set = NULL; int r; diff --git a/src/core/unit.h b/src/core/unit.h index 94f2180951c..733eeecd7f0 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -285,7 +285,7 @@ typedef struct Unit { nsec_t cpu_usage_base; nsec_t cpu_usage_last; /* the most recently read value */ - /* The current counter of processes sent SIGKILL by systemd-oomd */ + /* The current counter of OOM kills initiated by systemd-oomd */ uint64_t managed_oom_kill_last; /* The current counter of the oom_kill field in the memory.events cgroup attribute */ @@ -596,7 +596,7 @@ typedef struct UnitVTable { void (*notify_cgroup_empty)(Unit *u); /* Called whenever an OOM kill event on this unit was seen */ - void (*notify_cgroup_oom)(Unit *u); + void (*notify_cgroup_oom)(Unit *u, bool managed_oom); /* Called whenever a process of this unit sends us a message */ void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds); @@ -811,6 +811,8 @@ int unit_reload(Unit *u); int unit_kill(Unit *u, KillWho w, int signo, sd_bus_error *error); int unit_kill_common(Unit *u, KillWho who, int signo, pid_t main_pid, pid_t control_pid, sd_bus_error *error); +void unit_notify_cgroup_oom(Unit *u, bool managed_oom); + typedef enum UnitNotifyFlags { UNIT_NOTIFY_RELOAD_FAILURE = 1 << 0, UNIT_NOTIFY_WILL_AUTO_RESTART = 1 << 1, diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c index 77718d9c9e3..a135824c536 100644 --- a/src/oom/oomd-util.c +++ b/src/oom/oomd-util.c @@ -192,6 +192,10 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { if (!pids_killed) return -ENOMEM; + r = increment_oomd_xattr(path, "user.oomd_ooms", 1); + if (r < 0) + log_debug_errno(r, "Failed to set user.oomd_ooms before kill: %m"); + if (recurse) r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); else diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c index 13d9e60f16c..82a60ad8803 100644 --- a/src/oom/test-oomd-util.c +++ b/src/oom/test-oomd-util.c @@ -77,12 +77,16 @@ static void test_oomd_cgroup_kill(void) { abort(); } + assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_ooms", &v) >= 0); + assert_se(streq(v, i == 0 ? "1" : "2")); + v = mfree(v); + /* Wait a bit since processes may take some time to be cleaned up. */ sleep(2); assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0); - assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0); + assert_se(streq(v, i == 0 ? "2" : "4")); } }