Merge pull request #19126 from anitazha/oomdimprovements

systemd-oomd post-test week improvements
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2021-04-06 07:59:59 +02:00 committed by GitHub
commit 9d5ae3a121
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 281 additions and 226 deletions

View file

@ -52,9 +52,9 @@
<listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command>
will take action. If the fraction of swap used on the system is more than what is defined here,
<command>systemd-oomd</command> will act on eligible descendant control groups, starting from the
ones with the highest swap usage to the lowest swap usage. Which control groups are monitored and
what action gets taken depends on what the unit has configured for
<command>systemd-oomd</command> will act on eligible descendant control groups with swap usage greater
than 5% of total swap, starting from the ones with the highest swap usage. Which
control groups are monitored and what action gets taken depends on what the unit has configured for
<varname>ManagedOOMSwap=</varname>. Takes a value specified in percent (when suffixed with "%"),
permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem>
</varlistentry>
@ -81,7 +81,7 @@
<listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure
limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
<varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>.
Defaults to 30 seconds when this property is unset or set to 0.</para></listitem>
Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0.</para></listitem>
</varlistentry>
</variablelist>

View file

@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) {
return 0;
}
static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
_cleanup_set_free_ Set *targets = NULL;
static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
Manager *m = userdata;
usec_t usec_now;
int r;
@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
if (r < 0)
return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer: %m");
@ -324,13 +323,98 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
return log_error_errno(r, "Failed to acquire varlink connection: %m");
}
/* Update the cgroups used for detection/action */
r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
/* We still try to acquire swap information for oomctl even if no units want swap monitoring */
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
/* If there are no units depending on swap actions, the only error we exit on is ENOMEM.
* Allow ENOENT in the event that swap is disabled on the system. */
if (r == -ENOENT) {
zero(m->system_context);
return 0;
} else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
return log_error_errno(r, "Failed to acquire system context: %m");
/* Return early if nothing is requesting swap monitoring */
if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
return 0;
/* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
* system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
* is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
* nodes are the ones that matter). */
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
_cleanup_free_ char *selected = NULL;
uint64_t threshold;
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
else {
if (selected)
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
PERMYRIAD_AS_PERCENT_FORMAT_STR,
selected, m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
return 0;
}
}
return 0;
}
static void clear_candidate_hashmapp(Manager **m) {
if (*m)
hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
}
static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
/* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
* update the candidate data (in which case clear_candidates will be NULL). */
_cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
_cleanup_set_free_ Set *targets = NULL;
bool in_post_action_delay = false;
Manager *m = userdata;
usec_t usec_now;
int r;
assert(s);
assert(userdata);
/* Reset timer */
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
if (r < 0)
return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection: %m");
}
/* Return early if nothing is requesting memory pressure monitoring */
if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
return 0;
/* Update the cgroups used for detection/action */
r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
if (r == -ENOMEM)
return log_oom();
@ -344,23 +428,13 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
* Allow ENOENT in the event that swap is disabled on the system. */
if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
return log_error_errno(r, "Failed to acquire system context: %m");
else if (r == -ENOENT)
zero(m->system_context);
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
m->last_reclaim_at = usec_now;
/* If we're still recovering from a kill, don't try to kill again yet */
if (m->post_action_delay_start > 0) {
if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
return 0;
/* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
* values and go on a kill storm. */
if (m->mem_pressure_post_action_delay_start > 0) {
if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
in_post_action_delay = true;
else
m->post_action_delay_start = 0;
m->mem_pressure_post_action_delay_start = 0;
}
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
@ -368,91 +442,92 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
else if (r == 1) {
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
OomdCGroupContext *t;
else if (r == 1 && !in_post_action_delay) {
OomdCGroupContext *t;
SET_FOREACH(t, targets) {
_cleanup_free_ char *selected = NULL;
char ts[FORMAT_TIMESPAN_MAX];
SET_FOREACH(t, targets) {
_cleanup_free_ char *selected = NULL;
char ts[FORMAT_TIMESPAN_MAX];
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
continue;
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
m->post_action_delay_start = usec_now;
if (selected)
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
return 0;
}
r = update_monitored_cgroup_contexts_candidates(
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
else
clear_candidates = NULL;
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
m->mem_pressure_post_action_delay_start = usec_now;
if (selected)
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
return 0;
}
}
}
} else {
/* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
* monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
* might happen.
* Candidate cgroup data will continue to get updated during the post-action delay period in case
* pressure continues to be high after a kill. */
OomdCGroupContext *c;
HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
if (c->mem_pressure_limit_hit_start == 0)
continue;
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
_cleanup_free_ char *selected = NULL;
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
else {
m->post_action_delay_start = usec_now;
if (selected)
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
PERMYRIAD_AS_PERCENT_FORMAT_STR,
selected, m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
return 0;
r = update_monitored_cgroup_contexts_candidates(
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
else {
clear_candidates = NULL;
break;
}
}
}
return 0;
}
static int monitor_cgroup_contexts(Manager *m) {
static int monitor_swap_contexts(Manager *m) {
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
int r;
assert(m);
assert(m->event);
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
if (r < 0)
return r;
@ -464,9 +539,34 @@ static int monitor_cgroup_contexts(Manager *m) {
if (r < 0)
return r;
(void) sd_event_source_set_description(s, "oomd-timer");
(void) sd_event_source_set_description(s, "oomd-swap-timer");
m->cgroup_context_event_source = TAKE_PTR(s);
m->swap_context_event_source = TAKE_PTR(s);
return 0;
}
static int monitor_memory_pressure_contexts(Manager *m) {
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
int r;
assert(m);
assert(m->event);
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
if (r < 0)
return r;
r = sd_event_source_set_exit_on_failure(s, true);
if (r < 0)
return r;
r = sd_event_source_set_enabled(s, SD_EVENT_ON);
if (r < 0)
return r;
(void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
m->mem_pressure_context_event_source = TAKE_PTR(s);
return 0;
}
@ -474,7 +574,8 @@ Manager* manager_free(Manager *m) {
assert(m);
varlink_close_unref(m->varlink);
sd_event_source_unref(m->cgroup_context_event_source);
sd_event_source_unref(m->swap_context_event_source);
sd_event_source_unref(m->mem_pressure_context_event_source);
sd_event_unref(m->event);
bus_verify_polkit_async_registry_free(m->polkit_registry);
@ -596,7 +697,11 @@ int manager_start(
if (r < 0)
return r;
r = monitor_cgroup_contexts(m);
r = monitor_memory_pressure_contexts(m);
if (r < 0)
return r;
r = monitor_swap_contexts(m);
if (r < 0)
return r;

View file

@ -7,10 +7,9 @@
#include "varlink.h"
/* Polling interval for monitoring stats */
#define INTERVAL_USEC (1 * USEC_PER_SEC)
/* Used to weight the averages */
#define AVERAGE_SIZE_DECAY 4
#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
* percentage of time all tasks were delayed (i.e. unproductive).
@ -20,6 +19,9 @@
#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60
#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90
/* Only tackle candidates with large swap usage. */
#define THRESHOLD_SWAP_USED_PERCENT 5
#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
@ -44,10 +46,10 @@ struct Manager {
OomdSystemContext system_context;
usec_t last_reclaim_at;
usec_t post_action_delay_start;
usec_t mem_pressure_post_action_delay_start;
sd_event_source *cgroup_context_event_source;
sd_event_source *swap_context_event_source;
sd_event_source *mem_pressure_context_event_source;
Varlink *varlink;
};

View file

@ -82,17 +82,17 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
usec_t diff;
if (ctx->last_hit_mem_pressure_limit == 0)
ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC);
if (ctx->mem_pressure_limit_hit_start == 0)
ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit;
diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
if (diff >= duration) {
r = set_put(targets, ctx);
if (r < 0)
return -ENOMEM;
}
} else
ctx->last_hit_mem_pressure_limit = 0;
ctx->mem_pressure_limit_hit_start = 0;
}
if (!set_isempty(targets)) {
@ -104,34 +104,21 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
return 0;
}
bool oomd_memory_reclaim(Hashmap *h) {
uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
OomdCGroupContext *ctx;
uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
uint64_t last_pgscan;
assert(h);
assert(c);
/* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
* there was reclaim activity. Used along with pressure checks to decide whether to take action. */
HASHMAP_FOREACH(ctx, h) {
uint64_t sum;
sum = pgscan + ctx->pgscan;
if (sum < pgscan || sum < ctx->pgscan)
pgscan_of++; /* count overflows */
pgscan = sum;
sum = last_pgscan + ctx->last_pgscan;
if (sum < last_pgscan || sum < ctx->last_pgscan)
last_pgscan_of++; /* count overflows */
last_pgscan = sum;
/* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
* pgscan is monotonic and in practice should not decrease (except in the recreation case). */
last_pgscan = c->last_pgscan;
if (c->last_pgscan > c->pgscan) {
log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.",
c->last_pgscan, c->pgscan, c->path);
last_pgscan = 0;
}
/* overflow counts are the same, return sums comparison */
if (last_pgscan_of == pgscan_of)
return pgscan > last_pgscan;
return pgscan_of > last_pgscan_of;
return c->pgscan - last_pgscan;
}
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
@ -246,7 +233,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
return ret;
}
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
_cleanup_free_ OomdCGroupContext **sorted = NULL;
int n, r, ret = 0;
@ -257,12 +244,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
if (n < 0)
return n;
/* Try to kill cgroups with non-zero swap usage until we either succeed in
* killing or we get to a cgroup with no swap usage. */
/* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
* no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
for (int i = 0; i < n; i++) {
/* Skip over cgroups with no resource usage.
* Continue break since there might be "avoid" cgroups at the end. */
if (sorted[i]->swap_usage == 0)
/* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
* cgroups at the end. */
if (sorted[i]->swap_usage <= threshold_usage)
continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
@ -430,9 +417,13 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
if (old_ctx) {
curr_ctx->last_pgscan = old_ctx->pgscan;
curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
}
if (oomd_pgscan_rate(curr_ctx) > 0)
curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
if (r < 0)
return r;
@ -456,7 +447,11 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
ctx->last_pgscan = old_ctx->pgscan;
ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
if (oomd_pgscan_rate(ctx) > 0)
ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
}
}

View file

@ -32,10 +32,10 @@ struct OomdCGroupContext {
ManagedOOMPreference preference;
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
/* These are only used for acting on high memory pressure. */
loadavg_t mem_pressure_limit;
usec_t mem_pressure_duration_usec;
usec_t last_hit_mem_pressure_limit;
usec_t mem_pressure_limit_hit_start;
usec_t last_had_mem_reclaim;
};
struct OomdSystemContext {
@ -51,23 +51,22 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
* if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
* `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns
* `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns
* below the limit.
* Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
* Returns -ENOMEM for allocation errors. */
int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
* current sum is higher than the last interval's sum (there was some reclaim activity). */
bool oomd_memory_reclaim(Hashmap *h);
/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
/* Returns pgscan - last_pgscan, accounting for corner cases. */
uint64_t oomd_pgscan_rate(const OomdCGroupContext *c);
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
* (after the smallest values). */
static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
uint64_t last1, last2;
uint64_t diff1, diff2;
int r;
assert(c1);
@ -77,22 +76,9 @@ static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const
if (r != 0)
return r;
/* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. */
last2 = (*c2)->last_pgscan;
if ((*c2)->last_pgscan > (*c2)->pgscan) {
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
(*c2)->last_pgscan, (*c2)->pgscan, (*c2)->path);
last2 = 0;
}
last1 = (*c1)->last_pgscan;
if ((*c1)->last_pgscan > (*c1)->pgscan) {
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
(*c1)->last_pgscan, (*c1)->pgscan, (*c1)->path);
last1 = 0;
}
r = CMP((*c2)->pgscan - last2, (*c1)->pgscan - last1);
diff1 = oomd_pgscan_rate(*c1);
diff2 = oomd_pgscan_rate(*c2);
r = CMP(diff2, diff1);
if (r != 0)
return r;
@ -125,7 +111,7 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
* everything in `h` is a candidate.
* Returns the killed cgroup in ret_selected. */
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected);
int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret);

View file

@ -155,6 +155,9 @@ static int run(int argc, char *argv[]) {
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC)
log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s");
r = manager_new(&m);
if (r < 0)
return log_error_errno(r, "Failed to create manager: %m");

View file

@ -160,9 +160,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
/* make sure certain values from h1 get updated in h2 */
c1->pgscan = 5555;
c1->pgscan = UINT64_MAX;
c1->mem_pressure_limit = 6789;
c1->last_hit_mem_pressure_limit = 42;
c1->mem_pressure_limit_hit_start = 42;
c1->last_had_mem_reclaim = 888;
assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
c1 = hashmap_get(h1, cgroup);
@ -170,9 +171,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(c1);
assert_se(c2);
assert_se(c1 != c2);
assert_se(c2->last_pgscan == 5555);
assert_se(c2->last_pgscan == UINT64_MAX);
assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->last_hit_mem_pressure_limit == 42);
assert_se(c2->mem_pressure_limit_hit_start == 42);
assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
/* Assert that avoid/omit are not set if the cgroup is not owned by root */
if (test_xattrs) {
@ -189,20 +191,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
char **paths = STRV_MAKE("/0.slice",
"/1.slice");
OomdCGroupContext ctx_old[3] = {
OomdCGroupContext ctx_old[2] = {
{ .path = paths[0],
.mem_pressure_limit = 5,
.last_hit_mem_pressure_limit = 777,
.mem_pressure_limit_hit_start = 777,
.last_had_mem_reclaim = 888,
.pgscan = 57 },
{ .path = paths[1],
.mem_pressure_limit = 6,
.last_hit_mem_pressure_limit = 888,
.mem_pressure_limit_hit_start = 888,
.last_had_mem_reclaim = 888,
.pgscan = 42 },
};
OomdCGroupContext ctx_new[3] = {
OomdCGroupContext ctx_new[2] = {
{ .path = paths[0],
.pgscan = 100 },
.pgscan = 57 },
{ .path = paths[1],
.pgscan = 101 },
};
@ -221,13 +225,15 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
assert_se(c_new = hashmap_get(h_new, "/0.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
assert_se(c_old = hashmap_get(h_old, "/1.slice"));
assert_se(c_new = hashmap_get(h_new, "/1.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
}
static void test_oomd_system_context_acquire(void) {
@ -290,7 +296,7 @@ static void test_oomd_pressure_above(void) {
assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
assert_se(set_contains(t1, &ctx[0]) == true);
assert_se(c = hashmap_get(h1, "/herp.slice"));
assert_se(c->last_hit_mem_pressure_limit > 0);
assert_se(c->mem_pressure_limit_hit_start > 0);
/* Low memory pressure */
assert_se(h2 = hashmap_new(&string_hash_ops));
@ -298,7 +304,7 @@ static void test_oomd_pressure_above(void) {
assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
assert_se(t2 == NULL);
assert_se(c = hashmap_get(h2, "/derp.slice"));
assert_se(c->last_hit_mem_pressure_limit == 0);
assert_se(c->mem_pressure_limit_hit_start == 0);
/* High memory pressure w/ multiple cgroups */
assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
@ -306,50 +312,9 @@ static void test_oomd_pressure_above(void) {
assert_se(set_contains(t3, &ctx[0]) == true);
assert_se(set_size(t3) == 1);
assert_se(c = hashmap_get(h1, "/herp.slice"));
assert_se(c->last_hit_mem_pressure_limit > 0);
assert_se(c->mem_pressure_limit_hit_start > 0);
assert_se(c = hashmap_get(h1, "/derp.slice"));
assert_se(c->last_hit_mem_pressure_limit == 0);
}
static void test_oomd_memory_reclaim(void) {
_cleanup_hashmap_free_ Hashmap *h1 = NULL;
char **paths = STRV_MAKE("/0.slice",
"/1.slice",
"/2.slice",
"/3.slice",
"/4.slice");
OomdCGroupContext ctx[5] = {
{ .path = paths[0],
.last_pgscan = 100,
.pgscan = 100 },
{ .path = paths[1],
.last_pgscan = 100,
.pgscan = 100 },
{ .path = paths[2],
.last_pgscan = 77,
.pgscan = 33 },
{ .path = paths[3],
.last_pgscan = UINT64_MAX,
.pgscan = 100 },
{ .path = paths[4],
.last_pgscan = 100,
.pgscan = UINT64_MAX },
};
assert_se(h1 = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
assert_se(oomd_memory_reclaim(h1) == true);
assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(c->mem_pressure_limit_hit_start == 0);
}
static void test_oomd_swap_free_below(void) {
@ -468,7 +433,6 @@ int main(void) {
test_oomd_update_cgroup_contexts_between_hashmaps();
test_oomd_system_context_acquire();
test_oomd_pressure_above();
test_oomd_memory_reclaim();
test_oomd_swap_free_below();
test_oomd_sort_cgroups();