oomd: increase accuracy of SwapUsedLimit= to permyriads too

oomd.conf has two parameters with fractionals: SwapUsedLimit= and
DefaultMemoryPressureLimit=, but one accepts permyriads, the other only
percentages, for no apparent reason. One carries the "Percent" in the
name, the other doesn't.

Let's clean this up: always accept permyriads, and drop the suffix,
given that it is misleading.

I figure we should internally try to focus on scaling everything
relative to UINT32_MAX, and if that isn't in the cards at least 10000,
but never permille nor percent unless there's a really really good
reason for it (e.g. interface defined by someone else).
This commit is contained in:
Lennart Poettering 2021-02-17 17:56:26 +01:00
parent d9d3f05def
commit d06e7fb532
8 changed files with 71 additions and 61 deletions

View file

@ -48,36 +48,38 @@
<variablelist class='config-directives'>
<varlistentry>
<term><varname>SwapUsedLimitPercent=</varname></term>
<term><varname>SwapUsedLimit=</varname></term>
<listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command> will
take action. If the percentage of swap used on the system is more than what is defined here,
<command>systemd-oomd</command> will act on eligible descendant cgroups, starting from the ones with the
highest swap usage to the lowest swap usage. Which cgroups are monitored and what
action gets taken depends on what the unit has configured for <varname>ManagedOOMSwap=</varname>.
Takes a percentage value between 0% and 100%, inclusive. Defaults to 90%.</para></listitem>
<listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command>
will take action. If the fraction of swap used on the system is more than what is defined here,
<command>systemd-oomd</command> will act on eligible descendant control groups, starting from the
ones with the highest swap usage to the lowest swap usage. Which control groups are monitored and
what action gets taken depends on what the unit has configured for
<varname>ManagedOOMSwap=</varname>. Takes a value specified in percent (when suffixed with "%"),
permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>DefaultMemoryPressureLimit=</varname></term>
<listitem><para>Sets the limit for memory pressure on the unit's cgroup before <command>systemd-oomd</command>
will take action. A unit can override this value with <varname>ManagedOOMMemoryPressureLimit=</varname>.
The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks
in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the
limit set for longer than the duration set by <varname>DefaultMemoryPressureDurationSec=</varname>,
<command>systemd-oomd</command> will act on eligible descendant cgroups,
starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are
monitored and what action gets taken depends on what the unit has configured for
<varname>ManagedOOMMemoryPressure=</varname>. Takes a percentage value between 0% and 100%, inclusive.
Defaults to 60%.</para></listitem>
<listitem><para>Sets the limit for memory pressure on the unit's control group before
<command>systemd-oomd</command> will take action. A unit can override this value with
<varname>ManagedOOMMemoryPressureLimit=</varname>. The memory pressure for this property represents
the fraction of time in a 10 second window in which all tasks in the control group were delayed. For
each monitored control group, if the memory pressure on that control group exceeds the limit set for
longer than the duration set by <varname>DefaultMemoryPressureDurationSec=</varname>,
<command>systemd-oomd</command> will act on eligible descendant control groups, starting from the
ones with the most reclaim activity to the least reclaim activity. Which control groups are monitored
and what action gets taken depends on what the unit has configured for
<varname>ManagedOOMMemoryPressure=</varname>. Takes a fraction specified in the same way as
<varname>SwapUsedLimit=</varname> above. Defaults to 60%.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>DefaultMemoryPressureDurationSec=</varname></term>
<listitem><para>Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before
<command>systemd-oomd</command> will take action. Memory pressure limits are defined by
<listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure
limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
<varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>.
Defaults to 30 seconds when this property is unset or set to 0.</para></listitem>
</varlistentry>

View file

@ -16,7 +16,7 @@ typedef struct ManagedOOMReply {
ManagedOOMMode mode;
char *path;
char *property;
unsigned limit;
uint32_t limit;
} ManagedOOMReply;
static void managed_oom_reply_destroy(ManagedOOMReply *reply) {
@ -53,10 +53,10 @@ static int process_managed_oom_reply(
assert(m);
static const JsonDispatch dispatch_table[] = {
{ "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
{ "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
{ "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
{ "limit", JSON_VARIANT_UNSIGNED, json_dispatch_unsigned, offsetof(ManagedOOMReply, limit), 0 },
{ "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMReply, mode), JSON_MANDATORY },
{ "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, path), JSON_MANDATORY },
{ "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMReply, property), JSON_MANDATORY },
{ "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMReply, limit), 0 },
{},
};
@ -87,7 +87,8 @@ static int process_managed_oom_reply(
if (ret == -ENOMEM) {
r = ret;
goto finish;
} else if (ret < 0)
}
if (ret < 0)
continue;
monitor_hm = streq(reply.property, "ManagedOOMSwap") ?
@ -100,19 +101,15 @@ static int process_managed_oom_reply(
limit = m->default_mem_pressure_limit;
if (streq(reply.property, "ManagedOOMMemoryPressure")) {
if (reply.limit > UINT32_MAX) /* out of range */
continue;
if (reply.limit != 0) {
int permyriad = UINT32_SCALE_TO_PERMYRIAD(reply.limit);
if (streq(reply.property, "ManagedOOMMemoryPressure") && reply.limit > 0) {
int permyriad = UINT32_SCALE_TO_PERMYRIAD(reply.limit);
ret = store_loadavg_fixed_point(
(unsigned long) permyriad / 100,
(unsigned long) permyriad % 100,
&limit);
if (ret < 0)
continue;
}
ret = store_loadavg_fixed_point(
(unsigned long) permyriad / 100,
(unsigned long) permyriad % 100,
&limit);
if (ret < 0)
continue;
}
ret = oomd_insert_cgroup_context(NULL, monitor_hm, empty_to_root(reply.path));
@ -354,11 +351,11 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
}
}
if (oomd_swap_free_below(&m->system_context, (100 - m->swap_used_limit))) {
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than %u%%",
m->system_context.swap_used, m->system_context.swap_total, m->swap_used_limit);
log_notice("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
m->system_context.swap_used, m->system_context.swap_total, PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
if (r == -ENOMEM)
@ -484,7 +481,13 @@ static int manager_connect_bus(Manager *m) {
return 0;
}
int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec) {
int manager_start(
Manager *m,
bool dry_run,
int swap_used_limit_permyriad,
int mem_pressure_limit_permyriad,
usec_t mem_pressure_usec) {
unsigned long l, f;
int r;
@ -492,10 +495,10 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur
m->dry_run = dry_run;
m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT;
assert(m->swap_used_limit <= 100);
m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
assert(m->swap_used_limit_permyriad <= 10000);
if (mem_pressure_limit_permyriad != -1) {
if (mem_pressure_limit_permyriad >= 0) {
assert(mem_pressure_limit_permyriad <= 10000);
l = mem_pressure_limit_permyriad / 100;
@ -543,12 +546,12 @@ int manager_get_dump_string(Manager *m, char **ret) {
fprintf(f,
"Dry Run: %s\n"
"Swap Used Limit: %u%%\n"
"Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
"Default Memory Pressure Limit: %lu.%02lu%%\n"
"Default Memory Pressure Duration: %s\n"
"System Context:\n",
yes_no(m->dry_run),
m->swap_used_limit,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit),
format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
oomd_dump_system_context(&m->system_context, f, "\t");

View file

@ -18,7 +18,7 @@
* system.slice are assumed to be less latency sensitive. */
#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60
#define DEFAULT_SWAP_USED_LIMIT 90
#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90
#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
@ -32,7 +32,7 @@ struct Manager {
Hashmap *polkit_registry;
bool dry_run;
unsigned swap_used_limit;
int swap_used_limit_permyriad;
loadavg_t default_mem_pressure_limit;
usec_t default_mem_pressure_duration_usec;
@ -56,7 +56,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
int manager_new(Manager **ret);
int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec);
int manager_start(Manager *m, bool dry_run, int swap_used_limit_permyriad, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec);
int manager_get_dump_string(Manager *m, char **ret);

View file

@ -134,13 +134,13 @@ bool oomd_memory_reclaim(Hashmap *h) {
return pgscan_of > last_pgscan_of;
}
bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent) {
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
uint64_t swap_threshold;
assert(ctx);
assert(threshold_percent <= 100);
assert(threshold_permyriad <= 10000);
swap_threshold = ctx->swap_total * threshold_percent / ((uint64_t) 100);
swap_threshold = ctx->swap_total * threshold_permyriad / (uint64_t) 10000;
return (ctx->swap_total - ctx->swap_used) < swap_threshold;
}

View file

@ -61,8 +61,8 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
* current sum is higher than the last interval's sum (there was some reclaim activity). */
bool oomd_memory_reclaim(Hashmap *h);
/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
* (after the smallest values). */

View file

@ -17,13 +17,13 @@
#include "signal-util.h"
static bool arg_dry_run = false;
static int arg_swap_used_limit = -1;
static int arg_swap_used_limit_permyriad = -1;
static int arg_mem_pressure_limit_permyriad = -1;
static usec_t arg_mem_pressure_usec = 0;
static int parse_config(void) {
static const ConfigTableItem items[] = {
{ "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit },
{ "OOM", "SwapUsedLimit", config_parse_permyriad, 0, &arg_swap_used_limit_permyriad },
{ "OOM", "DefaultMemoryPressureLimit", config_parse_permyriad, 0, &arg_mem_pressure_limit_permyriad },
{ "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec },
{}
@ -159,7 +159,12 @@ static int run(int argc, char *argv[]) {
if (r < 0)
return log_error_errno(r, "Failed to create manager: %m");
r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit_permyriad, arg_mem_pressure_usec);
r = manager_start(
m,
arg_dry_run,
arg_swap_used_limit_permyriad,
arg_mem_pressure_limit_permyriad,
arg_mem_pressure_usec);
if (r < 0)
return log_error_errno(r, "Failed to start up daemon: %m");

View file

@ -12,6 +12,6 @@
# See oomd.conf(5) for details
[OOM]
#SwapUsedLimitPercent=90%
#SwapUsedLimit=90%
#DefaultMemoryPressureLimit=60%
#DefaultMemoryPressureDurationSec=30s

View file

@ -302,19 +302,19 @@ static void test_oomd_swap_free_below(void) {
.swap_total = 20971512 * 1024U,
.swap_used = 20971440 * 1024U,
};
assert_se(oomd_swap_free_below(&ctx, 20) == true);
assert_se(oomd_swap_free_below(&ctx, 2000) == true);
ctx = (OomdSystemContext) {
.swap_total = 20971512 * 1024U,
.swap_used = 3310136 * 1024U,
};
assert_se(oomd_swap_free_below(&ctx, 20) == false);
assert_se(oomd_swap_free_below(&ctx, 2000) == false);
ctx = (OomdSystemContext) {
.swap_total = 0,
.swap_used = 0,
};
assert_se(oomd_swap_free_below(&ctx, 20) == false);
assert_se(oomd_swap_free_below(&ctx, 2000) == false);
}
static void test_oomd_sort_cgroups(void) {