diff --git a/src/core/cgroup.c b/src/core/cgroup.c index edd10fc31d7..25c7880f6b7 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -208,6 +208,126 @@ void cgroup_context_done(CGroupContext *c) { cpu_set_reset(&c->cpuset_mems); } +static int unit_get_kernel_memory_limit(Unit *u, const char *file, uint64_t *ret) { + _cleanup_free_ char *raw_kval = NULL; + uint64_t kval; + int r; + + assert(u); + + if (!u->cgroup_realized) + return -EOWNERDEAD; + + r = cg_get_attribute("memory", u->cgroup_path, file, &raw_kval); + if (r < 0) + return r; + + if (streq(raw_kval, "max")) { + *ret = CGROUP_LIMIT_MAX; + return 0; + } + + r = safe_atou64(raw_kval, &kval); + if (r < 0) + return r; + + *ret = kval; + + return 0; +} + +static int unit_compare_memory_limit(Unit *u, const char *property_name, uint64_t *ret_unit_value, uint64_t *ret_kernel_value) { + CGroupContext *c; + CGroupMask m; + const char *file; + uint64_t unit_value; + int r; + + /* Compare kernel memcg configuration against our internal systemd state. Unsupported (and will + * return -ENODATA) on cgroup v1. + * + * Returns: + * + * <0: On error. + * 0: If the kernel memory setting doesn't match our configuration. + * >0: If the kernel memory setting matches our configuration. + * + * The following values are only guaranteed to be populated on return >=0: + * + * - ret_unit_value will contain our internal expected value for the unit, page-aligned. + * - ret_kernel_value will contain the actual value presented by the kernel. */ + + assert(u); + + r = cg_all_unified(); + if (r < 0) + return log_debug_errno(r, "Failed to determine cgroup hierarchy version: %m"); + + /* Unsupported on v1. + * + * We don't return ENOENT, since that could actually mask a genuine problem where somebody else has + * silently masked the controller. */ + if (r == 0) + return -ENODATA; + + /* The root slice doesn't have any controller files, so we can't compare anything. */ + if (unit_has_name(u, SPECIAL_ROOT_SLICE)) + return -ENODATA; + + /* It's possible to have MemoryFoo set without systemd wanting to have the memory controller enabled, + * for example, in the case of DisableControllers= or cgroup_disable on the kernel command line. To + * avoid specious errors in these scenarios, check that we even expect the memory controller to be + * enabled at all. */ + m = unit_get_target_mask(u); + if (!FLAGS_SET(m, CGROUP_MASK_MEMORY)) + return -ENODATA; + + c = unit_get_cgroup_context(u); + assert(c); + + if (streq(property_name, "MemoryLow")) { + unit_value = unit_get_ancestor_memory_low(u); + file = "memory.low"; + } else if (streq(property_name, "MemoryMin")) { + unit_value = unit_get_ancestor_memory_min(u); + file = "memory.min"; + } else if (streq(property_name, "MemoryHigh")) { + unit_value = c->memory_high; + file = "memory.high"; + } else if (streq(property_name, "MemoryMax")) { + unit_value = c->memory_max; + file = "memory.max"; + } else if (streq(property_name, "MemorySwapMax")) { + unit_value = c->memory_swap_max; + file = "memory.swap.max"; + } else + return -EINVAL; + + r = unit_get_kernel_memory_limit(u, file, ret_kernel_value); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to parse %s: %m", file); + + /* It's intended (soon) in a future kernel to not expose cgroup memory limits rounded to page + * boundaries, but instead separate the user-exposed limit, which is whatever userspace told us, from + * our internal page-counting. To support those future kernels, just check the value itself first + * without any page-alignment. */ + if (*ret_kernel_value == unit_value) { + *ret_unit_value = unit_value; + return 1; + } + + /* The current kernel behaviour, by comparison, is that even if you write a particular number of + * bytes into a cgroup memory file, it always returns that number page-aligned down (since the kernel + * internally stores cgroup limits in pages). As such, so long as it aligns properly, everything is + * cricket. */ + if (unit_value != CGROUP_LIMIT_MAX) + unit_value = PAGE_ALIGN_DOWN(unit_value); + + *ret_unit_value = unit_value; + + return *ret_kernel_value == *ret_unit_value; +} + void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix) { _cleanup_free_ char *disable_controllers_str = NULL; _cleanup_free_ char *cpuset_cpus = NULL;