diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 3f19d2f56a7..141fde05b42 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -529,6 +529,10 @@ node /org/freedesktop/systemd1 { readonly t DefaultLimitRTTIMESoft = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly t DefaultTasksMax = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t DefaultMemoryPressureThresholdUSec = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s DefaultMemoryPressureWatch = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly t TimerSlackNSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -782,6 +786,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -1208,6 +1216,10 @@ node /org/freedesktop/systemd1 { + + + + @@ -2803,6 +2815,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { readonly a(iiqq) SocketBindDeny = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly (bas) RestrictNetworkInterfaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s MemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -3395,6 +3411,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -3995,6 +4015,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + + @@ -4747,6 +4771,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { readonly a(iiqq) SocketBindDeny = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly (bas) RestrictNetworkInterfaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s MemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -5359,6 +5387,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -5949,6 +5981,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + + @@ -6590,6 +6626,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { readonly a(iiqq) SocketBindDeny = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly (bas) RestrictNetworkInterfaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s MemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -7130,6 +7170,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -7638,6 +7682,10 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + + @@ -8406,6 +8454,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { readonly a(iiqq) SocketBindDeny = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly (bas) RestrictNetworkInterfaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s MemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -8932,6 +8984,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -9426,6 +9482,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + + @@ -10053,6 +10113,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { readonly a(iiqq) SocketBindDeny = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly (bas) RestrictNetworkInterfaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s MemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t MemoryPressureThresholdUSec = ...; }; interface org.freedesktop.DBus.Peer { ... }; interface org.freedesktop.DBus.Introspectable { ... }; @@ -10219,6 +10283,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -10391,6 +10459,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + + @@ -10586,6 +10658,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { readonly a(iiqq) SocketBindDeny = [...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly (bas) RestrictNetworkInterfaces = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s MemoryPressureWatch = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s KillMode = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -10772,6 +10848,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + @@ -10974,6 +11054,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + + diff --git a/man/systemd-system.conf.xml b/man/systemd-system.conf.xml index 82a52e21509..71d403db8d0 100644 --- a/man/systemd-system.conf.xml +++ b/man/systemd-system.conf.xml @@ -556,6 +556,18 @@ to configure the rate limit window, and ReloadLimitBurst= takes a positive integer to configure the maximum allowed number of reloads within the configured time window. + + + DefaultMemoryPressureWatch= + DefaultMemoryPressureThresholdSec= + + Configures the default settings for the per-unit + MemoryPressureWatch= and MemoryPressureThresholdSec= + settings. See + systemd.resource-control5 + for details. Defaults to auto and 100ms, respectively. This + also sets the memory pressure monitoring threshold for the service manager itself. + diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index e6969416822..795e2f0671c 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -3779,6 +3779,16 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX + + $MEMORY_PRESSURE_WATCH + $MEMORY_PRESSURE_WRITE + + If memory pressure monitoring is enabled for this service unit, the path to watch + and the data to write into it. See Memory Pressure + Handling for details about these variables and the service protocol data they + convey. + + For system services, when PAMName= is enabled and pam_systemd is part diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 27e29e853a1..f057433973d 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1169,6 +1169,53 @@ DeviceAllow=/dev/loop-control + + + MemoryPressureWatch= + + Controls memory pressure monitoring for invoked processes. Takes one of + off, on, auto or skip. If + off tells the service not to watch for memory pressure events, by setting the + $MEMORY_PRESSURE_WATCH environment variable to the literal string + /dev/null. If on tells the service to watch for memory + pressure events. This enables memory accounting for the service, and ensures the + memory.pressure cgroup attribute files is accessible for read and write to the + service's user. It then sets the $MEMORY_PRESSURE_WATCH environment variable for + processes invoked by the unit to the file system path to this file. The threshold information + configured with MemoryPressureThresholdSec= is encoded in the + $MEMORY_PRESSURE_WRITE environment variable. If the auto value + is set the protocol is enabled if memory accounting is anyway enabled for the unit, and disabled + otherwise. If set to skip the logic is neither enabled, nor disabled and the two + environment variables are not set. + + Note that services are free to use the two environment variables, but it's unproblematic if + they ignore them. Memory pressure handling must be implemented individually in each service, and + usually means different things for different software. For further details on memory pressure + handling see Memory Pressure Handling in + systemd. + + Services implemented using + sd-event3 may use + sd_event_add_memory_pressure3 + to watch for and handle memory pressure events. + + If not explicit set, defaults to the DefaultMemoryPressureWatch= setting in + systemd-system.conf5. + + + + MemoryPressureThresholdSec= + + Sets the memory pressure threshold time for memory pressure monitor as configured via + MemoryPressureWatch=. Specifies the maximum allocation latency before a memory + pressure event is signalled to the service, per 1s window. If not specified defaults to the + DefaultMemoryPressureThresholdSec= setting in + systemd-system.conf5 + (which in turn defaults to 100ms). The specified value expects a time unit such as + ms or µs, see + systemd.time7 for + details on the permitted syntax. + diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 441d1125d29..41eb755fa5d 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -175,6 +175,9 @@ void cgroup_context_init(CGroupContext *c) { .moom_swap = MANAGED_OOM_AUTO, .moom_mem_pressure = MANAGED_OOM_AUTO, .moom_preference = MANAGED_OOM_PREFERENCE_NONE, + + .memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID, + .memory_pressure_threshold_usec = USEC_INFINITY, }; } @@ -517,7 +520,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sManagedOOMSwap: %s\n" "%sManagedOOMMemoryPressure: %s\n" "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" - "%sManagedOOMPreference: %s\n", + "%sManagedOOMPreference: %s\n" + "%sMemoryPressureWatch: %s\n", prefix, yes_no(c->cpu_accounting), prefix, yes_no(c->io_accounting), prefix, yes_no(c->blockio_accounting), @@ -559,7 +563,12 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_mode_to_string(c->moom_swap), prefix, managed_oom_mode_to_string(c->moom_mem_pressure), prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)), - prefix, managed_oom_preference_to_string(c->moom_preference)); + prefix, managed_oom_preference_to_string(c->moom_preference), + prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch)); + + if (c->memory_pressure_threshold_usec != USEC_INFINITY) + fprintf(f, "%sMemoryPressureThresholdSec: %s\n", + prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1)); if (c->delegate) { _cleanup_free_ char *t = NULL; @@ -4376,3 +4385,12 @@ static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { }; DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction); + +static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = { + [CGROUP_PRESSURE_WATCH_OFF] = "off", + [CGROUP_PRESSURE_WATCH_AUTO] = "auto", + [CGROUP_PRESSURE_WATCH_ON] = "on", + [CGROUP_PRESSURE_WATCH_SKIP] = "skip", +}; + +DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON); diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 7d905096420..8e1f0939016 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -110,6 +110,15 @@ struct CGroupSocketBindItem { uint16_t port_min; }; +typedef enum CGroupPressureWatch { + CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */ + CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */ + CGROUP_PRESSURE_WATCH_ON, + CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */ + _CGROUP_PRESSURE_WATCH_MAX, + _CGROUP_PRESSURE_WATCH_INVALID = -EINVAL, +} CGroupPressureWatch; + struct CGroupContext { bool cpu_accounting; bool io_accounting; @@ -207,6 +216,12 @@ struct CGroupContext { ManagedOOMMode moom_mem_pressure; uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */ ManagedOOMPreference moom_preference; + + /* Memory pressure logic */ + CGroupPressureWatch memory_pressure_watch; + usec_t memory_pressure_threshold_usec; + /* NB: For now we don't make the period configurable, not the type, nor do we allow multiple + * triggers, nor triggers for non-memory pressure. We might add that later. */ }; /* Used when querying IP accounting data */ @@ -248,6 +263,13 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p); void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head); +static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) { + assert(c); + + return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON || + (c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting); +} + int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path); @@ -351,3 +373,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action); const char* freezer_action_to_string(FreezerAction a) _const_; FreezerAction freezer_action_from_string(const char *s) _pure_; + +const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_; +CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_; diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index a3313c417f3..3a02fcbdb10 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -24,6 +24,7 @@ #include "socket-util.h" BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve); +BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode); @@ -494,6 +495,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0), SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0), SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0), + SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0), + SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0), SD_BUS_VTABLE_END }; @@ -743,6 +746,47 @@ static int bus_cgroup_set_transient_property( } } + return 1; + + } else if (streq(name, "MemoryPressureWatch")) { + CGroupPressureWatch p; + const char *t; + + r = sd_bus_message_read(message, "s", &t); + if (r < 0) + return r; + + if (isempty(t)) + p = _CGROUP_PRESSURE_WATCH_INVALID; + else { + p = cgroup_pressure_watch_from_string(t); + if (p < 0) + return p; + } + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->memory_pressure_watch = p; + unit_write_settingf(u, flags, name, "MemoryPressureWatch=%s", strempty(cgroup_pressure_watch_to_string(p))); + } + + return 1; + + } else if (streq(name, "MemoryPressureThresholdUSec")) { + uint64_t t; + + r = sd_bus_message_read(message, "t", &t); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->memory_pressure_threshold_usec = t; + + if (t == UINT64_MAX) + unit_write_setting(u, flags, name, "MemoryPressureThresholdUSec="); + else + unit_write_settingf(u, flags, name, "MemoryPressureThresholdUSec=%" PRIu64, t); + } + return 1; } diff --git a/src/core/dbus-cgroup.h b/src/core/dbus-cgroup.h index 5bf45eb972e..dd0d5da65df 100644 --- a/src/core/dbus-cgroup.h +++ b/src/core/dbus-cgroup.h @@ -10,5 +10,6 @@ extern const sd_bus_vtable bus_cgroup_vtable[]; int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); +int bus_property_get_cgroup_pressure_watch(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error); int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index 047a7b44240..6dd75d9e93e 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -2943,6 +2943,8 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, default_tasks_max), 0), + SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, default_memory_pressure_threshold_usec), 0), + SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, default_memory_pressure_watch), 0), SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST), SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST), diff --git a/src/core/execute.c b/src/core/execute.c index bba2753bee3..e23faf25bd9 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -80,6 +80,7 @@ #include "parse-util.h" #include "path-util.h" #include "process-util.h" +#include "psi-util.h" #include "random-util.h" #include "recurse-dir.h" #include "rlimit-util.h" @@ -1808,6 +1809,7 @@ static int build_environment( const Unit *u, const ExecContext *c, const ExecParameters *p, + const CGroupContext *cgroup_context, size_t n_fds, char **fdnames, const char *home, @@ -1815,6 +1817,7 @@ static int build_environment( const char *shell, dev_t journal_stream_dev, ino_t journal_stream_ino, + const char *memory_pressure_path, char ***ret) { _cleanup_strv_free_ char **our_env = NULL; @@ -1826,7 +1829,7 @@ static int build_environment( assert(p); assert(ret); -#define N_ENV_VARS 17 +#define N_ENV_VARS 19 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); if (!our_env) return -ENOMEM; @@ -1990,8 +1993,35 @@ static int build_environment( our_env[n_env++] = x; - our_env[n_env++] = NULL; - assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); + if (memory_pressure_path) { + x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + + if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) { + _cleanup_free_ char *b = NULL, *e = NULL; + + if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT, + MEMORY_PRESSURE_DEFAULT_TYPE, + cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC : + CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC), + MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0) + return -ENOMEM; + + if (base64mem(b, strlen(b) + 1, &e) < 0) + return -ENOMEM; + + x = strjoin("MEMORY_PRESSURE_WRITE=", e); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + } + + assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX); #undef N_ENV_VARS *ret = TAKE_PTR(our_env); @@ -4246,6 +4276,7 @@ static int exec_child( const ExecParameters *params, ExecRuntime *runtime, DynamicCreds *dcreds, + const CGroupContext *cgroup_context, int socket_fd, const int named_iofds[static 3], int *params_fds, @@ -4259,7 +4290,7 @@ static int exec_child( int r, ngids = 0, exec_fd; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; - _cleanup_free_ char *home_buffer = NULL; + _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL; const char *home = NULL, *shell = NULL; char **final_argv = NULL; dev_t journal_stream_dev = 0; @@ -4672,15 +4703,41 @@ static int exec_child( } } - /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 - * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not - * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only - * touch a single hierarchy too. */ - if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) { - r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); - if (r < 0) { - *exit_status = EXIT_CGROUP; - return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m"); + if (params->cgroup_path) { + /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1 + * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not + * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only + * touch a single hierarchy too. */ + + if (params->flags & EXEC_CGROUP_DELEGATE) { + r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m"); + } + } + + if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) { + if (cgroup_context_want_memory_pressure(cgroup_context)) { + r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path); + if (r < 0) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + + r = chmod_and_chown(memory_pressure_path, 0644, uid, gid); + if (r < 0) { + log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path); + memory_pressure_path = mfree(memory_pressure_path); + } + } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) { + memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */ + if (!memory_pressure_path) { + *exit_status = EXIT_MEMORY; + return log_oom(); + } + } } } @@ -4704,6 +4761,7 @@ static int exec_child( unit, context, params, + cgroup_context, n_fds, fdnames, home, @@ -4711,6 +4769,7 @@ static int exec_child( shell, journal_stream_dev, journal_stream_ino, + memory_pressure_path, &our_env); if (r < 0) { *exit_status = EXIT_MEMORY; @@ -5358,6 +5417,7 @@ int exec_spawn(Unit *unit, const ExecParameters *params, ExecRuntime *runtime, DynamicCreds *dcreds, + const CGroupContext *cgroup_context, pid_t *ret) { int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL; @@ -5445,6 +5505,7 @@ int exec_spawn(Unit *unit, params, runtime, dcreds, + cgroup_context, socket_fd, named_iofds, fds, diff --git a/src/core/execute.h b/src/core/execute.h index 79f98daf303..0cfbd3b1d2b 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -441,6 +441,7 @@ int exec_spawn(Unit *unit, const ExecParameters *exec_params, ExecRuntime *runtime, DynamicCreds *dynamic_creds, + const CGroupContext *cgroup_context, pid_t *ret); void exec_command_done_array(ExecCommand *c, size_t n); diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index d4a874eafa5..8577e9ef543 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -146,6 +146,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_time DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy"); DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference="); +DEFINE_CONFIG_PARSE_ENUM(config_parse_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch, "Failed to parse CGroupPressureWatch="); DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 11d43dda923..68ceeaec818 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -152,6 +152,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec); CONFIG_PARSER_PROTOTYPE(config_parse_tty_size); CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns); CONFIG_PARSER_PROTOTYPE(config_parse_open_file); +CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_pressure_watch); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/core/main.c b/src/core/main.c index c9849d05c1a..5469c55da73 100644 --- a/src/core/main.c +++ b/src/core/main.c @@ -75,6 +75,7 @@ #include "pretty-print.h" #include "proc-cmdline.h" #include "process-util.h" +#include "psi-util.h" #include "random-util.h" #include "rlimit-util.h" #if HAVE_SECCOMP @@ -162,6 +163,8 @@ static bool arg_default_blockio_accounting; static bool arg_default_memory_accounting; static bool arg_default_tasks_accounting; static TasksMax arg_default_tasks_max; +static usec_t arg_default_memory_pressure_threshold_usec; +static CGroupPressureWatch arg_default_memory_pressure_watch; static sd_id128_t arg_machine_id; static EmergencyAction arg_cad_burst_action; static OOMPolicy arg_default_oom_policy; @@ -686,6 +689,8 @@ static int parse_config_file(void) { { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting }, { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting }, { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max }, + { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_default_memory_pressure_threshold_usec }, + { "Manager", "DefaultMemoryPressureWatch", config_parse_cgroup_pressure_watch, 0, &arg_default_memory_pressure_watch }, { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_system, &arg_cad_burst_action }, { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy }, { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL }, @@ -767,6 +772,8 @@ static void set_manager_defaults(Manager *m) { m->default_memory_accounting = arg_default_memory_accounting; m->default_tasks_accounting = arg_default_tasks_accounting; m->default_tasks_max = arg_default_tasks_max; + m->default_memory_pressure_watch = arg_default_memory_pressure_watch; + m->default_memory_pressure_threshold_usec = arg_default_memory_pressure_threshold_usec; m->default_oom_policy = arg_default_oom_policy; m->default_oom_score_adjust_set = arg_default_oom_score_adjust_set; m->default_oom_score_adjust = arg_default_oom_score_adjust; @@ -2474,6 +2481,8 @@ static void reset_arguments(void) { arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT; arg_default_tasks_accounting = true; arg_default_tasks_max = DEFAULT_TASKS_MAX; + arg_default_memory_pressure_threshold_usec = MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC; + arg_default_memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO; arg_machine_id = (sd_id128_t) {}; arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE; arg_default_oom_policy = OOM_STOP; diff --git a/src/core/manager.c b/src/core/manager.c index 8dd4f098912..634aa386eab 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -70,6 +70,7 @@ #include "path-lookup.h" #include "path-util.h" #include "process-util.h" +#include "psi-util.h" #include "ratelimit.h" #include "rlimit-util.h" #include "rm-rf.h" @@ -643,6 +644,8 @@ static char** sanitize_environment(char **l) { "LOG_NAMESPACE", "MAINPID", "MANAGERPID", + "MEMORY_PRESSURE_WATCH", + "MEMORY_PRESSURE_WRITE", "MONITOR_EXIT_CODE", "MONITOR_EXIT_STATUS", "MONITOR_INVOCATION_ID", @@ -803,6 +806,16 @@ int manager_setup_memory_pressure_event_source(Manager *m) { if (r < 0) log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r, "Failed to establish memory pressure event source, ignoring: %m"); + else if (m->default_memory_pressure_threshold_usec != USEC_INFINITY) { + + /* If there's a default memory pressure threshold set, also apply it to the service manager itself */ + r = sd_event_source_set_memory_pressure_period( + m->memory_pressure_event_source, + m->default_memory_pressure_threshold_usec, + MEMORY_PRESSURE_DEFAULT_WINDOW_USEC); + if (r < 0) + log_warning_errno(r, "Failed to adjust memory pressure threshold, ignoring: %m"); + } return 0; } @@ -897,6 +910,9 @@ int manager_new(LookupScope scope, ManagerTestRunFlags test_run_flags, Manager * .test_run_flags = test_run_flags, .default_oom_policy = OOM_STOP, + + .default_memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO, + .default_memory_pressure_threshold_usec = USEC_INFINITY, }; #if ENABLE_EFI diff --git a/src/core/manager.h b/src/core/manager.h index 66e6a9e1cfc..085fffef509 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -377,6 +377,9 @@ struct Manager { int default_oom_score_adjust; bool default_oom_score_adjust_set; + CGroupPressureWatch default_memory_pressure_watch; + usec_t default_memory_pressure_threshold_usec; + int original_log_level; LogTarget original_log_target; bool log_level_overridden; diff --git a/src/core/mount.c b/src/core/mount.c index 95bd04f6e9f..a833adc1ded 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -922,6 +922,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) { &exec_params, m->exec_runtime, &m->dynamic_creds, + &m->cgroup_context, &pid); if (r < 0) return r; diff --git a/src/core/service.c b/src/core/service.c index dc5ccfd239d..0481416f228 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1709,6 +1709,7 @@ static int service_spawn_internal( &exec_params, s->exec_runtime, &s->dynamic_creds, + &s->cgroup_context, &pid); if (r < 0) return r; diff --git a/src/core/socket.c b/src/core/socket.c index 3dd726d52a1..307a8f20612 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -1948,6 +1948,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) { &exec_params, s->exec_runtime, &s->dynamic_creds, + &s->cgroup_context, &pid); if (r < 0) return r; diff --git a/src/core/swap.c b/src/core/swap.c index ab901a2cd17..d0b557cf400 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -690,6 +690,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) { &exec_params, s->exec_runtime, &s->dynamic_creds, + &s->cgroup_context, &pid); if (r < 0) goto fail; diff --git a/src/core/unit.c b/src/core/unit.c index 8cd1e0370dd..a67dceb31fa 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -184,6 +184,9 @@ static void unit_init(Unit *u) { if (u->type != UNIT_SLICE) cc->tasks_max = u->manager->default_tasks_max; + + cc->memory_pressure_watch = u->manager->default_memory_pressure_watch; + cc->memory_pressure_threshold_usec = u->manager->default_memory_pressure_threshold_usec; } ec = unit_get_exec_context(u); diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 1e95e366787..badd61656c6 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -460,7 +460,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons "Slice", "ManagedOOMSwap", "ManagedOOMMemoryPressure", - "ManagedOOMPreference")) + "ManagedOOMPreference", + "MemoryPressureWatch")) return bus_append_string(m, field, eq); if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { @@ -913,6 +914,9 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons return 1; } + if (streq(field, "MemoryPressureThresholdSec")) + return bus_append_parse_sec_rename(m, field, eq); + return 0; }