pid1: add unit file settings to control memory pressure logic

This commit is contained in:
Lennart Poettering 2023-02-15 19:00:14 +01:00
parent 29e6b0c171
commit 6bb0084204
22 changed files with 362 additions and 16 deletions

View file

@ -529,6 +529,10 @@ node /org/freedesktop/systemd1 {
readonly t DefaultLimitRTTIMESoft = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t DefaultTasksMax = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t DefaultMemoryPressureThresholdUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s DefaultMemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t TimerSlackNSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -782,6 +786,10 @@ node /org/freedesktop/systemd1 {
<!--property DefaultTasksMax is not documented!-->
<!--property DefaultMemoryPressureThresholdUSec is not documented!-->
<!--property DefaultMemoryPressureWatch is not documented!-->
<!--property TimerSlackNSec is not documented!-->
<!--property DefaultOOMPolicy is not documented!-->
@ -1208,6 +1216,10 @@ node /org/freedesktop/systemd1 {
<variablelist class="dbus-property" generated="True" extra-ref="DefaultTasksMax"/>
<variablelist class="dbus-property" generated="True" extra-ref="DefaultMemoryPressureThresholdUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="DefaultMemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="TimerSlackNSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="DefaultOOMPolicy"/>
@ -2803,6 +2815,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
readonly a(iiqq) SocketBindDeny = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly (bas) RestrictNetworkInterfaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s MemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t MemoryPressureThresholdUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -3395,6 +3411,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property RestrictNetworkInterfaces is not documented!-->
<!--property MemoryPressureWatch is not documented!-->
<!--property MemoryPressureThresholdUSec is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -3995,6 +4015,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNetworkInterfaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureThresholdUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -4747,6 +4771,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
readonly a(iiqq) SocketBindDeny = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly (bas) RestrictNetworkInterfaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s MemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t MemoryPressureThresholdUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -5359,6 +5387,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property RestrictNetworkInterfaces is not documented!-->
<!--property MemoryPressureWatch is not documented!-->
<!--property MemoryPressureThresholdUSec is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -5949,6 +5981,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNetworkInterfaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureThresholdUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -6590,6 +6626,10 @@ node /org/freedesktop/systemd1/unit/home_2emount {
readonly a(iiqq) SocketBindDeny = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly (bas) RestrictNetworkInterfaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s MemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t MemoryPressureThresholdUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -7130,6 +7170,10 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property RestrictNetworkInterfaces is not documented!-->
<!--property MemoryPressureWatch is not documented!-->
<!--property MemoryPressureThresholdUSec is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -7638,6 +7682,10 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNetworkInterfaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureThresholdUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -8406,6 +8454,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
readonly a(iiqq) SocketBindDeny = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly (bas) RestrictNetworkInterfaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s MemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t MemoryPressureThresholdUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -8932,6 +8984,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property RestrictNetworkInterfaces is not documented!-->
<!--property MemoryPressureWatch is not documented!-->
<!--property MemoryPressureThresholdUSec is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -9426,6 +9482,10 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNetworkInterfaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureThresholdUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -10053,6 +10113,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
readonly a(iiqq) SocketBindDeny = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly (bas) RestrictNetworkInterfaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s MemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t MemoryPressureThresholdUSec = ...;
};
interface org.freedesktop.DBus.Peer { ... };
interface org.freedesktop.DBus.Introspectable { ... };
@ -10219,6 +10283,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<!--property RestrictNetworkInterfaces is not documented!-->
<!--property MemoryPressureWatch is not documented!-->
<!--property MemoryPressureThresholdUSec is not documented!-->
<!--Autogenerated cross-references for systemd.directives, do not edit-->
<variablelist class="dbus-interface" generated="True" extra-ref="org.freedesktop.systemd1.Unit"/>
@ -10391,6 +10459,10 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNetworkInterfaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureThresholdUSec"/>
<!--End of Autogenerated section-->
<refsect2>
@ -10586,6 +10658,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
readonly a(iiqq) SocketBindDeny = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly (bas) RestrictNetworkInterfaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s MemoryPressureWatch = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly t MemoryPressureThresholdUSec = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s KillMode = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -10772,6 +10848,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<!--property RestrictNetworkInterfaces is not documented!-->
<!--property MemoryPressureWatch is not documented!-->
<!--property MemoryPressureThresholdUSec is not documented!-->
<!--property KillMode is not documented!-->
<!--property KillSignal is not documented!-->
@ -10974,6 +11054,10 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNetworkInterfaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureWatch"/>
<variablelist class="dbus-property" generated="True" extra-ref="MemoryPressureThresholdUSec"/>
<variablelist class="dbus-property" generated="True" extra-ref="KillMode"/>
<variablelist class="dbus-property" generated="True" extra-ref="KillSignal"/>

View file

@ -556,6 +556,18 @@
to configure the rate limit window, and <varname>ReloadLimitBurst=</varname> takes a positive integer to
configure the maximum allowed number of reloads within the configured time window.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>DefaultMemoryPressureWatch=</varname></term>
<term><varname>DefaultMemoryPressureThresholdSec=</varname></term>
<listitem><para>Configures the default settings for the per-unit
<varname>MemoryPressureWatch=</varname> and <varname>MemoryPressureThresholdSec=</varname>
settings. See
<citerefentry><refentrytitle>systemd.resource-control</refentrytitle><manvolnum>5</manvolnum></citerefentry>
for details. Defaults to <literal>auto</literal> and <literal>100ms</literal>, respectively. This
also sets the memory pressure monitoring threshold for the service manager itself.</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View file

@ -3779,6 +3779,16 @@ StandardInputData=V2XigLJyZSBubyBzdHJhbmdlcnMgdG8gbG92ZQpZb3Uga25vdyB0aGUgcnVsZX
</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>$MEMORY_PRESSURE_WATCH</varname></term>
<term><varname>$MEMORY_PRESSURE_WRITE</varname></term>
<listitem><para>If memory pressure monitoring is enabled for this service unit, the path to watch
and the data to write into it. See <ulink url="https://systemd.io/MEMORY_PRESSURE">Memory Pressure
Handling</ulink> for details about these variables and the service protocol data they
convey.</para></listitem>
</varlistentry>
</variablelist>
<para>For system services, when <varname>PAMName=</varname> is enabled and <command>pam_systemd</command> is part

View file

@ -1169,6 +1169,53 @@ DeviceAllow=/dev/loop-control
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>MemoryPressureWatch=</varname></term>
<listitem><para>Controls memory pressure monitoring for invoked processes. Takes one of
<literal>off</literal>, <literal>on</literal>, <literal>auto</literal> or <literal>skip</literal>. If
<literal>off</literal> tells the service not to watch for memory pressure events, by setting the
<varname>$MEMORY_PRESSURE_WATCH</varname> environment variable to the literal string
<filename>/dev/null</filename>. If <literal>on</literal> tells the service to watch for memory
pressure events. This enables memory accounting for the service, and ensures the
<filename>memory.pressure</filename> cgroup attribute files is accessible for read and write to the
service's user. It then sets the <varname>$MEMORY_PRESSURE_WATCH</varname> environment variable for
processes invoked by the unit to the file system path to this file. The threshold information
configured with <varname>MemoryPressureThresholdSec=</varname> is encoded in the
<varname>$MEMORY_PRESSURE_WRITE</varname> environment variable. If the <literal>auto</literal> value
is set the protocol is enabled if memory accounting is anyway enabled for the unit, and disabled
otherwise. If set to <literal>skip</literal> the logic is neither enabled, nor disabled and the two
environment variables are not set.</para>
<para>Note that services are free to use the two environment variables, but it's unproblematic if
they ignore them. Memory pressure handling must be implemented individually in each service, and
usually means different things for different software. For further details on memory pressure
handling see <ulink url="https://systemd.io/MEMORY_PRESSURE">Memory Pressure Handling in
systemd</ulink>.</para>
<para>Services implemented using
<citerefentry><refentrytitle>sd-event</refentrytitle><manvolnum>3</manvolnum></citerefentry> may use
<citerefentry><refentrytitle>sd_event_add_memory_pressure</refentrytitle><manvolnum>3</manvolnum></citerefentry>
to watch for and handle memory pressure events.</para>
<para>If not explicit set, defaults to the <varname>DefaultMemoryPressureWatch=</varname> setting in
<citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>MemoryPressureThresholdSec=</varname></term>
<listitem><para>Sets the memory pressure threshold time for memory pressure monitor as configured via
<varname>MemoryPressureWatch=</varname>. Specifies the maximum allocation latency before a memory
pressure event is signalled to the service, per 1s window. If not specified defaults to the
<varname>DefaultMemoryPressureThresholdSec=</varname> setting in
<citerefentry><refentrytitle>systemd-system.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry>
(which in turn defaults to 100ms). The specified value expects a time unit such as
<literal>ms</literal> or <literal>µs</literal>, see
<citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry> for
details on the permitted syntax.</para></listitem>
</varlistentry>
</variablelist>
</refsect1>

View file

@ -175,6 +175,9 @@ void cgroup_context_init(CGroupContext *c) {
.moom_swap = MANAGED_OOM_AUTO,
.moom_mem_pressure = MANAGED_OOM_AUTO,
.moom_preference = MANAGED_OOM_PREFERENCE_NONE,
.memory_pressure_watch = _CGROUP_PRESSURE_WATCH_INVALID,
.memory_pressure_threshold_usec = USEC_INFINITY,
};
}
@ -517,7 +520,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
"%sManagedOOMSwap: %s\n"
"%sManagedOOMMemoryPressure: %s\n"
"%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
"%sManagedOOMPreference: %s\n",
"%sManagedOOMPreference: %s\n"
"%sMemoryPressureWatch: %s\n",
prefix, yes_no(c->cpu_accounting),
prefix, yes_no(c->io_accounting),
prefix, yes_no(c->blockio_accounting),
@ -559,7 +563,12 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, managed_oom_mode_to_string(c->moom_swap),
prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)),
prefix, managed_oom_preference_to_string(c->moom_preference));
prefix, managed_oom_preference_to_string(c->moom_preference),
prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch));
if (c->memory_pressure_threshold_usec != USEC_INFINITY)
fprintf(f, "%sMemoryPressureThresholdSec: %s\n",
prefix, FORMAT_TIMESPAN(c->memory_pressure_threshold_usec, 1));
if (c->delegate) {
_cleanup_free_ char *t = NULL;
@ -4376,3 +4385,12 @@ static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {
};
DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction);
static const char* const cgroup_pressure_watch_table[_CGROUP_PRESSURE_WATCH_MAX] = {
[CGROUP_PRESSURE_WATCH_OFF] = "off",
[CGROUP_PRESSURE_WATCH_AUTO] = "auto",
[CGROUP_PRESSURE_WATCH_ON] = "on",
[CGROUP_PRESSURE_WATCH_SKIP] = "skip",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(cgroup_pressure_watch, CGroupPressureWatch, CGROUP_PRESSURE_WATCH_ON);

View file

@ -110,6 +110,15 @@ struct CGroupSocketBindItem {
uint16_t port_min;
};
typedef enum CGroupPressureWatch {
CGROUP_PRESSURE_WATCH_OFF, /* → tells the service payload explicitly not to watch for memory pressure */
CGROUP_PRESSURE_WATCH_AUTO, /* → on if memory account is on anyway for the unit, otherwise off */
CGROUP_PRESSURE_WATCH_ON,
CGROUP_PRESSURE_WATCH_SKIP, /* → doesn't set up memory pressure watch, but also doesn't explicitly tell payload to avoid it */
_CGROUP_PRESSURE_WATCH_MAX,
_CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
} CGroupPressureWatch;
struct CGroupContext {
bool cpu_accounting;
bool io_accounting;
@ -207,6 +216,12 @@ struct CGroupContext {
ManagedOOMMode moom_mem_pressure;
uint32_t moom_mem_pressure_limit; /* Normalized to 2^32-1 == 100% */
ManagedOOMPreference moom_preference;
/* Memory pressure logic */
CGroupPressureWatch memory_pressure_watch;
usec_t memory_pressure_threshold_usec;
/* NB: For now we don't make the period configurable, not the type, nor do we allow multiple
* triggers, nor triggers for non-memory pressure. We might add that later. */
};
/* Used when querying IP accounting data */
@ -248,6 +263,13 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI
void cgroup_context_remove_bpf_foreign_program(CGroupContext *c, CGroupBPFForeignProgram *p);
void cgroup_context_remove_socket_bind(CGroupSocketBindItem **head);
static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
assert(c);
return c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_ON ||
(c->memory_pressure_watch == CGROUP_PRESSURE_WATCH_AUTO && c->memory_accounting);
}
int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
int cgroup_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
@ -351,3 +373,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action);
const char* freezer_action_to_string(FreezerAction a) _const_;
FreezerAction freezer_action_from_string(const char *s) _pure_;
const char* cgroup_pressure_watch_to_string(CGroupPressureWatch a) _const_;
CGroupPressureWatch cgroup_pressure_watch_from_string(const char *s) _pure_;

View file

@ -24,6 +24,7 @@
#include "socket-util.h"
BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_resolve);
BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
@ -494,6 +495,8 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("SocketBindAllow", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_allow), 0),
SD_BUS_PROPERTY("SocketBindDeny", "a(iiqq)", property_get_socket_bind, offsetof(CGroupContext, socket_bind_deny), 0),
SD_BUS_PROPERTY("RestrictNetworkInterfaces", "(bas)", property_get_restrict_network_interfaces, 0, 0),
SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0),
SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0),
SD_BUS_VTABLE_END
};
@ -743,6 +746,47 @@ static int bus_cgroup_set_transient_property(
}
}
return 1;
} else if (streq(name, "MemoryPressureWatch")) {
CGroupPressureWatch p;
const char *t;
r = sd_bus_message_read(message, "s", &t);
if (r < 0)
return r;
if (isempty(t))
p = _CGROUP_PRESSURE_WATCH_INVALID;
else {
p = cgroup_pressure_watch_from_string(t);
if (p < 0)
return p;
}
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->memory_pressure_watch = p;
unit_write_settingf(u, flags, name, "MemoryPressureWatch=%s", strempty(cgroup_pressure_watch_to_string(p)));
}
return 1;
} else if (streq(name, "MemoryPressureThresholdUSec")) {
uint64_t t;
r = sd_bus_message_read(message, "t", &t);
if (r < 0)
return r;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->memory_pressure_threshold_usec = t;
if (t == UINT64_MAX)
unit_write_setting(u, flags, name, "MemoryPressureThresholdUSec=");
else
unit_write_settingf(u, flags, name, "MemoryPressureThresholdUSec=%" PRIu64, t);
}
return 1;
}

View file

@ -10,5 +10,6 @@
extern const sd_bus_vtable bus_cgroup_vtable[];
int bus_property_get_tasks_max(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
int bus_property_get_cgroup_pressure_watch(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
int bus_cgroup_set_property(Unit *u, CGroupContext *c, const char *name, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error);

View file

@ -2943,6 +2943,8 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_PROPERTY("DefaultLimitRTTIME", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultTasksMax", "t", bus_property_get_tasks_max, offsetof(Manager, default_tasks_max), 0),
SD_BUS_PROPERTY("DefaultMemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(Manager, default_memory_pressure_threshold_usec), 0),
SD_BUS_PROPERTY("DefaultMemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(Manager, default_memory_pressure_watch), 0),
SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("DefaultOOMScoreAdjust", "i", property_get_oom_score_adjust, 0, SD_BUS_VTABLE_PROPERTY_CONST),

View file

@ -80,6 +80,7 @@
#include "parse-util.h"
#include "path-util.h"
#include "process-util.h"
#include "psi-util.h"
#include "random-util.h"
#include "recurse-dir.h"
#include "rlimit-util.h"
@ -1808,6 +1809,7 @@ static int build_environment(
const Unit *u,
const ExecContext *c,
const ExecParameters *p,
const CGroupContext *cgroup_context,
size_t n_fds,
char **fdnames,
const char *home,
@ -1815,6 +1817,7 @@ static int build_environment(
const char *shell,
dev_t journal_stream_dev,
ino_t journal_stream_ino,
const char *memory_pressure_path,
char ***ret) {
_cleanup_strv_free_ char **our_env = NULL;
@ -1826,7 +1829,7 @@ static int build_environment(
assert(p);
assert(ret);
#define N_ENV_VARS 17
#define N_ENV_VARS 19
our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
if (!our_env)
return -ENOMEM;
@ -1990,8 +1993,35 @@ static int build_environment(
our_env[n_env++] = x;
our_env[n_env++] = NULL;
assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
if (memory_pressure_path) {
x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
_cleanup_free_ char *b = NULL, *e = NULL;
if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
MEMORY_PRESSURE_DEFAULT_TYPE,
cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
return -ENOMEM;
if (base64mem(b, strlen(b) + 1, &e) < 0)
return -ENOMEM;
x = strjoin("MEMORY_PRESSURE_WRITE=", e);
if (!x)
return -ENOMEM;
our_env[n_env++] = x;
}
}
assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
#undef N_ENV_VARS
*ret = TAKE_PTR(our_env);
@ -4246,6 +4276,7 @@ static int exec_child(
const ExecParameters *params,
ExecRuntime *runtime,
DynamicCreds *dcreds,
const CGroupContext *cgroup_context,
int socket_fd,
const int named_iofds[static 3],
int *params_fds,
@ -4259,7 +4290,7 @@ static int exec_child(
int r, ngids = 0, exec_fd;
_cleanup_free_ gid_t *supplementary_gids = NULL;
const char *username = NULL, *groupname = NULL;
_cleanup_free_ char *home_buffer = NULL;
_cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
const char *home = NULL, *shell = NULL;
char **final_argv = NULL;
dev_t journal_stream_dev = 0;
@ -4672,15 +4703,41 @@ static int exec_child(
}
}
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
* safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
* touch a single hierarchy too. */
if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
if (params->cgroup_path) {
/* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
* this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
* safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
* touch a single hierarchy too. */
if (params->flags & EXEC_CGROUP_DELEGATE) {
r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
if (r < 0) {
*exit_status = EXIT_CGROUP;
return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
}
}
if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
if (cgroup_context_want_memory_pressure(cgroup_context)) {
r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
if (r < 0) {
*exit_status = EXIT_MEMORY;
return log_oom();
}
r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
if (r < 0) {
log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
memory_pressure_path = mfree(memory_pressure_path);
}
} else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
if (!memory_pressure_path) {
*exit_status = EXIT_MEMORY;
return log_oom();
}
}
}
}
@ -4704,6 +4761,7 @@ static int exec_child(
unit,
context,
params,
cgroup_context,
n_fds,
fdnames,
home,
@ -4711,6 +4769,7 @@ static int exec_child(
shell,
journal_stream_dev,
journal_stream_ino,
memory_pressure_path,
&our_env);
if (r < 0) {
*exit_status = EXIT_MEMORY;
@ -5358,6 +5417,7 @@ int exec_spawn(Unit *unit,
const ExecParameters *params,
ExecRuntime *runtime,
DynamicCreds *dcreds,
const CGroupContext *cgroup_context,
pid_t *ret) {
int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
@ -5445,6 +5505,7 @@ int exec_spawn(Unit *unit,
params,
runtime,
dcreds,
cgroup_context,
socket_fd,
named_iofds,
fds,

View file

@ -441,6 +441,7 @@ int exec_spawn(Unit *unit,
const ExecParameters *exec_params,
ExecRuntime *runtime,
DynamicCreds *dynamic_creds,
const CGroupContext *cgroup_context,
pid_t *ret);
void exec_command_done_array(ExecCommand *c, size_t n);

View file

@ -146,6 +146,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_time
DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference=");
DEFINE_CONFIG_PARSE_ENUM(config_parse_cgroup_pressure_watch, cgroup_pressure_watch, CGroupPressureWatch, "Failed to parse CGroupPressureWatch=");
DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");

View file

@ -152,6 +152,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_watchdog_sec);
CONFIG_PARSER_PROTOTYPE(config_parse_tty_size);
CONFIG_PARSER_PROTOTYPE(config_parse_log_filter_patterns);
CONFIG_PARSER_PROTOTYPE(config_parse_open_file);
CONFIG_PARSER_PROTOTYPE(config_parse_cgroup_pressure_watch);
/* gperf prototypes */
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);

View file

@ -75,6 +75,7 @@
#include "pretty-print.h"
#include "proc-cmdline.h"
#include "process-util.h"
#include "psi-util.h"
#include "random-util.h"
#include "rlimit-util.h"
#if HAVE_SECCOMP
@ -162,6 +163,8 @@ static bool arg_default_blockio_accounting;
static bool arg_default_memory_accounting;
static bool arg_default_tasks_accounting;
static TasksMax arg_default_tasks_max;
static usec_t arg_default_memory_pressure_threshold_usec;
static CGroupPressureWatch arg_default_memory_pressure_watch;
static sd_id128_t arg_machine_id;
static EmergencyAction arg_cad_burst_action;
static OOMPolicy arg_default_oom_policy;
@ -686,6 +689,8 @@ static int parse_config_file(void) {
{ "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting },
{ "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
{ "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
{ "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_default_memory_pressure_threshold_usec },
{ "Manager", "DefaultMemoryPressureWatch", config_parse_cgroup_pressure_watch, 0, &arg_default_memory_pressure_watch },
{ "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_system, &arg_cad_burst_action },
{ "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
{ "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL },
@ -767,6 +772,8 @@ static void set_manager_defaults(Manager *m) {
m->default_memory_accounting = arg_default_memory_accounting;
m->default_tasks_accounting = arg_default_tasks_accounting;
m->default_tasks_max = arg_default_tasks_max;
m->default_memory_pressure_watch = arg_default_memory_pressure_watch;
m->default_memory_pressure_threshold_usec = arg_default_memory_pressure_threshold_usec;
m->default_oom_policy = arg_default_oom_policy;
m->default_oom_score_adjust_set = arg_default_oom_score_adjust_set;
m->default_oom_score_adjust = arg_default_oom_score_adjust;
@ -2474,6 +2481,8 @@ static void reset_arguments(void) {
arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
arg_default_tasks_accounting = true;
arg_default_tasks_max = DEFAULT_TASKS_MAX;
arg_default_memory_pressure_threshold_usec = MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC;
arg_default_memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO;
arg_machine_id = (sd_id128_t) {};
arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
arg_default_oom_policy = OOM_STOP;

View file

@ -70,6 +70,7 @@
#include "path-lookup.h"
#include "path-util.h"
#include "process-util.h"
#include "psi-util.h"
#include "ratelimit.h"
#include "rlimit-util.h"
#include "rm-rf.h"
@ -643,6 +644,8 @@ static char** sanitize_environment(char **l) {
"LOG_NAMESPACE",
"MAINPID",
"MANAGERPID",
"MEMORY_PRESSURE_WATCH",
"MEMORY_PRESSURE_WRITE",
"MONITOR_EXIT_CODE",
"MONITOR_EXIT_STATUS",
"MONITOR_INVOCATION_ID",
@ -803,6 +806,16 @@ int manager_setup_memory_pressure_event_source(Manager *m) {
if (r < 0)
log_full_errno(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r) || (r == -EHOSTDOWN) ? LOG_DEBUG : LOG_NOTICE, r,
"Failed to establish memory pressure event source, ignoring: %m");
else if (m->default_memory_pressure_threshold_usec != USEC_INFINITY) {
/* If there's a default memory pressure threshold set, also apply it to the service manager itself */
r = sd_event_source_set_memory_pressure_period(
m->memory_pressure_event_source,
m->default_memory_pressure_threshold_usec,
MEMORY_PRESSURE_DEFAULT_WINDOW_USEC);
if (r < 0)
log_warning_errno(r, "Failed to adjust memory pressure threshold, ignoring: %m");
}
return 0;
}
@ -897,6 +910,9 @@ int manager_new(LookupScope scope, ManagerTestRunFlags test_run_flags, Manager *
.test_run_flags = test_run_flags,
.default_oom_policy = OOM_STOP,
.default_memory_pressure_watch = CGROUP_PRESSURE_WATCH_AUTO,
.default_memory_pressure_threshold_usec = USEC_INFINITY,
};
#if ENABLE_EFI

View file

@ -377,6 +377,9 @@ struct Manager {
int default_oom_score_adjust;
bool default_oom_score_adjust_set;
CGroupPressureWatch default_memory_pressure_watch;
usec_t default_memory_pressure_threshold_usec;
int original_log_level;
LogTarget original_log_target;
bool log_level_overridden;

View file

@ -922,6 +922,7 @@ static int mount_spawn(Mount *m, ExecCommand *c, pid_t *_pid) {
&exec_params,
m->exec_runtime,
&m->dynamic_creds,
&m->cgroup_context,
&pid);
if (r < 0)
return r;

View file

@ -1709,6 +1709,7 @@ static int service_spawn_internal(
&exec_params,
s->exec_runtime,
&s->dynamic_creds,
&s->cgroup_context,
&pid);
if (r < 0)
return r;

View file

@ -1948,6 +1948,7 @@ static int socket_spawn(Socket *s, ExecCommand *c, pid_t *_pid) {
&exec_params,
s->exec_runtime,
&s->dynamic_creds,
&s->cgroup_context,
&pid);
if (r < 0)
return r;

View file

@ -690,6 +690,7 @@ static int swap_spawn(Swap *s, ExecCommand *c, pid_t *_pid) {
&exec_params,
s->exec_runtime,
&s->dynamic_creds,
&s->cgroup_context,
&pid);
if (r < 0)
goto fail;

View file

@ -184,6 +184,9 @@ static void unit_init(Unit *u) {
if (u->type != UNIT_SLICE)
cc->tasks_max = u->manager->default_tasks_max;
cc->memory_pressure_watch = u->manager->default_memory_pressure_watch;
cc->memory_pressure_threshold_usec = u->manager->default_memory_pressure_threshold_usec;
}
ec = unit_get_exec_context(u);

View file

@ -460,7 +460,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
"Slice",
"ManagedOOMSwap",
"ManagedOOMMemoryPressure",
"ManagedOOMPreference"))
"ManagedOOMPreference",
"MemoryPressureWatch"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {
@ -913,6 +914,9 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
return 1;
}
if (streq(field, "MemoryPressureThresholdSec"))
return bus_append_parse_sec_rename(m, field, eq);
return 0;
}