mirror of
https://github.com/systemd/systemd
synced 2024-10-15 04:24:19 +00:00
pid1: add SurviveFinalKillSignal= to skip units on final sigterm/sigkill spree
Add a new boolean for units, SurviveFinalKillSignal=yes/no. Units that set it will not have their process receive the final sigterm/sigkill in the shutdown phase. This is implemented by checking if a process is part of a cgroup marked with a user.survive_final_kill_signal xattr (or a trusted xattr if we can't set a user one, which were added only in kernel v5.7 and are not supported in CentOS 8).
This commit is contained in:
parent
69feab97f9
commit
559214cbbd
|
@ -2028,6 +2028,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
|||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly b DefaultDependencies = ...;
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly b SurviveFinalKillSignal = ...;
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s OnSuccessJobMode = '...';
|
||||
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
|
||||
readonly s OnFailureJobMode = '...';
|
||||
|
@ -2142,6 +2144,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
|||
|
||||
<!--property CanFreeze is not documented!-->
|
||||
|
||||
<!--property SurviveFinalKillSignal is not documented!-->
|
||||
|
||||
<!--property OnSuccessJobMode is not documented!-->
|
||||
|
||||
<!--property OnFailureJobMode is not documented!-->
|
||||
|
@ -2354,6 +2358,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
|
|||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="DefaultDependencies"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="SurviveFinalKillSignal"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="OnSuccessJobMode"/>
|
||||
|
||||
<variablelist class="dbus-property" generated="True" extra-ref="OnFailureJobMode"/>
|
||||
|
@ -11613,6 +11619,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
|
|||
<para><varname>AccessSELinuxContext</varname> and
|
||||
<varname>ActivationDetails</varname> were added in version 252.</para>
|
||||
<para><function>QueueSignal()</function> was added in version 254.</para>
|
||||
<para><varname>SurviveFinalKillSignal</varname> was added in version 255.</para>
|
||||
</refsect2>
|
||||
<refsect2>
|
||||
<title>Service Unit Objects</title>
|
||||
|
|
|
@ -96,12 +96,41 @@
|
|||
<listitem><para>The <filename>/run/</filename> file system remains mounted and populated and may be
|
||||
used to pass state information between such userspace reboot cycles.</para></listitem>
|
||||
|
||||
<listitem><para>Service processes may continue to run over the transition, if they are placed in
|
||||
services that remain active until the very end of shutdown (which again is achieved via
|
||||
<varname>DefaultDependencies=no</varname>). They must also be set up to avoid being killed by the
|
||||
aforementioned <constant>SIGTERM</constant> spree (as per <ulink
|
||||
url="https://systemd.io/ROOT_STORAGE_DAEMONS">systemd and Storage Daemons for the Root File
|
||||
System</ulink>).</para></listitem>
|
||||
<listitem><para>Service processes may continue to run over the transition, past soft-reboot and into
|
||||
the next session, if they are placed in services that remain active until the very end of shutdown
|
||||
(which again is achieved via <varname>DefaultDependencies=no</varname>). They must also be set up to
|
||||
avoid being killed by the aforementioned <constant>SIGTERM</constant> and <constant>SIGKILL</constant>
|
||||
via <varname>SurviveFinalKillSignal=yes</varname>, and also be configured to avoid being stopped on
|
||||
isolate via <varname>IgnoreOnIsolate=yes</varname>. They also have to be configured to be stopped on
|
||||
normal shutdown, reboot and maintenance mode. Finally, they have to be ordered after
|
||||
<constant>basic.target</constant> to ensure correct ordeering on boot. Note that in case any new or
|
||||
custom units are used to isolate to, or that implement an equivalent shutdown functionality, they will
|
||||
also have to be configured manually for correct ordering and conflicting. For example:</para>
|
||||
|
||||
<programlisting>[Unit]
|
||||
Description=My surviving service
|
||||
SurviveFinalKillSignal=yes
|
||||
IgnoreOnIsolate=yes
|
||||
DefaultDependencies=no
|
||||
After=basic.target
|
||||
Conflicts=reboot.target
|
||||
Before=reboot.target
|
||||
Conflicts=kexec.target
|
||||
Before=kexec.target
|
||||
Conflicts=poweroff.target
|
||||
Before=poweroff.target
|
||||
Conflicts=halt.target
|
||||
Before=halt.target
|
||||
Conflicts=rescue.target
|
||||
Before=rescue.target
|
||||
Conflicts=emergency.target
|
||||
Before=emergency.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=sleep infinity
|
||||
</programlisting>
|
||||
</listitem>
|
||||
|
||||
<listitem><para>File system mounts may remain mounted during the transition, and complex storage
|
||||
attached, if configured to remain until the very end of the shutdown process. (Also achieved via
|
||||
|
|
|
@ -1023,6 +1023,20 @@
|
|||
<xi:include href="version-info.xml" xpointer="v201"/></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>SurviveFinalKillSignal=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument. Defaults to <option>no</option>. If <option>yes</option>,
|
||||
processes belonging to this unit will not be sent the final <literal>SIGTERM</literal> and
|
||||
<literal>SIGKILL</literal> signals during the final phase of the system shutdown process.
|
||||
This functionality replaces the older mechanism that allowed a program to set
|
||||
<literal>argv[0][0] = '@'</literal> as described at
|
||||
<ulink url="https://systemd.io/ROOT_STORAGE_DAEMONS">systemd and Storage Daemons for the Root File
|
||||
System</ulink>, which however continues to be supported.</para>
|
||||
|
||||
<xi:include href="version-info.xml" xpointer="v255"/></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>CollectMode=</varname></term>
|
||||
|
||||
|
|
|
@ -888,6 +888,7 @@ int cgroup_log_xattr_apply(Unit *u, const char *cgroup_path) {
|
|||
|
||||
static void cgroup_xattr_apply(Unit *u) {
|
||||
bool b;
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
|
||||
|
@ -921,6 +922,32 @@ static void cgroup_xattr_apply(Unit *u) {
|
|||
else
|
||||
unit_remove_xattr_graceful(u, NULL, xn);
|
||||
}
|
||||
|
||||
if (u->survive_final_kill_signal) {
|
||||
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
|
||||
u->cgroup_path,
|
||||
"user.survive_final_kill_signal",
|
||||
"1",
|
||||
1,
|
||||
/* flags= */ 0);
|
||||
/* user xattr support was added in kernel v5.7 */
|
||||
if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
|
||||
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER,
|
||||
u->cgroup_path,
|
||||
"trusted.survive_final_kill_signal",
|
||||
"1",
|
||||
1,
|
||||
/* flags= */ 0);
|
||||
if (r < 0)
|
||||
log_unit_debug_errno(u,
|
||||
r,
|
||||
"Failed to set 'survive_final_kill_signal' xattr on control "
|
||||
"group %s, ignoring: %m",
|
||||
empty_to_root(u->cgroup_path));
|
||||
} else {
|
||||
unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "user.survive_final_kill_signal");
|
||||
unit_remove_xattr_graceful(u, /* cgroup_path= */ NULL, "trusted.survive_final_kill_signal");
|
||||
}
|
||||
}
|
||||
|
||||
static int lookup_block_device(const char *p, dev_t *ret) {
|
||||
|
|
|
@ -921,6 +921,7 @@ const sd_bus_vtable bus_unit_vtable[] = {
|
|||
SD_BUS_PROPERTY("RefuseManualStop", "b", bus_property_get_bool, offsetof(Unit, refuse_manual_stop), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("AllowIsolate", "b", bus_property_get_bool, offsetof(Unit, allow_isolate), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("DefaultDependencies", "b", bus_property_get_bool, offsetof(Unit, default_dependencies), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("SurviveFinalKillSignal", "b", bus_property_get_bool, offsetof(Unit, survive_final_kill_signal), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("OnSuccesJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST|SD_BUS_VTABLE_HIDDEN), /* deprecated */
|
||||
SD_BUS_PROPERTY("OnSuccessJobMode", "s", property_get_job_mode, offsetof(Unit, on_success_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("OnFailureJobMode", "s", property_get_job_mode, offsetof(Unit, on_failure_job_mode), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
|
@ -2174,6 +2175,9 @@ static int bus_unit_set_transient_property(
|
|||
if (streq(name, "DefaultDependencies"))
|
||||
return bus_set_transient_bool(u, name, &u->default_dependencies, message, flags, error);
|
||||
|
||||
if (streq(name, "SurviveFinalKillSignal"))
|
||||
return bus_set_transient_bool(u, name, &u->survive_final_kill_signal, message, flags, error);
|
||||
|
||||
if (streq(name, "OnSuccessJobMode"))
|
||||
return bus_set_transient_job_mode(u, name, &u->on_success_job_mode, message, flags, error);
|
||||
|
||||
|
|
|
@ -313,6 +313,7 @@ Unit.RefuseManualStart, config_parse_bool,
|
|||
Unit.RefuseManualStop, config_parse_bool, 0, offsetof(Unit, refuse_manual_stop)
|
||||
Unit.AllowIsolate, config_parse_bool, 0, offsetof(Unit, allow_isolate)
|
||||
Unit.DefaultDependencies, config_parse_bool, 0, offsetof(Unit, default_dependencies)
|
||||
Unit.SurviveFinalKillSignal, config_parse_bool, 0, offsetof(Unit, survive_final_kill_signal)
|
||||
Unit.OnSuccessJobMode, config_parse_job_mode, 0, offsetof(Unit, on_success_job_mode)
|
||||
Unit.OnFailureJobMode, config_parse_job_mode, 0, offsetof(Unit, on_failure_job_mode)
|
||||
{# The following is a legacy alias name for compatibility #}
|
||||
|
|
|
@ -1766,7 +1766,8 @@ static void finish_remaining_processes(ManagerObjective objective) {
|
|||
if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
|
||||
broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
|
||||
|
||||
/* On soft reboot really make sure nothing is left */
|
||||
/* On soft reboot really make sure nothing is left. Note that this will skip cgroups
|
||||
* of units that were configured with SurviveFinalKillSignal=yes. */
|
||||
if (objective == MANAGER_SOFT_REBOOT)
|
||||
broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
|
||||
}
|
||||
|
|
|
@ -826,6 +826,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
|
|||
"%s\tRefuseManualStart: %s\n"
|
||||
"%s\tRefuseManualStop: %s\n"
|
||||
"%s\tDefaultDependencies: %s\n"
|
||||
"%s\tSurviveFinalKillSignal: %s\n"
|
||||
"%s\tOnSuccessJobMode: %s\n"
|
||||
"%s\tOnFailureJobMode: %s\n"
|
||||
"%s\tIgnoreOnIsolate: %s\n",
|
||||
|
@ -833,6 +834,7 @@ void unit_dump(Unit *u, FILE *f, const char *prefix) {
|
|||
prefix, yes_no(u->refuse_manual_start),
|
||||
prefix, yes_no(u->refuse_manual_stop),
|
||||
prefix, yes_no(u->default_dependencies),
|
||||
prefix, yes_no(u->survive_final_kill_signal),
|
||||
prefix, job_mode_to_string(u->on_success_job_mode),
|
||||
prefix, job_mode_to_string(u->on_failure_job_mode),
|
||||
prefix, yes_no(u->ignore_on_isolate));
|
||||
|
|
|
@ -451,6 +451,9 @@ typedef struct Unit {
|
|||
/* Create default dependencies */
|
||||
bool default_dependencies;
|
||||
|
||||
/* Configure so that the unit survives a system transition without stopping/starting. */
|
||||
bool survive_final_kill_signal;
|
||||
|
||||
/* Refuse manual starting, allow starting only indirectly via dependency. */
|
||||
bool refuse_manual_start;
|
||||
|
||||
|
|
|
@ -2607,6 +2607,7 @@ static int bus_append_unit_property(sd_bus_message *m, const char *field, const
|
|||
"RefuseManualStop",
|
||||
"AllowIsolate",
|
||||
"IgnoreOnIsolate",
|
||||
"SurviveFinalKillSignal",
|
||||
"DefaultDependencies"))
|
||||
return bus_append_parse_boolean(m, field, eq);
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include "alloc-util.h"
|
||||
#include "constants.h"
|
||||
#include "dirent-util.h"
|
||||
#include "errno-util.h"
|
||||
#include "fd-util.h"
|
||||
#include "format-util.h"
|
||||
#include "initrd-util.h"
|
||||
|
@ -22,10 +23,54 @@
|
|||
#include "string-util.h"
|
||||
#include "terminal-util.h"
|
||||
|
||||
static bool ignore_proc(pid_t pid, bool warn_rootfs) {
|
||||
static bool argv_has_at(pid_t pid) {
|
||||
_cleanup_fclose_ FILE *f = NULL;
|
||||
const char *p;
|
||||
char c = 0;
|
||||
|
||||
p = procfs_file_alloca(pid, "cmdline");
|
||||
f = fopen(p, "re");
|
||||
if (!f) {
|
||||
log_debug_errno(errno, "Failed to open %s, ignoring: %m", p);
|
||||
return true; /* not really, but has the desired effect */
|
||||
}
|
||||
|
||||
/* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
|
||||
* kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
|
||||
* actual kernel threads are already filtered out above. */
|
||||
(void) fread(&c, 1, 1, f);
|
||||
|
||||
/* Processes with argv[0][0] = '@' we ignore from the killing spree.
|
||||
*
|
||||
* https://systemd.io/ROOT_STORAGE_DAEMONS */
|
||||
return c == '@';
|
||||
}
|
||||
|
||||
static bool is_survivor_cgroup(pid_t pid) {
|
||||
_cleanup_free_ char *cgroup_path = NULL;
|
||||
int r;
|
||||
|
||||
r = cg_pid_get_path(/* root= */ NULL, pid, &cgroup_path);
|
||||
if (r < 0) {
|
||||
log_warning_errno(r, "Failed to get cgroup path of process " PID_FMT ", ignoring: %m", pid);
|
||||
return false;
|
||||
}
|
||||
|
||||
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.survive_final_kill_signal");
|
||||
/* user xattr support was added to kernel v5.7, try with the trusted namespace as a fallback */
|
||||
if (ERRNO_IS_NEG_XATTR_ABSENT(r))
|
||||
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER,
|
||||
cgroup_path,
|
||||
"trusted.survive_final_kill_signal");
|
||||
if (r < 0)
|
||||
log_debug_errno(r,
|
||||
"Failed to get survive_final_kill_signal xattr of %s, ignoring: %m",
|
||||
cgroup_path);
|
||||
|
||||
return r > 0;
|
||||
}
|
||||
|
||||
static bool ignore_proc(pid_t pid, bool warn_rootfs) {
|
||||
uid_t uid;
|
||||
int r;
|
||||
|
||||
|
@ -38,6 +83,10 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) {
|
|||
if (r != 0)
|
||||
return true; /* also ignore processes where we can't determine this */
|
||||
|
||||
/* Ignore processes that are part of a cgroup marked with the user.survive_final_kill_signal xattr */
|
||||
if (is_survivor_cgroup(pid))
|
||||
return true;
|
||||
|
||||
r = get_process_uid(pid, &uid);
|
||||
if (r < 0)
|
||||
return true; /* not really, but better safe than sorry */
|
||||
|
@ -46,20 +95,7 @@ static bool ignore_proc(pid_t pid, bool warn_rootfs) {
|
|||
if (uid != 0)
|
||||
return false;
|
||||
|
||||
p = procfs_file_alloca(pid, "cmdline");
|
||||
f = fopen(p, "re");
|
||||
if (!f)
|
||||
return true; /* not really, but has the desired effect */
|
||||
|
||||
/* Try to read the first character of the command line. If the cmdline is empty (which might be the case for
|
||||
* kernel threads but potentially also other stuff), this line won't do anything, but we don't care much, as
|
||||
* actual kernel threads are already filtered out above. */
|
||||
(void) fread(&c, 1, 1, f);
|
||||
|
||||
/* Processes with argv[0][0] = '@' we ignore from the killing spree.
|
||||
*
|
||||
* https://systemd.io/ROOT_STORAGE_DAEMONS */
|
||||
if (c != '@')
|
||||
if (!argv_has_at(pid))
|
||||
return false;
|
||||
|
||||
if (warn_rootfs &&
|
||||
|
|
|
@ -20,8 +20,8 @@ if [ -f /run/testsuite82.touch3 ]; then
|
|||
read -r x <&5
|
||||
test "$x" = "oinkoink"
|
||||
|
||||
# Check that no service is still around
|
||||
test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
|
||||
# Check that the surviving service is still around
|
||||
test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
|
||||
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
|
||||
|
||||
# All succeeded, exit cleanly now
|
||||
|
@ -43,8 +43,8 @@ elif [ -f /run/testsuite82.touch2 ]; then
|
|||
systemd-notify --fd=3 --pid=parent 3<"$T"
|
||||
rm "$T"
|
||||
|
||||
# Check that no service is still around
|
||||
test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
|
||||
# Check that the surviving service is still around
|
||||
test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
|
||||
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
|
||||
|
||||
# Test that we really are in the new overlayfs root fs
|
||||
|
@ -57,6 +57,9 @@ elif [ -f /run/testsuite82.touch2 ]; then
|
|||
mount --bind /original-root /run/nextroot
|
||||
mount
|
||||
|
||||
# Restart the unit that is not supposed to survive
|
||||
systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
|
||||
|
||||
# Now issue the soft reboot. We should be right back soon.
|
||||
touch /run/testsuite82.touch3
|
||||
systemctl --no-block soft-reboot
|
||||
|
@ -85,8 +88,8 @@ elif [ -f /run/testsuite82.touch ]; then
|
|||
systemd-notify --fd=3 --pid=parent 3<"$T"
|
||||
rm "$T"
|
||||
|
||||
# Check that no service survived, regardless of the configuration
|
||||
test "$(systemctl show -P ActiveState testsuite-82-survive.service)" != "active"
|
||||
# Check that the surviving service is still around
|
||||
test "$(systemctl show -P ActiveState testsuite-82-survive.service)" = "active"
|
||||
test "$(systemctl show -P ActiveState testsuite-82-nosurvive.service)" != "active"
|
||||
|
||||
# This time we test the /run/nextroot/ root switching logic. (We synthesize a new rootfs from the old via overlayfs)
|
||||
|
@ -107,6 +110,9 @@ elif [ -f /run/testsuite82.touch ]; then
|
|||
# Bind our current root into the target so that we later can return to it
|
||||
mount --bind / /run/nextroot/original-root
|
||||
|
||||
# Restart the unit that is not supposed to survive
|
||||
systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
|
||||
|
||||
# Now issue the soft reboot. We should be right back soon.
|
||||
touch /run/testsuite82.touch2
|
||||
systemctl --no-block soft-reboot
|
||||
|
@ -123,23 +129,17 @@ else
|
|||
systemd-notify --fd=3 --pid=parent 3<"$T"
|
||||
rm "$T"
|
||||
|
||||
# Create a script that can survive the soft reboot by ignoring SIGTERM (we
|
||||
# do this instead of the argv[0][0] = '@' thing because that's so hard to
|
||||
# do from a shell
|
||||
T="/dev/shm/survive-$RANDOM.sh"
|
||||
cat >$T <<EOF
|
||||
#!/bin/bash
|
||||
trap "" TERM
|
||||
systemd-notify --ready
|
||||
rm "$T"
|
||||
exec sleep infinity
|
||||
EOF
|
||||
chmod +x "$T"
|
||||
# This sets DefaultDependencies=no so that it remains running until the
|
||||
# very end, and IgnoreOnIsolate=yes so that it isn't stopped via the
|
||||
# "testsuite.target" isolation we do on next boot
|
||||
systemd-run -p Type=notify -p DefaultDependencies=no -p IgnoreOnIsolate=yes --unit=testsuite-82-survive.service "$T"
|
||||
systemd-run -p Type=exec -p DefaultDependencies=no -p IgnoreOnIsolate=yes --unit=testsuite-82-nosurvive.service sleep infinity
|
||||
# Configure this transient unit to survive the soft reboot - it will not conflict with shutdown.target
|
||||
# and it will be ignored on the isolate that happens in the next boot.
|
||||
systemd-run -p Type=exec --unit=testsuite-82-survive.service \
|
||||
--property SurviveFinalKillSignal=yes \
|
||||
--property IgnoreOnIsolate=yes \
|
||||
--property DefaultDependencies=no \
|
||||
--property After=basic.target \
|
||||
--property "Conflicts=reboot.target kexec.target poweroff.target halt.target emergency.target rescue.target" \
|
||||
--property "Before=reboot.target kexec.target poweroff.target halt.target emergency.target rescue.target" \
|
||||
sleep infinity
|
||||
systemd-run -p Type=exec --unit=testsuite-82-nosurvive.service sleep infinity
|
||||
|
||||
# Check that we can set up an inhibitor, and that busctl monitor sees the
|
||||
# PrepareForShutdownWithMetadata signal and that it says 'soft-reboot'.
|
||||
|
|
Loading…
Reference in a new issue