core: add ExecMainHandoverTimestamp property recording time-of-execve

Enable the exec_fd logic for Type=notify* services too, and change it
to send a timestamp instead of a '1' byte. Record the timestamp in a
new ExecMainHandoverTimestamp property so that users can track accurately
when control is handed over from systemd to the service payload, so
that latency and startup performance can be trivially and accurately
tracked and attributed.
This commit is contained in:
Luca Boccassi 2024-04-20 00:50:16 +01:00 committed by Zbigniew Jędrzejewski-Szmek
parent 489ae6b3d5
commit 93cb78aee2
7 changed files with 100 additions and 29 deletions

View file

@ -2768,6 +2768,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
readonly t ExecMainStartTimestampMonotonic = ...;
readonly t ExecMainExitTimestamp = ...;
readonly t ExecMainExitTimestampMonotonic = ...;
readonly t ExecMainHandoverTimestamp = ...;
readonly t ExecMainHandoverTimestampMonotonic = ...;
readonly u ExecMainPID = ...;
readonly i ExecMainCode = ...;
readonly i ExecMainStatus = ...;
@ -4057,6 +4059,10 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainExitTimestampMonotonic"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoverTimestamp"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoverTimestampMonotonic"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainPID"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainCode"/>
@ -4706,12 +4712,17 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<para><varname>ExecMainStartTimestamp</varname>, <varname>ExecMainStartTimestampMonotonic</varname>,
<varname>ExecMainExitTimestamp</varname>, <varname>ExecMainExitTimestampMonotonic</varname>,
<varname>ExecMainHandoverTimestamp</varname>, <varname>ExecMainHandoverTimestampMonotonic</varname>,
<varname>ExecMainPID</varname>, <varname>ExecMainCode</varname>, <varname>ExecMainStatus</varname>
contain information about the main process of the service as far as it is known. This is often the same
runtime information that is stored in <varname>ExecStart</varname>. However, it deviates for
<varname>Type=forking</varname> services where the main process of the service is not forked off
systemd directly. These fields either contain information of the last run of the process or of the
current running process.</para>
contain information about the main process of the service as far as it is known. The
<varname>ExecMainStartTimestamp</varname> timestamps record when the main child process is spawned by
the service manager. <varname>ExecMainExitTimestamp</varname> timestamps record when the main child
process exit has been detected by the service manager. <varname>ExecMainHandoverTimestamp</varname>
timestamps record when the service executable is executed by <command>systemd-executor</command>.
This is often the same runtime information that is stored in <varname>ExecStart=</varname>. However,
it deviates for <varname>Type=forking</varname> services where the main process of the service is not
forked off systemd directly. These fields either contain information of the last run of the process or
of the current running process.</para>
<para><varname>MainPID</varname> and <varname>ControlPID</varname> contain the main and control PID of
the service. The main PID is the current main PID of the service and is 0 when the service currently
@ -12055,8 +12066,10 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>MemoryZSwapCurrent</varname> were added in version 255.</para>
<para><varname>EffectiveMemoryHigh</varname>,
<varname>EffectiveMemoryMax</varname>,
<varname>EffectiveTasksMax</varname>, and
<varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
<varname>EffectiveTasksMax</varname>,
<varname>MemoryZSwapWriteback</varname>,
<varname>ExecMainHandoverTimestampMonotonic</varname>, and
<varname>ExecMainHandoverTimestamp</varname> were added in version 256.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>

View file

@ -9,6 +9,7 @@
#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoverTimestamp", (offset) + offsetof(ExecStatus, handover_timestamp), flags), \
SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)

View file

@ -5265,12 +5265,14 @@ int exec_invoke(
log_command_line(context, params, "Executing", executable, final_argv);
if (params->exec_fd >= 0) {
uint8_t hot = 1;
usec_t t = now(CLOCK_MONOTONIC);
/* We have finished with all our initializations. Let's now let the manager know that. From this point
* on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
* on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a
* timestamp so that the service manager and users can track the precise moment we handed
* over execution of the service to the kernel. */
if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
if (write(params->exec_fd, &t, sizeof(t)) < 0) {
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
}
@ -5279,7 +5281,7 @@ int exec_invoke(
r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
if (params->exec_fd >= 0) {
uint8_t hot = 0;
uint64_t hot = 0;
/* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
* that POLLHUP on it no longer means execve() succeeded. */

View file

@ -1866,6 +1866,11 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
"%sStart Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
if (dual_timestamp_is_set(&s->handover_timestamp))
fprintf(f,
"%sHandover Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP(s->handover_timestamp.realtime));
if (dual_timestamp_is_set(&s->exit_timestamp))
fprintf(f,
"%sExit Timestamp: %s\n"

View file

@ -91,6 +91,7 @@ typedef enum ExecKeyringMode {
struct ExecStatus {
dual_timestamp start_timestamp;
dual_timestamp exit_timestamp;
dual_timestamp handover_timestamp;
pid_t pid;
int code; /* as in siginfo_t::si_code */
int status; /* as in siginfo_t::si_status */
@ -443,7 +444,9 @@ struct ExecParameters {
int stdout_fd;
int stderr_fd;
/* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done */
/* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. It
* will also be used to send a timestamp taken as the very last operation before execve, for
* tracking purposes. */
int exec_fd;
char *notify_socket;

View file

@ -24,6 +24,7 @@
#include "fd-util.h"
#include "fileio.h"
#include "format-util.h"
#include "io-util.h"
#include "load-dropin.h"
#include "load-fragment.h"
#include "log.h"
@ -1675,7 +1676,8 @@ static int service_spawn_internal(
log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
}
if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) &&
IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) {
r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
if (r < 0)
return r;
@ -3008,6 +3010,7 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
(void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
(void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-handover", &s->main_exec_status.handover_timestamp);
if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
(void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
@ -3292,6 +3295,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
else if (streq(key, "main-exec-status-exit"))
deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
else if (streq(key, "main-exec-status-handover"))
deserialize_dual_timestamp(value, &s->main_exec_status.handover_timestamp);
else if (streq(key, "notify-access-override")) {
NotifyAccess notify_access;
@ -3520,21 +3525,22 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
* the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
* the child signalled us first that it is about to call the execve(). It does so by sending us a simple
* non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
* sends a zero byte we'll ignore POLLHUP on the fd again. */
* sends a zero byte we'll ignore POLLHUP on the fd again.
* For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately
* before the target executable is execve'd, so that we can accurately track when control is handed over to the
* payload. */
for (;;) {
uint8_t x;
uint64_t x = 0;
ssize_t n;
n = read(fd, &x, sizeof(x));
if (n < 0) {
if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
return 0;
return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
}
if (n == 0) { /* EOF → the event we are waiting for */
n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false);
if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
return 0;
if (n < 0)
return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m");
if (n == 0) {
/* EOF → the event we are waiting for in case of Type=exec */
s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
@ -3550,13 +3556,37 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
return 0;
}
if (n == 1) {
/* Backward compatibility block: a single byte was read, which means somehow an old
* executor before this logic was introduced sent the notification, so there is no
* timestamp (reset it). */
/* A byte was read → this turns on/off the exec fd logic */
assert(n == sizeof(x));
s->exec_fd_hot = x;
s->exec_fd_hot = x;
if (s->state == SERVICE_START)
s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
continue;
}
if (x == 0) {
/* If we get x=0 then the execve actually failed, which means the exec fd logic is
* now off. Also reset the exec timestamp, given it has no meaning anymore. */
s->exec_fd_hot = false;
if (s->state == SERVICE_START)
s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
} else {
/* A non-zero value was read, which means the exec fd logic is now enabled. Record
* the received timestamp so that users can track when control is handed over to the
* service payload. */
s->exec_fd_hot = true;
if (s->state == SERVICE_START) {
assert_cc(sizeof(uint64_t) == sizeof(usec_t));
dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x);
}
}
}
return 0;
}
static void service_notify_cgroup_empty_event(Unit *u) {

View file

@ -0,0 +1,17 @@
#!/usr/bin/env bash
# SPDX-License-Identifier: LGPL-2.1-or-later
set -eux
set -o pipefail
# Check that timestamps of a Type=notify service are consistent
systemd-run --service-type notify --property NotifyAccess=all --unit notify.service --wait sh -c 'systemd-notify --ready; exit 1' || :
start=$(systemctl show --property=ExecMainStartTimestampMonotonic --value notify.service)
handover=$(systemctl show --property=ExecMainHandoverTimestampMonotonic --value notify.service)
active=$(systemctl show --property=ActiveEnterTimestampMonotonic --value notify.service)
exit=$(systemctl show --property=ExecMainExitTimestampMonotonic --value notify.service)
[[ $start -le $handover ]]
[[ $handover -le $active ]]
[[ $active -le $exit ]]