1
0
mirror of https://github.com/systemd/systemd synced 2024-07-09 04:26:06 +00:00

Merge pull request #32441 from poettering/rework-handoff-timestamp

pid1: rework handoff timestamp
This commit is contained in:
Lennart Poettering 2024-04-25 15:19:17 +02:00 committed by GitHub
commit 5693208019
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
18 changed files with 426 additions and 134 deletions

13
NEWS
View File

@ -206,13 +206,12 @@ CHANGES WITH 256-rc1:
sent at the same time and carries the string passed to "systemctl
--reboot-argument=" if there was one.
* New D-Bus properties ExecMainHandoverTimestamp and
ExecMainHandoverTimestampMonotonic are now published by services of
type exec, dbus, notify, and notify-reload.
This timestamp is taken as the very last operation before executing
a service's binary, which allows users to accurately track when
execution control of the process is handed over from systemd to the
payload.
* New D-Bus properties ExecMainHandoffTimestamp and
ExecMainHandoffTimestampMonotonic are now published by services
units. This timestamp is taken as the very last operation before
handing off control to invoked binaries. This information is
available for other unit types that fork off processes (i.e. mount,
swap, socket units), but currently only via "systemd-analyze dump".
* An additional timestamp is now taken by the service manager when a
system shutdown operation is initiated. It can be queried via D-Bus

7
TODO
View File

@ -130,6 +130,13 @@ Deprecations and removals:
Features:
* expose the handoff timestamp fully via the D-Bus properties that contain
ExecStatus information
* properly serialize the ExecStatus data from all ExecCommand objects
associated with services, sockets, mounts and swaps. Currently, the data is
flushed out on reload, which is quite a limitation.
* Clean up "reboot argument" handling, i.e. set it through some IPC service
instead of directly via /run/, so that it can be sensible set remotely.

View File

@ -2766,8 +2766,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
readonly t ExecMainStartTimestampMonotonic = ...;
readonly t ExecMainExitTimestamp = ...;
readonly t ExecMainExitTimestampMonotonic = ...;
readonly t ExecMainHandoverTimestamp = ...;
readonly t ExecMainHandoverTimestampMonotonic = ...;
readonly t ExecMainHandoffTimestamp = ...;
readonly t ExecMainHandoffTimestampMonotonic = ...;
readonly u ExecMainPID = ...;
readonly i ExecMainCode = ...;
readonly i ExecMainStatus = ...;
@ -4057,9 +4057,9 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainExitTimestampMonotonic"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoverTimestamp"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoffTimestamp"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoverTimestampMonotonic"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainHandoffTimestampMonotonic"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExecMainPID"/>
@ -4710,18 +4710,20 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<para><varname>ExecMainStartTimestamp</varname>, <varname>ExecMainStartTimestampMonotonic</varname>,
<varname>ExecMainExitTimestamp</varname>, <varname>ExecMainExitTimestampMonotonic</varname>,
<varname>ExecMainHandoverTimestamp</varname>, <varname>ExecMainHandoverTimestampMonotonic</varname>,
<varname>ExecMainHandoffTimestamp</varname>, <varname>ExecMainHandoffTimestampMonotonic</varname>,
<varname>ExecMainPID</varname>, <varname>ExecMainCode</varname>, <varname>ExecMainStatus</varname>
contain information about the main process of the service as far as it is known. The
<varname>ExecMainStartTimestamp</varname> timestamps record when the main child process is spawned by
the service manager. <varname>ExecMainExitTimestamp</varname> timestamps record when the main child
process exit has been detected by the service manager. <varname>ExecMainHandoverTimestamp</varname>
timestamps record when the service executable is executed by <command>systemd-executor</command> for
services of type <literal>exec</literal>, <literal>dbus</literal>, <literal>notify</literal>, and
<literal>notify-reload</literal>. This is often the same runtime information that is stored in
<varname>ExecStart=</varname>. However, it deviates for <varname>Type=forking</varname> services where
the main process of the service is not forked off systemd directly. These fields either contain
information of the last run of the process or of the current running process.</para>
<varname>ExecMainStartTimestamp</varname> timestamps record when the main process of the service is
created. <varname>ExecMainExitTimestamp</varname> timestamps record when the main process exit has been
detected by the service manager. <varname>ExecMainHandoffTimestamp</varname> timestamps records when
the service binary is about to be executed by <command>systemd-executor</command> (this timestamp is
recorded regardless if the immediately following <function>execve()</function> system call succeeds or
fails). This is often the same runtime information that is also maintained for
<varname>ExecStart=</varname>. However, it deviates for services with <varname>Type=forking</varname>
as well as services that use <varname>MAINPID=</varname> <function>sd_notify()</function> messages as
the main process of the service is not forked off by the service manager directly in that case. These
fields either contain information of the last run of the process or of the current running
process.</para>
<para><varname>MainPID</varname> and <varname>ControlPID</varname> contain the main and control PID of
the service. The main PID is the current main PID of the service and is 0 when the service currently
@ -12067,8 +12069,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>EffectiveMemoryMax</varname>,
<varname>EffectiveTasksMax</varname>,
<varname>MemoryZSwapWriteback</varname>,
<varname>ExecMainHandoverTimestampMonotonic</varname>, and
<varname>ExecMainHandoverTimestamp</varname> were added in version 256.</para>
<varname>ExecMainHandoffTimestampMonotonic</varname>, and
<varname>ExecMainHandoffTimestamp</varname> were added in version 256.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>

View File

@ -9,7 +9,7 @@
#define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoverTimestamp", (offset) + offsetof(ExecStatus, handover_timestamp), flags), \
BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoffTimestamp", (offset) + offsetof(ExecStatus, handoff_timestamp), flags), \
SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \
SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \
SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags)

View File

@ -3519,7 +3519,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
int dont_close[n_fds + 15];
int dont_close[n_fds + 16];
assert(params);
@ -3555,6 +3555,9 @@ static int close_remaining_fds(
if (params->user_lookup_fd >= 0)
dont_close[n_dont_close++] = params->user_lookup_fd;
if (params->handoff_timestamp_fd >= 0)
dont_close[n_dont_close++] = params->handoff_timestamp_fd;
assert(n_dont_close <= ELEMENTSOF(dont_close));
return close_all_fds(dont_close, n_dont_close);
@ -3974,6 +3977,52 @@ static void exec_params_close(ExecParameters *p) {
p->stderr_fd = safe_close(p->stderr_fd);
}
static int exec_fd_mark_hot(
const ExecContext *c,
ExecParameters *p,
bool hot,
int *reterr_exit_status) {
assert(c);
assert(p);
if (p->exec_fd < 0)
return 0;
uint8_t x = hot;
if (write(p->exec_fd, &x, sizeof(x)) < 0) {
if (reterr_exit_status)
*reterr_exit_status = EXIT_EXEC;
return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
}
return 1;
}
static int send_handoff_timestamp(
const ExecContext *c,
ExecParameters *p,
int *reterr_exit_status) {
assert(c);
assert(p);
if (p->handoff_timestamp_fd < 0)
return 0;
dual_timestamp dt;
dual_timestamp_now(&dt);
if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) {
if (reterr_exit_status)
*reterr_exit_status = EXIT_EXEC;
return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m");
}
return 1;
}
int exec_invoke(
const ExecCommand *command,
const ExecContext *context,
@ -4105,7 +4154,7 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
}
int keep_fds[n_fds + 3];
int keep_fds[n_fds + 4];
memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
n_keep_fds = n_fds;
@ -4115,6 +4164,12 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
}
r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
if (r < 0) {
*exit_status = EXIT_FDS;
return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
}
#if HAVE_LIBBPF
r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
if (r < 0) {
@ -4849,8 +4904,9 @@ int exec_invoke(
/* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
* we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
* more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
* execve(). But first, close the remaining sockets in the context objects. */
* more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
* them open until the final execve(). But first, close the remaining sockets in the context
* objects. */
exec_runtime_close(runtime);
exec_params_close(params);
@ -5266,33 +5322,29 @@ int exec_invoke(
log_command_line(context, params, "Executing", executable, final_argv);
if (params->exec_fd >= 0) {
usec_t t = now(CLOCK_MONOTONIC);
/* We have finished with all our initializations. Let's now let the manager know that. From this
* point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
/* We have finished with all our initializations. Let's now let the manager know that. From this point
* on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a
* timestamp so that the service manager and users can track the precise moment we handed
* over execution of the service to the kernel. */
r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
if (r < 0)
return r;
if (write(params->exec_fd, &t, sizeof(t)) < 0) {
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
}
/* As last thing before the execve(), let's send the handoff timestamp */
r = send_handoff_timestamp(context, params, exit_status);
if (r < 0) {
/* If this handoff timestamp failed, let's undo the marking as hot */
(void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
return r;
}
/* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
* have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
* exec_fd this is pretty much the whole raison d'etre. */
r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
if (params->exec_fd >= 0) {
uint64_t hot = 0;
/* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
* that POLLHUP on it no longer means execve() succeeded. */
if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
}
}
/* The execve() failed, let's undo the marking as hot */
(void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);

View File

@ -1373,6 +1373,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
if (r < 0)
return r;
r = serialize_fd(f, fds, "exec-parameters-handoff-timestamp-fd", p->handoff_timestamp_fd);
if (r < 0)
return r;
if (c && exec_context_restrict_filesystems_set(c)) {
r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd);
if (r < 0)
@ -1620,6 +1624,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
continue;
p->exec_fd = fd;
} else if ((val = startswith(l, "exec-parameters-handoff-timestamp-fd="))) {
int fd;
fd = deserialize_fd(fds, val);
if (fd < 0)
continue;
close_and_replace(p->handoff_timestamp_fd, fd);
} else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
int fd;

View File

@ -1842,6 +1842,19 @@ void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int
(void) utmp_put_dead_process(context->utmp_id, pid, code, status);
}
void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts) {
assert(s);
assert(ucred);
assert(ts);
if (ucred->pid != s->pid)
*s = (ExecStatus) {
.pid = ucred->pid,
};
s->handoff_timestamp = *ts;
}
void exec_status_reset(ExecStatus *s) {
assert(s);
@ -1864,21 +1877,42 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
if (dual_timestamp_is_set(&s->start_timestamp))
fprintf(f,
"%sStart Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
prefix, FORMAT_TIMESTAMP_STYLE(s->start_timestamp.realtime, TIMESTAMP_US));
if (dual_timestamp_is_set(&s->handover_timestamp))
if (dual_timestamp_is_set(&s->handoff_timestamp) && dual_timestamp_is_set(&s->start_timestamp) &&
s->handoff_timestamp.monotonic > s->start_timestamp.monotonic)
fprintf(f,
"%sHandover Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP(s->handover_timestamp.realtime));
"%sHandoff Timestamp: %s since start\n",
prefix,
FORMAT_TIMESPAN(usec_sub_unsigned(s->handoff_timestamp.monotonic, s->start_timestamp.monotonic), 1));
else
fprintf(f,
"%sHandoff Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP_STYLE(s->handoff_timestamp.realtime, TIMESTAMP_US));
if (dual_timestamp_is_set(&s->exit_timestamp)) {
if (dual_timestamp_is_set(&s->handoff_timestamp) && s->exit_timestamp.monotonic > s->handoff_timestamp.monotonic)
fprintf(f,
"%sExit Timestamp: %s since handoff\n",
prefix,
FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->handoff_timestamp.monotonic), 1));
else if (dual_timestamp_is_set(&s->start_timestamp) && s->exit_timestamp.monotonic > s->start_timestamp.monotonic)
fprintf(f,
"%sExit Timestamp: %s since start\n",
prefix,
FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->start_timestamp.monotonic), 1));
else
fprintf(f,
"%sExit Timestamp: %s\n",
prefix, FORMAT_TIMESTAMP_STYLE(s->exit_timestamp.realtime, TIMESTAMP_US));
if (dual_timestamp_is_set(&s->exit_timestamp))
fprintf(f,
"%sExit Timestamp: %s\n"
"%sExit Code: %s\n"
"%sExit Status: %i\n",
prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
prefix, sigchld_code_to_string(s->code),
prefix, s->status);
}
}
void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {

View File

@ -91,7 +91,7 @@ typedef enum ExecKeyringMode {
struct ExecStatus {
dual_timestamp start_timestamp;
dual_timestamp exit_timestamp;
dual_timestamp handover_timestamp;
dual_timestamp handoff_timestamp;
pid_t pid;
int code; /* as in siginfo_t::si_code */
int status; /* as in siginfo_t::si_status */
@ -444,9 +444,7 @@ struct ExecParameters {
int stdout_fd;
int stderr_fd;
/* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. It
* will also be used to send a timestamp taken as the very last operation before execve, for
* tracking purposes. */
/* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. */
int exec_fd;
char *notify_socket;
@ -457,6 +455,7 @@ struct ExecParameters {
char **files_env;
int user_lookup_fd;
int handoff_timestamp_fd;
int bpf_restrict_fs_map_fd;
@ -475,6 +474,7 @@ struct ExecParameters {
.exec_fd = -EBADF, \
.bpf_restrict_fs_map_fd = -EBADF, \
.user_lookup_fd = -EBADF, \
.handoff_timestamp_fd = -EBADF, \
}
#include "unit.h"
@ -545,6 +545,7 @@ char** exec_context_get_restrict_filesystems(const ExecContext *c);
void exec_status_start(ExecStatus *s, pid_t pid);
void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status);
void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts);
void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix);
void exec_status_reset(ExecStatus *s);

View File

@ -148,6 +148,12 @@ int manager_serialize(
return r;
}
if (m->handoff_timestamp_fds[0] >= 0) {
r = serialize_fd_many(f, fds, "handoff-timestamp-fds", m->handoff_timestamp_fds, 2);
if (r < 0)
return r;
}
(void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit);
(void) serialize_ratelimit(f, "reload-reexec-ratelimit", &m->reload_reexec_ratelimit);

View File

@ -56,6 +56,7 @@
#include "inotify-util.h"
#include "install.h"
#include "io-util.h"
#include "iovec-util.h"
#include "label-util.h"
#include "load-fragment.h"
#include "locale-setup.h"
@ -124,6 +125,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t
static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata);
static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata);
static int manager_dispatch_run_queue(sd_event_source *source, void *userdata);
static int manager_dispatch_sigchld(sd_event_source *source, void *userdata);
@ -904,6 +906,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags,
.cgroups_agent_fd = -EBADF,
.signal_fd = -EBADF,
.user_lookup_fds = EBADF_PAIR,
.handoff_timestamp_fds = EBADF_PAIR,
.private_listen_fd = -EBADF,
.dev_autofs_fd = -EBADF,
.cgroup_inotify_fd = -EBADF,
@ -1253,6 +1256,49 @@ static int manager_setup_user_lookup_fd(Manager *m) {
return 0;
}
static int manager_setup_handoff_timestamp_fd(Manager *m) {
int r;
assert(m);
/* Set up the socket pair used for for passing timestamps back when the executor processes we fork
* off invokes execve(), i.e. when we hand off control to our payload processes. */
if (m->handoff_timestamp_fds[0] < 0) {
m->handoff_timestamp_event_source = sd_event_source_disable_unref(m->handoff_timestamp_event_source);
safe_close_pair(m->handoff_timestamp_fds);
if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->handoff_timestamp_fds) < 0)
return log_error_errno(errno, "Failed to allocate handoff timestamp socket: %m");
/* Make sure children never have to block */
(void) fd_increase_rxbuf(m->handoff_timestamp_fds[0], NOTIFY_RCVBUF_SIZE);
r = setsockopt_int(m->handoff_timestamp_fds[0], SOL_SOCKET, SO_PASSCRED, true);
if (r < 0)
return log_error_errno(r, "SO_PASSCRED failed: %m");
/* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */
r = fd_nonblock(m->handoff_timestamp_fds[0], true);
if (r < 0)
return log_error_errno(r, "Failed to make handoff timestamp socket O_NONBLOCK: %m");
}
if (!m->handoff_timestamp_event_source) {
r = sd_event_add_io(m->event, &m->handoff_timestamp_event_source, m->handoff_timestamp_fds[0], EPOLLIN, manager_dispatch_handoff_timestamp_fd, m);
if (r < 0)
return log_error_errno(errno, "Failed to allocate handoff timestamp event source: %m");
r = sd_event_source_set_priority(m->handoff_timestamp_event_source, EVENT_PRIORITY_HANDOFF_TIMESTAMP);
if (r < 0)
return log_error_errno(errno, "Failed to set priority of handoff timestamp event source: %m");
(void) sd_event_source_set_description(m->handoff_timestamp_event_source, "handoff-timestamp");
}
return 0;
}
static unsigned manager_dispatch_cleanup_queue(Manager *m) {
Unit *u;
unsigned n = 0;
@ -1667,12 +1713,14 @@ Manager* manager_free(Manager *m) {
sd_event_source_unref(m->jobs_in_progress_event_source);
sd_event_source_unref(m->run_queue_event_source);
sd_event_source_unref(m->user_lookup_event_source);
sd_event_source_unref(m->handoff_timestamp_event_source);
sd_event_source_unref(m->memory_pressure_event_source);
safe_close(m->signal_fd);
safe_close(m->notify_fd);
safe_close(m->cgroups_agent_fd);
safe_close_pair(m->user_lookup_fds);
safe_close_pair(m->handoff_timestamp_fds);
manager_close_ask_password(m);
@ -2013,6 +2061,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo
/* This shouldn't fail, except if things are really broken. */
return r;
r = manager_setup_handoff_timestamp_fd(m);
if (r < 0)
/* This shouldn't fail, except if things are really broken. */
return r;
/* Connect to the bus if we are good for it */
manager_setup_bus(m);
@ -3655,6 +3708,7 @@ int manager_reload(Manager *m) {
(void) manager_setup_notify(m);
(void) manager_setup_cgroups_agent(m);
(void) manager_setup_user_lookup_fd(m);
(void) manager_setup_handoff_timestamp_fd(m);
/* Third, fire things up! */
manager_coldplug(m);
@ -4776,13 +4830,12 @@ static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint
char unit_name[UNIT_NAME_MAX+1];
} _packed_ buffer;
Manager *m = userdata;
Manager *m = ASSERT_PTR(userdata);
ssize_t l;
size_t n;
Unit *u;
assert_se(source);
assert_se(m);
assert(source);
/* Invoked whenever a child process succeeded resolving its user/group to use and sent us the
* resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that
@ -4831,6 +4884,73 @@ static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint
return 0;
}
static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
Manager *m = ASSERT_PTR(userdata);
usec_t ts[2] = {};
CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control;
struct msghdr msghdr = {
.msg_iov = &IOVEC_MAKE(ts, sizeof(ts)),
.msg_iovlen = 1,
.msg_control = &control,
.msg_controllen = sizeof(control),
};
ssize_t n;
assert(source);
n = recvmsg_safe(m->handoff_timestamp_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC);
if (ERRNO_IS_NEG_TRANSIENT(n))
return 0; /* Spurious wakeup, try again */
if (n == -EXFULL) {
log_warning("Got message with truncated control, ignoring.");
return 0;
}
if (n < 0)
return log_error_errno(n, "Failed to receive handoff timestamp message: %m");
if (msghdr.msg_flags & MSG_TRUNC) {
log_warning("Got truncated handoff timestamp message, ignoring.");
return 0;
}
if (n != sizeof(ts)) {
log_warning("Got handoff timestamp message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(ts));
return 0;
}
struct ucred *ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
if (!ucred || !pid_is_valid(ucred->pid)) {
log_warning("Received notify message without valid credentials. Ignoring.");
return 0;
}
log_debug("Got handoff timestamp event for PID " PID_FMT ".", ucred->pid);
_cleanup_free_ Unit **units = NULL;
int n_units = manager_get_units_for_pidref(m, &PIDREF_MAKE_FROM_PID(ucred->pid), &units);
if (n_units < 0) {
log_warning_errno(n_units, "Unable to determine units for PID " PID_FMT ", ignoring: %m", ucred->pid);
return 0;
}
if (n_units == 0) {
log_debug("Got handoff timestamp for process " PID_FMT " we are not interested in, ignoring.", ucred->pid);
return 0;
}
dual_timestamp dt = {
.realtime = ts[0],
.monotonic = ts[1],
};
FOREACH_ARRAY(u, units, n_units) {
if (!UNIT_VTABLE(*u)->notify_handoff_timestamp)
continue;
UNIT_VTABLE(*u)->notify_handoff_timestamp(*u, ucred, &dt);
}
return 0;
}
void manager_ref_console(Manager *m) {
assert(m);

View File

@ -286,6 +286,9 @@ struct Manager {
int user_lookup_fds[2];
sd_event_source *user_lookup_event_source;
int handoff_timestamp_fds[2];
sd_event_source *handoff_timestamp_event_source;
RuntimeScope runtime_scope;
LookupPaths lookup_paths;
@ -656,22 +659,23 @@ void unit_defaults_done(UnitDefaults *defaults);
enum {
/* most important … */
EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-10,
EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-9,
EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-9,
EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-8, /* cgroupv1 */
EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-8, /* cgroupv2 */
EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-7,
EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6,
EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5,
EVENT_PRIORITY_SIGCHLD = SD_EVENT_PRIORITY_NORMAL-4,
EVENT_PRIORITY_SIGNALS = SD_EVENT_PRIORITY_NORMAL-3,
EVENT_PRIORITY_CGROUP_EMPTY = SD_EVENT_PRIORITY_NORMAL-2,
EVENT_PRIORITY_TIME_CHANGE = SD_EVENT_PRIORITY_NORMAL-1,
EVENT_PRIORITY_TIME_ZONE = SD_EVENT_PRIORITY_NORMAL-1,
EVENT_PRIORITY_IPC = SD_EVENT_PRIORITY_NORMAL,
EVENT_PRIORITY_REWATCH_PIDS = SD_EVENT_PRIORITY_IDLE,
EVENT_PRIORITY_SERVICE_WATCHDOG = SD_EVENT_PRIORITY_IDLE+1,
EVENT_PRIORITY_RUN_QUEUE = SD_EVENT_PRIORITY_IDLE+2,
EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11,
EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10,
EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */
EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */
EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8,
EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7,
EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6,
EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5,
EVENT_PRIORITY_SIGCHLD = SD_EVENT_PRIORITY_NORMAL-4,
EVENT_PRIORITY_SIGNALS = SD_EVENT_PRIORITY_NORMAL-3,
EVENT_PRIORITY_CGROUP_EMPTY = SD_EVENT_PRIORITY_NORMAL-2,
EVENT_PRIORITY_TIME_CHANGE = SD_EVENT_PRIORITY_NORMAL-1,
EVENT_PRIORITY_TIME_ZONE = SD_EVENT_PRIORITY_NORMAL-1,
EVENT_PRIORITY_IPC = SD_EVENT_PRIORITY_NORMAL,
EVENT_PRIORITY_REWATCH_PIDS = SD_EVENT_PRIORITY_IDLE,
EVENT_PRIORITY_SERVICE_WATCHDOG = SD_EVENT_PRIORITY_IDLE+1,
EVENT_PRIORITY_RUN_QUEUE = SD_EVENT_PRIORITY_IDLE+2,
/* … to least important */
};

View File

@ -1927,6 +1927,22 @@ static void mount_shutdown(Manager *m) {
m->mount_monitor = NULL;
}
static void mount_handoff_timestamp(
Unit *u,
const struct ucred *ucred,
const dual_timestamp *ts) {
Mount *m = ASSERT_PTR(MOUNT(u));
assert(ucred);
assert(ts);
if (m->control_pid.pid == ucred->pid && m->control_command) {
exec_status_handoff(&m->control_command->exec_status, ucred, ts);
unit_add_to_dbus_queue(u);
}
}
static int mount_get_timeout(Unit *u, usec_t *timeout) {
Mount *m = ASSERT_PTR(MOUNT(u));
usec_t t;
@ -2453,6 +2469,8 @@ const UnitVTable mount_vtable = {
.reset_failed = mount_reset_failed,
.notify_handoff_timestamp = mount_handoff_timestamp,
.control_pid = mount_control_pid,
.bus_set_property = bus_mount_set_property,

View File

@ -1676,8 +1676,7 @@ static int service_spawn_internal(
log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
}
if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) &&
IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) {
if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
if (r < 0)
return r;
@ -3010,7 +3009,7 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) {
(void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid);
(void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-handover", &s->main_exec_status.handover_timestamp);
(void) serialize_dual_timestamp(f, "main-exec-status-handoff", &s->main_exec_status.handoff_timestamp);
if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) {
(void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code);
@ -3295,8 +3294,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value,
deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp);
else if (streq(key, "main-exec-status-exit"))
deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp);
else if (streq(key, "main-exec-status-handover"))
deserialize_dual_timestamp(value, &s->main_exec_status.handover_timestamp);
else if (streq(key, "main-exec-status-handoff"))
deserialize_dual_timestamp(value, &s->main_exec_status.handoff_timestamp);
else if (streq(key, "notify-access-override")) {
NotifyAccess notify_access;
@ -3519,28 +3518,27 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
log_unit_debug(UNIT(s), "got exec-fd event");
/* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
* successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
* closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
* parent. We need to be careful however, as there are other reasons that we might cause the child's side of
* the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
* the child signalled us first that it is about to call the execve(). It does so by sending us a simple
* non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
* sends a zero byte we'll ignore POLLHUP on the fd again.
* For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately
* before the target executable is execve'd, so that we can accurately track when control is handed over to the
* payload. */
* successfully for it. We implement this through a pipe() towards the child, which the kernel
* automatically closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on
* the pipe in the parent. We need to be careful however, as there are other reasons that we might
* cause the child's side of the pipe to be closed (for example, a simple exit()). To deal with that
* we'll ignore EOFs on the pipe unless the child signalled us first that it is about to call the
* execve(). It does so by sending us a simple non-zero byte via the pipe. We also provide the child
* with a way to inform us in case execve() failed: if it sends a zero byte we'll ignore POLLHUP on
* the fd again. */
for (;;) {
uint64_t x = 0;
uint8_t x;
ssize_t n;
n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false);
if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
return 0;
if (n < 0)
return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m");
if (n == 0) {
/* EOF → the event we are waiting for in case of Type=exec */
n = read(fd, &x, sizeof(x));
if (n < 0) {
if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
return 0;
return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
}
if (n == 0) { /* EOF → the event we are waiting for in case of Type=exec */
s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
@ -3556,36 +3554,11 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
return 0;
}
if (n == 1) {
/* Backward compatibility block: a single byte was read, which means somehow an old
* executor before this logic was introduced sent the notification, so there is no
* timestamp (reset it). */
s->exec_fd_hot = x;
if (s->state == SERVICE_START)
s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
/* A byte was read → this turns on/off the exec fd logic */
assert(n == sizeof(x));
continue;
}
if (x == 0) {
/* If we get x=0 then the execve actually failed, which means the exec fd logic is
* now off. Also reset the exec timestamp, given it has no meaning anymore. */
s->exec_fd_hot = false;
if (s->state == SERVICE_START)
s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
} else {
/* A non-zero value was read, which means the exec fd logic is now enabled. Record
* the received timestamp so that users can track when control is handed over to the
* service payload. */
s->exec_fd_hot = true;
if (s->state == SERVICE_START) {
assert_cc(sizeof(uint64_t) == sizeof(usec_t));
dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x);
}
}
s->exec_fd_hot = x;
}
}
@ -4605,6 +4578,29 @@ static void service_notify_message(
unit_add_to_dbus_queue(u);
}
static void service_handoff_timestamp(
Unit *u,
const struct ucred *ucred,
const dual_timestamp *ts) {
Service *s = ASSERT_PTR(SERVICE(u));
assert(ucred);
assert(ts);
if (s->main_pid.pid == ucred->pid) {
if (s->main_command)
exec_status_handoff(&s->main_command->exec_status, ucred, ts);
exec_status_handoff(&s->main_exec_status, ucred, ts);
} else if (s->control_pid.pid == ucred->pid && s->control_command)
exec_status_handoff(&s->control_command->exec_status, ucred, ts);
else
return;
unit_add_to_dbus_queue(u);
}
static int service_get_timeout(Unit *u, usec_t *timeout) {
Service *s = ASSERT_PTR(SERVICE(u));
uint64_t t;
@ -5190,6 +5186,7 @@ const UnitVTable service_vtable = {
.notify_cgroup_empty = service_notify_cgroup_empty_event,
.notify_cgroup_oom = service_notify_cgroup_oom_event,
.notify_message = service_notify_message,
.notify_handoff_timestamp = service_handoff_timestamp,
.main_pid = service_main_pid,
.control_pid = service_control_pid,

View File

@ -3378,6 +3378,22 @@ static void socket_trigger_notify(Unit *u, Unit *other) {
socket_set_state(s, SOCKET_RUNNING);
}
static void socket_handoff_timestamp(
Unit *u,
const struct ucred *ucred,
const dual_timestamp *ts) {
Socket *s = ASSERT_PTR(SOCKET(u));
assert(ucred);
assert(ts);
if (s->control_pid.pid == ucred->pid && s->control_command) {
exec_status_handoff(&s->control_command->exec_status, ucred, ts);
unit_add_to_dbus_queue(u);
}
}
static int socket_get_timeout(Unit *u, usec_t *timeout) {
Socket *s = ASSERT_PTR(SOCKET(u));
usec_t t;
@ -3580,6 +3596,8 @@ const UnitVTable socket_vtable = {
.reset_failed = socket_reset_failed,
.notify_handoff_timestamp = socket_handoff_timestamp,
.control_pid = socket_control_pid,
.bus_set_property = bus_socket_set_property,

View File

@ -1422,6 +1422,22 @@ static void swap_reset_failed(Unit *u) {
s->clean_result = SWAP_SUCCESS;
}
static void swap_handoff_timestamp(
Unit *u,
const struct ucred *ucred,
const dual_timestamp *ts) {
Swap *s = ASSERT_PTR(SWAP(u));
assert(ucred);
assert(ts);
if (s->control_pid.pid == ucred->pid && s->control_command) {
exec_status_handoff(&s->control_command->exec_status, ucred, ts);
unit_add_to_dbus_queue(u);
}
}
static int swap_get_timeout(Unit *u, usec_t *timeout) {
Swap *s = ASSERT_PTR(SWAP(u));
usec_t t;
@ -1608,6 +1624,8 @@ const UnitVTable swap_vtable = {
.reset_failed = swap_reset_failed,
.notify_handoff_timestamp = swap_handoff_timestamp,
.control_pid = swap_control_pid,
.bus_set_property = bus_swap_set_property,

View File

@ -5367,6 +5367,7 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
}
p->user_lookup_fd = u->manager->user_lookup_fds[1];
p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1];
p->cgroup_id = crt ? crt->cgroup_id : 0;
p->invocation_id = u->invocation_id;

View File

@ -632,6 +632,9 @@ typedef struct UnitVTable {
/* Called whenever a process of this unit sends us a message */
void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds);
/* Called whenever we learn a handoff timestamp */
void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts);
/* Called whenever a name this Unit registered for comes or goes away. */
void (*bus_name_owner_change)(Unit *u, const char *new_owner);

View File

@ -8,10 +8,10 @@ set -o pipefail
systemd-run --service-type notify --property NotifyAccess=all --unit notify.service --wait sh -c 'systemd-notify --ready; exit 1' || :
start=$(systemctl show --property=ExecMainStartTimestampMonotonic --value notify.service)
handover=$(systemctl show --property=ExecMainHandoverTimestampMonotonic --value notify.service)
handoff=$(systemctl show --property=ExecMainHandoffTimestampMonotonic --value notify.service)
active=$(systemctl show --property=ActiveEnterTimestampMonotonic --value notify.service)
exit=$(systemctl show --property=ExecMainExitTimestampMonotonic --value notify.service)
[[ $start -le $handover ]]
[[ $handover -le $active ]]
[[ $start -le $handoff ]]
[[ $handoff -le $active ]]
[[ $active -le $exit ]]