execute: send handoff timestamps from executor to service manager

This changes the executor to systematically send handoff timestamps to
the service manager if a socket for that is supplied. This drops the
code that did this via Type=exec messages, and reverts that part to the
old behaviour before 93cb78aee2.

Benefits of this approach:

1. We can collect the handoff for any command we fork off, regardless
   if it's ExecStart= something else, regardless whether it's Type=exec,
   Type=simple or some any other service type, regardless of the unit
   type.

2. We collect both CLOCK_REALTIME and CLOCK_MONOTONIC, as we do for the
   other process timestamps.

3. It's entirely backwards compatible, as this doesn't change the
   protocol between service manager and executor, but just extends it.
This commit is contained in:
Lennart Poettering 2024-04-23 23:22:07 +02:00
parent 817062e621
commit 12001b1bf0
5 changed files with 113 additions and 73 deletions

View file

@ -3519,7 +3519,7 @@ static int close_remaining_fds(
const int *fds, size_t n_fds) {
size_t n_dont_close = 0;
int dont_close[n_fds + 15];
int dont_close[n_fds + 16];
assert(params);
@ -3555,6 +3555,9 @@ static int close_remaining_fds(
if (params->user_lookup_fd >= 0)
dont_close[n_dont_close++] = params->user_lookup_fd;
if (params->handoff_timestamp_fd >= 0)
dont_close[n_dont_close++] = params->handoff_timestamp_fd;
assert(n_dont_close <= ELEMENTSOF(dont_close));
return close_all_fds(dont_close, n_dont_close);
@ -3974,6 +3977,52 @@ static void exec_params_close(ExecParameters *p) {
p->stderr_fd = safe_close(p->stderr_fd);
}
static int exec_fd_mark_hot(
const ExecContext *c,
ExecParameters *p,
bool hot,
int *reterr_exit_status) {
assert(c);
assert(p);
if (p->exec_fd < 0)
return 0;
uint8_t x = hot;
if (write(p->exec_fd, &x, sizeof(x)) < 0) {
if (reterr_exit_status)
*reterr_exit_status = EXIT_EXEC;
return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
}
return 1;
}
static int send_handoff_timestamp(
const ExecContext *c,
ExecParameters *p,
int *reterr_exit_status) {
assert(c);
assert(p);
if (p->handoff_timestamp_fd < 0)
return 0;
dual_timestamp dt;
dual_timestamp_now(&dt);
if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) {
if (reterr_exit_status)
*reterr_exit_status = EXIT_EXEC;
return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m");
}
return 1;
}
int exec_invoke(
const ExecCommand *command,
const ExecContext *context,
@ -4105,7 +4154,7 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
}
int keep_fds[n_fds + 3];
int keep_fds[n_fds + 4];
memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
n_keep_fds = n_fds;
@ -4115,6 +4164,12 @@ int exec_invoke(
return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
}
r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
if (r < 0) {
*exit_status = EXIT_FDS;
return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
}
#if HAVE_LIBBPF
r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
if (r < 0) {
@ -4849,8 +4904,9 @@ int exec_invoke(
/* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
* we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
* more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
* execve(). But first, close the remaining sockets in the context objects. */
* more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
* them open until the final execve(). But first, close the remaining sockets in the context
* objects. */
exec_runtime_close(runtime);
exec_params_close(params);
@ -5266,33 +5322,29 @@ int exec_invoke(
log_command_line(context, params, "Executing", executable, final_argv);
if (params->exec_fd >= 0) {
usec_t t = now(CLOCK_MONOTONIC);
/* We have finished with all our initializations. Let's now let the manager know that. From this
* point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
/* We have finished with all our initializations. Let's now let the manager know that. From this point
* on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a
* timestamp so that the service manager and users can track the precise moment we handed
* over execution of the service to the kernel. */
r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
if (r < 0)
return r;
if (write(params->exec_fd, &t, sizeof(t)) < 0) {
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
}
/* As last thing before the execve(), let's send the handoff timestamp */
r = send_handoff_timestamp(context, params, exit_status);
if (r < 0) {
/* If this handoff timestamp failed, let's undo the marking as hot */
(void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
return r;
}
/* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
* have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
* exec_fd this is pretty much the whole raison d'etre. */
r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
if (params->exec_fd >= 0) {
uint64_t hot = 0;
/* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
* that POLLHUP on it no longer means execve() succeeded. */
if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
}
}
/* The execve() failed, let's undo the marking as hot */
(void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
*exit_status = EXIT_EXEC;
return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);

View file

@ -1373,6 +1373,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext
if (r < 0)
return r;
r = serialize_fd(f, fds, "exec-parameters-handoff-timestamp-fd", p->handoff_timestamp_fd);
if (r < 0)
return r;
if (c && exec_context_restrict_filesystems_set(c)) {
r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd);
if (r < 0)
@ -1620,6 +1624,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) {
continue;
p->exec_fd = fd;
} else if ((val = startswith(l, "exec-parameters-handoff-timestamp-fd="))) {
int fd;
fd = deserialize_fd(fds, val);
if (fd < 0)
continue;
close_and_replace(p->handoff_timestamp_fd, fd);
} else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) {
int fd;

View file

@ -457,6 +457,7 @@ struct ExecParameters {
char **files_env;
int user_lookup_fd;
int handoff_timestamp_fd;
int bpf_restrict_fs_map_fd;
@ -475,6 +476,7 @@ struct ExecParameters {
.exec_fd = -EBADF, \
.bpf_restrict_fs_map_fd = -EBADF, \
.user_lookup_fd = -EBADF, \
.handoff_timestamp_fd = -EBADF, \
}
#include "unit.h"

View file

@ -1676,8 +1676,7 @@ static int service_spawn_internal(
log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds);
}
if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) &&
IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) {
if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) {
r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd);
if (r < 0)
return r;
@ -3519,28 +3518,27 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
log_unit_debug(UNIT(s), "got exec-fd event");
/* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve()
* successfully for it. We implement this through a pipe() towards the child, which the kernel automatically
* closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the
* parent. We need to be careful however, as there are other reasons that we might cause the child's side of
* the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless
* the child signalled us first that it is about to call the execve(). It does so by sending us a simple
* non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it
* sends a zero byte we'll ignore POLLHUP on the fd again.
* For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately
* before the target executable is execve'd, so that we can accurately track when control is handed over to the
* payload. */
* successfully for it. We implement this through a pipe() towards the child, which the kernel
* automatically closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on
* the pipe in the parent. We need to be careful however, as there are other reasons that we might
* cause the child's side of the pipe to be closed (for example, a simple exit()). To deal with that
* we'll ignore EOFs on the pipe unless the child signalled us first that it is about to call the
* execve(). It does so by sending us a simple non-zero byte via the pipe. We also provide the child
* with a way to inform us in case execve() failed: if it sends a zero byte we'll ignore POLLHUP on
* the fd again. */
for (;;) {
uint64_t x = 0;
uint8_t x;
ssize_t n;
n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false);
if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
return 0;
if (n < 0)
return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m");
if (n == 0) {
/* EOF → the event we are waiting for in case of Type=exec */
n = read(fd, &x, sizeof(x));
if (n < 0) {
if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */
return 0;
return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m");
}
if (n == 0) { /* EOF → the event we are waiting for in case of Type=exec */
s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source);
if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */
@ -3556,36 +3554,11 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev
return 0;
}
if (n == 1) {
/* Backward compatibility block: a single byte was read, which means somehow an old
* executor before this logic was introduced sent the notification, so there is no
* timestamp (reset it). */
s->exec_fd_hot = x;
if (s->state == SERVICE_START)
s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
/* A byte was read → this turns on/off the exec fd logic */
assert(n == sizeof(x));
continue;
}
if (x == 0) {
/* If we get x=0 then the execve actually failed, which means the exec fd logic is
* now off. Also reset the exec timestamp, given it has no meaning anymore. */
s->exec_fd_hot = false;
if (s->state == SERVICE_START)
s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL;
} else {
/* A non-zero value was read, which means the exec fd logic is now enabled. Record
* the received timestamp so that users can track when control is handed over to the
* service payload. */
s->exec_fd_hot = true;
if (s->state == SERVICE_START) {
assert_cc(sizeof(uint64_t) == sizeof(usec_t));
dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x);
}
}
s->exec_fd_hot = x;
}
}

View file

@ -5367,6 +5367,7 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) {
}
p->user_lookup_fd = u->manager->user_lookup_fds[1];
p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1];
p->cgroup_id = crt ? crt->cgroup_id : 0;
p->invocation_id = u->invocation_id;