diff --git a/NEWS b/NEWS index 806fb8abf35..38a1abb78ca 100644 --- a/NEWS +++ b/NEWS @@ -206,13 +206,12 @@ CHANGES WITH 256-rc1: sent at the same time and carries the string passed to "systemctl --reboot-argument=" if there was one. - * New D-Bus properties ExecMainHandoverTimestamp and - ExecMainHandoverTimestampMonotonic are now published by services of - type exec, dbus, notify, and notify-reload. - This timestamp is taken as the very last operation before executing - a service's binary, which allows users to accurately track when - execution control of the process is handed over from systemd to the - payload. + * New D-Bus properties ExecMainHandoffTimestamp and + ExecMainHandoffTimestampMonotonic are now published by services + units. This timestamp is taken as the very last operation before + handing off control to invoked binaries. This information is + available for other unit types that fork off processes (i.e. mount, + swap, socket units), but currently only via "systemd-analyze dump". * An additional timestamp is now taken by the service manager when a system shutdown operation is initiated. It can be queried via D-Bus diff --git a/TODO b/TODO index 163f66f1527..5bc8a7c683c 100644 --- a/TODO +++ b/TODO @@ -130,6 +130,13 @@ Deprecations and removals: Features: +* expose the handoff timestamp fully via the D-Bus properties that contain + ExecStatus information + +* properly serialize the ExecStatus data from all ExecCommand objects + associated with services, sockets, mounts and swaps. Currently, the data is + flushed out on reload, which is quite a limitation. + * Clean up "reboot argument" handling, i.e. set it through some IPC service instead of directly via /run/, so that it can be sensible set remotely. diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index ca8696bed5e..9e29eda4e07 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2766,8 +2766,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { readonly t ExecMainStartTimestampMonotonic = ...; readonly t ExecMainExitTimestamp = ...; readonly t ExecMainExitTimestampMonotonic = ...; - readonly t ExecMainHandoverTimestamp = ...; - readonly t ExecMainHandoverTimestampMonotonic = ...; + readonly t ExecMainHandoffTimestamp = ...; + readonly t ExecMainHandoffTimestampMonotonic = ...; readonly u ExecMainPID = ...; readonly i ExecMainCode = ...; readonly i ExecMainStatus = ...; @@ -4057,9 +4057,9 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { - + - + @@ -4710,18 +4710,20 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { ExecMainStartTimestamp, ExecMainStartTimestampMonotonic, ExecMainExitTimestamp, ExecMainExitTimestampMonotonic, - ExecMainHandoverTimestamp, ExecMainHandoverTimestampMonotonic, + ExecMainHandoffTimestamp, ExecMainHandoffTimestampMonotonic, ExecMainPID, ExecMainCode, ExecMainStatus contain information about the main process of the service as far as it is known. The - ExecMainStartTimestamp timestamps record when the main child process is spawned by - the service manager. ExecMainExitTimestamp timestamps record when the main child - process exit has been detected by the service manager. ExecMainHandoverTimestamp - timestamps record when the service executable is executed by systemd-executor for - services of type exec, dbus, notify, and - notify-reload. This is often the same runtime information that is stored in - ExecStart=. However, it deviates for Type=forking services where - the main process of the service is not forked off systemd directly. These fields either contain - information of the last run of the process or of the current running process. + ExecMainStartTimestamp timestamps record when the main process of the service is + created. ExecMainExitTimestamp timestamps record when the main process exit has been + detected by the service manager. ExecMainHandoffTimestamp timestamps records when + the service binary is about to be executed by systemd-executor (this timestamp is + recorded regardless if the immediately following execve() system call succeeds or + fails). This is often the same runtime information that is also maintained for + ExecStart=. However, it deviates for services with Type=forking + as well as services that use MAINPID= sd_notify() messages as + the main process of the service is not forked off by the service manager directly in that case. These + fields either contain information of the last run of the process or of the current running + process. MainPID and ControlPID contain the main and control PID of the service. The main PID is the current main PID of the service and is 0 when the service currently @@ -12067,8 +12069,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ EffectiveMemoryMax, EffectiveTasksMax, MemoryZSwapWriteback, - ExecMainHandoverTimestampMonotonic, and - ExecMainHandoverTimestamp were added in version 256. + ExecMainHandoffTimestampMonotonic, and + ExecMainHandoffTimestamp were added in version 256. Socket Unit Objects diff --git a/src/core/dbus-execute.h b/src/core/dbus-execute.h index a1205b09971..4b7cb86d3e5 100644 --- a/src/core/dbus-execute.h +++ b/src/core/dbus-execute.h @@ -9,7 +9,7 @@ #define BUS_EXEC_STATUS_VTABLE(prefix, offset, flags) \ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "StartTimestamp", (offset) + offsetof(ExecStatus, start_timestamp), flags), \ BUS_PROPERTY_DUAL_TIMESTAMP(prefix "ExitTimestamp", (offset) + offsetof(ExecStatus, exit_timestamp), flags), \ - BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoverTimestamp", (offset) + offsetof(ExecStatus, handover_timestamp), flags), \ + BUS_PROPERTY_DUAL_TIMESTAMP(prefix "HandoffTimestamp", (offset) + offsetof(ExecStatus, handoff_timestamp), flags), \ SD_BUS_PROPERTY(prefix "PID", "u", bus_property_get_pid, (offset) + offsetof(ExecStatus, pid), flags), \ SD_BUS_PROPERTY(prefix "Code", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, code), flags), \ SD_BUS_PROPERTY(prefix "Status", "i", bus_property_get_int, (offset) + offsetof(ExecStatus, status), flags) diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 8b16acb399c..1a734a972a2 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -3519,7 +3519,7 @@ static int close_remaining_fds( const int *fds, size_t n_fds) { size_t n_dont_close = 0; - int dont_close[n_fds + 15]; + int dont_close[n_fds + 16]; assert(params); @@ -3555,6 +3555,9 @@ static int close_remaining_fds( if (params->user_lookup_fd >= 0) dont_close[n_dont_close++] = params->user_lookup_fd; + if (params->handoff_timestamp_fd >= 0) + dont_close[n_dont_close++] = params->handoff_timestamp_fd; + assert(n_dont_close <= ELEMENTSOF(dont_close)); return close_all_fds(dont_close, n_dont_close); @@ -3974,6 +3977,52 @@ static void exec_params_close(ExecParameters *p) { p->stderr_fd = safe_close(p->stderr_fd); } +static int exec_fd_mark_hot( + const ExecContext *c, + ExecParameters *p, + bool hot, + int *reterr_exit_status) { + + assert(c); + assert(p); + + if (p->exec_fd < 0) + return 0; + + uint8_t x = hot; + + if (write(p->exec_fd, &x, sizeof(x)) < 0) { + if (reterr_exit_status) + *reterr_exit_status = EXIT_EXEC; + return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold"); + } + + return 1; +} + +static int send_handoff_timestamp( + const ExecContext *c, + ExecParameters *p, + int *reterr_exit_status) { + + assert(c); + assert(p); + + if (p->handoff_timestamp_fd < 0) + return 0; + + dual_timestamp dt; + dual_timestamp_now(&dt); + + if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) { + if (reterr_exit_status) + *reterr_exit_status = EXIT_EXEC; + return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m"); + } + + return 1; +} + int exec_invoke( const ExecCommand *command, const ExecContext *context, @@ -4105,7 +4154,7 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m"); } - int keep_fds[n_fds + 3]; + int keep_fds[n_fds + 4]; memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int)); n_keep_fds = n_fds; @@ -4115,6 +4164,12 @@ int exec_invoke( return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); } + r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->handoff_timestamp_fd); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m"); + } + #if HAVE_LIBBPF r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, ¶ms->bpf_restrict_fs_map_fd); if (r < 0) { @@ -4849,8 +4904,9 @@ int exec_invoke( /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any - * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final - * execve(). But first, close the remaining sockets in the context objects. */ + * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep + * them open until the final execve(). But first, close the remaining sockets in the context + * objects. */ exec_runtime_close(runtime); exec_params_close(params); @@ -5266,33 +5322,29 @@ int exec_invoke( log_command_line(context, params, "Executing", executable, final_argv); - if (params->exec_fd >= 0) { - usec_t t = now(CLOCK_MONOTONIC); + /* We have finished with all our initializations. Let's now let the manager know that. From this + * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ - /* We have finished with all our initializations. Let's now let the manager know that. From this point - * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. We send a - * timestamp so that the service manager and users can track the precise moment we handed - * over execution of the service to the kernel. */ + r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status); + if (r < 0) + return r; - if (write(params->exec_fd, &t, sizeof(t)) < 0) { - *exit_status = EXIT_EXEC; - return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m"); - } + /* As last thing before the execve(), let's send the handoff timestamp */ + r = send_handoff_timestamp(context, params, exit_status); + if (r < 0) { + /* If this handoff timestamp failed, let's undo the marking as hot */ + (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL); + return r; } + /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they + * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for + * exec_fd this is pretty much the whole raison d'etre. */ + r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env); - if (params->exec_fd >= 0) { - uint64_t hot = 0; - - /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager - * that POLLHUP on it no longer means execve() succeeded. */ - - if (write(params->exec_fd, &hot, sizeof(hot)) < 0) { - *exit_status = EXIT_EXEC; - return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m"); - } - } + /* The execve() failed, let's undo the marking as hot */ + (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL); *exit_status = EXIT_EXEC; return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable); diff --git a/src/core/execute-serialize.c b/src/core/execute-serialize.c index 9e402e5e697..0b6939b5d5a 100644 --- a/src/core/execute-serialize.c +++ b/src/core/execute-serialize.c @@ -1373,6 +1373,10 @@ static int exec_parameters_serialize(const ExecParameters *p, const ExecContext if (r < 0) return r; + r = serialize_fd(f, fds, "exec-parameters-handoff-timestamp-fd", p->handoff_timestamp_fd); + if (r < 0) + return r; + if (c && exec_context_restrict_filesystems_set(c)) { r = serialize_fd(f, fds, "exec-parameters-bpf-outer-map-fd", p->bpf_restrict_fs_map_fd); if (r < 0) @@ -1620,6 +1624,14 @@ static int exec_parameters_deserialize(ExecParameters *p, FILE *f, FDSet *fds) { continue; p->exec_fd = fd; + } else if ((val = startswith(l, "exec-parameters-handoff-timestamp-fd="))) { + int fd; + + fd = deserialize_fd(fds, val); + if (fd < 0) + continue; + + close_and_replace(p->handoff_timestamp_fd, fd); } else if ((val = startswith(l, "exec-parameters-bpf-outer-map-fd="))) { int fd; diff --git a/src/core/execute.c b/src/core/execute.c index 26a4bf1b6f5..05a7f907a9b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -1842,6 +1842,19 @@ void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int (void) utmp_put_dead_process(context->utmp_id, pid, code, status); } +void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts) { + assert(s); + assert(ucred); + assert(ts); + + if (ucred->pid != s->pid) + *s = (ExecStatus) { + .pid = ucred->pid, + }; + + s->handoff_timestamp = *ts; +} + void exec_status_reset(ExecStatus *s) { assert(s); @@ -1864,21 +1877,42 @@ void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { if (dual_timestamp_is_set(&s->start_timestamp)) fprintf(f, "%sStart Timestamp: %s\n", - prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime)); + prefix, FORMAT_TIMESTAMP_STYLE(s->start_timestamp.realtime, TIMESTAMP_US)); - if (dual_timestamp_is_set(&s->handover_timestamp)) + if (dual_timestamp_is_set(&s->handoff_timestamp) && dual_timestamp_is_set(&s->start_timestamp) && + s->handoff_timestamp.monotonic > s->start_timestamp.monotonic) fprintf(f, - "%sHandover Timestamp: %s\n", - prefix, FORMAT_TIMESTAMP(s->handover_timestamp.realtime)); + "%sHandoff Timestamp: %s since start\n", + prefix, + FORMAT_TIMESPAN(usec_sub_unsigned(s->handoff_timestamp.monotonic, s->start_timestamp.monotonic), 1)); + else + fprintf(f, + "%sHandoff Timestamp: %s\n", + prefix, FORMAT_TIMESTAMP_STYLE(s->handoff_timestamp.realtime, TIMESTAMP_US)); + + if (dual_timestamp_is_set(&s->exit_timestamp)) { + + if (dual_timestamp_is_set(&s->handoff_timestamp) && s->exit_timestamp.monotonic > s->handoff_timestamp.monotonic) + fprintf(f, + "%sExit Timestamp: %s since handoff\n", + prefix, + FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->handoff_timestamp.monotonic), 1)); + else if (dual_timestamp_is_set(&s->start_timestamp) && s->exit_timestamp.monotonic > s->start_timestamp.monotonic) + fprintf(f, + "%sExit Timestamp: %s since start\n", + prefix, + FORMAT_TIMESPAN(usec_sub_unsigned(s->exit_timestamp.monotonic, s->start_timestamp.monotonic), 1)); + else + fprintf(f, + "%sExit Timestamp: %s\n", + prefix, FORMAT_TIMESTAMP_STYLE(s->exit_timestamp.realtime, TIMESTAMP_US)); - if (dual_timestamp_is_set(&s->exit_timestamp)) fprintf(f, - "%sExit Timestamp: %s\n" "%sExit Code: %s\n" "%sExit Status: %i\n", - prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime), prefix, sigchld_code_to_string(s->code), prefix, s->status); + } } void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) { diff --git a/src/core/execute.h b/src/core/execute.h index f6595cf93e1..6a313640de2 100644 --- a/src/core/execute.h +++ b/src/core/execute.h @@ -91,7 +91,7 @@ typedef enum ExecKeyringMode { struct ExecStatus { dual_timestamp start_timestamp; dual_timestamp exit_timestamp; - dual_timestamp handover_timestamp; + dual_timestamp handoff_timestamp; pid_t pid; int code; /* as in siginfo_t::si_code */ int status; /* as in siginfo_t::si_status */ @@ -444,9 +444,7 @@ struct ExecParameters { int stdout_fd; int stderr_fd; - /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. It - * will also be used to send a timestamp taken as the very last operation before execve, for - * tracking purposes. */ + /* An fd that is closed by the execve(), and thus will result in EOF when the execve() is done. */ int exec_fd; char *notify_socket; @@ -457,6 +455,7 @@ struct ExecParameters { char **files_env; int user_lookup_fd; + int handoff_timestamp_fd; int bpf_restrict_fs_map_fd; @@ -475,6 +474,7 @@ struct ExecParameters { .exec_fd = -EBADF, \ .bpf_restrict_fs_map_fd = -EBADF, \ .user_lookup_fd = -EBADF, \ + .handoff_timestamp_fd = -EBADF, \ } #include "unit.h" @@ -545,6 +545,7 @@ char** exec_context_get_restrict_filesystems(const ExecContext *c); void exec_status_start(ExecStatus *s, pid_t pid); void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status); +void exec_status_handoff(ExecStatus *s, const struct ucred *ucred, const dual_timestamp *ts); void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix); void exec_status_reset(ExecStatus *s); diff --git a/src/core/manager-serialize.c b/src/core/manager-serialize.c index 0b3def3dabe..39a8a0533f0 100644 --- a/src/core/manager-serialize.c +++ b/src/core/manager-serialize.c @@ -148,6 +148,12 @@ int manager_serialize( return r; } + if (m->handoff_timestamp_fds[0] >= 0) { + r = serialize_fd_many(f, fds, "handoff-timestamp-fds", m->handoff_timestamp_fds, 2); + if (r < 0) + return r; + } + (void) serialize_ratelimit(f, "dump-ratelimit", &m->dump_ratelimit); (void) serialize_ratelimit(f, "reload-reexec-ratelimit", &m->reload_reexec_ratelimit); diff --git a/src/core/manager.c b/src/core/manager.c index 11a81139868..04103e0fe93 100644 --- a/src/core/manager.c +++ b/src/core/manager.c @@ -56,6 +56,7 @@ #include "inotify-util.h" #include "install.h" #include "io-util.h" +#include "iovec-util.h" #include "label-util.h" #include "load-fragment.h" #include "locale-setup.h" @@ -124,6 +125,7 @@ static int manager_dispatch_signal_fd(sd_event_source *source, int fd, uint32_t static int manager_dispatch_time_change_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_idle_pipe_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); +static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata); static int manager_dispatch_jobs_in_progress(sd_event_source *source, usec_t usec, void *userdata); static int manager_dispatch_run_queue(sd_event_source *source, void *userdata); static int manager_dispatch_sigchld(sd_event_source *source, void *userdata); @@ -904,6 +906,7 @@ int manager_new(RuntimeScope runtime_scope, ManagerTestRunFlags test_run_flags, .cgroups_agent_fd = -EBADF, .signal_fd = -EBADF, .user_lookup_fds = EBADF_PAIR, + .handoff_timestamp_fds = EBADF_PAIR, .private_listen_fd = -EBADF, .dev_autofs_fd = -EBADF, .cgroup_inotify_fd = -EBADF, @@ -1253,6 +1256,49 @@ static int manager_setup_user_lookup_fd(Manager *m) { return 0; } +static int manager_setup_handoff_timestamp_fd(Manager *m) { + int r; + + assert(m); + + /* Set up the socket pair used for for passing timestamps back when the executor processes we fork + * off invokes execve(), i.e. when we hand off control to our payload processes. */ + + if (m->handoff_timestamp_fds[0] < 0) { + m->handoff_timestamp_event_source = sd_event_source_disable_unref(m->handoff_timestamp_event_source); + safe_close_pair(m->handoff_timestamp_fds); + + if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, m->handoff_timestamp_fds) < 0) + return log_error_errno(errno, "Failed to allocate handoff timestamp socket: %m"); + + /* Make sure children never have to block */ + (void) fd_increase_rxbuf(m->handoff_timestamp_fds[0], NOTIFY_RCVBUF_SIZE); + + r = setsockopt_int(m->handoff_timestamp_fds[0], SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_error_errno(r, "SO_PASSCRED failed: %m"); + + /* Mark the receiving socket as O_NONBLOCK (but leave sending side as-is) */ + r = fd_nonblock(m->handoff_timestamp_fds[0], true); + if (r < 0) + return log_error_errno(r, "Failed to make handoff timestamp socket O_NONBLOCK: %m"); + } + + if (!m->handoff_timestamp_event_source) { + r = sd_event_add_io(m->event, &m->handoff_timestamp_event_source, m->handoff_timestamp_fds[0], EPOLLIN, manager_dispatch_handoff_timestamp_fd, m); + if (r < 0) + return log_error_errno(errno, "Failed to allocate handoff timestamp event source: %m"); + + r = sd_event_source_set_priority(m->handoff_timestamp_event_source, EVENT_PRIORITY_HANDOFF_TIMESTAMP); + if (r < 0) + return log_error_errno(errno, "Failed to set priority of handoff timestamp event source: %m"); + + (void) sd_event_source_set_description(m->handoff_timestamp_event_source, "handoff-timestamp"); + } + + return 0; +} + static unsigned manager_dispatch_cleanup_queue(Manager *m) { Unit *u; unsigned n = 0; @@ -1667,12 +1713,14 @@ Manager* manager_free(Manager *m) { sd_event_source_unref(m->jobs_in_progress_event_source); sd_event_source_unref(m->run_queue_event_source); sd_event_source_unref(m->user_lookup_event_source); + sd_event_source_unref(m->handoff_timestamp_event_source); sd_event_source_unref(m->memory_pressure_event_source); safe_close(m->signal_fd); safe_close(m->notify_fd); safe_close(m->cgroups_agent_fd); safe_close_pair(m->user_lookup_fds); + safe_close_pair(m->handoff_timestamp_fds); manager_close_ask_password(m); @@ -2013,6 +2061,11 @@ int manager_startup(Manager *m, FILE *serialization, FDSet *fds, const char *roo /* This shouldn't fail, except if things are really broken. */ return r; + r = manager_setup_handoff_timestamp_fd(m); + if (r < 0) + /* This shouldn't fail, except if things are really broken. */ + return r; + /* Connect to the bus if we are good for it */ manager_setup_bus(m); @@ -3655,6 +3708,7 @@ int manager_reload(Manager *m) { (void) manager_setup_notify(m); (void) manager_setup_cgroups_agent(m); (void) manager_setup_user_lookup_fd(m); + (void) manager_setup_handoff_timestamp_fd(m); /* Third, fire things up! */ manager_coldplug(m); @@ -4776,13 +4830,12 @@ static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint char unit_name[UNIT_NAME_MAX+1]; } _packed_ buffer; - Manager *m = userdata; + Manager *m = ASSERT_PTR(userdata); ssize_t l; size_t n; Unit *u; - assert_se(source); - assert_se(m); + assert(source); /* Invoked whenever a child process succeeded resolving its user/group to use and sent us the * resulting UID/GID in a datagram. We parse the datagram here and pass it off to the unit, so that @@ -4831,6 +4884,73 @@ static int manager_dispatch_user_lookup_fd(sd_event_source *source, int fd, uint return 0; } +static int manager_dispatch_handoff_timestamp_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) { + Manager *m = ASSERT_PTR(userdata); + usec_t ts[2] = {}; + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control; + struct msghdr msghdr = { + .msg_iov = &IOVEC_MAKE(ts, sizeof(ts)), + .msg_iovlen = 1, + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + ssize_t n; + + assert(source); + + n = recvmsg_safe(m->handoff_timestamp_fds[0], &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC|MSG_TRUNC); + if (ERRNO_IS_NEG_TRANSIENT(n)) + return 0; /* Spurious wakeup, try again */ + if (n == -EXFULL) { + log_warning("Got message with truncated control, ignoring."); + return 0; + } + if (n < 0) + return log_error_errno(n, "Failed to receive handoff timestamp message: %m"); + + if (msghdr.msg_flags & MSG_TRUNC) { + log_warning("Got truncated handoff timestamp message, ignoring."); + return 0; + } + if (n != sizeof(ts)) { + log_warning("Got handoff timestamp message of unexpected size %zi (expected %zu), ignoring.", n, sizeof(ts)); + return 0; + } + + struct ucred *ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred); + if (!ucred || !pid_is_valid(ucred->pid)) { + log_warning("Received notify message without valid credentials. Ignoring."); + return 0; + } + + log_debug("Got handoff timestamp event for PID " PID_FMT ".", ucred->pid); + + _cleanup_free_ Unit **units = NULL; + int n_units = manager_get_units_for_pidref(m, &PIDREF_MAKE_FROM_PID(ucred->pid), &units); + if (n_units < 0) { + log_warning_errno(n_units, "Unable to determine units for PID " PID_FMT ", ignoring: %m", ucred->pid); + return 0; + } + if (n_units == 0) { + log_debug("Got handoff timestamp for process " PID_FMT " we are not interested in, ignoring.", ucred->pid); + return 0; + } + + dual_timestamp dt = { + .realtime = ts[0], + .monotonic = ts[1], + }; + + FOREACH_ARRAY(u, units, n_units) { + if (!UNIT_VTABLE(*u)->notify_handoff_timestamp) + continue; + + UNIT_VTABLE(*u)->notify_handoff_timestamp(*u, ucred, &dt); + } + + return 0; +} + void manager_ref_console(Manager *m) { assert(m); diff --git a/src/core/manager.h b/src/core/manager.h index f0f814f940c..0641b2726f0 100644 --- a/src/core/manager.h +++ b/src/core/manager.h @@ -286,6 +286,9 @@ struct Manager { int user_lookup_fds[2]; sd_event_source *user_lookup_event_source; + int handoff_timestamp_fds[2]; + sd_event_source *handoff_timestamp_event_source; + RuntimeScope runtime_scope; LookupPaths lookup_paths; @@ -656,22 +659,23 @@ void unit_defaults_done(UnitDefaults *defaults); enum { /* most important … */ - EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-10, - EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-9, - EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-9, - EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-8, /* cgroupv1 */ - EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-8, /* cgroupv2 */ - EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-7, - EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6, - EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5, - EVENT_PRIORITY_SIGCHLD = SD_EVENT_PRIORITY_NORMAL-4, - EVENT_PRIORITY_SIGNALS = SD_EVENT_PRIORITY_NORMAL-3, - EVENT_PRIORITY_CGROUP_EMPTY = SD_EVENT_PRIORITY_NORMAL-2, - EVENT_PRIORITY_TIME_CHANGE = SD_EVENT_PRIORITY_NORMAL-1, - EVENT_PRIORITY_TIME_ZONE = SD_EVENT_PRIORITY_NORMAL-1, - EVENT_PRIORITY_IPC = SD_EVENT_PRIORITY_NORMAL, - EVENT_PRIORITY_REWATCH_PIDS = SD_EVENT_PRIORITY_IDLE, - EVENT_PRIORITY_SERVICE_WATCHDOG = SD_EVENT_PRIORITY_IDLE+1, - EVENT_PRIORITY_RUN_QUEUE = SD_EVENT_PRIORITY_IDLE+2, + EVENT_PRIORITY_USER_LOOKUP = SD_EVENT_PRIORITY_NORMAL-11, + EVENT_PRIORITY_MOUNT_TABLE = SD_EVENT_PRIORITY_NORMAL-10, + EVENT_PRIORITY_SWAP_TABLE = SD_EVENT_PRIORITY_NORMAL-10, + EVENT_PRIORITY_CGROUP_AGENT = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv1 */ + EVENT_PRIORITY_CGROUP_INOTIFY = SD_EVENT_PRIORITY_NORMAL-9, /* cgroupv2 */ + EVENT_PRIORITY_CGROUP_OOM = SD_EVENT_PRIORITY_NORMAL-8, + EVENT_PRIORITY_HANDOFF_TIMESTAMP = SD_EVENT_PRIORITY_NORMAL-7, + EVENT_PRIORITY_EXEC_FD = SD_EVENT_PRIORITY_NORMAL-6, + EVENT_PRIORITY_NOTIFY = SD_EVENT_PRIORITY_NORMAL-5, + EVENT_PRIORITY_SIGCHLD = SD_EVENT_PRIORITY_NORMAL-4, + EVENT_PRIORITY_SIGNALS = SD_EVENT_PRIORITY_NORMAL-3, + EVENT_PRIORITY_CGROUP_EMPTY = SD_EVENT_PRIORITY_NORMAL-2, + EVENT_PRIORITY_TIME_CHANGE = SD_EVENT_PRIORITY_NORMAL-1, + EVENT_PRIORITY_TIME_ZONE = SD_EVENT_PRIORITY_NORMAL-1, + EVENT_PRIORITY_IPC = SD_EVENT_PRIORITY_NORMAL, + EVENT_PRIORITY_REWATCH_PIDS = SD_EVENT_PRIORITY_IDLE, + EVENT_PRIORITY_SERVICE_WATCHDOG = SD_EVENT_PRIORITY_IDLE+1, + EVENT_PRIORITY_RUN_QUEUE = SD_EVENT_PRIORITY_IDLE+2, /* … to least important */ }; diff --git a/src/core/mount.c b/src/core/mount.c index d70a60b3774..3bf7e89e38f 100644 --- a/src/core/mount.c +++ b/src/core/mount.c @@ -1927,6 +1927,22 @@ static void mount_shutdown(Manager *m) { m->mount_monitor = NULL; } +static void mount_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Mount *m = ASSERT_PTR(MOUNT(u)); + + assert(ucred); + assert(ts); + + if (m->control_pid.pid == ucred->pid && m->control_command) { + exec_status_handoff(&m->control_command->exec_status, ucred, ts); + unit_add_to_dbus_queue(u); + } +} + static int mount_get_timeout(Unit *u, usec_t *timeout) { Mount *m = ASSERT_PTR(MOUNT(u)); usec_t t; @@ -2453,6 +2469,8 @@ const UnitVTable mount_vtable = { .reset_failed = mount_reset_failed, + .notify_handoff_timestamp = mount_handoff_timestamp, + .control_pid = mount_control_pid, .bus_set_property = bus_mount_set_property, diff --git a/src/core/service.c b/src/core/service.c index b2dc42f4401..b3fe2c0b1f8 100644 --- a/src/core/service.c +++ b/src/core/service.c @@ -1676,8 +1676,7 @@ static int service_spawn_internal( log_unit_debug(UNIT(s), "Passing %zu fds to service", exec_params.n_socket_fds + exec_params.n_storage_fds); } - if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && - IN_SET(s->type, SERVICE_NOTIFY, SERVICE_NOTIFY_RELOAD, SERVICE_EXEC, SERVICE_DBUS)) { + if (!FLAGS_SET(exec_params.flags, EXEC_IS_CONTROL) && s->type == SERVICE_EXEC) { r = service_allocate_exec_fd(s, &exec_fd_source, &exec_params.exec_fd); if (r < 0) return r; @@ -3010,7 +3009,7 @@ static int service_serialize(Unit *u, FILE *f, FDSet *fds) { (void) serialize_item_format(f, "main-exec-status-pid", PID_FMT, s->main_exec_status.pid); (void) serialize_dual_timestamp(f, "main-exec-status-start", &s->main_exec_status.start_timestamp); (void) serialize_dual_timestamp(f, "main-exec-status-exit", &s->main_exec_status.exit_timestamp); - (void) serialize_dual_timestamp(f, "main-exec-status-handover", &s->main_exec_status.handover_timestamp); + (void) serialize_dual_timestamp(f, "main-exec-status-handoff", &s->main_exec_status.handoff_timestamp); if (dual_timestamp_is_set(&s->main_exec_status.exit_timestamp)) { (void) serialize_item_format(f, "main-exec-status-code", "%i", s->main_exec_status.code); @@ -3295,8 +3294,8 @@ static int service_deserialize_item(Unit *u, const char *key, const char *value, deserialize_dual_timestamp(value, &s->main_exec_status.start_timestamp); else if (streq(key, "main-exec-status-exit")) deserialize_dual_timestamp(value, &s->main_exec_status.exit_timestamp); - else if (streq(key, "main-exec-status-handover")) - deserialize_dual_timestamp(value, &s->main_exec_status.handover_timestamp); + else if (streq(key, "main-exec-status-handoff")) + deserialize_dual_timestamp(value, &s->main_exec_status.handoff_timestamp); else if (streq(key, "notify-access-override")) { NotifyAccess notify_access; @@ -3519,28 +3518,27 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev log_unit_debug(UNIT(s), "got exec-fd event"); /* If Type=exec is set, we'll consider a service started successfully the instant we invoked execve() - * successfully for it. We implement this through a pipe() towards the child, which the kernel automatically - * closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on the pipe in the - * parent. We need to be careful however, as there are other reasons that we might cause the child's side of - * the pipe to be closed (for example, a simple exit()). To deal with that we'll ignore EOFs on the pipe unless - * the child signalled us first that it is about to call the execve(). It does so by sending us a simple - * non-zero byte via the pipe. We also provide the child with a way to inform us in case execve() failed: if it - * sends a zero byte we'll ignore POLLHUP on the fd again. - * For exec, dbus, notify and notify-reload types we also get a timestamp from sd-executor, taken immediately - * before the target executable is execve'd, so that we can accurately track when control is handed over to the - * payload. */ + * successfully for it. We implement this through a pipe() towards the child, which the kernel + * automatically closes for us due to O_CLOEXEC on execve() in the child, which then triggers EOF on + * the pipe in the parent. We need to be careful however, as there are other reasons that we might + * cause the child's side of the pipe to be closed (for example, a simple exit()). To deal with that + * we'll ignore EOFs on the pipe unless the child signalled us first that it is about to call the + * execve(). It does so by sending us a simple non-zero byte via the pipe. We also provide the child + * with a way to inform us in case execve() failed: if it sends a zero byte we'll ignore POLLHUP on + * the fd again. */ for (;;) { - uint64_t x = 0; + uint8_t x; ssize_t n; - n = loop_read(fd, &x, sizeof(x), /* do_poll= */ false); - if (n == -EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ - return 0; - if (n < 0) - return log_unit_error_errno(UNIT(s), n, "Failed to read from exec_fd: %m"); - if (n == 0) { - /* EOF → the event we are waiting for in case of Type=exec */ + n = read(fd, &x, sizeof(x)); + if (n < 0) { + if (errno == EAGAIN) /* O_NONBLOCK in effect → everything queued has now been processed. */ + return 0; + + return log_unit_error_errno(UNIT(s), errno, "Failed to read from exec_fd: %m"); + } + if (n == 0) { /* EOF → the event we are waiting for in case of Type=exec */ s->exec_fd_event_source = sd_event_source_disable_unref(s->exec_fd_event_source); if (s->exec_fd_hot) { /* Did the child tell us to expect EOF now? */ @@ -3556,36 +3554,11 @@ static int service_dispatch_exec_io(sd_event_source *source, int fd, uint32_t ev return 0; } - if (n == 1) { - /* Backward compatibility block: a single byte was read, which means somehow an old - * executor before this logic was introduced sent the notification, so there is no - * timestamp (reset it). */ - s->exec_fd_hot = x; - if (s->state == SERVICE_START) - s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL; + /* A byte was read → this turns on/off the exec fd logic */ + assert(n == sizeof(x)); - continue; - } - - if (x == 0) { - /* If we get x=0 then the execve actually failed, which means the exec fd logic is - * now off. Also reset the exec timestamp, given it has no meaning anymore. */ - - s->exec_fd_hot = false; - if (s->state == SERVICE_START) - s->main_exec_status.handover_timestamp = DUAL_TIMESTAMP_NULL; - } else { - /* A non-zero value was read, which means the exec fd logic is now enabled. Record - * the received timestamp so that users can track when control is handed over to the - * service payload. */ - - s->exec_fd_hot = true; - if (s->state == SERVICE_START) { - assert_cc(sizeof(uint64_t) == sizeof(usec_t)); - dual_timestamp_from_monotonic(&s->main_exec_status.handover_timestamp, (usec_t)x); - } - } + s->exec_fd_hot = x; } } @@ -4605,6 +4578,29 @@ static void service_notify_message( unit_add_to_dbus_queue(u); } +static void service_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Service *s = ASSERT_PTR(SERVICE(u)); + + assert(ucred); + assert(ts); + + if (s->main_pid.pid == ucred->pid) { + if (s->main_command) + exec_status_handoff(&s->main_command->exec_status, ucred, ts); + + exec_status_handoff(&s->main_exec_status, ucred, ts); + } else if (s->control_pid.pid == ucred->pid && s->control_command) + exec_status_handoff(&s->control_command->exec_status, ucred, ts); + else + return; + + unit_add_to_dbus_queue(u); +} + static int service_get_timeout(Unit *u, usec_t *timeout) { Service *s = ASSERT_PTR(SERVICE(u)); uint64_t t; @@ -5190,6 +5186,7 @@ const UnitVTable service_vtable = { .notify_cgroup_empty = service_notify_cgroup_empty_event, .notify_cgroup_oom = service_notify_cgroup_oom_event, .notify_message = service_notify_message, + .notify_handoff_timestamp = service_handoff_timestamp, .main_pid = service_main_pid, .control_pid = service_control_pid, diff --git a/src/core/socket.c b/src/core/socket.c index 4c8428d1a9a..41147d4bf74 100644 --- a/src/core/socket.c +++ b/src/core/socket.c @@ -3378,6 +3378,22 @@ static void socket_trigger_notify(Unit *u, Unit *other) { socket_set_state(s, SOCKET_RUNNING); } +static void socket_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Socket *s = ASSERT_PTR(SOCKET(u)); + + assert(ucred); + assert(ts); + + if (s->control_pid.pid == ucred->pid && s->control_command) { + exec_status_handoff(&s->control_command->exec_status, ucred, ts); + unit_add_to_dbus_queue(u); + } +} + static int socket_get_timeout(Unit *u, usec_t *timeout) { Socket *s = ASSERT_PTR(SOCKET(u)); usec_t t; @@ -3580,6 +3596,8 @@ const UnitVTable socket_vtable = { .reset_failed = socket_reset_failed, + .notify_handoff_timestamp = socket_handoff_timestamp, + .control_pid = socket_control_pid, .bus_set_property = bus_socket_set_property, diff --git a/src/core/swap.c b/src/core/swap.c index 3d202607287..222241c9da2 100644 --- a/src/core/swap.c +++ b/src/core/swap.c @@ -1422,6 +1422,22 @@ static void swap_reset_failed(Unit *u) { s->clean_result = SWAP_SUCCESS; } +static void swap_handoff_timestamp( + Unit *u, + const struct ucred *ucred, + const dual_timestamp *ts) { + + Swap *s = ASSERT_PTR(SWAP(u)); + + assert(ucred); + assert(ts); + + if (s->control_pid.pid == ucred->pid && s->control_command) { + exec_status_handoff(&s->control_command->exec_status, ucred, ts); + unit_add_to_dbus_queue(u); + } +} + static int swap_get_timeout(Unit *u, usec_t *timeout) { Swap *s = ASSERT_PTR(SWAP(u)); usec_t t; @@ -1608,6 +1624,8 @@ const UnitVTable swap_vtable = { .reset_failed = swap_reset_failed, + .notify_handoff_timestamp = swap_handoff_timestamp, + .control_pid = swap_control_pid, .bus_set_property = bus_swap_set_property, diff --git a/src/core/unit.c b/src/core/unit.c index 15285a3ac4c..ca5623b5c2f 100644 --- a/src/core/unit.c +++ b/src/core/unit.c @@ -5367,6 +5367,7 @@ int unit_set_exec_params(Unit *u, ExecParameters *p) { } p->user_lookup_fd = u->manager->user_lookup_fds[1]; + p->handoff_timestamp_fd = u->manager->handoff_timestamp_fds[1]; p->cgroup_id = crt ? crt->cgroup_id : 0; p->invocation_id = u->invocation_id; diff --git a/src/core/unit.h b/src/core/unit.h index 74520a33cb3..b135fecc514 100644 --- a/src/core/unit.h +++ b/src/core/unit.h @@ -632,6 +632,9 @@ typedef struct UnitVTable { /* Called whenever a process of this unit sends us a message */ void (*notify_message)(Unit *u, const struct ucred *ucred, char * const *tags, FDSet *fds); + /* Called whenever we learn a handoff timestamp */ + void (*notify_handoff_timestamp)(Unit *u, const struct ucred *ucred, const dual_timestamp *ts); + /* Called whenever a name this Unit registered for comes or goes away. */ void (*bus_name_owner_change)(Unit *u, const char *new_owner); diff --git a/test/units/testsuite-07.exec-timestamps.sh b/test/units/testsuite-07.exec-timestamps.sh index 799bc89e06e..0211166ae3c 100755 --- a/test/units/testsuite-07.exec-timestamps.sh +++ b/test/units/testsuite-07.exec-timestamps.sh @@ -8,10 +8,10 @@ set -o pipefail systemd-run --service-type notify --property NotifyAccess=all --unit notify.service --wait sh -c 'systemd-notify --ready; exit 1' || : start=$(systemctl show --property=ExecMainStartTimestampMonotonic --value notify.service) -handover=$(systemctl show --property=ExecMainHandoverTimestampMonotonic --value notify.service) +handoff=$(systemctl show --property=ExecMainHandoffTimestampMonotonic --value notify.service) active=$(systemctl show --property=ActiveEnterTimestampMonotonic --value notify.service) exit=$(systemctl show --property=ExecMainExitTimestampMonotonic --value notify.service) -[[ $start -le $handover ]] -[[ $handover -le $active ]] +[[ $start -le $handoff ]] +[[ $handoff -le $active ]] [[ $active -le $exit ]]