sd_notify: support AF_VSOCK

Allow sending notifications via AF_VSOCK, so that VMs can communicate
to the hypervisor/VMM that they are finished booting.

Note that if the hypervisor does not support SOCK_DGRAM over AF_VSOCK
(ie: qemu at the time of writing), SOCK_SEQPACKET will be used instead.
This commit is contained in:
Luca Boccassi 2023-01-03 18:08:09 +01:00
parent 747b5d963e
commit 6c94cfcda5
2 changed files with 80 additions and 14 deletions

View file

@ -368,13 +368,26 @@
<xi:include href="libsystemd-pkgconfig.xml" xpointer="pkgconfig-text"/>
<para>These functions send a single datagram with the
state string as payload to the <constant>AF_UNIX</constant> socket
referenced in the <varname>$NOTIFY_SOCKET</varname> environment
variable. If the first character of
<varname>$NOTIFY_SOCKET</varname> is <literal>@</literal>, the
string is understood as Linux abstract namespace socket. The
datagram is accompanied by the process credentials of the sending
service, using SCM_CREDENTIALS.</para>
state string as payload to the socket referenced in the
<varname>$NOTIFY_SOCKET</varname> environment variable. If the
first character of <varname>$NOTIFY_SOCKET</varname> is
<literal>/</literal> or <literal>@</literal>, the string is understood
as an <constant>AF_UNIX</constant> or Linux abstract namespace socket
(respectively), and in both cases the datagram is accompanied by the
process credentials of the sending service, using SCM_CREDENTIALS. If
the string starts with <literal>vsock:</literal> then the string is
understood as an <constant>AF_VSOCK</constant> address, which is useful
for hypervisors/VMMs or other processes on the host to receive a
notification when a virtual machine has finished booting. Note that in
case the hypervisor does not support <constant>SOCK_DGRAM</constant>
over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant>
will be used instead. The address should be in the form:
<literal>vsock:CID:PORT</literal>. Note that unlike other uses of vsock,
the CID is mandatory and cannot be <literal>VMADDR_CID_ANY</literal>.
Note that PID1 will send the VSOCK packets from a privileged port
(i.e.: lower than 1024), as an attempt to address concerns that unprivileged
processes in the guest might try to send malicious notifications to the
host, driving it to make destructive decisions based on them.</para>
</refsect1>
<refsect1>

View file

@ -433,6 +433,23 @@ _public_ int sd_is_mq(int fd, const char *path) {
return 1;
}
static int vsock_bind_privileged_port(int fd) {
union sockaddr_union sa = {
.vm.svm_family = AF_VSOCK,
.vm.svm_cid = VMADDR_CID_ANY,
.vm.svm_port = 1023,
};
int r;
assert(fd >= 0);
do
r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm)));
while (r == -EADDRINUSE && --sa.vm.svm_port > 0);
return r;
}
_public_ int sd_pid_notify_with_fds(
pid_t pid,
int unset_environment,
@ -440,12 +457,12 @@ _public_ int sd_pid_notify_with_fds(
const int *fds,
unsigned n_fds) {
union sockaddr_union sockaddr;
SocketAddress address;
struct iovec iovec;
struct msghdr msghdr = {
.msg_iov = &iovec,
.msg_iovlen = 1,
.msg_name = &sockaddr,
.msg_name = &address.sockaddr,
};
_cleanup_close_ int fd = -EBADF;
struct cmsghdr *cmsg = NULL;
@ -467,17 +484,53 @@ _public_ int sd_pid_notify_with_fds(
if (!e)
return 0;
r = sockaddr_un_set_path(&sockaddr.un, e);
/* Allow AF_UNIX and AF_VSOCK, reject the rest. */
r = socket_address_parse_unix(&address, e);
if (r == -EPROTO)
r = socket_address_parse_vsock(&address, e);
if (r < 0)
goto finish;
msghdr.msg_namelen = r;
msghdr.msg_namelen = address.size;
fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0);
if (fd < 0) {
r = -errno;
/* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out,
* we always require a specific CID. */
if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) {
r = -EINVAL;
goto finish;
}
/* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns
* ENODEV. Fallback to SOCK_SEQPACKET in that case. */
fd = socket(address.sockaddr.sa.sa_family, SOCK_DGRAM|SOCK_CLOEXEC, 0);
if (fd < 0) {
if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK) {
r = -errno;
goto finish;
}
fd = socket(address.sockaddr.sa.sa_family, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
if (fd < 0) {
r = -errno;
goto finish;
}
r = vsock_bind_privileged_port(fd);
if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
goto finish;
if (connect(fd, &address.sockaddr.sa, address.size) < 0) {
r = -errno;
goto finish;
}
msghdr.msg_name = NULL;
msghdr.msg_namelen = 0;
} else if (address.sockaddr.sa.sa_family == AF_VSOCK) {
r = vsock_bind_privileged_port(fd);
if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
goto finish;
}
(void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
iovec = IOVEC_MAKE_STRING(state);