/* * Vhost User Bridge * * Copyright (c) 2015 Red Hat, Inc. * * Authors: * Victor Kaplansky * * This work is licensed under the terms of the GNU GPL, version 2 or * later. See the COPYING file in the top-level directory. */ /* * TODO: * - main should get parameters from the command line. * - implement all request handlers. Still not implemented: * vubr_get_queue_num_exec() * vubr_send_rarp_exec() * - test for broken requests and virtqueue. * - implement features defined by Virtio 1.0 spec. * - support mergeable buffers and indirect descriptors. * - implement clean shutdown. * - implement non-blocking writes to UDP backend. * - implement polling strategy. * - implement clean starting/stopping of vq processing * - implement clean starting/stopping of used and buffers * dirty page logging. */ #define _FILE_OFFSET_BITS 64 #include "qemu/osdep.h" #include "qemu/atomic.h" #include "qemu/ctype.h" #include "qemu/iov.h" #include "standard-headers/linux/virtio_net.h" #include "libvhost-user.h" #define VHOST_USER_BRIDGE_DEBUG 1 #define DPRINT(...) \ do { \ if (VHOST_USER_BRIDGE_DEBUG) { \ printf(__VA_ARGS__); \ } \ } while (0) enum { VHOST_USER_BRIDGE_MAX_QUEUES = 8, }; typedef void (*CallbackFunc)(int sock, void *ctx); typedef struct Event { void *ctx; CallbackFunc callback; } Event; typedef struct Dispatcher { int max_sock; fd_set fdset; Event events[FD_SETSIZE]; } Dispatcher; typedef struct VubrDev { VuDev vudev; Dispatcher dispatcher; int backend_udp_sock; struct sockaddr_in backend_udp_dest; int hdrlen; int sock; int ready; int quit; struct { int fd; void *addr; pthread_t thread; } notifier; } VubrDev; static void vubr_die(const char *s) { perror(s); exit(1); } static int dispatcher_init(Dispatcher *dispr) { FD_ZERO(&dispr->fdset); dispr->max_sock = -1; return 0; } static int dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb) { if (sock >= FD_SETSIZE) { fprintf(stderr, "Error: Failed to add new event. sock %d should be less than %d\n", sock, FD_SETSIZE); return -1; } dispr->events[sock].ctx = ctx; dispr->events[sock].callback = cb; FD_SET(sock, &dispr->fdset); if (sock > dispr->max_sock) { dispr->max_sock = sock; } DPRINT("Added sock %d for watching. max_sock: %d\n", sock, dispr->max_sock); return 0; } static int dispatcher_remove(Dispatcher *dispr, int sock) { if (sock >= FD_SETSIZE) { fprintf(stderr, "Error: Failed to remove event. sock %d should be less than %d\n", sock, FD_SETSIZE); return -1; } FD_CLR(sock, &dispr->fdset); DPRINT("Sock %d removed from dispatcher watch.\n", sock); return 0; } /* timeout in us */ static int dispatcher_wait(Dispatcher *dispr, uint32_t timeout) { struct timeval tv; tv.tv_sec = timeout / 1000000; tv.tv_usec = timeout % 1000000; fd_set fdset = dispr->fdset; /* wait until some of sockets become readable. */ int rc = select(dispr->max_sock + 1, &fdset, 0, 0, &tv); if (rc == -1) { vubr_die("select"); } /* Timeout */ if (rc == 0) { return 0; } /* Now call callback for every ready socket. */ int sock; for (sock = 0; sock < dispr->max_sock + 1; sock++) { /* The callback on a socket can remove other sockets from the * dispatcher, thus we have to check that the socket is * still not removed from dispatcher's list */ if (FD_ISSET(sock, &fdset) && FD_ISSET(sock, &dispr->fdset)) { Event *e = &dispr->events[sock]; e->callback(sock, e->ctx); } } return 0; } static void vubr_handle_tx(VuDev *dev, int qidx) { VuVirtq *vq = vu_get_queue(dev, qidx); VubrDev *vubr = container_of(dev, VubrDev, vudev); int hdrlen = vubr->hdrlen; VuVirtqElement *elem = NULL; assert(qidx % 2); for (;;) { ssize_t ret; unsigned int out_num; struct iovec sg[VIRTQUEUE_MAX_SIZE], *out_sg; elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); if (!elem) { break; } out_num = elem->out_num; out_sg = elem->out_sg; if (out_num < 1) { fprintf(stderr, "virtio-net header not in first element\n"); break; } if (VHOST_USER_BRIDGE_DEBUG) { iov_hexdump(out_sg, out_num, stderr, "TX:", 1024); } if (hdrlen) { unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), out_sg, out_num, hdrlen, -1); out_num = sg_num; out_sg = sg; } struct msghdr msg = { .msg_name = (struct sockaddr *) &vubr->backend_udp_dest, .msg_namelen = sizeof(struct sockaddr_in), .msg_iov = out_sg, .msg_iovlen = out_num, }; do { ret = sendmsg(vubr->backend_udp_sock, &msg, 0); } while (ret == -1 && (errno == EAGAIN || errno == EINTR)); if (ret == -1) { vubr_die("sendmsg()"); } vu_queue_push(dev, vq, elem, 0); vu_queue_notify(dev, vq); free(elem); elem = NULL; } free(elem); } /* this function reverse the effect of iov_discard_front() it must be * called with 'front' being the original struct iovec and 'bytes' * being the number of bytes you shaved off */ static void iov_restore_front(struct iovec *front, struct iovec *iov, size_t bytes) { struct iovec *cur; for (cur = front; cur != iov; cur++) { assert(bytes >= cur->iov_len); bytes -= cur->iov_len; } cur->iov_base -= bytes; cur->iov_len += bytes; } static void iov_truncate(struct iovec *iov, unsigned iovc, size_t bytes) { unsigned i; for (i = 0; i < iovc; i++, iov++) { if (bytes < iov->iov_len) { iov->iov_len = bytes; return; } bytes -= iov->iov_len; } assert(!"couldn't truncate iov"); } static void vubr_backend_recv_cb(int sock, void *ctx) { VubrDev *vubr = (VubrDev *) ctx; VuDev *dev = &vubr->vudev; VuVirtq *vq = vu_get_queue(dev, 0); VuVirtqElement *elem = NULL; struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; struct virtio_net_hdr_mrg_rxbuf mhdr; unsigned mhdr_cnt = 0; int hdrlen = vubr->hdrlen; int i = 0; struct virtio_net_hdr hdr = { .flags = 0, .gso_type = VIRTIO_NET_HDR_GSO_NONE }; DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); DPRINT(" hdrlen = %d\n", hdrlen); if (!vu_queue_enabled(dev, vq) || !vu_queue_started(dev, vq) || !vu_queue_avail_bytes(dev, vq, hdrlen, 0)) { DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n"); return; } while (1) { struct iovec *sg; ssize_t ret, total = 0; unsigned int num; elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement)); if (!elem) { break; } if (elem->in_num < 1) { fprintf(stderr, "virtio-net contains no in buffers\n"); break; } sg = elem->in_sg; num = elem->in_num; if (i == 0) { if (hdrlen == 12) { mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), sg, elem->in_num, offsetof(typeof(mhdr), num_buffers), sizeof(mhdr.num_buffers)); } iov_from_buf(sg, elem->in_num, 0, &hdr, sizeof hdr); total += hdrlen; ret = iov_discard_front(&sg, &num, hdrlen); assert(ret == hdrlen); } struct msghdr msg = { .msg_name = (struct sockaddr *) &vubr->backend_udp_dest, .msg_namelen = sizeof(struct sockaddr_in), .msg_iov = sg, .msg_iovlen = num, .msg_flags = MSG_DONTWAIT, }; do { ret = recvmsg(vubr->backend_udp_sock, &msg, 0); } while (ret == -1 && (errno == EINTR)); if (i == 0) { iov_restore_front(elem->in_sg, sg, hdrlen); } if (ret == -1) { if (errno == EWOULDBLOCK) { vu_queue_rewind(dev, vq, 1); break; } vubr_die("recvmsg()"); } total += ret; iov_truncate(elem->in_sg, elem->in_num, total); vu_queue_fill(dev, vq, elem, total, i++); free(elem); elem = NULL; break; /* could loop if DONTWAIT worked? */ } if (mhdr_cnt) { mhdr.num_buffers = i; iov_from_buf(mhdr_sg, mhdr_cnt, 0, &mhdr.num_buffers, sizeof mhdr.num_buffers); } vu_queue_flush(dev, vq, i); vu_queue_notify(dev, vq); free(elem); } static void vubr_receive_cb(int sock, void *ctx) { VubrDev *vubr = (VubrDev *)ctx; if (!vu_dispatch(&vubr->vudev)) { fprintf(stderr, "Error while dispatching\n"); } } typedef struct WatchData { VuDev *dev; vu_watch_cb cb; void *data; } WatchData; static void watch_cb(int sock, void *ctx) { struct WatchData *wd = ctx; wd->cb(wd->dev, VU_WATCH_IN, wd->data); } static void vubr_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb, void *data) { VubrDev *vubr = container_of(dev, VubrDev, vudev); static WatchData watches[FD_SETSIZE]; struct WatchData *wd = &watches[fd]; wd->cb = cb; wd->data = data; wd->dev = dev; dispatcher_add(&vubr->dispatcher, fd, wd, watch_cb); } static void vubr_remove_watch(VuDev *dev, int fd) { VubrDev *vubr = container_of(dev, VubrDev, vudev); dispatcher_remove(&vubr->dispatcher, fd); } static int vubr_send_rarp_exec(VuDev *dev, VhostUserMsg *vmsg) { DPRINT("Function %s() not implemented yet.\n", __func__); return 0; } static int vubr_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply) { switch (vmsg->request) { case VHOST_USER_SEND_RARP: *do_reply = vubr_send_rarp_exec(dev, vmsg); return 1; default: /* let the library handle the rest */ return 0; } return 0; } static void vubr_set_features(VuDev *dev, uint64_t features) { VubrDev *vubr = container_of(dev, VubrDev, vudev); if ((features & (1ULL << VIRTIO_F_VERSION_1)) || (features & (1ULL << VIRTIO_NET_F_MRG_RXBUF))) { vubr->hdrlen = 12; } else { vubr->hdrlen = 10; } } static uint64_t vubr_get_features(VuDev *dev) { return 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE | 1ULL << VIRTIO_NET_F_MRG_RXBUF | 1ULL << VIRTIO_F_VERSION_1; } static void vubr_queue_set_started(VuDev *dev, int qidx, bool started) { VubrDev *vubr = container_of(dev, VubrDev, vudev); VuVirtq *vq = vu_get_queue(dev, qidx); if (started && vubr->notifier.fd >= 0) { vu_set_queue_host_notifier(dev, vq, vubr->notifier.fd, qemu_real_host_page_size(), qidx * qemu_real_host_page_size()); } if (qidx % 2 == 1) { vu_set_queue_handler(dev, vq, started ? vubr_handle_tx : NULL); } } static void vubr_panic(VuDev *dev, const char *msg) { VubrDev *vubr = container_of(dev, VubrDev, vudev); fprintf(stderr, "PANIC: %s\n", msg); dispatcher_remove(&vubr->dispatcher, dev->sock); vubr->quit = 1; } static bool vubr_queue_is_processed_in_order(VuDev *dev, int qidx) { return true; } static const VuDevIface vuiface = { .get_features = vubr_get_features, .set_features = vubr_set_features, .process_msg = vubr_process_msg, .queue_set_started = vubr_queue_set_started, .queue_is_processed_in_order = vubr_queue_is_processed_in_order, }; static void vubr_accept_cb(int sock, void *ctx) { VubrDev *dev = (VubrDev *)ctx; int conn_fd; struct sockaddr_un un; socklen_t len = sizeof(un); conn_fd = accept(sock, (struct sockaddr *) &un, &len); if (conn_fd == -1) { vubr_die("accept()"); } DPRINT("Got connection from remote peer on sock %d\n", conn_fd); if (!vu_init(&dev->vudev, VHOST_USER_BRIDGE_MAX_QUEUES, conn_fd, vubr_panic, NULL, vubr_set_watch, vubr_remove_watch, &vuiface)) { fprintf(stderr, "Failed to initialize libvhost-user\n"); exit(1); } dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb); dispatcher_remove(&dev->dispatcher, sock); } static VubrDev * vubr_new(const char *path, bool client) { VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev)); struct sockaddr_un un; CallbackFunc cb; size_t len; if (strlen(path) >= sizeof(un.sun_path)) { fprintf(stderr, "unix domain socket path '%s' is too long\n", path); exit(1); } /* Get a UNIX socket. */ dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); if (dev->sock == -1) { vubr_die("socket"); } dev->notifier.fd = -1; un.sun_family = AF_UNIX; strcpy(un.sun_path, path); len = sizeof(un.sun_family) + strlen(path); if (!client) { unlink(path); if (bind(dev->sock, (struct sockaddr *) &un, len) == -1) { vubr_die("bind"); } if (listen(dev->sock, 1) == -1) { vubr_die("listen"); } cb = vubr_accept_cb; DPRINT("Waiting for connections on UNIX socket %s ...\n", path); } else { if (connect(dev->sock, (struct sockaddr *)&un, len) == -1) { vubr_die("connect"); } if (!vu_init(&dev->vudev, VHOST_USER_BRIDGE_MAX_QUEUES, dev->sock, vubr_panic, NULL, vubr_set_watch, vubr_remove_watch, &vuiface)) { fprintf(stderr, "Failed to initialize libvhost-user\n"); exit(1); } cb = vubr_receive_cb; } dispatcher_init(&dev->dispatcher); dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev, cb); return dev; } static void *notifier_thread(void *arg) { VuDev *dev = (VuDev *)arg; VubrDev *vubr = container_of(dev, VubrDev, vudev); int pagesize = qemu_real_host_page_size(); int qidx; while (true) { for (qidx = 0; qidx < VHOST_USER_BRIDGE_MAX_QUEUES; qidx++) { uint16_t *n = vubr->notifier.addr + pagesize * qidx; if (*n == qidx) { *n = 0xffff; /* We won't miss notifications if we reset * the memory first. */ smp_mb(); DPRINT("Got a notification for queue%d via host notifier.\n", qidx); if (qidx % 2 == 1) { vubr_handle_tx(dev, qidx); } } usleep(1000); } } return NULL; } static void vubr_host_notifier_setup(VubrDev *dev) { pthread_t thread; size_t length; void *addr; int fd; length = qemu_real_host_page_size() * VHOST_USER_BRIDGE_MAX_QUEUES; fd = g_file_open_tmp("vubr-XXXXXX", NULL, NULL); if (fd < 0) { vubr_die("mkstemp()"); } if (posix_fallocate(fd, 0, length) != 0) { vubr_die("posix_fallocate()"); } addr = mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) { vubr_die("mmap()"); } memset(addr, 0xff, length); if (pthread_create(&thread, NULL, notifier_thread, &dev->vudev) != 0) { vubr_die("pthread_create()"); } dev->notifier.fd = fd; dev->notifier.addr = addr; dev->notifier.thread = thread; } static void vubr_set_host(struct sockaddr_in *saddr, const char *host) { if (qemu_isdigit(host[0])) { if (!inet_aton(host, &saddr->sin_addr)) { fprintf(stderr, "inet_aton() failed.\n"); exit(1); } } else { struct hostent *he = gethostbyname(host); if (!he) { fprintf(stderr, "gethostbyname() failed.\n"); exit(1); } saddr->sin_addr = *(struct in_addr *)he->h_addr; } } static void vubr_backend_udp_setup(VubrDev *dev, const char *local_host, const char *local_port, const char *remote_host, const char *remote_port) { int sock; const char *r; int lport, rport; lport = strtol(local_port, (char **)&r, 0); if (r == local_port) { fprintf(stderr, "lport parsing failed.\n"); exit(1); } rport = strtol(remote_port, (char **)&r, 0); if (r == remote_port) { fprintf(stderr, "rport parsing failed.\n"); exit(1); } struct sockaddr_in si_local = { .sin_family = AF_INET, .sin_port = htons(lport), }; vubr_set_host(&si_local, local_host); /* setup destination for sends */ dev->backend_udp_dest = (struct sockaddr_in) { .sin_family = AF_INET, .sin_port = htons(rport), }; vubr_set_host(&dev->backend_udp_dest, remote_host); sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); if (sock == -1) { vubr_die("socket"); } if (bind(sock, (struct sockaddr *)&si_local, sizeof(si_local)) == -1) { vubr_die("bind"); } dev->backend_udp_sock = sock; dispatcher_add(&dev->dispatcher, sock, dev, vubr_backend_recv_cb); DPRINT("Waiting for data from udp backend on %s:%d...\n", local_host, lport); } static void vubr_run(VubrDev *dev) { while (!dev->quit) { /* timeout 200ms */ dispatcher_wait(&dev->dispatcher, 200000); /* Here one can try polling strategy. */ } } static int vubr_parse_host_port(const char **host, const char **port, const char *buf) { char *p = strchr(buf, ':'); if (!p) { return -1; } *p = '\0'; *host = strdup(buf); *port = strdup(p + 1); return 0; } #define DEFAULT_UD_SOCKET "/tmp/vubr.sock" #define DEFAULT_LHOST "127.0.0.1" #define DEFAULT_LPORT "4444" #define DEFAULT_RHOST "127.0.0.1" #define DEFAULT_RPORT "5555" static const char *ud_socket_path = DEFAULT_UD_SOCKET; static const char *lhost = DEFAULT_LHOST; static const char *lport = DEFAULT_LPORT; static const char *rhost = DEFAULT_RHOST; static const char *rport = DEFAULT_RPORT; int main(int argc, char *argv[]) { VubrDev *dev; int opt; bool client = false; bool host_notifier = false; while ((opt = getopt(argc, argv, "l:r:u:cH")) != -1) { switch (opt) { case 'l': if (vubr_parse_host_port(&lhost, &lport, optarg) < 0) { goto out; } break; case 'r': if (vubr_parse_host_port(&rhost, &rport, optarg) < 0) { goto out; } break; case 'u': ud_socket_path = strdup(optarg); break; case 'c': client = true; break; case 'H': host_notifier = true; break; default: goto out; } } DPRINT("ud socket: %s (%s)\n", ud_socket_path, client ? "client" : "server"); DPRINT("local: %s:%s\n", lhost, lport); DPRINT("remote: %s:%s\n", rhost, rport); dev = vubr_new(ud_socket_path, client); if (!dev) { return 1; } if (host_notifier) { vubr_host_notifier_setup(dev); } vubr_backend_udp_setup(dev, lhost, lport, rhost, rport); vubr_run(dev); vu_deinit(&dev->vudev); return 0; out: fprintf(stderr, "Usage: %s ", argv[0]); fprintf(stderr, "[-c] [-H] [-u ud_socket_path] [-l lhost:lport] [-r rhost:rport]\n"); fprintf(stderr, "\t-u path to unix domain socket. default: %s\n", DEFAULT_UD_SOCKET); fprintf(stderr, "\t-l local host and port. default: %s:%s\n", DEFAULT_LHOST, DEFAULT_LPORT); fprintf(stderr, "\t-r remote host and port. default: %s:%s\n", DEFAULT_RHOST, DEFAULT_RPORT); fprintf(stderr, "\t-c client mode\n"); fprintf(stderr, "\t-H use host notifier\n"); return 1; }