vhost: Shadow virtqueue buffers forwarding

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
This commit is contained in:
Eugenio Pérez 2022-03-14 18:34:47 +01:00 committed by Jason Wang
parent d96be4c894
commit 100890f7ca
3 changed files with 524 additions and 13 deletions

View file

@ -13,6 +13,8 @@
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qemu/main-loop.h"
#include "qemu/log.h"
#include "qemu/memalign.h"
#include "linux-headers/linux/vhost.h"
/**
@ -59,28 +61,307 @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
}
/**
* Forward guest notifications.
* Number of descriptors that the SVQ can make available from the guest.
*
* @n: guest kick event notifier, the one that guest set to notify svq.
* @svq: The svq
*/
static void vhost_handle_guest_kick(EventNotifier *n)
static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
event_notifier_test_and_clear(n);
return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
}
static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
const struct iovec *iovec, size_t num,
bool more_descs, bool write)
{
uint16_t i = svq->free_head, last = svq->free_head;
unsigned n;
uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
vring_desc_t *descs = svq->vring.desc;
if (num == 0) {
return;
}
for (n = 0; n < num; n++) {
if (more_descs || (n + 1 < num)) {
descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
} else {
descs[i].flags = flags;
}
descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
descs[i].len = cpu_to_le32(iovec[n].iov_len);
last = i;
i = cpu_to_le16(descs[i].next);
}
svq->free_head = le16_to_cpu(descs[last].next);
}
static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
VirtQueueElement *elem, unsigned *head)
{
unsigned avail_idx;
vring_avail_t *avail = svq->vring.avail;
*head = svq->free_head;
/* We need some descriptors here */
if (unlikely(!elem->out_num && !elem->in_num)) {
qemu_log_mask(LOG_GUEST_ERROR,
"Guest provided element with no descriptors");
return false;
}
vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
false);
vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
/*
* Put the entry in the available array (but don't update avail->idx until
* they do sync).
*/
avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
avail->ring[avail_idx] = cpu_to_le16(*head);
svq->shadow_avail_idx++;
/* Update the avail index after write the descriptor */
smp_wmb();
avail->idx = cpu_to_le16(svq->shadow_avail_idx);
return true;
}
static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
{
unsigned qemu_head;
bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
if (unlikely(!ok)) {
return false;
}
svq->ring_id_maps[qemu_head] = elem;
return true;
}
static void vhost_svq_kick(VhostShadowVirtqueue *svq)
{
/*
* We need to expose the available array entries before checking the used
* flags
*/
smp_mb();
if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
return;
}
event_notifier_set(&svq->hdev_kick);
}
/**
* Forward vhost notifications
* Forward available buffers.
*
* @svq: Shadow VirtQueue
*
* Note that this function does not guarantee that all guest's available
* buffers are available to the device in SVQ avail ring. The guest may have
* exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
* qemu vaddr.
*
* If that happens, guest's kick notifications will be disabled until the
* device uses some buffers.
*/
static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
{
/* Clear event notifier */
event_notifier_test_and_clear(&svq->svq_kick);
/* Forward to the device as many available buffers as possible */
do {
virtio_queue_set_notification(svq->vq, false);
while (true) {
VirtQueueElement *elem;
bool ok;
if (svq->next_guest_avail_elem) {
elem = g_steal_pointer(&svq->next_guest_avail_elem);
} else {
elem = virtqueue_pop(svq->vq, sizeof(*elem));
}
if (!elem) {
break;
}
if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
/*
* This condition is possible since a contiguous buffer in GPA
* does not imply a contiguous buffer in qemu's VA
* scatter-gather segments. If that happens, the buffer exposed
* to the device needs to be a chain of descriptors at this
* moment.
*
* SVQ cannot hold more available buffers if we are here:
* queue the current guest descriptor and ignore further kicks
* until some elements are used.
*/
svq->next_guest_avail_elem = elem;
return;
}
ok = vhost_svq_add(svq, elem);
if (unlikely(!ok)) {
/* VQ is broken, just return and ignore any other kicks */
return;
}
vhost_svq_kick(svq);
}
virtio_queue_set_notification(svq->vq, true);
} while (!virtio_queue_empty(svq->vq));
}
/**
* Handle guest's kick.
*
* @n: guest kick event notifier, the one that guest set to notify svq.
*/
static void vhost_handle_guest_kick_notifier(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
event_notifier_test_and_clear(n);
vhost_handle_guest_kick(svq);
}
static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
{
if (svq->last_used_idx != svq->shadow_used_idx) {
return true;
}
svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
return svq->last_used_idx != svq->shadow_used_idx;
}
/**
* Enable vhost device calls after disable them.
*
* @svq: The svq
*
* It returns false if there are pending used buffers from the vhost device,
* avoiding the possible races between SVQ checking for more work and enabling
* callbacks. True if SVQ used vring has no more pending buffers.
*/
static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
{
svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
/* Make sure the flag is written before the read of used_idx */
smp_mb();
return !vhost_svq_more_used(svq);
}
static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
{
svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
}
static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
uint32_t *len)
{
vring_desc_t *descs = svq->vring.desc;
const vring_used_t *used = svq->vring.used;
vring_used_elem_t used_elem;
uint16_t last_used;
if (!vhost_svq_more_used(svq)) {
return NULL;
}
/* Only get used array entries after they have been exposed by dev */
smp_rmb();
last_used = svq->last_used_idx & (svq->vring.num - 1);
used_elem.id = le32_to_cpu(used->ring[last_used].id);
used_elem.len = le32_to_cpu(used->ring[last_used].len);
svq->last_used_idx++;
if (unlikely(used_elem.id >= svq->vring.num)) {
qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
svq->vdev->name, used_elem.id);
return NULL;
}
if (unlikely(!svq->ring_id_maps[used_elem.id])) {
qemu_log_mask(LOG_GUEST_ERROR,
"Device %s says index %u is used, but it was not available",
svq->vdev->name, used_elem.id);
return NULL;
}
descs[used_elem.id].next = svq->free_head;
svq->free_head = used_elem.id;
*len = used_elem.len;
return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
}
static void vhost_svq_flush(VhostShadowVirtqueue *svq,
bool check_for_avail_queue)
{
VirtQueue *vq = svq->vq;
/* Forward as many used buffers as possible. */
do {
unsigned i = 0;
vhost_svq_disable_notification(svq);
while (true) {
uint32_t len;
g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
if (!elem) {
break;
}
if (unlikely(i >= svq->vring.num)) {
qemu_log_mask(LOG_GUEST_ERROR,
"More than %u used buffers obtained in a %u size SVQ",
i, svq->vring.num);
virtqueue_fill(vq, elem, len, i);
virtqueue_flush(vq, i);
return;
}
virtqueue_fill(vq, elem, len, i++);
}
virtqueue_flush(vq, i);
event_notifier_set(&svq->svq_call);
if (check_for_avail_queue && svq->next_guest_avail_elem) {
/*
* Avail ring was full when vhost_svq_flush was called, so it's a
* good moment to make more descriptors available if possible.
*/
vhost_handle_guest_kick(svq);
}
} while (!vhost_svq_enable_notification(svq));
}
/**
* Forward used buffers.
*
* @n: hdev call event notifier, the one that device set to notify svq.
*
* Note that we are not making any buffers available in the loop, there is no
* way that it runs more than virtqueue size times.
*/
static void vhost_svq_handle_call(EventNotifier *n)
{
VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
hdev_call);
event_notifier_test_and_clear(n);
event_notifier_set(&svq->svq_call);
vhost_svq_flush(svq, true);
}
/**
@ -161,7 +442,41 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
if (poll_start) {
event_notifier_init_fd(svq_kick, svq_kick_fd);
event_notifier_set(svq_kick);
event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
}
}
/**
* Start the shadow virtqueue operation.
*
* @svq: Shadow Virtqueue
* @vdev: VirtIO device
* @vq: Virtqueue to shadow
*/
void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
VirtQueue *vq)
{
size_t desc_size, driver_size, device_size;
svq->next_guest_avail_elem = NULL;
svq->shadow_avail_idx = 0;
svq->shadow_used_idx = 0;
svq->last_used_idx = 0;
svq->vdev = vdev;
svq->vq = vq;
svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
driver_size = vhost_svq_driver_area_size(svq);
device_size = vhost_svq_device_area_size(svq);
svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
desc_size = sizeof(vring_desc_t) * svq->vring.num;
svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
memset(svq->vring.desc, 0, driver_size);
svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
memset(svq->vring.used, 0, device_size);
svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
for (unsigned i = 0; i < svq->vring.num - 1; i++) {
svq->vring.desc[i].next = cpu_to_le16(i + 1);
}
}
@ -172,6 +487,31 @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
void vhost_svq_stop(VhostShadowVirtqueue *svq)
{
event_notifier_set_handler(&svq->svq_kick, NULL);
g_autofree VirtQueueElement *next_avail_elem = NULL;
if (!svq->vq) {
return;
}
/* Send all pending used descriptors to guest */
vhost_svq_flush(svq, false);
for (unsigned i = 0; i < svq->vring.num; ++i) {
g_autofree VirtQueueElement *elem = NULL;
elem = g_steal_pointer(&svq->ring_id_maps[i]);
if (elem) {
virtqueue_detach_element(svq->vq, elem, 0);
}
}
next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
if (next_avail_elem) {
virtqueue_detach_element(svq->vq, next_avail_elem, 0);
}
svq->vq = NULL;
g_free(svq->ring_id_maps);
qemu_vfree(svq->vring.desc);
qemu_vfree(svq->vring.used);
}
/**

View file

@ -36,6 +36,30 @@ typedef struct VhostShadowVirtqueue {
/* Guest's call notifier, where the SVQ calls guest. */
EventNotifier svq_call;
/* Virtio queue shadowing */
VirtQueue *vq;
/* Virtio device */
VirtIODevice *vdev;
/* Map for use the guest's descriptors */
VirtQueueElement **ring_id_maps;
/* Next VirtQueue element that guest made available */
VirtQueueElement *next_guest_avail_elem;
/* Next head to expose to the device */
uint16_t shadow_avail_idx;
/* Next free descriptor */
uint16_t free_head;
/* Last seen used idx */
uint16_t shadow_used_idx;
/* Next head to consume from the device */
uint16_t last_used_idx;
} VhostShadowVirtqueue;
bool vhost_svq_valid_features(uint64_t features, Error **errp);
@ -47,6 +71,8 @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
VirtQueue *vq);
void vhost_svq_stop(VhostShadowVirtqueue *svq);
VhostShadowVirtqueue *vhost_svq_new(void);

View file

@ -777,9 +777,9 @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
* Note that this function does not rewind kick file descriptor if cannot set
* call one.
*/
static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
VhostShadowVirtqueue *svq, unsigned idx,
Error **errp)
static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
VhostShadowVirtqueue *svq, unsigned idx,
Error **errp)
{
struct vhost_vring_file file = {
.index = dev->vq_index + idx,
@ -791,7 +791,7 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
r = vhost_vdpa_set_vring_dev_kick(dev, &file);
if (unlikely(r != 0)) {
error_setg_errno(errp, -r, "Can't set device kick fd");
return false;
return r;
}
event_notifier = &svq->hdev_call;
@ -801,6 +801,95 @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
error_setg_errno(errp, -r, "Can't set device call fd");
}
return r;
}
/**
* Unmap a SVQ area in the device
*/
static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
hwaddr size)
{
int r;
size = ROUND_UP(size, qemu_real_host_page_size);
r = vhost_vdpa_dma_unmap(v, iova, size);
return r == 0;
}
static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
const VhostShadowVirtqueue *svq)
{
struct vhost_vdpa *v = dev->opaque;
struct vhost_vring_addr svq_addr;
size_t device_size = vhost_svq_device_area_size(svq);
size_t driver_size = vhost_svq_driver_area_size(svq);
bool ok;
vhost_svq_get_vring_addr(svq, &svq_addr);
ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
if (unlikely(!ok)) {
return false;
}
return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
}
/**
* Map the shadow virtqueue rings in the device
*
* @dev: The vhost device
* @svq: The shadow virtqueue
* @addr: Assigned IOVA addresses
* @errp: Error pointer
*/
static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
const VhostShadowVirtqueue *svq,
struct vhost_vring_addr *addr,
Error **errp)
{
struct vhost_vdpa *v = dev->opaque;
size_t device_size = vhost_svq_device_area_size(svq);
size_t driver_size = vhost_svq_driver_area_size(svq);
int r;
ERRP_GUARD();
vhost_svq_get_vring_addr(svq, addr);
r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
(void *)(uintptr_t)addr->desc_user_addr, true);
if (unlikely(r != 0)) {
error_setg_errno(errp, -r, "Cannot create vq driver region: ");
return false;
}
r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
(void *)(intptr_t)addr->used_user_addr, false);
if (unlikely(r != 0)) {
error_setg_errno(errp, -r, "Cannot create vq device region: ");
}
return r == 0;
}
static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
VhostShadowVirtqueue *svq, unsigned idx,
Error **errp)
{
uint16_t vq_index = dev->vq_index + idx;
struct vhost_vring_state s = {
.index = vq_index,
};
int r;
r = vhost_vdpa_set_dev_vring_base(dev, &s);
if (unlikely(r)) {
error_setg_errno(errp, -r, "Cannot set vring base");
return false;
}
r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
return r == 0;
}
@ -815,10 +904,62 @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
}
for (i = 0; i < v->shadow_vqs->len; ++i) {
VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
struct vhost_vring_addr addr = {
.index = i,
};
int r;
bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
if (unlikely(!ok)) {
error_reportf_err(err, "Cannot setup SVQ %u: ", i);
goto err;
}
vhost_svq_start(svq, dev->vdev, vq);
ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
if (unlikely(!ok)) {
goto err_map;
}
/* Override vring GPA set by vhost subsystem */
r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
if (unlikely(r != 0)) {
error_setg_errno(&err, -r, "Cannot set device address");
goto err_set_addr;
}
}
return true;
err_set_addr:
vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
err_map:
vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
err:
error_reportf_err(err, "Cannot setup SVQ %u: ", i);
for (unsigned j = 0; j < i; ++j) {
VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
vhost_vdpa_svq_unmap_rings(dev, svq);
vhost_svq_stop(svq);
}
return false;
}
static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
{
struct vhost_vdpa *v = dev->opaque;
if (!v->shadow_vqs) {
return true;
}
for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
if (unlikely(!ok)) {
return false;
}
}
@ -840,6 +981,10 @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
}
vhost_vdpa_set_vring_ready(dev);
} else {
ok = vhost_vdpa_svqs_stop(dev);
if (unlikely(!ok)) {
return -1;
}
vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
}