From 1f995a4782d140b16d9b24e787053944fb5c4dfb Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Feb 2024 14:06:06 -0500 Subject: [PATCH 01/16] virtio-blk: enforce iothread-vq-mapping validation Hanna Czenczek noticed that the safety of `vq_aio_context[vq->value] = ctx;` with user-defined vq->value inputs is not obvious. The code is structured in validate() + apply() steps so input validation is there, but it happens way earlier and there is nothing that guarantees apply() can only be called with validated inputs. This patch moves the validate() call inside the apply() function so validation is guaranteed. I also added the bounds checking assertion that Hanna suggested. Signed-off-by: Stefan Hajnoczi Reviewed-by: Manos Pitsidianakis Reviewed-by: Hanna Czenczek Message-ID: <20240206190610.107963-2-stefanha@redhat.com> Reviewed-by: Kevin Wolf Reviewed-by: Michael S. Tsirkin Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 183 +++++++++++++++++++++++------------------- 1 file changed, 102 insertions(+), 81 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 227d83569f..6e3e3a23ee 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1485,68 +1485,6 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f, return 0; } -static bool -validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, - uint16_t num_queues, Error **errp) -{ - g_autofree unsigned long *vqs = bitmap_new(num_queues); - g_autoptr(GHashTable) iothreads = - g_hash_table_new(g_str_hash, g_str_equal); - - for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) { - const char *name = node->value->iothread; - uint16List *vq; - - if (!iothread_by_id(name)) { - error_setg(errp, "IOThread \"%s\" object does not exist", name); - return false; - } - - if (!g_hash_table_add(iothreads, (gpointer)name)) { - error_setg(errp, - "duplicate IOThread name \"%s\" in iothread-vq-mapping", - name); - return false; - } - - if (node != list) { - if (!!node->value->vqs != !!list->value->vqs) { - error_setg(errp, "either all items in iothread-vq-mapping " - "must have vqs or none of them must have it"); - return false; - } - } - - for (vq = node->value->vqs; vq; vq = vq->next) { - if (vq->value >= num_queues) { - error_setg(errp, "vq index %u for IOThread \"%s\" must be " - "less than num_queues %u in iothread-vq-mapping", - vq->value, name, num_queues); - return false; - } - - if (test_and_set_bit(vq->value, vqs)) { - error_setg(errp, "cannot assign vq %u to IOThread \"%s\" " - "because it is already assigned", vq->value, name); - return false; - } - } - } - - if (list->value->vqs) { - for (uint16_t i = 0; i < num_queues; i++) { - if (!test_bit(i, vqs)) { - error_setg(errp, - "missing vq %u IOThread assignment in iothread-vq-mapping", - i); - return false; - } - } - } - - return true; -} - static void virtio_resize_cb(void *opaque) { VirtIODevice *vdev = opaque; @@ -1613,15 +1551,95 @@ static const BlockDevOps virtio_block_ops = { .drained_end = virtio_blk_drained_end, }; -/* Generate vq:AioContext mappings from a validated iothread-vq-mapping list */ -static void -apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list, - AioContext **vq_aio_context, uint16_t num_queues) +static bool +validate_iothread_vq_mapping_list(IOThreadVirtQueueMappingList *list, + uint16_t num_queues, Error **errp) +{ + g_autofree unsigned long *vqs = bitmap_new(num_queues); + g_autoptr(GHashTable) iothreads = + g_hash_table_new(g_str_hash, g_str_equal); + + for (IOThreadVirtQueueMappingList *node = list; node; node = node->next) { + const char *name = node->value->iothread; + uint16List *vq; + + if (!iothread_by_id(name)) { + error_setg(errp, "IOThread \"%s\" object does not exist", name); + return false; + } + + if (!g_hash_table_add(iothreads, (gpointer)name)) { + error_setg(errp, + "duplicate IOThread name \"%s\" in iothread-vq-mapping", + name); + return false; + } + + if (node != list) { + if (!!node->value->vqs != !!list->value->vqs) { + error_setg(errp, "either all items in iothread-vq-mapping " + "must have vqs or none of them must have it"); + return false; + } + } + + for (vq = node->value->vqs; vq; vq = vq->next) { + if (vq->value >= num_queues) { + error_setg(errp, "vq index %u for IOThread \"%s\" must be " + "less than num_queues %u in iothread-vq-mapping", + vq->value, name, num_queues); + return false; + } + + if (test_and_set_bit(vq->value, vqs)) { + error_setg(errp, "cannot assign vq %u to IOThread \"%s\" " + "because it is already assigned", vq->value, name); + return false; + } + } + } + + if (list->value->vqs) { + for (uint16_t i = 0; i < num_queues; i++) { + if (!test_bit(i, vqs)) { + error_setg(errp, + "missing vq %u IOThread assignment in iothread-vq-mapping", + i); + return false; + } + } + } + + return true; +} + +/** + * apply_iothread_vq_mapping: + * @iothread_vq_mapping_list: The mapping of virtqueues to IOThreads. + * @vq_aio_context: The array of AioContext pointers to fill in. + * @num_queues: The length of @vq_aio_context. + * @errp: If an error occurs, a pointer to the area to store the error. + * + * Fill in the AioContext for each virtqueue in the @vq_aio_context array given + * the iothread-vq-mapping parameter in @iothread_vq_mapping_list. + * + * Returns: %true on success, %false on failure. + **/ +static bool apply_iothread_vq_mapping( + IOThreadVirtQueueMappingList *iothread_vq_mapping_list, + AioContext **vq_aio_context, + uint16_t num_queues, + Error **errp) { IOThreadVirtQueueMappingList *node; size_t num_iothreads = 0; size_t cur_iothread = 0; + if (!validate_iothread_vq_mapping_list(iothread_vq_mapping_list, + num_queues, errp)) { + return false; + } + for (node = iothread_vq_mapping_list; node; node = node->next) { num_iothreads++; } @@ -1638,6 +1656,7 @@ apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list, /* Explicit vq:IOThread assignment */ for (vq = node->value->vqs; vq; vq = vq->next) { + assert(vq->value < num_queues); vq_aio_context[vq->value] = ctx; } } else { @@ -1650,6 +1669,8 @@ apply_vq_mapping(IOThreadVirtQueueMappingList *iothread_vq_mapping_list, cur_iothread++; } + + return true; } /* Context: BQL held */ @@ -1660,6 +1681,13 @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + if (conf->iothread && conf->iothread_vq_mapping_list) { + error_setg(errp, + "iothread and iothread-vq-mapping properties cannot be set " + "at the same time"); + return false; + } + if (conf->iothread || conf->iothread_vq_mapping_list) { if (!k->set_guest_notifiers || !k->ioeventfd_assign) { error_setg(errp, @@ -1685,8 +1713,14 @@ static bool virtio_blk_vq_aio_context_init(VirtIOBlock *s, Error **errp) s->vq_aio_context = g_new(AioContext *, conf->num_queues); if (conf->iothread_vq_mapping_list) { - apply_vq_mapping(conf->iothread_vq_mapping_list, s->vq_aio_context, - conf->num_queues); + if (!apply_iothread_vq_mapping(conf->iothread_vq_mapping_list, + s->vq_aio_context, + conf->num_queues, + errp)) { + g_free(s->vq_aio_context); + s->vq_aio_context = NULL; + return false; + } } else if (conf->iothread) { AioContext *ctx = iothread_get_aio_context(conf->iothread); for (unsigned i = 0; i < conf->num_queues; i++) { @@ -1996,19 +2030,6 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } - if (conf->iothread_vq_mapping_list) { - if (conf->iothread) { - error_setg(errp, "iothread and iothread-vq-mapping properties " - "cannot be set at the same time"); - return; - } - - if (!validate_iothread_vq_mapping_list(conf->iothread_vq_mapping_list, - conf->num_queues, errp)) { - return; - } - } - s->config_size = virtio_get_config_size(&virtio_blk_cfg_size_params, s->host_features); virtio_init(vdev, VIRTIO_ID_BLOCK, s->config_size); From 5fbcbd50fca67ffe35317f2e86f6822aa22e41c6 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Feb 2024 14:06:07 -0500 Subject: [PATCH 02/16] virtio-blk: clarify that there is at least 1 virtqueue It is not possible to instantiate a virtio-blk device with 0 virtqueues. The following check is located in ->realize(): if (!conf->num_queues) { error_setg(errp, "num-queues property must be larger than 0"); return; } Later on we access s->vq_aio_context[0] under the assumption that there is as least one virtqueue. Hanna Czenczek noted that it would help to show that the array index is already valid. Add an assertion to document that s->vq_aio_context[0] is always safe...and catch future code changes that break this assumption. Suggested-by: Hanna Czenczek Reviewed-by: Manos Pitsidianakis Reviewed-by: Hanna Czenczek Signed-off-by: Stefan Hajnoczi Message-ID: <20240206190610.107963-3-stefanha@redhat.com> Reviewed-by: Kevin Wolf Reviewed-by: Michael S. Tsirkin Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 6e3e3a23ee..e430ba583c 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1824,6 +1824,7 @@ static int virtio_blk_start_ioeventfd(VirtIODevice *vdev) * Try to change the AioContext so that block jobs and other operations can * co-locate their activity in the same AioContext. If it fails, nevermind. */ + assert(nvqs > 0); /* enforced during ->realize() */ r = blk_set_aio_context(s->conf.conf.blk, s->vq_aio_context[0], &local_err); if (r < 0) { From f2eea93c6b410efccc18f4b47090b4a4e842afbd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Feb 2024 14:06:08 -0500 Subject: [PATCH 03/16] virtio-blk: add vq_rq[] bounds check in virtio_blk_dma_restart_cb() Hanna Czenczek noted that the array index in virtio_blk_dma_restart_cb() is not bounds-checked: g_autofree VirtIOBlockReq **vq_rq = g_new0(VirtIOBlockReq *, num_queues); ... while (rq) { VirtIOBlockReq *next = rq->next; uint16_t idx = virtio_get_queue_index(rq->vq); rq->next = vq_rq[idx]; ^^^^^^^^^^ The code is correct because both rq->vq and vq_rq[] depend on num_queues, but this is indirect and not 100% obvious. Add an assertion. Suggested-by: Hanna Czenczek Reviewed-by: Manos Pitsidianakis Reviewed-by: Hanna Czenczek Signed-off-by: Stefan Hajnoczi Message-ID: <20240206190610.107963-4-stefanha@redhat.com> Reviewed-by: Kevin Wolf Reviewed-by: Michael S. Tsirkin Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index e430ba583c..31212506ca 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -1209,6 +1209,8 @@ static void virtio_blk_dma_restart_cb(void *opaque, bool running, VirtIOBlockReq *next = rq->next; uint16_t idx = virtio_get_queue_index(rq->vq); + /* Only num_queues vqs were created so vq_rq[idx] is within bounds */ + assert(idx < num_queues); rq->next = vq_rq[idx]; vq_rq[idx] = rq; rq = next; From 8802f4f4e3a3e7e0ffc778cab853b14349574a7b Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Feb 2024 14:06:09 -0500 Subject: [PATCH 04/16] virtio-blk: declare VirtIOBlock::rq with a type The VirtIOBlock::rq field has had the type void * since its introduction in commit 869a5c6df19a ("Stop VM on error in virtio-blk. (Gleb Natapov)"). Perhaps this was done to avoid the forward declaration of VirtIOBlockReq. Hanna Czenczek pointed out the missing type. Specify the actual type because there is no need to use void * here. Suggested-by: Hanna Czenczek Reviewed-by: Manos Pitsidianakis Reviewed-by: Hanna Czenczek Signed-off-by: Stefan Hajnoczi Message-ID: <20240206190610.107963-5-stefanha@redhat.com> Reviewed-by: Kevin Wolf Reviewed-by: Michael S. Tsirkin Signed-off-by: Kevin Wolf --- include/hw/virtio/virtio-blk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h index 833a9a344f..5c14110c4b 100644 --- a/include/hw/virtio/virtio-blk.h +++ b/include/hw/virtio/virtio-blk.h @@ -55,7 +55,7 @@ struct VirtIOBlock { VirtIODevice parent_obj; BlockBackend *blk; QemuMutex rq_lock; - void *rq; /* protected by rq_lock */ + struct VirtIOBlockReq *rq; /* protected by rq_lock */ VirtIOBlkConf conf; unsigned short sector_mask; bool original_wce; From 1f25c172f83704e350c0829438d832384084a74d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Feb 2024 14:06:10 -0500 Subject: [PATCH 05/16] monitor: use aio_co_reschedule_self() The aio_co_reschedule_self() API is designed to avoid the race condition between scheduling the coroutine in another AioContext and yielding. The QMP dispatch code uses the open-coded version that appears susceptible to the race condition at first glance: aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self()); qemu_coroutine_yield(); The code is actually safe because the iohandler and qemu_aio_context AioContext run under the Big QEMU Lock. Nevertheless, set a good example and use aio_co_reschedule_self() so it's obvious that there is no race. Suggested-by: Hanna Reitz Reviewed-by: Manos Pitsidianakis Reviewed-by: Hanna Czenczek Signed-off-by: Stefan Hajnoczi Message-ID: <20240206190610.107963-6-stefanha@redhat.com> Reviewed-by: Kevin Wolf Acked-by: Markus Armbruster Reviewed-by: Michael S. Tsirkin Signed-off-by: Kevin Wolf --- qapi/qmp-dispatch.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c index 176b549473..f3488afeef 100644 --- a/qapi/qmp-dispatch.c +++ b/qapi/qmp-dispatch.c @@ -212,8 +212,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ * executing the command handler so that it can make progress if it * involves an AIO_WAIT_WHILE(). */ - aio_co_schedule(qemu_get_aio_context(), qemu_coroutine_self()); - qemu_coroutine_yield(); + aio_co_reschedule_self(qemu_get_aio_context()); } monitor_set_cur(qemu_coroutine_self(), cur_mon); @@ -227,9 +226,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ * Move back to iohandler_ctx so that nested event loops for * qemu_aio_context don't start new monitor commands. */ - aio_co_schedule(iohandler_get_aio_context(), - qemu_coroutine_self()); - qemu_coroutine_yield(); + aio_co_reschedule_self(iohandler_get_aio_context()); } } else { /* From ad893672027ffe26db498947d70cde6d4f58a111 Mon Sep 17 00:00:00 2001 From: Hanna Czenczek Date: Fri, 2 Feb 2024 15:47:54 +0100 Subject: [PATCH 06/16] block-backend: Allow concurrent context changes Since AioContext locks have been removed, a BlockBackend's AioContext may really change at any time (only exception is that it is often confined to a drained section, as noted in this patch). Therefore, blk_get_aio_context() cannot rely on its root node's context always matching that of the BlockBackend. In practice, whether they match does not matter anymore anyway: Requests can be sent to BDSs from any context, so anyone who requests the BB's context should have no reason to require the root node to have the same context. Therefore, we can and should remove the assertion to that effect. In addition, because the context can be set and queried from different threads concurrently, it has to be accessed with atomic operations. Buglink: https://issues.redhat.com/browse/RHEL-19381 Suggested-by: Kevin Wolf Signed-off-by: Hanna Czenczek Message-ID: <20240202144755.671354-2-hreitz@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- block/block-backend.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/block/block-backend.c b/block/block-backend.c index 209eb07528..9c4de79e6b 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -44,7 +44,7 @@ struct BlockBackend { char *name; int refcnt; BdrvChild *root; - AioContext *ctx; + AioContext *ctx; /* access with atomic operations only */ DriveInfo *legacy_dinfo; /* null unless created by drive_new() */ QTAILQ_ENTRY(BlockBackend) link; /* for block_backends */ QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */ @@ -2414,22 +2414,22 @@ void blk_op_unblock_all(BlockBackend *blk, Error *reason) } } +/** + * Return BB's current AioContext. Note that this context may change + * concurrently at any time, with one exception: If the BB has a root node + * attached, its context will only change through bdrv_try_change_aio_context(), + * which creates a drained section. Therefore, incrementing such a BB's + * in-flight counter will prevent its context from changing. + */ AioContext *blk_get_aio_context(BlockBackend *blk) { - BlockDriverState *bs; IO_CODE(); if (!blk) { return qemu_get_aio_context(); } - bs = blk_bs(blk); - if (bs) { - AioContext *ctx = bdrv_get_aio_context(blk_bs(blk)); - assert(ctx == blk->ctx); - } - - return blk->ctx; + return qatomic_read(&blk->ctx); } int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, @@ -2442,7 +2442,7 @@ int blk_set_aio_context(BlockBackend *blk, AioContext *new_context, GLOBAL_STATE_CODE(); if (!bs) { - blk->ctx = new_context; + qatomic_set(&blk->ctx, new_context); return 0; } @@ -2471,7 +2471,7 @@ static void blk_root_set_aio_ctx_commit(void *opaque) AioContext *new_context = s->new_ctx; ThrottleGroupMember *tgm = &blk->public.throttle_group_member; - blk->ctx = new_context; + qatomic_set(&blk->ctx, new_context); if (tgm->throttle_state) { throttle_group_detach_aio_context(tgm); throttle_group_attach_aio_context(tgm, new_context); From 1604c0493193273e4eac547f86fbd2845e7f9af4 Mon Sep 17 00:00:00 2001 From: Hanna Czenczek Date: Fri, 2 Feb 2024 15:47:55 +0100 Subject: [PATCH 07/16] scsi: Await request purging scsi_device_for_each_req_async() currently does not provide any way to be awaited. One of its callers is scsi_device_purge_requests(), which therefore currently does not guarantee that all requests are fully settled when it returns. We want all requests to be settled, because scsi_device_purge_requests() is called through the unrealize path, including the one invoked by virtio_scsi_hotunplug() through qdev_simple_device_unplug_cb(), which most likely assumes that all SCSI requests are done then. In fact, scsi_device_purge_requests() already contains a blk_drain(), but this will not fully await scsi_device_for_each_req_async(), only the I/O requests it potentially cancels (not the non-I/O requests). However, we can have scsi_device_for_each_req_async() increment the BB in-flight counter, and have scsi_device_for_each_req_async_bh() decrement it when it is done. This way, the blk_drain() will fully await all SCSI requests to be purged. This also removes the need for scsi_device_for_each_req_async_bh() to double-check the current context and potentially re-schedule itself, should it now differ from the BB's context: Changing a BB's AioContext with a root node is done through bdrv_try_change_aio_context(), which creates a drained section. With this patch, we keep the BB in-flight counter elevated throughout, so we know the BB's context cannot change. Signed-off-by: Hanna Czenczek Message-ID: <20240202144755.671354-3-hreitz@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- hw/scsi/scsi-bus.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index 0a2eb11c56..230313022c 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -120,17 +120,13 @@ static void scsi_device_for_each_req_async_bh(void *opaque) SCSIRequest *next; /* - * If the AioContext changed before this BH was called then reschedule into - * the new AioContext before accessing ->requests. This can happen when - * scsi_device_for_each_req_async() is called and then the AioContext is - * changed before BHs are run. + * The BB cannot have changed contexts between this BH being scheduled and + * now: BBs' AioContexts, when they have a node attached, can only be + * changed via bdrv_try_change_aio_context(), in a drained section. While + * we have the in-flight counter incremented, that drain must block. */ ctx = blk_get_aio_context(s->conf.blk); - if (ctx != qemu_get_current_aio_context()) { - aio_bh_schedule_oneshot(ctx, scsi_device_for_each_req_async_bh, - g_steal_pointer(&data)); - return; - } + assert(ctx == qemu_get_current_aio_context()); QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) { data->fn(req, data->fn_opaque); @@ -138,11 +134,16 @@ static void scsi_device_for_each_req_async_bh(void *opaque) /* Drop the reference taken by scsi_device_for_each_req_async() */ object_unref(OBJECT(s)); + + /* Paired with blk_inc_in_flight() in scsi_device_for_each_req_async() */ + blk_dec_in_flight(s->conf.blk); } /* * Schedule @fn() to be invoked for each enqueued request in device @s. @fn() * runs in the AioContext that is executing the request. + * Keeps the BlockBackend's in-flight counter incremented until everything is + * done, so draining it will settle all scheduled @fn() calls. */ static void scsi_device_for_each_req_async(SCSIDevice *s, void (*fn)(SCSIRequest *, void *), @@ -163,6 +164,8 @@ static void scsi_device_for_each_req_async(SCSIDevice *s, */ object_ref(OBJECT(s)); + /* Paired with blk_dec_in_flight() in scsi_device_for_each_req_async_bh() */ + blk_inc_in_flight(s->conf.blk); aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk), scsi_device_for_each_req_async_bh, data); @@ -1728,11 +1731,20 @@ static void scsi_device_purge_one_req(SCSIRequest *req, void *opaque) scsi_req_cancel_async(req, NULL); } +/** + * Cancel all requests, and block until they are deleted. + */ void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense) { scsi_device_for_each_req_async(sdev, scsi_device_purge_one_req, NULL); + /* + * Await all the scsi_device_purge_one_req() calls scheduled by + * scsi_device_for_each_req_async(), and all I/O requests that were + * cancelled this way, but may still take a bit of time to settle. + */ blk_drain(sdev->conf.blk); + scsi_device_set_ua(sdev, sense); } From c645bac4e06bf9642cc8e339d027a5d6ec54d811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= Date: Mon, 5 Feb 2024 15:40:19 +0000 Subject: [PATCH 08/16] iotests: fix leak of tmpdir in dry-run mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creating an instance of the 'TestEnv' class will create a temporary directory. This dir is only deleted, however, in the __exit__ handler invoked by a context manager. In dry-run mode, we don't use the TestEnv via a context manager, so were leaking the temporary directory. Since meson invokes 'check' 5 times on each configure run, developers /tmp was filling up with empty temporary directories. Signed-off-by: Daniel P. Berrangé Message-ID: <20240205154019.1841037-1-berrange@redhat.com> Reviewed-by: Michael Tokarev Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- tests/qemu-iotests/check | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check index f2e9d27dcf..56d88ca423 100755 --- a/tests/qemu-iotests/check +++ b/tests/qemu-iotests/check @@ -184,7 +184,8 @@ if __name__ == '__main__': sys.exit(str(e)) if args.dry_run: - print('\n'.join([os.path.basename(t) for t in tests])) + with env: + print('\n'.join([os.path.basename(t) for t in tests])) else: with TestRunner(env, tap=args.tap, color=args.color) as tr: From 7d2faf0ce2ccc896ac56bc5ed2cdf4a55056a8bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20P=2E=20Berrang=C3=A9?= Date: Mon, 5 Feb 2024 15:51:58 +0000 Subject: [PATCH 09/16] iotests: give tempdir an identifying name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If something goes wrong causing the iotests not to cleanup their temporary directory, it is useful if the dir had an identifying name to show what is to blame. Signed-off-by: Daniel P. Berrangé Message-ID: <20240205155158.1843304-1-berrange@redhat.com> Revieved-by: Michael Tokarev Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- tests/qemu-iotests/testenv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py index 3ff38f2661..588f30a4f1 100644 --- a/tests/qemu-iotests/testenv.py +++ b/tests/qemu-iotests/testenv.py @@ -126,7 +126,7 @@ def init_directories(self) -> None: self.tmp_sock_dir = False Path(self.sock_dir).mkdir(parents=True, exist_ok=True) except KeyError: - self.sock_dir = tempfile.mkdtemp() + self.sock_dir = tempfile.mkdtemp(prefix="qemu-iotests-") self.tmp_sock_dir = True self.sample_img_dir = os.getenv('SAMPLE_IMG_DIR', From b3d9bb9a560316b125dae5fb18924523d004d3dd Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 6 Feb 2024 09:04:09 -0500 Subject: [PATCH 10/16] virtio-blk: do not use C99 mixed declarations QEMU's coding style generally forbids C99 mixed declarations. Signed-off-by: Stefan Hajnoczi Message-ID: <20240206140410.65650-1-stefanha@redhat.com> Reviewed-by: Hanna Czenczek Reviewed-by: Kevin Wolf Acked-by: Michael S. Tsirkin Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 31212506ca..bda5c117d4 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -661,6 +661,9 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret) int64_t zrp_size, n, j = 0; int64_t nz = data->zone_report_data.nr_zones; int8_t err_status = VIRTIO_BLK_S_OK; + struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) { + .nr_zones = cpu_to_le64(nz), + }; trace_virtio_blk_zone_report_complete(vdev, req, nz, ret); if (ret) { @@ -668,9 +671,6 @@ static void virtio_blk_zone_report_complete(void *opaque, int ret) goto out; } - struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) { - .nr_zones = cpu_to_le64(nz), - }; zrp_size = sizeof(struct virtio_blk_zone_report) + sizeof(struct virtio_blk_zone_descriptor) * nz; n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr)); @@ -898,13 +898,14 @@ static int virtio_blk_handle_zone_append(VirtIOBlockReq *req, int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS; int64_t len = iov_size(out_iov, out_num); + ZoneCmdData *data; trace_virtio_blk_handle_zone_append(vdev, req, offset >> BDRV_SECTOR_BITS); if (!check_zoned_request(s, offset, len, true, &err_status)) { goto out; } - ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData)); + data = g_malloc(sizeof(ZoneCmdData)); data->req = req; data->in_iov = in_iov; data->in_num = in_num; @@ -1191,14 +1192,15 @@ static void virtio_blk_dma_restart_cb(void *opaque, bool running, { VirtIOBlock *s = opaque; uint16_t num_queues = s->conf.num_queues; + g_autofree VirtIOBlockReq **vq_rq = NULL; + VirtIOBlockReq *rq; if (!running) { return; } /* Split the device-wide s->rq request list into per-vq request lists */ - g_autofree VirtIOBlockReq **vq_rq = g_new0(VirtIOBlockReq *, num_queues); - VirtIOBlockReq *rq; + vq_rq = g_new0(VirtIOBlockReq *, num_queues); WITH_QEMU_LOCK_GUARD(&s->rq_lock) { rq = s->rq; @@ -1961,6 +1963,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIOBlock *s = VIRTIO_BLK(dev); VirtIOBlkConf *conf = &s->conf; + BlockDriverState *bs; Error *err = NULL; unsigned i; @@ -2006,7 +2009,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp) return; } - BlockDriverState *bs = blk_bs(conf->conf.blk); + bs = blk_bs(conf->conf.blk); if (bs->bl.zoned != BLK_Z_NONE) { virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED); if (bs->bl.zoned == BLK_Z_HM) { From 3089637461693837cafd2709ef36d0cf6a4a8ed8 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 31 Jan 2024 14:06:07 +0100 Subject: [PATCH 11/16] scsi: Don't ignore most usb-storage properties usb-storage is for the most part just a wrapper around an internally created scsi-disk device. It uses DEFINE_BLOCK_PROPERTIES() to offer all of the usual block device properties to the user, but then only forwards a few select properties to the internal device while the rest is silently ignored. This changes scsi_bus_legacy_add_drive() to accept a whole BlockConf instead of some individual values inside of it so that usb-storage can now pass the whole configuration to the internal scsi-disk. This enables the remaining block device properties, e.g. logical/physical_block_size or discard_granularity. Buglink: https://issues.redhat.com/browse/RHEL-22375 Signed-off-by: Kevin Wolf Message-ID: <20240131130607.24117-1-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- hw/scsi/scsi-bus.c | 33 +++++++++++++-------------------- hw/usb/dev-storage-classic.c | 5 +---- include/hw/scsi/scsi.h | 5 +---- 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c index 230313022c..9e40b0c920 100644 --- a/hw/scsi/scsi-bus.c +++ b/hw/scsi/scsi-bus.c @@ -376,15 +376,13 @@ static void scsi_qdev_unrealize(DeviceState *qdev) /* handle legacy '-drive if=scsi,...' cmd line args */ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, - int unit, bool removable, int bootindex, - bool share_rw, - BlockdevOnError rerror, - BlockdevOnError werror, + int unit, bool removable, BlockConf *conf, const char *serial, Error **errp) { const char *driver; char *name; DeviceState *dev; + SCSIDevice *s; DriveInfo *dinfo; if (blk_is_sg(blk)) { @@ -402,11 +400,10 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, object_property_add_child(OBJECT(bus), name, OBJECT(dev)); g_free(name); + s = SCSI_DEVICE(dev); + s->conf = *conf; + qdev_prop_set_uint32(dev, "scsi-id", unit); - if (bootindex >= 0) { - object_property_set_int(OBJECT(dev), "bootindex", bootindex, - &error_abort); - } if (object_property_find(OBJECT(dev), "removable")) { qdev_prop_set_bit(dev, "removable", removable); } @@ -417,19 +414,12 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, object_unparent(OBJECT(dev)); return NULL; } - if (!object_property_set_bool(OBJECT(dev), "share-rw", share_rw, errp)) { - object_unparent(OBJECT(dev)); - return NULL; - } - - qdev_prop_set_enum(dev, "rerror", rerror); - qdev_prop_set_enum(dev, "werror", werror); if (!qdev_realize_and_unref(dev, &bus->qbus, errp)) { object_unparent(OBJECT(dev)); return NULL; } - return SCSI_DEVICE(dev); + return s; } void scsi_bus_legacy_handle_cmdline(SCSIBus *bus) @@ -437,6 +427,12 @@ void scsi_bus_legacy_handle_cmdline(SCSIBus *bus) Location loc; DriveInfo *dinfo; int unit; + BlockConf conf = { + .bootindex = -1, + .share_rw = false, + .rerror = BLOCKDEV_ON_ERROR_AUTO, + .werror = BLOCKDEV_ON_ERROR_AUTO, + }; loc_push_none(&loc); for (unit = 0; unit <= bus->info->max_target; unit++) { @@ -446,10 +442,7 @@ void scsi_bus_legacy_handle_cmdline(SCSIBus *bus) } qemu_opts_loc_restore(dinfo->opts); scsi_bus_legacy_add_drive(bus, blk_by_legacy_dinfo(dinfo), - unit, false, -1, false, - BLOCKDEV_ON_ERROR_AUTO, - BLOCKDEV_ON_ERROR_AUTO, - NULL, &error_fatal); + unit, false, &conf, NULL, &error_fatal); } loc_pop(&loc); } diff --git a/hw/usb/dev-storage-classic.c b/hw/usb/dev-storage-classic.c index 84d19752b5..50a3ad6285 100644 --- a/hw/usb/dev-storage-classic.c +++ b/hw/usb/dev-storage-classic.c @@ -67,10 +67,7 @@ static void usb_msd_storage_realize(USBDevice *dev, Error **errp) scsi_bus_init(&s->bus, sizeof(s->bus), DEVICE(dev), &usb_msd_scsi_info_storage); scsi_dev = scsi_bus_legacy_add_drive(&s->bus, blk, 0, !!s->removable, - s->conf.bootindex, s->conf.share_rw, - s->conf.rerror, s->conf.werror, - dev->serial, - errp); + &s->conf, dev->serial, errp); blk_unref(blk); if (!scsi_dev) { return; diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h index 10c4e8288d..c3d5e17e38 100644 --- a/include/hw/scsi/scsi.h +++ b/include/hw/scsi/scsi.h @@ -199,10 +199,7 @@ static inline SCSIBus *scsi_bus_from_device(SCSIDevice *d) } SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk, - int unit, bool removable, int bootindex, - bool share_rw, - BlockdevOnError rerror, - BlockdevOnError werror, + int unit, bool removable, BlockConf *conf, const char *serial, Error **errp); void scsi_bus_set_ua(SCSIBus *bus, SCSISense sense); void scsi_bus_legacy_handle_cmdline(SCSIBus *bus); From 10b2393e5e7f4c1d633f1ac8578465681c333efb Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 31 Jan 2024 18:31:40 +0100 Subject: [PATCH 12/16] blkio: Respect memory-alignment for bounce buffer allocations blkio_alloc_mem_region() requires that the requested buffer size is a multiple of the memory-alignment property. If it isn't, the allocation fails with a return value of -EINVAL. Fix the call in blkio_resize_bounce_pool() to make sure the requested size is properly aligned. I observed this problem with vhost-vdpa, which requires page aligned memory. As the virtio-blk device behind it still had 512 byte blocks, we got bs->bl.request_alignment = 512, but actually any request that needed a bounce buffer and was not aligned to 4k would fail without this fix. Suggested-by: Stefano Garzarella Signed-off-by: Kevin Wolf Message-ID: <20240131173140.42398-1-kwolf@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Stefano Garzarella Signed-off-by: Kevin Wolf --- block/blkio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blkio.c b/block/blkio.c index bc2f21784c..882e1c297b 100644 --- a/block/blkio.c +++ b/block/blkio.c @@ -89,6 +89,9 @@ static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes) /* Pad size to reduce frequency of resize calls */ bytes += 128 * 1024; + /* Align the pool size to avoid blkio_alloc_mem_region() failure */ + bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment); + WITH_QEMU_LOCK_GUARD(&s->blkio_lock) { int ret; From c42c3833e0cfdf2b80fb3ca410acfd392b6874ab Mon Sep 17 00:00:00 2001 From: Hanna Czenczek Date: Fri, 2 Feb 2024 16:31:56 +0100 Subject: [PATCH 13/16] virtio-scsi: Attach event vq notifier with no_poll As of commit 38738f7dbbda90fbc161757b7f4be35b52205552 ("virtio-scsi: don't waste CPU polling the event virtqueue"), we only attach an io_read notifier for the virtio-scsi event virtqueue instead, and no polling notifiers. During operation, the event virtqueue is typically non-empty, but none of the buffers are intended to be used immediately. Instead, they only get used when certain events occur. Therefore, it makes no sense to continuously poll it when non-empty, because it is supposed to be and stay non-empty. We do this by using virtio_queue_aio_attach_host_notifier_no_poll() instead of virtio_queue_aio_attach_host_notifier() for the event virtqueue. Commit 766aa2de0f29b657148e04599320d771c36fd126 ("virtio-scsi: implement BlockDevOps->drained_begin()") however has virtio_scsi_drained_end() use virtio_queue_aio_attach_host_notifier() for all virtqueues, including the event virtqueue. This can lead to it being polled again, undoing the benefit of commit 38738f7dbbda90fbc161757b7f4be35b52205552. Fix it by using virtio_queue_aio_attach_host_notifier_no_poll() for the event virtqueue. Reported-by: Fiona Ebner Fixes: 766aa2de0f29b657148e04599320d771c36fd126 ("virtio-scsi: implement BlockDevOps->drained_begin()") Reviewed-by: Stefan Hajnoczi Tested-by: Fiona Ebner Reviewed-by: Fiona Ebner Signed-off-by: Hanna Czenczek Message-ID: <20240202153158.788922-2-hreitz@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- hw/scsi/virtio-scsi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 690aceec45..9f02ceea09 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -1149,6 +1149,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus) static void virtio_scsi_drained_end(SCSIBus *bus) { VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus); + VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s); VirtIODevice *vdev = VIRTIO_DEVICE(s); uint32_t total_queues = VIRTIO_SCSI_VQ_NUM_FIXED + s->parent_obj.conf.num_queues; @@ -1166,7 +1167,11 @@ static void virtio_scsi_drained_end(SCSIBus *bus) for (uint32_t i = 0; i < total_queues; i++) { VirtQueue *vq = virtio_get_queue(vdev, i); - virtio_queue_aio_attach_host_notifier(vq, s->ctx); + if (vq == vs->event_vq) { + virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx); + } else { + virtio_queue_aio_attach_host_notifier(vq, s->ctx); + } } } From 5bdbaebcce18fe6a627cafad2043ec08f3de5744 Mon Sep 17 00:00:00 2001 From: Hanna Czenczek Date: Fri, 2 Feb 2024 16:31:57 +0100 Subject: [PATCH 14/16] virtio: Re-enable notifications after drain During drain, we do not care about virtqueue notifications, which is why we remove the handlers on it. When removing those handlers, whether vq notifications are enabled or not depends on whether we were in polling mode or not; if not, they are enabled (by default); if so, they have been disabled by the io_poll_start callback. Because we do not care about those notifications after removing the handlers, this is fine. However, we have to explicitly ensure they are enabled when re-attaching the handlers, so we will resume receiving notifications. We do this in virtio_queue_aio_attach_host_notifier*(). If such a function is called while we are in a polling section, attaching the notifiers will then invoke the io_poll_start callback, re-disabling notifications. Because we will always miss virtqueue updates in the drained section, we also need to poll the virtqueue once after attaching the notifiers. Buglink: https://issues.redhat.com/browse/RHEL-3934 Signed-off-by: Hanna Czenczek Message-ID: <20240202153158.788922-3-hreitz@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- hw/virtio/virtio.c | 42 ++++++++++++++++++++++++++++++++++++++++++ include/block/aio.h | 7 ++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 7549094154..d229755eae 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -3556,6 +3556,17 @@ static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n) void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) { + /* + * virtio_queue_aio_detach_host_notifier() can leave notifications disabled. + * Re-enable them. (And if detach has not been used before, notifications + * being enabled is still the default state while a notifier is attached; + * see virtio_queue_host_notifier_aio_poll_end(), which will always leave + * notifications enabled once the polling section is left.) + */ + if (!virtio_queue_get_notification(vq)) { + virtio_queue_set_notification(vq, 1); + } + aio_set_event_notifier(ctx, &vq->host_notifier, virtio_queue_host_notifier_read, virtio_queue_host_notifier_aio_poll, @@ -3563,6 +3574,13 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) aio_set_event_notifier_poll(ctx, &vq->host_notifier, virtio_queue_host_notifier_aio_poll_begin, virtio_queue_host_notifier_aio_poll_end); + + /* + * We will have ignored notifications about new requests from the guest + * while no notifiers were attached, so "kick" the virt queue to process + * those requests now. + */ + event_notifier_set(&vq->host_notifier); } /* @@ -3573,14 +3591,38 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx) */ void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx) { + /* See virtio_queue_aio_attach_host_notifier() */ + if (!virtio_queue_get_notification(vq)) { + virtio_queue_set_notification(vq, 1); + } + aio_set_event_notifier(ctx, &vq->host_notifier, virtio_queue_host_notifier_read, NULL, NULL); + + /* + * See virtio_queue_aio_attach_host_notifier(). + * Note that this may be unnecessary for the type of virtqueues this + * function is used for. Still, it will not hurt to have a quick look into + * whether we can/should process any of the virtqueue elements. + */ + event_notifier_set(&vq->host_notifier); } void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx) { aio_set_event_notifier(ctx, &vq->host_notifier, NULL, NULL, NULL); + + /* + * aio_set_event_notifier_poll() does not guarantee whether io_poll_end() + * will run after io_poll_begin(), so by removing the notifier, we do not + * know whether virtio_queue_host_notifier_aio_poll_end() has run after a + * previous virtio_queue_host_notifier_aio_poll_begin(), i.e. whether + * notifications are enabled or disabled. It does not really matter anyway; + * we just removed the notifier, so we do not care about notifications until + * we potentially re-attach it. The attach_host_notifier functions will + * ensure that notifications are enabled again when they are needed. + */ } void virtio_queue_host_notifier_read(EventNotifier *n) diff --git a/include/block/aio.h b/include/block/aio.h index 5d0a114988..8378553eb9 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -480,9 +480,14 @@ void aio_set_event_notifier(AioContext *ctx, AioPollFn *io_poll, EventNotifierHandler *io_poll_ready); -/* Set polling begin/end callbacks for an event notifier that has already been +/* + * Set polling begin/end callbacks for an event notifier that has already been * registered with aio_set_event_notifier. Do nothing if the event notifier is * not registered. + * + * Note that if the io_poll_end() callback (or the entire notifier) is removed + * during polling, it will not be called, so an io_poll_begin() is not + * necessarily always followed by an io_poll_end(). */ void aio_set_event_notifier_poll(AioContext *ctx, EventNotifier *notifier, From 52bff01f64eec017ffb0d5903a0ee1d67ca7a548 Mon Sep 17 00:00:00 2001 From: Hanna Czenczek Date: Fri, 2 Feb 2024 16:31:58 +0100 Subject: [PATCH 15/16] virtio-blk: Use ioeventfd_attach in start_ioeventfd Commit d3f6f294aeadd5f88caf0155e4360808c95b3146 ("virtio-blk: always set ioeventfd during startup") has made virtio_blk_start_ioeventfd() always kick the virtqueue (set the ioeventfd), regardless of whether the BB is drained. That is no longer necessary, because attaching the host notifier will now set the ioeventfd, too; this happens either immediately right here in virtio_blk_start_ioeventfd(), or later when the drain ends, in virtio_blk_ioeventfd_attach(). With event_notifier_set() removed, the code becomes the same as the one in virtio_blk_ioeventfd_attach(), so we can reuse that function. Signed-off-by: Hanna Czenczek Message-ID: <20240202153158.788922-4-hreitz@redhat.com> Reviewed-by: Stefan Hajnoczi Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index bda5c117d4..4ca5e632ea 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -37,6 +37,8 @@ #include "hw/virtio/virtio-blk-common.h" #include "qemu/coroutine.h" +static void virtio_blk_ioeventfd_attach(VirtIOBlock *s); + static void virtio_blk_init_request(VirtIOBlock *s, VirtQueue *vq, VirtIOBlockReq *req) { @@ -1847,17 +1849,14 @@ static int virtio_blk_start_ioeventfd(VirtIODevice *vdev) s->ioeventfd_started = true; smp_wmb(); /* paired with aio_notify_accept() on the read side */ - /* Get this show started by hooking up our callbacks */ - for (i = 0; i < nvqs; i++) { - VirtQueue *vq = virtio_get_queue(vdev, i); - AioContext *ctx = s->vq_aio_context[i]; - - /* Kick right away to begin processing requests already in vring */ - event_notifier_set(virtio_queue_get_host_notifier(vq)); - - if (!blk_in_drain(s->conf.conf.blk)) { - virtio_queue_aio_attach_host_notifier(vq, ctx); - } + /* + * Get this show started by hooking up our callbacks. If drained now, + * virtio_blk_drained_end() will do this later. + * Attaching the notifier also kicks the virtqueues, processing any requests + * they may already have. + */ + if (!blk_in_drain(s->conf.conf.blk)) { + virtio_blk_ioeventfd_attach(s); } return 0; From bfa36802d1704fc413c590ebdcc4e5ae0eacf439 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 22 Jan 2024 12:26:25 -0500 Subject: [PATCH 16/16] virtio-blk: avoid using ioeventfd state in irqfd conditional Requests that complete in an IOThread use irqfd to notify the guest while requests that complete in the main loop thread use the traditional qdev irq code path. The reason for this conditional is that the irq code path requires the BQL: if (s->ioeventfd_started && !s->ioeventfd_disabled) { virtio_notify_irqfd(vdev, req->vq); } else { virtio_notify(vdev, req->vq); } There is a corner case where the conditional invokes the irq code path instead of the irqfd code path: static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev) { ... /* * Set ->ioeventfd_started to false before draining so that host notifiers * are not detached/attached anymore. */ s->ioeventfd_started = false; /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */ blk_drain(s->conf.conf.blk); During blk_drain() the conditional produces the wrong result because ioeventfd_started is false. Use qemu_in_iothread() instead of checking the ioeventfd state. Cc: qemu-stable@nongnu.org Buglink: https://issues.redhat.com/browse/RHEL-15394 Signed-off-by: Stefan Hajnoczi Message-ID: <20240122172625.415386-1-stefanha@redhat.com> Reviewed-by: Kevin Wolf Signed-off-by: Kevin Wolf --- hw/block/virtio-blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 4ca5e632ea..738cb2ac36 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -66,7 +66,7 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status) iov_discard_undo(&req->inhdr_undo); iov_discard_undo(&req->outhdr_undo); virtqueue_push(req->vq, &req->elem, req->in_len); - if (s->ioeventfd_started && !s->ioeventfd_disabled) { + if (qemu_in_iothread()) { virtio_notify_irqfd(vdev, req->vq); } else { virtio_notify(vdev, req->vq);