From f187609f27b261702a17f79d20bf252ee0d4f9cd Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Mon, 31 Jul 2023 17:33:38 -0300 Subject: [PATCH 1/5] block-migration: Ensure we don't crash during migration cleanup We can fail the blk_insert_bs() at init_blk_migration(), leaving the BlkMigDevState without a dirty_bitmap and BlockDriverState. Account for the possibly missing elements when doing cleanup. Fix the following crashes: Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359 359 BlockDriverState *bs = bitmap->bs; #0 0x0000555555ec83ef in bdrv_release_dirty_bitmap (bitmap=0x0) at ../block/dirty-bitmap.c:359 #1 0x0000555555bba331 in unset_dirty_tracking () at ../migration/block.c:371 #2 0x0000555555bbad98 in block_migration_cleanup_bmds () at ../migration/block.c:681 Thread 1 "qemu-system-x86" received signal SIGSEGV, Segmentation fault. 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073 7073 QLIST_FOREACH_SAFE(blocker, &bs->op_blockers[op], list, next) { #0 0x0000555555e971ff in bdrv_op_unblock (bs=0x0, op=BLOCK_OP_TYPE_BACKUP_SOURCE, reason=0x0) at ../block.c:7073 #1 0x0000555555e9734a in bdrv_op_unblock_all (bs=0x0, reason=0x0) at ../block.c:7095 #2 0x0000555555bbae13 in block_migration_cleanup_bmds () at ../migration/block.c:690 Signed-off-by: Fabiano Rosas Message-id: 20230731203338.27581-1-farosas@suse.de Signed-off-by: Stefan Hajnoczi --- migration/block.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/migration/block.c b/migration/block.c index b9580a6c7e..86c2256a2b 100644 --- a/migration/block.c +++ b/migration/block.c @@ -368,7 +368,9 @@ static void unset_dirty_tracking(void) BlkMigDevState *bmds; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - bdrv_release_dirty_bitmap(bmds->dirty_bitmap); + if (bmds->dirty_bitmap) { + bdrv_release_dirty_bitmap(bmds->dirty_bitmap); + } } } @@ -676,13 +678,18 @@ static int64_t get_remaining_dirty(void) static void block_migration_cleanup_bmds(void) { BlkMigDevState *bmds; + BlockDriverState *bs; AioContext *ctx; unset_dirty_tracking(); while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); - bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker); + + bs = blk_bs(bmds->blk); + if (bs) { + bdrv_op_unblock_all(bs, bmds->blocker); + } error_free(bmds->blocker); /* Save ctx, because bmds->blk can disappear during blk_unref. */ From c54483b6f45a0c84e230728a1a9e7adf90345e15 Mon Sep 17 00:00:00 2001 From: Andrey Drobyshev Date: Tue, 11 Jul 2023 20:25:51 +0300 Subject: [PATCH 2/5] block: add subcluster_size field to BlockDriverInfo This is going to be used in the subsequent commit as requests alignment (in particular, during copy-on-read). This value only makes sense for the formats which support subclusters (currently QCOW2 only). If this field isn't set by driver's own bdrv_get_info() implementation, we simply set it equal to the cluster size thus treating each cluster as having a single subcluster. Reviewed-by: Eric Blake Reviewed-by: Denis V. Lunev Signed-off-by: Andrey Drobyshev Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Stefan Hajnoczi Message-ID: <20230711172553.234055-2-andrey.drobyshev@virtuozzo.com> --- block.c | 7 +++++++ block/qcow2.c | 1 + include/block/block-common.h | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/block.c b/block.c index a307c151a8..0af890f647 100644 --- a/block.c +++ b/block.c @@ -6480,6 +6480,13 @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) } memset(bdi, 0, sizeof(*bdi)); ret = drv->bdrv_co_get_info(bs, bdi); + if (bdi->subcluster_size == 0) { + /* + * If the driver left this unset, subclusters are not supported. + * Then it is safe to treat each cluster as having only one subcluster. + */ + bdi->subcluster_size = bdi->cluster_size; + } if (ret < 0) { return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index c51388e99d..b48cd9ce63 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -5197,6 +5197,7 @@ qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVQcow2State *s = bs->opaque; bdi->cluster_size = s->cluster_size; + bdi->subcluster_size = s->subcluster_size; bdi->vm_state_offset = qcow2_vm_state_offset(s); bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY; return 0; diff --git a/include/block/block-common.h b/include/block/block-common.h index e15395f2cb..df5ffc8d09 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -132,6 +132,11 @@ typedef struct BlockZoneWps { typedef struct BlockDriverInfo { /* in bytes, 0 if irrelevant */ int cluster_size; + /* + * A fraction of cluster_size, if supported (currently QCOW2 only); if + * disabled or unsupported, set equal to cluster_size. + */ + int subcluster_size; /* offset at which the VM state can be saved (0 if not possible) */ int64_t vm_state_offset; bool is_dirty; From fc6b211f920a68f4c9c8db91ccc35ca95fe6683a Mon Sep 17 00:00:00 2001 From: Andrey Drobyshev Date: Tue, 11 Jul 2023 20:25:52 +0300 Subject: [PATCH 3/5] block/io: align requests to subcluster_size When target image is using subclusters, and we align the request during copy-on-read, it makes sense to align to subcluster_size rather than cluster_size. Otherwise we end up with unnecessary allocations. This commit renames bdrv_round_to_clusters() to bdrv_round_to_subclusters() and utilizes subcluster_size field of BlockDriverInfo to make necessary alignments. It affects copy-on-read as well as mirror job (which is using bdrv_round_to_clusters()). This change also fixes the following bug with failing assert (covered by the test in the subsequent commit): qemu-img create -f qcow2 base.qcow2 64K qemu-img create -f qcow2 -o extended_l2=on,backing_file=base.qcow2,backing_fmt=qcow2 img.qcow2 64K qemu-io -c "write -P 0xaa 0 2K" img.qcow2 qemu-io -C -c "read -P 0x00 2K 62K" img.qcow2 qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. Reviewed-by: Eric Blake Reviewed-by: Denis V. Lunev Signed-off-by: Andrey Drobyshev Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Stefan Hajnoczi Message-ID: <20230711172553.234055-3-andrey.drobyshev@virtuozzo.com> --- block/io.c | 50 ++++++++++++++++++++-------------------- block/mirror.c | 8 +++---- include/block/block-io.h | 8 +++---- 3 files changed, 33 insertions(+), 33 deletions(-) diff --git a/block/io.c b/block/io.c index 055fcf7438..76e7df18d8 100644 --- a/block/io.c +++ b/block/io.c @@ -728,21 +728,21 @@ BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs) } /** - * Round a region to cluster boundaries + * Round a region to subcluster (if supported) or cluster boundaries */ void coroutine_fn GRAPH_RDLOCK -bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes, - int64_t *cluster_offset, int64_t *cluster_bytes) +bdrv_round_to_subclusters(BlockDriverState *bs, int64_t offset, int64_t bytes, + int64_t *align_offset, int64_t *align_bytes) { BlockDriverInfo bdi; IO_CODE(); - if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { - *cluster_offset = offset; - *cluster_bytes = bytes; + if (bdrv_co_get_info(bs, &bdi) < 0 || bdi.subcluster_size == 0) { + *align_offset = offset; + *align_bytes = bytes; } else { - int64_t c = bdi.cluster_size; - *cluster_offset = QEMU_ALIGN_DOWN(offset, c); - *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c); + int64_t c = bdi.subcluster_size; + *align_offset = QEMU_ALIGN_DOWN(offset, c); + *align_bytes = QEMU_ALIGN_UP(offset - *align_offset + bytes, c); } } @@ -1168,8 +1168,8 @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, void *bounce_buffer = NULL; BlockDriver *drv = bs->drv; - int64_t cluster_offset; - int64_t cluster_bytes; + int64_t align_offset; + int64_t align_bytes; int64_t skip_bytes; int ret; int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, @@ -1203,28 +1203,28 @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which * is one reason we loop rather than doing it all at once. */ - bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes); - skip_bytes = offset - cluster_offset; + bdrv_round_to_subclusters(bs, offset, bytes, &align_offset, &align_bytes); + skip_bytes = offset - align_offset; trace_bdrv_co_do_copy_on_readv(bs, offset, bytes, - cluster_offset, cluster_bytes); + align_offset, align_bytes); - while (cluster_bytes) { + while (align_bytes) { int64_t pnum; if (skip_write) { ret = 1; /* "already allocated", so nothing will be copied */ - pnum = MIN(cluster_bytes, max_transfer); + pnum = MIN(align_bytes, max_transfer); } else { - ret = bdrv_is_allocated(bs, cluster_offset, - MIN(cluster_bytes, max_transfer), &pnum); + ret = bdrv_is_allocated(bs, align_offset, + MIN(align_bytes, max_transfer), &pnum); if (ret < 0) { /* * Safe to treat errors in querying allocation as if * unallocated; we'll probably fail again soon on the * read, but at least that will set a decent errno. */ - pnum = MIN(cluster_bytes, max_transfer); + pnum = MIN(align_bytes, max_transfer); } /* Stop at EOF if the image ends in the middle of the cluster */ @@ -1242,7 +1242,7 @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, /* Must copy-on-read; use the bounce buffer */ pnum = MIN(pnum, MAX_BOUNCE_BUFFER); if (!bounce_buffer) { - int64_t max_we_need = MAX(pnum, cluster_bytes - pnum); + int64_t max_we_need = MAX(pnum, align_bytes - pnum); int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER); int64_t bounce_buffer_len = MIN(max_we_need, max_allowed); @@ -1254,7 +1254,7 @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, } qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum); - ret = bdrv_driver_preadv(bs, cluster_offset, pnum, + ret = bdrv_driver_preadv(bs, align_offset, pnum, &local_qiov, 0, 0); if (ret < 0) { goto err; @@ -1266,13 +1266,13 @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, /* FIXME: Should we (perhaps conditionally) be setting * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy * that still correctly reads as zero? */ - ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, + ret = bdrv_co_do_pwrite_zeroes(bs, align_offset, pnum, BDRV_REQ_WRITE_UNCHANGED); } else { /* This does not change the data on the disk, it is not * necessary to flush even in cache=writethrough mode. */ - ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, + ret = bdrv_driver_pwritev(bs, align_offset, pnum, &local_qiov, 0, BDRV_REQ_WRITE_UNCHANGED); } @@ -1301,8 +1301,8 @@ bdrv_co_do_copy_on_readv(BdrvChild *child, int64_t offset, int64_t bytes, } } - cluster_offset += pnum; - cluster_bytes -= pnum; + align_offset += pnum; + align_bytes -= pnum; progress += pnum - skip_bytes; skip_bytes = 0; } diff --git a/block/mirror.c b/block/mirror.c index d3cacd1708..e213a892db 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -283,8 +283,8 @@ static int coroutine_fn mirror_cow_align(MirrorBlockJob *s, int64_t *offset, need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity, s->cow_bitmap); if (need_cow) { - bdrv_round_to_clusters(blk_bs(s->target), *offset, *bytes, - &align_offset, &align_bytes); + bdrv_round_to_subclusters(blk_bs(s->target), *offset, *bytes, + &align_offset, &align_bytes); } if (align_bytes > max_bytes) { @@ -576,8 +576,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) int64_t target_offset; int64_t target_bytes; WITH_GRAPH_RDLOCK_GUARD() { - bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes, - &target_offset, &target_bytes); + bdrv_round_to_subclusters(blk_bs(s->target), offset, io_bytes, + &target_offset, &target_bytes); } if (target_offset == offset && target_bytes == io_bytes) { diff --git a/include/block/block-io.h b/include/block/block-io.h index 4415506e40..6db48f2d35 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -189,10 +189,10 @@ bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs, Error **errp); BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs); -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t offset, int64_t bytes, - int64_t *cluster_offset, - int64_t *cluster_bytes); +void bdrv_round_to_subclusters(BlockDriverState *bs, + int64_t offset, int64_t bytes, + int64_t *cluster_offset, + int64_t *cluster_bytes); void bdrv_get_backing_filename(BlockDriverState *bs, char *filename, int filename_size); From e2f938265e0aa8486303d260f3cb13d5bb9e9d6a Mon Sep 17 00:00:00 2001 From: Andrey Drobyshev Date: Tue, 11 Jul 2023 20:25:53 +0300 Subject: [PATCH 4/5] tests/qemu-iotests/197: add testcase for CoR with subclusters Add testcase which checks that allocations during copy-on-read are performed on the subcluster basis when subclusters are enabled in target image. This testcase also triggers the following assert with previous commit not being applied, so we check that as well: qemu-io: ../block/io.c:1236: bdrv_co_do_copy_on_readv: Assertion `skip_bytes < pnum' failed. Reviewed-by: Eric Blake Reviewed-by: Denis V. Lunev Signed-off-by: Andrey Drobyshev Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Stefan Hajnoczi Message-ID: <20230711172553.234055-4-andrey.drobyshev@virtuozzo.com> --- tests/qemu-iotests/197 | 29 +++++++++++++++++++++++++++++ tests/qemu-iotests/197.out | 24 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 index a2547bc280..f07a9da136 100755 --- a/tests/qemu-iotests/197 +++ b/tests/qemu-iotests/197 @@ -122,6 +122,35 @@ $QEMU_IO -f qcow2 -C -c 'read 0 1024' "$TEST_WRAP" | _filter_qemu_io $QEMU_IO -f qcow2 -c map "$TEST_WRAP" _check_test_img +echo +echo '=== Copy-on-read with subclusters ===' +echo + +# Create base and top images 64K (1 cluster) each. Make subclusters enabled +# for the top image +_make_test_img 64K +IMGPROTO=file IMGFMT=qcow2 TEST_IMG_FILE="$TEST_WRAP" \ + _make_test_img --no-opts -o extended_l2=true -F "$IMGFMT" -b "$TEST_IMG" \ + 64K | _filter_img_create + +$QEMU_IO -c "write -P 0xaa 0 64k" "$TEST_IMG" | _filter_qemu_io + +# Allocate individual subclusters in the top image, and not the whole cluster +$QEMU_IO -c "write -P 0xbb 28K 2K" -c "write -P 0xcc 34K 2K" "$TEST_WRAP" \ + | _filter_qemu_io + +# Only 2 subclusters should be allocated in the top image at this point +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map + +# Actual copy-on-read operation +$QEMU_IO -C -c "read -P 0xaa 30K 4K" "$TEST_WRAP" | _filter_qemu_io + +# And here we should have 4 subclusters allocated right in the middle of the +# top image. Make sure the whole cluster remains unallocated +$QEMU_IMG map "$TEST_WRAP" | _filter_qemu_img_map + +_check_test_img + # success, all done echo '*** done' status=0 diff --git a/tests/qemu-iotests/197.out b/tests/qemu-iotests/197.out index ad414c3b0e..8f34a30afe 100644 --- a/tests/qemu-iotests/197.out +++ b/tests/qemu-iotests/197.out @@ -31,4 +31,28 @@ read 1024/1024 bytes at offset 0 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) 1 KiB (0x400) bytes allocated at offset 0 bytes (0x0) No errors were found on the image. + +=== Copy-on-read with subclusters === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 +Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=65536 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 28672 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2048/2048 bytes at offset 34816 +2 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x7000 TEST_DIR/t.IMGFMT +0x7000 0x800 TEST_DIR/t.wrap.IMGFMT +0x7800 0x1000 TEST_DIR/t.IMGFMT +0x8800 0x800 TEST_DIR/t.wrap.IMGFMT +0x9000 0x7000 TEST_DIR/t.IMGFMT +read 4096/4096 bytes at offset 30720 +4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Offset Length File +0 0x7000 TEST_DIR/t.IMGFMT +0x7000 0x2000 TEST_DIR/t.wrap.IMGFMT +0x9000 0x7000 TEST_DIR/t.IMGFMT +No errors were found on the image. *** done From 87ec6f55af38e29be5b2b65a8acf84da73e06d06 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 26 Apr 2023 17:26:39 -0400 Subject: [PATCH 5/5] aio-posix: zero out io_uring sqe user_data liburing does not clear sqe->user_data. We must do it ourselves to avoid undefined behavior in process_cqe() when user_data is used. Note that fdmon-io_uring is currently disabled, so this is a latent bug that does not affect users. Let's merge this fix now to make it easier to enable fdmon-io_uring in the future (and I'm working on that). Signed-off-by: Stefan Hajnoczi Message-ID: <20230426212639.82310-1-stefanha@redhat.com> --- util/fdmon-io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c index 17ec18b7bd..16054c5ede 100644 --- a/util/fdmon-io_uring.c +++ b/util/fdmon-io_uring.c @@ -184,6 +184,7 @@ static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) #else io_uring_prep_poll_remove(sqe, node); #endif + io_uring_sqe_set_data(sqe, NULL); } /* Add a timeout that self-cancels when another cqe becomes ready */ @@ -197,6 +198,7 @@ static void add_timeout_sqe(AioContext *ctx, int64_t ns) sqe = get_sqe(ctx); io_uring_prep_timeout(sqe, &ts, 1, 0); + io_uring_sqe_set_data(sqe, NULL); } /* Add sqes from ctx->submit_list for submission */