From d0dd594bedc57f9be2af2af170bf56f9c3f2376e Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 15 Jun 2023 02:49:03 -0700 Subject: [PATCH 01/25] nvme: Print capabilities changes just once This current dev_info() could be very verbose and being printed very frequently depending on some userspace application sending some specific commands. Just print this message once and skip it until the controller resets. Use a controller flag (NVME_CTRL_DIRTY_CAPABILITY) to track if the capability needs a reset. Signed-off-by: Breno Leitao Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/nvme.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 76e8f8b4098e..4b7f9edab5e8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1125,8 +1125,11 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, mutex_unlock(&ctrl->scan_lock); } if (effects & NVME_CMD_EFFECTS_CCC) { - dev_info(ctrl->device, + if (!test_and_set_bit(NVME_CTRL_DIRTY_CAPABILITY, + &ctrl->flags)) { + dev_info(ctrl->device, "controller capabilities changed, reset may be required to take effect.\n"); + } } if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) { nvme_queue_scan(ctrl); @@ -3280,6 +3283,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended) return ret; } + clear_bit(NVME_CTRL_DIRTY_CAPABILITY, &ctrl->flags); ctrl->identified = true; return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 78308f15e090..f3182134487a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -243,6 +243,7 @@ enum nvme_ctrl_flags { NVME_CTRL_STARTED_ONCE = 2, NVME_CTRL_STOPPED = 3, NVME_CTRL_SKIP_ID_CNS_CS = 4, + NVME_CTRL_DIRTY_CAPABILITY = 5, }; struct nvme_ctrl { From 9d16d264775b9a10f3f5b5db768d7f51294b2a63 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 27 Apr 2023 21:47:45 +0200 Subject: [PATCH 02/25] nvmet: Reorder fields in 'struct nvmet_ns' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvmet_ns' from 520 to 512 bytes. When such a structure is allocated in nvmet_ns_alloc(), because of the way memory allocation works, when 520 bytes were requested, 1024 bytes were allocated. So, on x86_64, this change saves 512 bytes per allocation. Signed-off-by: Christophe JAILLET Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Jens Axboe Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 6cf723bc664e..8cfd60f3b564 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -79,8 +79,8 @@ struct nvmet_ns { struct completion disable_done; mempool_t *bvec_pool; - int use_p2pmem; struct pci_dev *p2p_dev; + int use_p2pmem; int pi_type; int metadata_size; u8 csi; From 125bfc7cd750e68c99f1d446e2c22abea08c237f Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 9 Jun 2023 17:43:20 +0800 Subject: [PATCH 03/25] md/raid10: fix the condition to call bio_end_io_acct() /sys/block/[device]/queue/iostats is used to control whether to count io stat. Write 0 to it will clear queue_flags QUEUE_FLAG_IO_STAT which means iostats is disabled. If we disable iostats and later endable it, the io issued during this period will be counted incorrectly, inflight will be decreased to -1. //T1 set iostats echo 0 > /sys/block/md0/queue/iostats clear QUEUE_FLAG_IO_STAT //T2 issue io if (QUEUE_FLAG_IO_STAT) -> false bio_start_io_acct inflight++ echo 1 > /sys/block/md0/queue/iostats set QUEUE_FLAG_IO_STAT //T3 io end if (QUEUE_FLAG_IO_STAT) -> true bio_end_io_acct inflight-- -> -1 Also, if iostats is enabled while issuing io but disabled while io end, inflight will never be decreased. Fix it by checking start_time when io end. If start_time is not 0, call bio_end_io_acct(). Fixes: 528bc2cf2fcc ("md/raid10: enable io accounting") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230609094320.2397604-1-linan666@huaweicloud.com --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d0de8c9fb3cf..79067769e44b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -325,7 +325,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + if (r10_bio->start_time) bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* From b5a99602b74bbfa655be509c615181dd95b0719e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Jun 2023 09:21:36 +0800 Subject: [PATCH 04/25] md/raid1-10: fix casting from randomized structure in raid1_submit_write() Following build error triggered while build with clang version 17.0.0 with W=1(this can't be reporduced with gcc 13.1.0): drivers/md/raid1-10.c:117:25: error: casting from randomized structure pointer type 'struct block_device *' to 'struct md_rdev *' 117 | struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; | ^ Fix this by casting 'bio->bi_bdev' to 'void *', as it used to be. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306142042.fmjfmTF8-lkp@intel.com/ Fixes: 8295efbe68c0 ("md/raid1-10: factor out a helper to submit normal write") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230616012136.3047071-1-yukuai1@huaweicloud.com --- drivers/md/raid1-10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 169ebe296f2d..3f22edec70e7 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -116,7 +116,7 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, static inline void raid1_submit_write(struct bio *bio) { - struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; + struct md_rdev *rdev = (void *)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); From a1d7671910965ca9f8f0377e7e3bfd1179fba4d8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 16 Jun 2023 22:24:04 -0700 Subject: [PATCH 05/25] md: use mddev->external to select holder in export_rdev() mdadm test "10ddf-create-fail-rebuild" triggers warnings like the following [ 215.526357] ------------[ cut here ]------------ [ 215.527243] WARNING: CPU: 18 PID: 1264 at block/bdev.c:617 blkdev_put+0x269/0x350 [ 215.528334] Modules linked in: [ 215.528806] CPU: 18 PID: 1264 Comm: mdmon Not tainted 6.4.0-rc2+ #768 [ 215.529863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS [ 215.531464] RIP: 0010:blkdev_put+0x269/0x350 [ 215.532167] Code: ff ff 49 8d 7d 10 e8 56 bf b8 ff 4d 8b 65 10 49 8d bc 24 58 05 00 00 e8 05 be b8 ff 41 83 ac 24 58 05 00 00 01 e9 44 ff ff ff <0f> 0b e9 52 fe ff ff 0f 0b e9 6b fe ff ff1 [ 215.534780] RSP: 0018:ffffc900040bfbf0 EFLAGS: 00010283 [ 215.535635] RAX: ffff888174001000 RBX: ffff88810b1c3b00 RCX: ffffffff819a4061 [ 215.536645] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88810b1c3ba0 [ 215.537657] RBP: ffff88810dbde800 R08: fffffbfff0fca983 R09: fffffbfff0fca983 [ 215.538674] R10: ffffc900040bfbf0 R11: fffffbfff0fca982 R12: ffff88810b1c3b38 [ 215.539687] R13: ffff88810b1c3b10 R14: ffff88810dbdecb8 R15: ffff88810b1c3b00 [ 215.540833] FS: 00007f2aabdff700(0000) GS:ffff888dfb400000(0000) knlGS:0000000000000000 [ 215.541961] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 215.542775] CR2: 00007fa19a85d934 CR3: 000000010c076006 CR4: 0000000000370ee0 [ 215.543814] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 215.544840] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 215.545885] Call Trace: [ 215.546257] [ 215.546608] export_rdev.isra.63+0x71/0xe0 [ 215.547338] mddev_unlock+0x1b1/0x2d0 [ 215.547898] array_state_store+0x28d/0x450 [ 215.548519] md_attr_store+0xd7/0x150 [ 215.549059] ? __pfx_sysfs_kf_write+0x10/0x10 [ 215.549702] kernfs_fop_write_iter+0x1b9/0x260 [ 215.550351] vfs_write+0x491/0x760 [ 215.550863] ? __pfx_vfs_write+0x10/0x10 [ 215.551445] ? __fget_files+0x156/0x230 [ 215.552053] ksys_write+0xc0/0x160 [ 215.552570] ? __pfx_ksys_write+0x10/0x10 [ 215.553141] ? ktime_get_coarse_real_ts64+0xec/0x100 [ 215.553878] do_syscall_64+0x3a/0x90 [ 215.554403] entry_SYSCALL_64_after_hwframe+0x72/0xdc [ 215.555125] RIP: 0033:0x7f2aade11847 [ 215.555696] Code: c3 66 90 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec 10 e8 1b fd ff ff 4c 89 e2 48 89 ee 89 df 41 89 c0 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 35 44 89 c7 48 89 448 [ 215.558398] RSP: 002b:00007f2aabdfeba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 [ 215.559516] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007f2aade11847 [ 215.560515] RDX: 0000000000000005 RSI: 0000000000438b8b RDI: 0000000000000010 [ 215.561512] RBP: 0000000000438b8b R08: 0000000000000000 R09: 00007f2aaecf0060 [ 215.562511] R10: 000000000e3ba40b R11: 0000000000000293 R12: 0000000000000005 [ 215.563647] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000c70750 [ 215.564693] [ 215.565029] irq event stamp: 15979 [ 215.565584] hardirqs last enabled at (15991): [] __up_console_sem+0x52/0x60 [ 215.566806] hardirqs last disabled at (16000): [] __up_console_sem+0x37/0x60 [ 215.568022] softirqs last enabled at (15716): [] __do_softirq+0x3eb/0x531 [ 215.569239] softirqs last disabled at (15711): [] irq_exit_rcu+0x115/0x160 [ 215.570434] ---[ end trace 0000000000000000 ]--- This means export_rdev() calls blkdev_put with a different holder than the one used by blkdev_get_by_dev(). This is because mddev->major_version == -2 is not a good check for external metadata. Fix this by using mddev->external instead. Also, do not clear mddev->external in md_clean(), as the flag might be used later in export_rdev(). Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Cc: Christoph Hellwig Cc: Jens Axboe Signed-off-by: Song Liu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230617052405.305871-1-song@kernel.org --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index cf3733c90c47..8e7cc2e69bc9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2458,7 +2458,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev); + blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -6140,7 +6140,7 @@ static void md_clean(struct mddev *mddev) mddev->resync_min = 0; mddev->resync_max = MaxSector; mddev->reshape_position = MaxSector; - mddev->external = 0; + /* we still need mddev->external in export_rdev, do not clear it yet */ mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; From 4934b6401a812f9fe368e7d2d091cd1d120ea262 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 21 Jun 2023 22:29:33 +0800 Subject: [PATCH 06/25] md: fix 'delete_mutex' deadlock Commit 3ce94ce5d05a ("md: fix duplicate filename for rdev") introduce a new lock 'delete_mutex', and trigger a new deadlock: t1: remove rdev t2: sysfs writer rdev_attr_store rdev_attr_store mddev_lock state_store md_kick_rdev_from_array lock delete_mutex list_add mddev->deleting unlock delete_mutex mddev_unlock mddev_lock ... lock delete_mutex kobject_del // wait for sysfs writers to be done mddev_unlock lock delete_mutex // wait for delete_mutex, deadlock 'delete_mutex' is used to protect the list 'mddev->deleting', turns out that this list can be protected by 'reconfig_mutex' directly, and this lock can be removed. Fix this problem by removing the lock, and use 'reconfig_mutex' to protect the list. mddev_unlock() will move this list to a local list to be handled after 'reconfig_mutex' is dropped. Fixes: 3ce94ce5d05a ("md: fix duplicate filename for rdev") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621142933.1395629-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 28 +++++++++------------------- drivers/md/md.h | 4 +--- 2 files changed, 10 insertions(+), 22 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e7cc2e69bc9..2e38ef421d69 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -643,7 +643,6 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); - mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); @@ -749,26 +748,15 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; -static void md_free_rdev(struct mddev *mddev) +void mddev_unlock(struct mddev *mddev) { struct md_rdev *rdev; struct md_rdev *tmp; + LIST_HEAD(delete); - mutex_lock(&mddev->delete_mutex); - if (list_empty(&mddev->deleting)) - goto out; + if (!list_empty(&mddev->deleting)) + list_splice_init(&mddev->deleting, &delete); - list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { - list_del_init(&rdev->same_set); - kobject_del(&rdev->kobj); - export_rdev(rdev, mddev); - } -out: - mutex_unlock(&mddev->delete_mutex); -} - -void mddev_unlock(struct mddev *mddev) -{ if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as * an access to the files will try to take reconfig_mutex @@ -808,7 +796,11 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); - md_free_rdev(mddev); + list_for_each_entry_safe(rdev, tmp, &delete, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); @@ -2488,9 +2480,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) * reconfig_mutex is held, hence it can't be called under * reconfig_mutex and it's delayed to mddev_unlock(). */ - mutex_lock(&mddev->delete_mutex); list_add(&rdev->same_set, &mddev->deleting); - mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) diff --git a/drivers/md/md.h b/drivers/md/md.h index bfd2306bc750..1aef86bf3fc3 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -531,11 +531,9 @@ struct mddev { /* * Temporarily store rdev that will be finally removed when - * reconfig_mutex is unlocked. + * reconfig_mutex is unlocked, protected by reconfig_mutex. */ struct list_head deleting; - /* Protect the deleting list */ - struct mutex delete_mutex; bool has_superblocks:1; bool fail_last_dev:1; From a8d5fdd4d2702d0c7ec125bd3bbce3fc589afa67 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 21 Jun 2023 18:57:28 +0800 Subject: [PATCH 07/25] raid10: avoid spin_lock from fastpath from raid10_unplug() Commit 0c0be98bbe67 ("md/raid10: prevent unnecessary calls to wake_up() in fast path") missed one place, for example, with: fio -direct=1 -rw=write/randwrite -iodepth=1 ... Plug and unplug are called for each io, then wake_up() from raid10_unplug() will cause lock contention as well. Avoid this contention by using wake_up_barrier() instead of wake_up(), where spin_lock is not held if waitqueue is empty. Fio test script: [global] name=random reads and writes ioengine=libaio direct=1 readwrite=randrw rwmixread=70 iodepth=64 buffered=0 filename=/dev/md0 size=1G runtime=30 time_based randrepeat=0 norandommap refill_buffers ramp_time=10 bs=4k numjobs=400 group_reporting=1 [job1] Test result with ramdisk raid10(By Ali): Before this patch With this patch READ IOPS=2033k IOPS=3642k WRITE IOPS=871k IOPS=1561K By the way, in this scenario, blk_plug_cb() will be allocated and freed for each io, this seems need to be optimized as well. Reported-and-tested-by: Ali Gholami Rudi Closes: https://lore.kernel.org/all/20231606122233@laper.mirepesht/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 79067769e44b..5051149e27bb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1118,7 +1118,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1127,7 +1127,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); raid1_prepare_flush_writes(mddev->bitmap); - wake_up(&conf->wait_barrier); + wake_up_barrier(conf); while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; From abcc0cbd49283fccd20420e86416b2475b00819c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2023 18:46:54 +0200 Subject: [PATCH 08/25] bcache: Alloc holder object before async registration Allocate holder object (cache or cached_dev) before offloading the rest of the startup to async work. This will allow us to open the block block device with proper holder. Signed-off-by: Jan Kara Acked-by: Coly Li Reviewed-by: Kent Overstreet Link: https://lore.kernel.org/r/20230622164658.12861-1-jack@suse.cz Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 66 +++++++++++++++------------------------ 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e2a803683105..913dd94353b6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2448,6 +2448,7 @@ struct async_reg_args { struct cache_sb *sb; struct cache_sb_disk *sb_disk; struct block_device *bdev; + void *holder; }; static void register_bdev_worker(struct work_struct *work) @@ -2455,22 +2456,13 @@ static void register_bdev_worker(struct work_struct *work) int fail = false; struct async_reg_args *args = container_of(work, struct async_reg_args, reg_work.work); - struct cached_dev *dc; - - dc = kzalloc(sizeof(*dc), GFP_KERNEL); - if (!dc) { - fail = true; - put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, bcache_kobj); - goto out; - } mutex_lock(&bch_register_lock); - if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0) + if (register_bdev(args->sb, args->sb_disk, args->bdev, args->holder) + < 0) fail = true; mutex_unlock(&bch_register_lock); -out: if (fail) pr_info("error %s: fail to register backing device\n", args->path); @@ -2485,21 +2477,11 @@ static void register_cache_worker(struct work_struct *work) int fail = false; struct async_reg_args *args = container_of(work, struct async_reg_args, reg_work.work); - struct cache *ca; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) { - fail = true; - put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, bcache_kobj); - goto out; - } /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0) + if (register_cache(args->sb, args->sb_disk, args->bdev, args->holder)) fail = true; -out: if (fail) pr_info("error %s: fail to register cache device\n", args->path); @@ -2520,6 +2502,13 @@ static void register_device_async(struct async_reg_args *args) queue_delayed_work(system_wq, &args->reg_work, 10); } +static void *alloc_holder_object(struct cache_sb *sb) +{ + if (SB_IS_BDEV(sb)) + return kzalloc(sizeof(struct cached_dev), GFP_KERNEL); + return kzalloc(sizeof(struct cache), GFP_KERNEL); +} + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { @@ -2528,6 +2517,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, struct cache_sb *sb; struct cache_sb_disk *sb_disk; struct block_device *bdev; + void *holder; ssize_t ret; bool async_registration = false; @@ -2585,6 +2575,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (err) goto out_blkdev_put; + holder = alloc_holder_object(sb); + if (!holder) { + ret = -ENOMEM; + err = "cannot allocate memory"; + goto out_put_sb_page; + } + err = "failed to register device"; if (async_registration) { @@ -2595,44 +2592,29 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!args) { ret = -ENOMEM; err = "cannot allocate memory"; - goto out_put_sb_page; + goto out_free_holder; } args->path = path; args->sb = sb; args->sb_disk = sb_disk; args->bdev = bdev; + args->holder = holder; register_device_async(args); /* No wait and returns to user space */ goto async_done; } if (SB_IS_BDEV(sb)) { - struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); - - if (!dc) { - ret = -ENOMEM; - err = "cannot allocate memory"; - goto out_put_sb_page; - } - mutex_lock(&bch_register_lock); - ret = register_bdev(sb, sb_disk, bdev, dc); + ret = register_bdev(sb, sb_disk, bdev, holder); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ if (ret < 0) goto out_free_sb; } else { - struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - - if (!ca) { - ret = -ENOMEM; - err = "cannot allocate memory"; - goto out_put_sb_page; - } - /* blkdev_put() will be called in bch_cache_release() */ - ret = register_cache(sb, sb_disk, bdev, ca); + ret = register_cache(sb, sb_disk, bdev, holder); if (ret) goto out_free_sb; } @@ -2644,6 +2626,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, async_done: return size; +out_free_holder: + kfree(holder); out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: From 2c5555983bd27d24162534b682b10654639a5576 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2023 18:46:55 +0200 Subject: [PATCH 09/25] bcache: Fix bcache device claiming Commit 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") introduced a change that blkdev_put() has to get exclusive holder of the bdev as an argument. However it overlooked that register_bdev() and register_cache() overwrite the bdev->bd_holder field in the block device to point to the real owning object which was not available at the time we called blkdev_get_by_path(). Messing with bdev internals like this is a layering violation and it also causes blkdev_put() to issue warning about mismatching holders. Fix bcache to reopen the block device with appropriate holder once it is available which also restores the behavior that multiple bcache caches cannot claim the same device which was broken by commit 29499ab060fe ("bcache: don't pass a stack address to blkdev_get_by_path"). Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Signed-off-by: Jan Kara Reviewed-by: Kent Overstreet Acked-by: Coly Li Link: https://lore.kernel.org/r/20230622164658.12861-2-jack@suse.cz Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 65 +++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 913dd94353b6..0ae2b3676293 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl) put_page(virt_to_page(dc->sb_disk)); if (!IS_ERR_OR_NULL(dc->bdev)) - blkdev_put(dc->bdev, bcache_kobj); + blkdev_put(dc->bdev, dc); wake_up(&unregister_wait); @@ -1453,7 +1453,6 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, memcpy(&dc->sb, sb, sizeof(struct cache_sb)); dc->bdev = bdev; - dc->bdev->bd_holder = dc; dc->sb_disk = sb_disk; if (cached_dev_init(dc, sb->block_size << 9)) @@ -2218,7 +2217,7 @@ void bch_cache_release(struct kobject *kobj) put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) - blkdev_put(ca->bdev, bcache_kobj); + blkdev_put(ca->bdev, ca); kfree(ca); module_put(THIS_MODULE); @@ -2345,7 +2344,6 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, memcpy(&ca->sb, sb, sizeof(struct cache_sb)); ca->bdev = bdev; - ca->bdev->bd_holder = ca; ca->sb_disk = sb_disk; if (bdev_max_discard_sectors((bdev))) @@ -2359,7 +2357,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, * call blkdev_put() to bdev in bch_cache_release(). So we * explicitly call blkdev_put() here. */ - blkdev_put(bdev, bcache_kobj); + blkdev_put(bdev, ca); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else if (ret == -EPERM) @@ -2516,10 +2514,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, char *path = NULL; struct cache_sb *sb; struct cache_sb_disk *sb_disk; - struct block_device *bdev; - void *holder; + struct block_device *bdev, *bdev2; + void *holder = NULL; ssize_t ret; bool async_registration = false; + bool quiet = false; #ifdef CONFIG_BCACHE_ASYNC_REGISTRATION async_registration = true; @@ -2548,24 +2547,9 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE, - bcache_kobj, NULL); - if (IS_ERR(bdev)) { - if (bdev == ERR_PTR(-EBUSY)) { - dev_t dev; - - mutex_lock(&bch_register_lock); - if (lookup_bdev(strim(path), &dev) == 0 && - bch_is_open(dev)) - err = "device already registered"; - else - err = "device busy"; - mutex_unlock(&bch_register_lock); - if (attr == &ksysfs_register_quiet) - goto done; - } + bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(bdev)) goto out_free_sb; - } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) @@ -2582,6 +2566,32 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_put_sb_page; } + /* Now reopen in exclusive mode with proper holder */ + bdev2 = blkdev_get_by_dev(bdev->bd_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + holder, NULL); + blkdev_put(bdev, NULL); + bdev = bdev2; + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + bdev = NULL; + if (ret == -EBUSY) { + dev_t dev; + + mutex_lock(&bch_register_lock); + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) + err = "device already registered"; + else + err = "device busy"; + mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) { + quiet = true; + ret = size; + } + } + goto out_free_holder; + } + err = "failed to register device"; if (async_registration) { @@ -2619,7 +2629,6 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, goto out_free_sb; } -done: kfree(sb); kfree(path); module_put(THIS_MODULE); @@ -2631,7 +2640,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: - blkdev_put(bdev, register_bcache); + if (bdev) + blkdev_put(bdev, holder); out_free_sb: kfree(sb); out_free_path: @@ -2640,7 +2650,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, out_module_put: module_put(THIS_MODULE); out: - pr_info("error %s: %s\n", path?path:"", err); + if (!quiet) + pr_info("error %s: %s\n", path?path:"", err); return ret; } From ad7c3b41e86b59943a903d23c7b037d820e6270c Mon Sep 17 00:00:00 2001 From: Jinke Han Date: Mon, 8 May 2023 01:06:31 +0800 Subject: [PATCH 10/25] blk-throttle: Fix io statistics for cgroup v1 After commit f382fb0bcef4 ("block: remove legacy IO schedulers"), blkio.throttle.io_serviced and blkio.throttle.io_service_bytes become the only stable io stats interface of cgroup v1, and these statistics are done in the blk-throttle code. But the current code only counts the bios that are actually throttled. When the user does not add the throttle limit, the io stats for cgroup v1 has nothing. I fix it according to the statistical method of v2, and made it count all ios accurately. Fixes: a7b36ee6ba29 ("block: move blk-throtl fast path inline") Tested-by: Andrea Righi Signed-off-by: Jinke Han Acked-by: Muchun Song Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230507170631.89607-1-hanjinke.666@bytedance.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 6 ++++-- block/blk-throttle.c | 6 ------ block/blk-throttle.h | 9 +++++++++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index cab33bd4f252..c8b28ec5dde9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2062,6 +2062,9 @@ void blk_cgroup_bio_start(struct bio *bio) struct blkg_iostat_set *bis; unsigned long flags; + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) + return; + /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; @@ -2093,8 +2096,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9d010d867fbf..7397ff199d66 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2178,12 +2178,6 @@ bool __blk_throtl_bio(struct bio *bio) rcu_read_lock(); - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { - blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); - } - spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index ef4b7a4de987..d1ccbfe9f797 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -185,6 +185,15 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); int rw = bio_data_dir(bio); + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + } + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + /* iops limit is always counted */ if (tg->has_rules_iops[rw]) return true; From c6b7a3a26e809c9d2a51ae303764c1d2994f31cf Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 24 Jun 2023 21:01:05 +0800 Subject: [PATCH 11/25] blk-mq: fix two misuses on RQF_USE_SCHED Request allocated from sched tags can't be issued via ->queue_rqs() directly, since driver tag isn't allocated yet. This is the 1st misuse of RQF_USE_SCHED for figuring out plug->has_elevator. Request allocated from sched tags can't be ended by blk_mq_end_request_batch() too, fix the 2nd RQF_USE_SCHED misuse in blk_mq_add_to_batch(). Without this patch, NVMe uring cmd passthrough IO workload can run into hang easily with real io scheduler. Fixes: dd6216bb16e8 ("blk-mq: make sure elevator callbacks aren't called for passthrough request") Reported-by: Guangwu Zhang Closes: https://lore.kernel.org/linux-block/CAGS2=YrBjpLPOKa-gzcKuuOG60AGth5794PNCDwatdnnscB9ug@mail.gmail.com/ Cc: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230624130105.1443879-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++++- include/linux/blk-mq.h | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 720b5061ffe8..32e50bc0cbb0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1280,7 +1280,11 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED)) + /* + * Any request allocated from sched tags can't be issued to + * ->queue_rqs() directly + */ + if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq->rq_next = NULL; rq_list_add(&plug->mq_list, rq); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f401067ac03a..aaed687a454c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -852,7 +852,11 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_USE_SCHED) || ioerror || + /* + * blk_mq_end_request_batch() can't end request allocated from + * sched tags + */ + if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || (req->end_io && !blk_rq_is_passthrough(req))) return false; From 645a829e03384a235b3760959d4ebe420a0f2027 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:40 +0800 Subject: [PATCH 12/25] blk-wbt: don't create wbt sysfs entry if CONFIG_BLK_WBT is disabled sysfs entry /sys/block/[device]/queue/wbt_lat_usec will be created even if CONFIG_BLK_WBT is disabled, while read and write will always fail. It doesn't make sense to create a sysfs entry that can't be accessed, so don't create such entry. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 143 ++++++++++++++++++++++++---------------------- block/blk-wbt.h | 19 ------ 2 files changed, 74 insertions(+), 88 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a64208583853..6c1c4ba66bc0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -47,19 +47,6 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_var_store64(s64 *var, const char *page) -{ - int err; - s64 v; - - err = kstrtos64(page, 10, &v); - if (err < 0) - return err; - - *var = v; - return 0; -} - static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, page); @@ -451,61 +438,6 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, return count; } -static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) -{ - if (!wbt_rq_qos(q)) - return -EINVAL; - - if (wbt_disabled(q)) - return sprintf(page, "0\n"); - - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); -} - -static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, - size_t count) -{ - struct rq_qos *rqos; - ssize_t ret; - s64 val; - - ret = queue_var_store64(&val, page); - if (ret < 0) - return ret; - if (val < -1) - return -EINVAL; - - rqos = wbt_rq_qos(q); - if (!rqos) { - ret = wbt_init(q->disk); - if (ret) - return ret; - } - - if (val == -1) - val = wbt_default_latency_nsec(q); - else if (val >= 0) - val *= 1000ULL; - - if (wbt_get_min_lat(q) == val) - return count; - - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - wbt_set_min_lat(q, val); - - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - - return count; -} - static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -598,7 +530,6 @@ QUEUE_RW_ENTRY(queue_wc, "write_cache"); QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); @@ -617,6 +548,78 @@ QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RW_ENTRY(queue_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); +#ifdef CONFIG_BLK_WBT +static ssize_t queue_var_store64(s64 *var, const char *page) +{ + int err; + s64 v; + + err = kstrtos64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!wbt_rq_qos(q)) + return -EINVAL; + + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + struct rq_qos *rqos; + ssize_t ret; + s64 val; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + if (val < -1) + return -EINVAL; + + rqos = wbt_rq_qos(q); + if (!rqos) { + ret = wbt_init(q->disk); + if (ret) + return ret; + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + return count; + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + wbt_set_min_lat(q, val); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return count; +} + +QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); +#endif + static struct attribute *queue_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -655,7 +658,9 @@ static struct attribute *queue_attrs[] = { &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, +#ifdef CONFIG_BLK_WBT &queue_wb_lat_entry.attr, +#endif &queue_poll_delay_entry.attr, &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-wbt.h b/block/blk-wbt.h index ba6cca5849a6..8a029e138f7a 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -18,10 +18,6 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline int wbt_init(struct gendisk *disk) -{ - return -EINVAL; -} static inline void wbt_disable_default(struct gendisk *disk) { } @@ -31,21 +27,6 @@ static inline void wbt_enable_default(struct gendisk *disk) static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } -static inline u64 wbt_get_min_lat(struct request_queue *q) -{ - return 0; -} -static inline void wbt_set_min_lat(struct request_queue *q, u64 val) -{ -} -static inline u64 wbt_default_latency_nsec(struct request_queue *q) -{ - return 0; -} -static inline bool wbt_disabled(struct request_queue *q) -{ - return true; -} #endif /* CONFIG_BLK_WBT */ From 71b8642e79f277459555629f2bea1a8d1fed307e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:41 +0800 Subject: [PATCH 13/25] blk-wbt: remove dead code to handle wbt enable/disable with io inflight enable or disable wbt is always called with queue freezed, so that wbt can never be enabled or disabled while io is still inflight, and this behaviour should always hold to avoid io hang(There have been reported several times). Therefor, the code to handle wbt enable/diskble with io inflight is not and never will be used, hence remove such dead code. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 53bf5aa6f9ad..21bbeb31a444 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -200,15 +200,6 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, inflight = atomic_dec_return(&rqw->inflight); - /* - * wbt got disabled with IO in flight. Wake up any potential - * waiters, we don't have to do more than that. - */ - if (unlikely(!rwb_enabled(rwb))) { - rwb_wake_all(rwb); - return; - } - /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we @@ -545,13 +536,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; - /* - * If we got disabled, just return UINT_MAX. This ensures that - * we'll properly inc a new IO, and dec+wakeup at the end. - */ - if (!rwb_enabled(rwb)) - return UINT_MAX; - if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; From 06257fda83ebfd1c33fb992e41dba7be4e1184d4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:42 +0800 Subject: [PATCH 14/25] blk-wbt: cleanup rwb_enabled() and wbt_disabled() 'wb_normal' will set to 0 if 'min_lat_nsec' is 0, and 'min_lat_nsec' can only be set to 0 through sysfs configuration where 'WBT_STATE_OFF_MANUAL' is set together, in the meantime, they can only be cleared together through sysfs afterwards. Hence 'wb_normal != 0' is the same as 'rwb->enable_state != WBT_STATE_OFF_MANUAL'. The code is redundan, hence replace the checking of 'wb_normal' to 'enable_state' in rwb_enabled() and reuse rwb_enabled() for wbt_disabled(). Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 21bbeb31a444..9f7c99c025f3 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -146,7 +146,7 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && - rwb->wb_normal != 0; + rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -494,8 +494,7 @@ bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); - return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || - RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; + return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) From eebc21d12f56c1e09a163abf91e351fa2a55a938 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:43 +0800 Subject: [PATCH 15/25] blk-iocost: move wbt_enable/disable_default() out of spinlock There are following smatch warning: block/blk-wbt.c:843 wbt_init() warn: sleeping in atomic context ioc_qos_write() <- disables preempt -> wbt_enable_default() -> wbt_init() wbt_init() will be called from wbt_enable_default() if wbt is not initialized, currently this is only possible in blk_register_queue(), hence wbt_init() will never be called from iocost and this warning is false positive. However, we might support rq_qos destruction dynamically in the future, and it's better to prevent that, hence move wbt_enable_default() outside 'ioc->lock'. This is safe because queue is still freezed. Reported-by: Dan Carpenter Link: https://lore.kernel.org/lkml/Y+Ja5SRs886CEz7a@kadam/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6084a9519883..9dfcf540f400 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3301,11 +3301,9 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - wbt_disable_default(disk); } else { blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; - wbt_enable_default(disk); } if (user) { @@ -3318,6 +3316,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + if (enable) + wbt_disable_default(disk); + else + wbt_enable_default(disk); + blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); From 6d85ebf95c44e52337ca1d07f0db4b435d1e6762 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 27 May 2023 09:06:44 +0800 Subject: [PATCH 16/25] blk-sysfs: add a new attr_group for blk_mq Currently wbt sysfs entry is created for bio based device, and wbt can be enabled for such device through sysfs while it doesn't make sense because wbt can only work for rq based device. In the meantime, there are other similar sysfs entries. Fix this by adding a new attr_group for blk_mq, and sysfs entries will only be created when the device is rq based. Suggested-by: Christoph Hellwig Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230527010644.647900-6-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6c1c4ba66bc0..afc797fb0dfc 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -621,7 +621,6 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif static struct attribute *queue_attrs[] = { - &queue_requests_entry.attr, &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, @@ -629,7 +628,6 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, - &elv_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -650,7 +648,6 @@ static struct attribute *queue_attrs[] = { &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, - &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_random_entry.attr, @@ -658,11 +655,7 @@ static struct attribute *queue_attrs[] = { &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, -#ifdef CONFIG_BLK_WBT - &queue_wb_lat_entry.attr, -#endif &queue_poll_delay_entry.attr, - &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, #endif @@ -671,16 +664,23 @@ static struct attribute *queue_attrs[] = { NULL, }; +static struct attribute *blk_mq_queue_attrs[] = { + &queue_requests_entry.attr, + &elv_iosched_entry.attr, + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, +#endif + NULL, +}; + static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - if (attr == &queue_io_timeout_entry.attr && - (!q->mq_ops || !q->mq_ops->timeout)) - return 0; - if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) @@ -689,11 +689,30 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, return attr->mode; } +static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; + + if (!queue_is_mq(q)) + return 0; + + if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) + return 0; + + return attr->mode; +} + static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; +static struct attribute_group blk_mq_queue_attr_group = { + .attrs = blk_mq_queue_attrs, + .is_visible = blk_mq_queue_attr_visible, +}; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) @@ -738,6 +757,7 @@ static const struct sysfs_ops queue_sysfs_ops = { static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, + &blk_mq_queue_attr_group, NULL }; From 86da1bae4c64ab3dcbdda0c77ce37c9bf47a501f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 22 Jun 2023 16:53:07 +0900 Subject: [PATCH 17/25] nvme: host: fix command name spelling Correctly spell "Zeroes" in nvme_cmd_write_zeroes command name. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/constants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c index bc523ca02254..44de4f6a4f4c 100644 --- a/drivers/nvme/host/constants.c +++ b/drivers/nvme/host/constants.c @@ -12,7 +12,7 @@ static const char * const nvme_ops[] = { [nvme_cmd_read] = "Read", [nvme_cmd_write_uncor] = "Write Uncorrectable", [nvme_cmd_compare] = "Compare", - [nvme_cmd_write_zeroes] = "Write Zeros", + [nvme_cmd_write_zeroes] = "Write Zeroes", [nvme_cmd_dsm] = "Dataset Management", [nvme_cmd_verify] = "Verify", [nvme_cmd_resv_register] = "Reservation Register", From 99160af413b4ff1c3b4741e8a7583f8e7197f201 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 20 Jun 2023 16:07:36 +0300 Subject: [PATCH 18/25] nvme-mpath: fix I/O failure with EAGAIN when failing over I/O It is possible that the next available path we failover to, happens to be frozen (for example if it is during connection establishment). If the original I/O was set with NOWAIT, this cause the I/O to unnecessarily fail because the request queue cannot be entered, hence the I/O fails with EAGAIN. The NOWAIT restriction that was originally set for the I/O is no longer relevant or needed because this is the nvme requeue context. Hence we clear the REQ_NOWAIT flag when failing over I/O. This fix a simple test case of nvme controller reset during I/O when the multipath device that has only a single path and I/O fails with "Resource temporarily unavailable" errno. Note that this reproduces with io_uring which by default sets IOCB_NOWAIT by default. Signed-off-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 91a9a55227fa..3acb47760e24 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -106,6 +106,14 @@ void nvme_failover_req(struct request *req) bio->bi_opf &= ~REQ_POLLED; bio->bi_cookie = BLK_QC_T_NONE; } + /* + * The alternate request queue that we may end up submitting + * the bio to may be frozen temporarily, in this case REQ_NOWAIT + * will fail the I/O immediately with EAGAIN to the issuer. + * We are not in the issuer context which cannot block. Clear + * the flag to avoid spurious EAGAIN I/O failures. + */ + bio->bi_opf &= ~REQ_NOWAIT; } blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); From f6c80cffcd47a2d41943e3a41fbe9034d9f6d7b0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 12 Jun 2023 12:03:42 -0700 Subject: [PATCH 19/25] block: add request polling helper Provide a direct request polling will for drivers. The interface does not require a bio, and can skip the overhead associated with polling those. The biggest gain from skipping the relatively expensive xarray lookup unnecessary when you already have the request. With this, the simple rq/qc conversion functions have only one caller each, so open code this and remove the helpers. Signed-off-by: Keith Busch Reviewed-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230612190343.2087040-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 48 ++++++++++++++++++++++++++++-------------- include/linux/blk-mq.h | 2 ++ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 98eb31ff914d..5504719b970d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -49,17 +49,8 @@ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); - -static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, - blk_qc_t qc) -{ - return xa_load(&q->hctx_table, qc); -} - -static inline blk_qc_t blk_rq_to_qc(struct request *rq) -{ - return rq->mq_hctx->queue_num; -} +static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator @@ -1248,7 +1239,7 @@ void blk_mq_start_request(struct request *rq) q->integrity.profile->prepare_fn(rq); #endif if (rq->bio && rq->bio->bi_opf & REQ_POLLED) - WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); + WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); @@ -1354,7 +1345,7 @@ EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - blk_mq_poll(rq->q, blk_rq_to_qc(rq), NULL, 0); + blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } @@ -4749,10 +4740,9 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, - unsigned int flags) +static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); int ret; @@ -4777,6 +4767,32 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch * return 0; } +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, + struct io_comp_batch *iob, unsigned int flags) +{ + struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); + + return blk_hctx_poll(q, hctx, iob, flags); +} + +int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct request_queue *q = rq->q; + int ret; + + if (!blk_rq_is_poll(rq)) + return 0; + if (!percpu_ref_tryget(&q->q_usage_counter)) + return 0; + + ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); + blk_queue_exit(q); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_rq_poll); + unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index aaed687a454c..2b7fb8e87793 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -715,6 +715,8 @@ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_request(struct request *rq); +int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, + unsigned int poll_flags); bool blk_mq_queue_inflight(struct request_queue *q); From 9408d8a37e6cce8803681ab816383450a056c3a9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 12 Jun 2023 12:03:43 -0700 Subject: [PATCH 20/25] nvme: improved uring polling Drivers can poll requests directly, so use that. We just need to ensure the driver's request was allocated from a polled hctx, so a special driver flag is added to struct io_uring_cmd. The allows unshared and multipath namespaces to use the same polling callback, and multipath is guaranteed to get the same queue as the command was submitted on. Previously multipath polling might check a different path and poll the wrong info. The other bonus is we don't need a bio payload in order to poll, allowing commands like 'flush' and 'write zeroes' to be submitted on the same high priority queue as read and write commands. Finally, using the request based polling skips the unnecessary bio overhead. Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230612190343.2087040-3-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 70 ++++++++++------------------------- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 2 - include/uapi/linux/io_uring.h | 2 + 4 files changed, 22 insertions(+), 54 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 2130ad65b58c..5c3250f36ce7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -505,7 +505,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - void *cookie = READ_ONCE(ioucmd->cookie); req->bio = pdu->bio; if (nvme_req(req)->flags & NVME_REQ_CANCELLED) @@ -518,10 +517,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, * For iopoll, complete it directly. * Otherwise, move the completion to task work. */ - if (cookie != NULL && blk_rq_is_poll(req)) + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); - else + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } @@ -531,7 +532,6 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, { struct io_uring_cmd *ioucmd = req->end_io_data; struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - void *cookie = READ_ONCE(ioucmd->cookie); req->bio = pdu->bio; pdu->req = req; @@ -540,10 +540,12 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req, * For iopoll, complete it directly. * Otherwise, move the completion to task work. */ - if (cookie != NULL && blk_rq_is_poll(req)) + if (blk_rq_is_poll(req)) { + WRITE_ONCE(ioucmd->cookie, NULL); nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED); - else + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb); + } return RQ_END_IO_NONE; } @@ -599,7 +601,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (issue_flags & IO_URING_F_IOPOLL) rq_flags |= REQ_POLLED; -retry: req = nvme_alloc_user_request(q, &c, rq_flags, blk_flags); if (IS_ERR(req)) return PTR_ERR(req); @@ -613,17 +614,11 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return ret; } - if (issue_flags & IO_URING_F_IOPOLL && rq_flags & REQ_POLLED) { - if (unlikely(!req->bio)) { - /* we can't poll this, so alloc regular req instead */ - blk_mq_free_request(req); - rq_flags &= ~REQ_POLLED; - goto retry; - } else { - WRITE_ONCE(ioucmd->cookie, req->bio); - req->bio->bi_opf |= REQ_POLLED; - } + if (blk_rq_is_poll(req)) { + ioucmd->flags |= IORING_URING_CMD_POLLED; + WRITE_ONCE(ioucmd->cookie, req); } + /* to free bio on completion, as req->bio will be null at that time */ pdu->bio = req->bio; pdu->meta_len = d.metadata_len; @@ -785,18 +780,16 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags) { - struct bio *bio; + struct request *req; int ret = 0; - struct nvme_ns *ns; - struct request_queue *q; + + if (!(ioucmd->flags & IORING_URING_CMD_POLLED)) + return 0; rcu_read_lock(); - bio = READ_ONCE(ioucmd->cookie); - ns = container_of(file_inode(ioucmd->file)->i_cdev, - struct nvme_ns, cdev); - q = ns->queue; - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio && bio->bi_bdev) - ret = bio_poll(bio, iob, poll_flags); + req = READ_ONCE(ioucmd->cookie); + if (req && blk_rq_is_poll(req)) + ret = blk_rq_poll(req, iob, poll_flags); rcu_read_unlock(); return ret; } @@ -890,31 +883,6 @@ int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, srcu_read_unlock(&head->srcu, srcu_idx); return ret; } - -int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, - struct io_comp_batch *iob, - unsigned int poll_flags) -{ - struct cdev *cdev = file_inode(ioucmd->file)->i_cdev; - struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); - int srcu_idx = srcu_read_lock(&head->srcu); - struct nvme_ns *ns = nvme_find_path(head); - struct bio *bio; - int ret = 0; - struct request_queue *q; - - if (ns) { - rcu_read_lock(); - bio = READ_ONCE(ioucmd->cookie); - q = ns->queue; - if (test_bit(QUEUE_FLAG_POLL, &q->queue_flags) && bio - && bio->bi_bdev) - ret = bio_poll(bio, iob, poll_flags); - rcu_read_unlock(); - } - srcu_read_unlock(&head->srcu, srcu_idx); - return ret; -} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 98001eebd275..5aa1592849af 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -470,7 +470,7 @@ static const struct file_operations nvme_ns_head_chr_fops = { .unlocked_ioctl = nvme_ns_head_chr_ioctl, .compat_ioctl = compat_ptr_ioctl, .uring_cmd = nvme_ns_head_chr_uring_cmd, - .uring_cmd_iopoll = nvme_ns_head_chr_uring_cmd_iopoll, + .uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll, }; static int nvme_add_ns_head_cdev(struct nvme_ns_head *head) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9a98c14c552a..791cafd9910a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -854,8 +854,6 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, struct io_comp_batch *iob, unsigned int poll_flags); -int nvme_ns_head_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, - struct io_comp_batch *iob, unsigned int poll_flags); int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd, diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f222d263bc55..08720c7bd92f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -244,8 +244,10 @@ enum io_uring_op { * sqe->uring_cmd_flags * IORING_URING_CMD_FIXED use registered buffer; pass this flag * along with setting sqe->buf_index. + * IORING_URING_CMD_POLLED driver use only */ #define IORING_URING_CMD_FIXED (1U << 0) +#define IORING_URING_CMD_POLLED (1U << 31) /* From a587b046ce921cc1805de6f0f000209b3644cadd Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 29 Jun 2023 01:30:47 -0700 Subject: [PATCH 21/25] cdrom/gdrom: Fix build error Commit 7ae24fcee992 ("cdrom: remove the unused mode argument to cdrom_release") was supposed to remove an unused argument from cdrom_release(). but instead removed a used argument from cdrom_open(). This results in the following build error. drivers/cdrom/gdrom.c: In function 'gdrom_bdops_open': drivers/cdrom/gdrom.c:484:15: error: too few arguments to function 'cdrom_open' drivers/cdrom/gdrom.c: In function 'gdrom_bdops_release': drivers/cdrom/gdrom.c:492:35: error: 'mode' undeclared Fix it up. Fixes: 7ae24fcee992 ("cdrom: remove the unused mode argument to cdrom_release") Cc: Christoph Hellwig Cc: Phillip Potter Cc: Hannes Reinecke Cc: Christian Brauner Cc: Jens Axboe Signed-off-by: Guenter Roeck Link: https://lore.kernel.org/r/20230629083047.3487172-1-linux@roeck-us.net Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 3a46e27479ff..d668b174ace9 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -481,7 +481,7 @@ static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode) disk_check_media_change(disk); mutex_lock(&gdrom_mutex); - ret = cdrom_open(gd.cd_info); + ret = cdrom_open(gd.cd_info, mode); mutex_unlock(&gdrom_mutex); return ret; } @@ -489,7 +489,7 @@ static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode) static void gdrom_bdops_release(struct gendisk *disk) { mutex_lock(&gdrom_mutex); - cdrom_release(gd.cd_info, mode); + cdrom_release(gd.cd_info); mutex_unlock(&gdrom_mutex); } From 2ab4e5f44a869eaf61d7520ad6296b91f67efeed Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Jun 2023 07:46:56 -0700 Subject: [PATCH 22/25] nvme: ensure unquiesce on teardown The reset work is called on quiesced IO queues, so ensure these are unquiesced after a failed reset to flush out any pending requests. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b027e5e3f4ac..8eaa954aa6ed 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2778,6 +2778,7 @@ static void nvme_reset_work(struct work_struct *work) nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_dev_disable(dev, true); nvme_mark_namespaces_dead(&dev->ctrl); + nvme_unquiesce_io_queues(&dev->ctrl); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); } From a2b5d5443fa7a0e9f26b31598bcc38c2b66300d9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Jun 2023 07:48:15 -0700 Subject: [PATCH 23/25] nvme: sync timeout work on failed reset Timeouts during reset will set the controller for failure, preventing the state change to LIVE. Ensure all timeout work is synced after the controller disabling completes to ensure we don't have any other tasks messing with any namespace request_queue's. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8eaa954aa6ed..bfeadecf9e15 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2777,6 +2777,7 @@ static void nvme_reset_work(struct work_struct *work) result); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); nvme_dev_disable(dev, true); + nvme_sync_queues(&dev->ctrl); nvme_mark_namespaces_dead(&dev->ctrl); nvme_unquiesce_io_queues(&dev->ctrl); nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); From 4e69d4dabd2379af57b0b8fb9b0d62c23f9cd3b8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 28 Jun 2023 07:51:02 -0700 Subject: [PATCH 24/25] nvme: disable controller on reset state failure If the controller is not in a RESETTING state at the point of reset work, we have to conclude the controller is being deleted. Go to the cleanup on this condition to ensure proper pairing of request_queue quiesce state. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bfeadecf9e15..c9224d39195e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2690,7 +2690,8 @@ static void nvme_reset_work(struct work_struct *work) if (dev->ctrl.state != NVME_CTRL_RESETTING) { dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n", dev->ctrl.state); - return; + result = -ENODEV; + goto out; } /* From e836007089ba8fdf24e636ef2b007651fb4582e6 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 23 Jun 2023 14:05:23 -0400 Subject: [PATCH 25/25] md/raid0: add discard support for the 'original' layout We've found that using raid0 with the 'original' layout and discard enabled with different disk sizes (such that at least two zones are created) can result in data corruption. This is due to the fact that the discard handling in 'raid0_handle_discard()' assumes the 'alternate' layout. We've seen this corruption using ext4 but other filesystems are likely susceptible as well. More specifically, while multiple zones are necessary to create the corruption, the corruption may not occur with multiple zones if they layout in such a way the layout matches what the 'alternate' layout would have produced. Thus, not all raid0 devices with the 'original' layout, different size disks and discard enabled will encounter this corruption. The 3.14 kernel inadvertently changed the raid0 disk layout for different size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the same raid0 array could corrupt data. This lead to the creation of the 'original' layout (to match the pre-3.14 layout) and the 'alternate' layout (to match the post 3.14 layout) in the 5.4 kernel time frame and an option to tell the kernel which layout to use (since it couldn't be autodetected). However, when the 'original' layout was added back to 5.4 discard support for the 'original' layout was not added leading this issue. I've been able to reliably reproduce the corruption with the following test case: 1. create raid0 array with different size disks using original layout 2. mkfs 3. mount -o discard 4. create lots of files 5. remove 1/2 the files 6. fstrim -a (or just the mount point for the raid0 array) 7. umount 8. fsck -fn /dev/md0 (spews all sorts of corruptions) Let's fix this by adding proper discard support to the 'original' layout. The fix 'maps' the 'original' layout disks to the order in which they are read/written such that we can compare the disks in the same way that the current 'alternate' layout does. A 'disk_shift' field is added to 'struct strip_zone'. This could be computed on the fly in raid0_handle_discard() but by adding this field, we save some computation in the discard path. Note we could also potentially fix this by re-ordering the disks in the zones that follow the first one, and then always read/writing them using the 'alternate' layout. However, that is seen as a more substantial change, and we are attempting the least invasive fix at this time to remedy the corruption. I've verified the change using the reproducer mentioned above. Typically, the corruption is seen after less than 3 iterations, while the patch has run 500+ iterations. Cc: NeilBrown Cc: Song Liu Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.") Cc: stable@vger.kernel.org Signed-off-by: Jason Baron Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com --- drivers/md/raid0.c | 62 ++++++++++++++++++++++++++++++++++++++++------ drivers/md/raid0.h | 1 + 2 files changed, 55 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f8ee9a95e25d..d1ac73fcd852 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -270,6 +270,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) goto abort; } + if (conf->layout == RAID0_ORIG_LAYOUT) { + for (i = 1; i < conf->nr_strip_zones; i++) { + sector_t first_sector = conf->strip_zone[i-1].zone_end; + + sector_div(first_sector, mddev->chunk_sectors); + zone = conf->strip_zone + i; + /* disk_shift is first disk index used in the zone */ + zone->disk_shift = sector_div(first_sector, + zone->nb_dev); + } + } + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -431,6 +443,20 @@ static int raid0_run(struct mddev *mddev) return ret; } +/* + * Convert disk_index to the disk order in which it is read/written. + * For example, if we have 4 disks, they are numbered 0,1,2,3. If we + * write the disks starting at disk 3, then the read/write order would + * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift() + * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map + * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in + * that 'output' space to understand the read/write disk ordering. + */ +static int map_disk_shift(int disk_index, int num_disks, int disk_shift) +{ + return ((disk_index + num_disks - disk_shift) % num_disks); +} + static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; @@ -444,7 +470,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) sector_t end_disk_offset; unsigned int end_disk_index; unsigned int disk; + sector_t orig_start, orig_end; + orig_start = start; zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { @@ -458,6 +486,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) } else end = bio_end_sector(bio); + orig_end = end; if (zone != conf->strip_zone) end = end - zone[-1].zone_end; @@ -469,13 +498,26 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) last_stripe_index = end; sector_div(last_stripe_index, stripe_size); - start_disk_index = (int)(start - first_stripe_index * stripe_size) / - mddev->chunk_sectors; + /* In the first zone the original and alternate layouts are the same */ + if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) { + sector_div(orig_start, mddev->chunk_sectors); + start_disk_index = sector_div(orig_start, zone->nb_dev); + start_disk_index = map_disk_shift(start_disk_index, + zone->nb_dev, + zone->disk_shift); + sector_div(orig_end, mddev->chunk_sectors); + end_disk_index = sector_div(orig_end, zone->nb_dev); + end_disk_index = map_disk_shift(end_disk_index, + zone->nb_dev, zone->disk_shift); + } else { + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + } start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % mddev->chunk_sectors) + first_stripe_index * mddev->chunk_sectors; - end_disk_index = (int)(end - last_stripe_index * stripe_size) / - mddev->chunk_sectors; end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % mddev->chunk_sectors) + last_stripe_index * mddev->chunk_sectors; @@ -483,18 +525,22 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; struct md_rdev *rdev; + int compare_disk; - if (disk < start_disk_index) + compare_disk = map_disk_shift(disk, zone->nb_dev, + zone->disk_shift); + + if (compare_disk < start_disk_index) dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > start_disk_index) + else if (compare_disk > start_disk_index) dev_start = first_stripe_index * mddev->chunk_sectors; else dev_start = start_disk_offset; - if (disk < end_disk_index) + if (compare_disk < end_disk_index) dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > end_disk_index) + else if (compare_disk > end_disk_index) dev_end = last_stripe_index * mddev->chunk_sectors; else dev_end = end_disk_offset; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 3816e5477db1..8cc761ca7423 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -6,6 +6,7 @@ struct strip_zone { sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ int nb_dev; /* # of devices attached to the zone */ + int disk_shift; /* start disk for the original layout */ }; /* Linux 3.14 (20d0189b101) made an unintended change to