From 83794367dcc6749662b17a1e4b8ec085023fc53b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 24 Apr 2023 22:13:18 +0900 Subject: [PATCH 01/17] block: Cleanup set_capacity()/bdev_set_nr_sectors() The code for setting a block device capacity (bd_nr_sectors field of struct block_device) is duplicated in set_capacity() and bdev_set_nr_sectors(). Clean this up by making bdev_set_nr_sectors() a block layer internal function defined in block/bdev.c instead of having this function statically defined in block/partitions/core.c. With this change, set_capacity() implementation can be simplified to only calling bdev_set_nr_sectors(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230424131318.79935-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/bdev.c | 8 ++++++++ block/blk.h | 2 ++ block/genhd.c | 7 +------ block/partitions/core.c | 8 -------- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 850852fe4b78..717089a5726f 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -428,6 +428,14 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) return bdev; } +void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; + spin_unlock(&bdev->bd_size_lock); +} + void bdev_add(struct block_device *bdev, dev_t dev) { bdev->bd_dev = dev; diff --git a/block/blk.h b/block/blk.h index 2da831103471..564119a76bc5 100644 --- a/block/blk.h +++ b/block/blk.h @@ -419,6 +419,8 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); void blk_drop_partitions(struct gendisk *disk); +void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); + struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); diff --git a/block/genhd.c b/block/genhd.c index cadcb472dfc3..8d56fb5f08b2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -57,12 +57,7 @@ static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { - struct block_device *bdev = disk->part0; - - spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - bdev->bd_nr_sectors = sectors; - spin_unlock(&bdev->bd_size_lock); + bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); diff --git a/block/partitions/core.c b/block/partitions/core.c index 7b8ef6296abd..49e0496ff23c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -85,14 +85,6 @@ static int (*check_part[])(struct parsed_partitions *) = { NULL }; -static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) -{ - spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - bdev->bd_nr_sectors = sectors; - spin_unlock(&bdev->bd_size_lock); -} - static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; From 38c8e3dfb2a1be863b7f5aad7755d5e9727da8a5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Apr 2023 11:41:54 +0800 Subject: [PATCH 02/17] block: sync part's ->bd_has_submit_bio with disk's submit_bio() always uses bio->bi_bdev->bd_has_submit_bio to decide if disk's ->submit_bio() is called, and bio->bi_bdev could point to one partition device. So we have to sync part bdev's ->bd_has_submit_bio with disk's. Reported-by: Changhui Zhong Link: https://lore.kernel.org/linux-block/ZEdItaPqif8fp85H@ovpn-8-24.pek2.redhat.com/T/#t Fixes: 9f4107b07b17 ("block: store bdev->bd_disk->fops->submit_bio state in bdev") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230425034154.110099-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bdev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index 717089a5726f..21c63bfef323 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -418,8 +418,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_partno = partno; bdev->bd_inode = inode; bdev->bd_queue = disk->queue; + if (partno) + bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; + else + bdev->bd_has_submit_bio = false; bdev->bd_stats = alloc_percpu(struct disk_stats); - bdev->bd_has_submit_bio = false; if (!bdev->bd_stats) { iput(inode); return NULL; From 3f89ac587baa0c0460c977d1596e16f950815f05 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 24 Apr 2023 16:46:28 -0700 Subject: [PATCH 03/17] block/drivers: remove dead clear of random flag QUEUE_FLAG_ADD_RANDOM is not set before we clear it for "null_blk", "brd", "nbd", "zram", and "bcache" since by default we don't set "QUEUE_FLAG_ADD_RANDOM" to MQ ops. Remove dead clear of QUEUE_FLAG_ADD_RANDOM in above listed drivers. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sergey Senozhatsky #zram Link: https://lore.kernel.org/r/20230424234628.45544-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 1 - drivers/block/nbd.c | 1 - drivers/block/null_blk/main.c | 1 - drivers/block/zram/zram_drv.c | 1 - drivers/md/bcache/super.c | 1 - 5 files changed, 5 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 34177f1bd97d..bcad9b926b0c 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -404,7 +404,6 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d445fd0934bd..7c96ec4e99df 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1805,7 +1805,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) * Tell the block layer that we are not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index fcb61f1d5437..a60671a42de9 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2145,7 +2145,6 @@ static int null_add_dev(struct nullb_device *dev) nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aa490da3cef2..f7d4c0d5ad0d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2323,7 +2323,6 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ba3909bb6bea..7e9d19fd21dd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -971,7 +971,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, } blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); blk_queue_write_cache(q, true, true); From 3315e169b446249c1b61ff988d157238f4b2c5a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 18 Mar 2023 17:36:23 +0000 Subject: [PATCH 04/17] blk-integrity: use sysfs_emit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The correct way to emit data into sysfs is via sysfs_emit(), use it. Also perform some trivial syntactic cleanups. Signed-off-by: Thomas Weißschuh Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20230309-kobj_release-gendisk_integrity-v3-1-ceccb4493c46@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-integrity.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 8f01d786f5cb..aca8c783d749 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -248,20 +248,19 @@ static ssize_t integrity_attr_store(struct kobject *kobj, static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) { if (bi->profile && bi->profile->name) - return sprintf(page, "%s\n", bi->profile->name); - else - return sprintf(page, "none\n"); + return sysfs_emit(page, "%s\n", bi->profile->name); + return sysfs_emit(page, "none\n"); } static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%u\n", bi->tag_size); + return sysfs_emit(page, "%u\n", bi->tag_size); } static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%u\n", - bi->interval_exp ? 1 << bi->interval_exp : 0); + return sysfs_emit(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); } static ssize_t integrity_verify_store(struct blk_integrity *bi, @@ -280,7 +279,7 @@ static ssize_t integrity_verify_store(struct blk_integrity *bi, static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0); + return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); } static ssize_t integrity_generate_store(struct blk_integrity *bi, @@ -299,13 +298,13 @@ static ssize_t integrity_generate_store(struct blk_integrity *bi, static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); + return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); } static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) { - return sprintf(page, "%u\n", - (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0); + return sysfs_emit(page, "%u\n", + !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)); } static struct integrity_sysfs_entry integrity_format_entry = { From 76b8c319f02715e14abdbbbdd6508e83a1059bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 18 Mar 2023 17:36:24 +0000 Subject: [PATCH 05/17] blk-integrity: convert to struct device_attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An upcoming patch will register the integrity attributes directly with the struct device kobject. For this the attributes have to be implemented in terms of struct device_attribute. Signed-off-by: Thomas Weißschuh Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20230309-kobj_release-gendisk_integrity-v3-2-ceccb4493c46@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-integrity.c | 129 +++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 66 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index aca8c783d749..1cbfdea88c72 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -212,21 +212,15 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, return true; } -struct integrity_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_integrity *, char *); - ssize_t (*store)(struct blk_integrity *, const char *, size_t); -}; - static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = &disk->queue->integrity; - struct integrity_sysfs_entry *entry = - container_of(attr, struct integrity_sysfs_entry, attr); + struct device *dev = disk_to_dev(disk); + struct device_attribute *dev_attr = + container_of(attr, struct device_attribute, attr); - return entry->show(bi, page); + return dev_attr->show(dev, dev_attr, page); } static ssize_t integrity_attr_store(struct kobject *kobj, @@ -234,38 +228,53 @@ static ssize_t integrity_attr_store(struct kobject *kobj, size_t count) { struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = &disk->queue->integrity; - struct integrity_sysfs_entry *entry = - container_of(attr, struct integrity_sysfs_entry, attr); - ssize_t ret = 0; + struct device *dev = disk_to_dev(disk); + struct device_attribute *dev_attr = + container_of(attr, struct device_attribute, attr); - if (entry->store) - ret = entry->store(bi, page, count); - - return ret; + if (!dev_attr->store) + return 0; + return dev_attr->store(dev, dev_attr, page, count); } -static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) +static inline struct blk_integrity *dev_to_bi(struct device *dev) { + return &dev_to_disk(dev)->queue->integrity; +} + +static ssize_t format_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct blk_integrity *bi = dev_to_bi(dev); + if (bi->profile && bi->profile->name) return sysfs_emit(page, "%s\n", bi->profile->name); return sysfs_emit(page, "none\n"); } -static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) +static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, + char *page) { + struct blk_integrity *bi = dev_to_bi(dev); + return sysfs_emit(page, "%u\n", bi->tag_size); } -static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) +static ssize_t protection_interval_bytes_show(struct device *dev, + struct device_attribute *attr, + char *page) { + struct blk_integrity *bi = dev_to_bi(dev); + return sysfs_emit(page, "%u\n", bi->interval_exp ? 1 << bi->interval_exp : 0); } -static ssize_t integrity_verify_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t read_verify_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t count) { + struct blk_integrity *bi = dev_to_bi(dev); char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); @@ -277,14 +286,20 @@ static ssize_t integrity_verify_store(struct blk_integrity *bi, return count; } -static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) +static ssize_t read_verify_show(struct device *dev, + struct device_attribute *attr, char *page) { + struct blk_integrity *bi = dev_to_bi(dev); + return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); } -static ssize_t integrity_generate_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t write_generate_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t count) { + struct blk_integrity *bi = dev_to_bi(dev); + char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); @@ -296,57 +311,39 @@ static ssize_t integrity_generate_store(struct blk_integrity *bi, return count; } -static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) +static ssize_t write_generate_show(struct device *dev, + struct device_attribute *attr, char *page) { + struct blk_integrity *bi = dev_to_bi(dev); + return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); } -static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) +static ssize_t device_is_integrity_capable_show(struct device *dev, + struct device_attribute *attr, + char *page) { + struct blk_integrity *bi = dev_to_bi(dev); + return sysfs_emit(page, "%u\n", !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)); } -static struct integrity_sysfs_entry integrity_format_entry = { - .attr = { .name = "format", .mode = 0444 }, - .show = integrity_format_show, -}; - -static struct integrity_sysfs_entry integrity_tag_size_entry = { - .attr = { .name = "tag_size", .mode = 0444 }, - .show = integrity_tag_size_show, -}; - -static struct integrity_sysfs_entry integrity_interval_entry = { - .attr = { .name = "protection_interval_bytes", .mode = 0444 }, - .show = integrity_interval_show, -}; - -static struct integrity_sysfs_entry integrity_verify_entry = { - .attr = { .name = "read_verify", .mode = 0644 }, - .show = integrity_verify_show, - .store = integrity_verify_store, -}; - -static struct integrity_sysfs_entry integrity_generate_entry = { - .attr = { .name = "write_generate", .mode = 0644 }, - .show = integrity_generate_show, - .store = integrity_generate_store, -}; - -static struct integrity_sysfs_entry integrity_device_entry = { - .attr = { .name = "device_is_integrity_capable", .mode = 0444 }, - .show = integrity_device_show, -}; +static DEVICE_ATTR_RO(format); +static DEVICE_ATTR_RO(tag_size); +static DEVICE_ATTR_RO(protection_interval_bytes); +static DEVICE_ATTR_RW(read_verify); +static DEVICE_ATTR_RW(write_generate); +static DEVICE_ATTR_RO(device_is_integrity_capable); static struct attribute *integrity_attrs[] = { - &integrity_format_entry.attr, - &integrity_tag_size_entry.attr, - &integrity_interval_entry.attr, - &integrity_verify_entry.attr, - &integrity_generate_entry.attr, - &integrity_device_entry.attr, - NULL, + &dev_attr_format.attr, + &dev_attr_tag_size.attr, + &dev_attr_protection_interval_bytes.attr, + &dev_attr_read_verify.attr, + &dev_attr_write_generate.attr, + &dev_attr_device_is_integrity_capable.attr, + NULL }; ATTRIBUTE_GROUPS(integrity); From ff53cd52d9bdbf4074d2bbe9b591729997780bd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 18 Mar 2023 17:36:25 +0000 Subject: [PATCH 06/17] blk-integrity: register sysfs attributes on struct device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "integrity" kobject only acted as a holder for static sysfs entries. It also was embedded into struct gendisk without managing it, violating assumptions of the driver core. Instead register the sysfs entries directly onto the struct device. Also drop the now unused member integrity_kobj from struct gendisk. Suggested-by: Christoph Hellwig Signed-off-by: Thomas Weißschuh Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20230309-kobj_release-gendisk_integrity-v3-3-ceccb4493c46@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-integrity.c | 55 +++--------------------------------------- block/blk.h | 10 +------- block/genhd.c | 12 +++------ include/linux/blkdev.h | 3 --- 4 files changed, 8 insertions(+), 72 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 1cbfdea88c72..d4e9b4556d14 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -212,31 +212,6 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, return true; } -static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct device *dev = disk_to_dev(disk); - struct device_attribute *dev_attr = - container_of(attr, struct device_attribute, attr); - - return dev_attr->show(dev, dev_attr, page); -} - -static ssize_t integrity_attr_store(struct kobject *kobj, - struct attribute *attr, const char *page, - size_t count) -{ - struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct device *dev = disk_to_dev(disk); - struct device_attribute *dev_attr = - container_of(attr, struct device_attribute, attr); - - if (!dev_attr->store) - return 0; - return dev_attr->store(dev, dev_attr, page, count); -} - static inline struct blk_integrity *dev_to_bi(struct device *dev) { return &dev_to_disk(dev)->queue->integrity; @@ -345,16 +320,10 @@ static struct attribute *integrity_attrs[] = { &dev_attr_device_is_integrity_capable.attr, NULL }; -ATTRIBUTE_GROUPS(integrity); -static const struct sysfs_ops integrity_ops = { - .show = &integrity_attr_show, - .store = &integrity_attr_store, -}; - -static const struct kobj_type integrity_ktype = { - .default_groups = integrity_groups, - .sysfs_ops = &integrity_ops, +const struct attribute_group blk_integrity_attr_group = { + .name = "integrity", + .attrs = integrity_attrs, }; static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) @@ -433,21 +402,3 @@ void blk_integrity_unregister(struct gendisk *disk) memset(bi, 0, sizeof(*bi)); } EXPORT_SYMBOL(blk_integrity_unregister); - -int blk_integrity_add(struct gendisk *disk) -{ - int ret; - - ret = kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity"); - if (!ret) - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); - return ret; -} - -void blk_integrity_del(struct gendisk *disk) -{ - kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); - kobject_del(&disk->integrity_kobj); - kobject_put(&disk->integrity_kobj); -} diff --git a/block/blk.h b/block/blk.h index 564119a76bc5..45547bcf1119 100644 --- a/block/blk.h +++ b/block/blk.h @@ -214,8 +214,7 @@ static inline bool integrity_req_gap_front_merge(struct request *req, bip_next->bip_vec[0].bv_offset); } -int blk_integrity_add(struct gendisk *disk); -void blk_integrity_del(struct gendisk *); +extern const struct attribute_group blk_integrity_attr_group; #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) @@ -248,13 +247,6 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline int blk_integrity_add(struct gendisk *disk) -{ - return 0; -} -static inline void blk_integrity_del(struct gendisk *disk) -{ -} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); diff --git a/block/genhd.c b/block/genhd.c index 8d56fb5f08b2..9fa4a7cd978c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -478,15 +478,11 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - ret = blk_integrity_add(disk); - if (ret) - goto out_del_block_link; - disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); if (!disk->part0->bd_holder_dir) { ret = -ENOMEM; - goto out_del_integrity; + goto out_del_block_link; } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (!disk->slave_dir) { @@ -549,8 +545,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); -out_del_integrity: - blk_integrity_del(disk); out_del_block_link: if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(ddev)); @@ -613,7 +607,6 @@ void del_gendisk(struct gendisk *disk) if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; - blk_integrity_del(disk); disk_del_events(disk); mutex_lock(&disk->open_mutex); @@ -1150,6 +1143,9 @@ static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, +#endif +#ifdef CONFIG_BLK_DEV_INTEGRITY + &blk_integrity_attr_group, #endif NULL }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6ede578dfbc6..aac5c8d7a9ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -157,9 +157,6 @@ struct gendisk { struct timer_rand_state *random; atomic_t sync_io; /* RAID */ struct disk_events *ev; -#ifdef CONFIG_BLK_DEV_INTEGRITY - struct kobject integrity_kobj; -#endif /* CONFIG_BLK_DEV_INTEGRITY */ #ifdef CONFIG_BLK_DEV_ZONED /* From daf376a366fd2d469d66ab83dfdc074777462bab Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:08 -0500 Subject: [PATCH 07/17] uapi nbd: improve doc links to userspace spec The uapi header intentionally documents only the NBD server features that the kernel module will utilize as a client. But while it already had one mention of skipped bits due to userspace extensions, it did not actually direct the reader to the canonical source to learn about those extensions. While touching comments, fix an outdated reference that listed only READ and WRITE as commands. Signed-off-by: Eric Blake Reviewed-by: Ming Lei Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-2-eblake@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/nbd.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 20d6cc91435d..8797387caaf7 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -11,6 +11,8 @@ * Cleanup PARANOIA usage & code. * 2004/02/19 Paul Clements * Removed PARANOIA, plus various cleanup and comments + * 2023 Copyright Red Hat + * Link to userspace extensions. */ #ifndef _UAPILINUX_NBD_H @@ -30,12 +32,18 @@ #define NBD_SET_TIMEOUT _IO( 0xab, 9 ) #define NBD_SET_FLAGS _IO( 0xab, 10) +/* + * See also https://github.com/NetworkBlockDevice/nbd/blob/master/doc/proto.md + * for additional userspace extensions not yet utilized in the kernel module. + */ + enum { NBD_CMD_READ = 0, NBD_CMD_WRITE = 1, NBD_CMD_DISC = 2, NBD_CMD_FLUSH = 3, NBD_CMD_TRIM = 4 + /* userspace defines additional extension commands */ }; /* values for flags field, these are server interaction specific. */ @@ -64,14 +72,15 @@ enum { #define NBD_REQUEST_MAGIC 0x25609513 #define NBD_REPLY_MAGIC 0x67446698 /* Do *not* use magics: 0x12560953 0x96744668. */ +/* magic 0x668e33ef for structured reply not supported by kernel yet */ /* * This is the packet used for communication between client and * server. All data are in network byte order. */ struct nbd_request { - __be32 magic; - __be32 type; /* == READ || == WRITE */ + __be32 magic; /* NBD_REQUEST_MAGIC */ + __be32 type; /* See NBD_CMD_* */ char handle[8]; __be64 from; __be32 len; @@ -82,7 +91,7 @@ struct nbd_request { * it has completed an I/O request (or an error occurs). */ struct nbd_reply { - __be32 magic; + __be32 magic; /* NBD_REPLY_MAGIC */ __be32 error; /* 0 = ok, else error */ char handle[8]; /* handle you got from request */ }; From 2686eb845da7762ee98b17e578b0c081aafb77b9 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:09 -0500 Subject: [PATCH 08/17] uapi nbd: add cookie alias to handle The uapi header declares a 'char handle[8]' per request; which is overloaded in English (are you referring to "handle" the verb, such as handling a signal or writing a callback handler, or "handle" the noun, the value used in a lookup table to correlate a response back to the request). Many user-space NBD implementations (both servers and clients) have instead used 'uint64_t cookie' or similar, as it is easier to directly assign an integer than to futz around with memcpy. In fact, upstream documentation is now encouraging this shift in terminology: https://github.com/NetworkBlockDevice/nbd/commit/ca4392eb2b Accomplish this by use of an anonymous union to provide the alias for anyone getting the definition from the uapi; this does not break existing clients, while exposing the nicer name for those who prefer it. Note that block/nbd.c still uses the term handle (in fact, it actually combines a 32-bit cookie and a 32-bit tag into the 64-bit handle), but that internal usage is not changed by the public uapi, since no compliant NBD server has any reason to inspect or alter the 64 bits sent over the socket. Signed-off-by: Eric Blake Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-3-eblake@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/nbd.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index 8797387caaf7..80ce0ef43afd 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -12,7 +12,7 @@ * 2004/02/19 Paul Clements * Removed PARANOIA, plus various cleanup and comments * 2023 Copyright Red Hat - * Link to userspace extensions. + * Link to userspace extensions, favor cookie over handle. */ #ifndef _UAPILINUX_NBD_H @@ -81,7 +81,10 @@ enum { struct nbd_request { __be32 magic; /* NBD_REQUEST_MAGIC */ __be32 type; /* See NBD_CMD_* */ - char handle[8]; + union { + __be64 cookie; /* Opaque identifier for request */ + char handle[8]; /* older spelling of cookie */ + }; __be64 from; __be32 len; } __attribute__((packed)); @@ -93,6 +96,9 @@ struct nbd_request { struct nbd_reply { __be32 magic; /* NBD_REPLY_MAGIC */ __be32 error; /* 0 = ok, else error */ - char handle[8]; /* handle you got from request */ + union { + __be64 cookie; /* Opaque identifier from request */ + char handle[8]; /* older spelling of cookie */ + }; }; #endif /* _UAPILINUX_NBD_H */ From bd9e9916c32fd4b4fb4e879e05bd1568ee02ec93 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:10 -0500 Subject: [PATCH 09/17] block nbd: use req.cookie instead of req.handle The NBD spec was recently changed [1] to refer to the opaque client identifier as a 'cookie' rather than a 'handle', but has for a much longer time listed it as a 64-bit value, and declares that all values in the NBD protocol are sent in network byte order (big-endian). Because the value is opaque to the server, it doesn't usually matter what endianness we send as the client - as long as we are consistent that either we byte-swap on both write and read, or on neither, then we can match server replies back to our requests. That said, our internal use of the cookie is as a 64-bit number (well, as two 32-bit numbers concatenated together), rather than as 8 individual bytes; so prior to this commit, we ARE leaking the native endianness of our internals as a client out to the server. We don't know of any server that will actually inspect the opaque value and behave differently depending on whether a little-endian or big-endian client is sending requests, but since we DO log the cookie value, a wireshark capture of the network traffic is easier to correlate back to the kernel traffic of a big-endian host (where the u64 and char[8] representations are the same) than of a little-endian host (where if wireshark honors the NBD spec and displays a u64 in network byte order, it is byte-swapped from what the kernel logged). The fix in this patch is thus two-part: it now consistently uses network byte order for the opaque value (no difference to a big-endian machine, but an extra byteswap on a little-endian machine; probably in the noise compared to the overhead of network traffic in general), and now uses a 64-bit integer instead of char[8] as its preferred access to the opaque value (direct assignment instead of memcpy()). Signed-off-by: Eric Blake Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-4-eblake@redhat.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7c96ec4e99df..9c35c958f2c8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -609,7 +609,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) request.len = htonl(size); } handle = nbd_cmd_handle(cmd); - memcpy(request.handle, &handle, sizeof(handle)); + request.cookie = cpu_to_be64(handle); trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); @@ -621,7 +621,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) trace_nbd_header_sent(req, handle); if (result < 0) { if (was_interrupted(result)) { - /* If we havne't sent anything we can just return BUSY, + /* If we haven't sent anything we can just return BUSY, * however if we have sent something we need to make * sure we only allow this req to be sent until we are * completely done. @@ -735,7 +735,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, u32 tag; int ret = 0; - memcpy(&handle, reply->handle, sizeof(handle)); + handle = be64_to_cpu(reply->cookie); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) From 952aa344bf4305ab6fa0d9962ef8c2caa2afef4c Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:11 -0500 Subject: [PATCH 10/17] docs nbd: userspace NBD now favors github over sourceforge While the sourceforge site for userspace NBD still exists, the code repository moved to github several years ago. Then with a recent patch[1], the github landing page contains just as much information as the sourceforge page, so we might as well point to a single location that also provides the code. [1] https://lists.debian.org/nbd/2023/03/msg00051.html Signed-off-by: Eric Blake Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-5-eblake@redhat.com Signed-off-by: Jens Axboe --- Documentation/admin-guide/blockdev/nbd.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/blockdev/nbd.rst b/Documentation/admin-guide/blockdev/nbd.rst index d78dfe559dcf..faf2ac4b1509 100644 --- a/Documentation/admin-guide/blockdev/nbd.rst +++ b/Documentation/admin-guide/blockdev/nbd.rst @@ -14,7 +14,7 @@ to borrow disk space from another computer. Unlike NFS, it is possible to put any filesystem on it, etc. For more information, or to download the nbd-client and nbd-server -tools, go to http://nbd.sf.net/. +tools, go to https://github.com/NetworkBlockDevice/nbd. The nbd kernel module need only be installed on the client system, as the nbd-server is completely in userspace. In fact, From fc05e06e6098ca2c28f7a10da0e00aeea20fa59e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Apr 2023 19:15:37 +0200 Subject: [PATCH 11/17] md/raid5: Improve performance for sequential IO Commit 7e55c60acfbb ("md/raid5: Pivot raid5_make_request()") changed the order in which requests for underlying disks are created. Since for large sequential IO adding of requests frequently races with md_raid5 thread submitting bios to underlying disks, this results in a change in IO pattern because intermediate states of new order of request creation result in more smaller discontiguous requests. For RAID5 on top of three rotational disks our performance testing revealed this results in regression in write throughput: iozone -a -s 131072000 -y 4 -q 8 -i 0 -i 1 -R before 7e55c60acfbb: KB reclen write rewrite read reread 131072000 4 493670 525964 524575 513384 131072000 8 540467 532880 512028 513703 after 7e55c60acfbb: KB reclen write rewrite read reread 131072000 4 421785 456184 531278 509248 131072000 8 459283 456354 528449 543834 To reduce the amount of discontiguous requests we can start generating requests with the stripe with the lowest chunk offset as that has the best chance of being adjacent to IO queued previously. This improves the performance to: KB reclen write rewrite read reread 131072000 4 497682 506317 518043 514559 131072000 8 514048 501886 506453 504319 restoring big part of the regression. Fixes: 7e55c60acfbb ("md/raid5: Pivot raid5_make_request()") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Jan Kara Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230417171537.17899-1-jack@suse.cz --- drivers/md/raid5.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 812a12e3e41a..4739ed891e75 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6079,6 +6079,38 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, return ret; } +/* + * If the bio covers multiple data disks, find sector within the bio that has + * the lowest chunk offset in the first chunk. + */ +static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, + struct bio *bi) +{ + int sectors_per_chunk = conf->chunk_sectors; + int raid_disks = conf->raid_disks; + int dd_idx; + struct stripe_head sh; + unsigned int chunk_offset; + sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + sector_t sector; + + /* We pass in fake stripe_head to get back parity disk numbers */ + sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh); + chunk_offset = sector_div(sector, sectors_per_chunk); + if (sectors_per_chunk - chunk_offset >= bio_sectors(bi)) + return r_sector; + /* + * Bio crosses to the next data disk. Check whether it's in the same + * chunk. + */ + dd_idx++; + while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx) + dd_idx++; + if (dd_idx >= raid_disks) + return r_sector; + return r_sector + sectors_per_chunk - chunk_offset; +} + static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -6150,6 +6182,17 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } md_account_bio(mddev, &bi); + /* + * Lets start with the stripe with the lowest chunk offset in the first + * chunk. That has the best chances of creating IOs adjacent to + * previous IOs in case of sequential IO and thus creates the most + * sequential IO pattern. We don't bother with the optimization when + * reshaping as the performance benefit is not worth the complexity. + */ + if (likely(conf->reshape_progress == MaxSector)) + logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); + s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + add_wait_queue(&conf->wait_for_overlap, &wait); while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, @@ -6178,7 +6221,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; } - s = find_first_bit(ctx.sectors_to_do, stripe_cnt); + s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); if (s == stripe_cnt) break; From b1211978ecf19bceb63a04f53fea4b5d73832a4a Mon Sep 17 00:00:00 2001 From: Jonathan Derrick Date: Mon, 24 Apr 2023 19:14:38 -0600 Subject: [PATCH 12/17] md: Fix bitmap offset type in sb writer Bitmap offset is allowed to be negative, indicating that bitmap precedes metadata. Change the type back from sector_t to loff_t to satisfy conditionals and calculations. Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-raid/CAPhsuW6HuaUJ5WcyPajVgUfkQFYp2D_cy1g6qxN4CU_gP2=z7g@mail.gmail.com/ Fixes: 10172f200b67 ("md: Fix types in sb writer") Signed-off-by: Jonathan Derrick Suggested-by: Yu Kuai Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230425011438.71046-1-jonathan.derrick@linux.dev --- drivers/md/md-bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 920bb68156d2..bc8d7565171d 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -219,7 +219,7 @@ static unsigned int optimal_io_size(struct block_device *bdev, } static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, - sector_t start, sector_t boundary) + loff_t start, loff_t boundary) { if (io_size != opt_size && start + opt_size / SECTOR_SIZE <= boundary) @@ -237,8 +237,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - sector_t offset = mddev->bitmap_info.offset; - sector_t ps, sboff, doff; + loff_t sboff, offset = mddev->bitmap_info.offset; + sector_t ps, doff; unsigned int size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE; From 3e46c89c74f2c38e5337d2cf44b0b551adff1cb4 Mon Sep 17 00:00:00 2001 From: Maxim Korotkov Date: Thu, 19 Jan 2023 13:44:43 +0300 Subject: [PATCH 13/17] writeback: fix call of incorrect macro the variable 'history' is of type u16, it may be an error that the hweight32 macro was used for it I guess macro hweight16 should be used Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 2a81490811d0 ("writeback: implement foreign cgroup inode detection") Signed-off-by: Maxim Korotkov Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230119104443.3002-1-korotkov.maxim.s@gmail.com Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 195dc23e0d83..6faeb45234ed 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -829,7 +829,7 @@ void wbc_detach_inode(struct writeback_control *wbc) * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ - if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } From 8176080d59e6d4ff9fc97ae534063073b4f7a715 Mon Sep 17 00:00:00 2001 From: Tao Su Date: Fri, 28 Apr 2023 12:51:49 +0800 Subject: [PATCH 14/17] block: Skip destroyed blkg when restart in blkg_destroy_all() Kernel hang in blkg_destroy_all() when total blkg greater than BLKG_DESTROY_BATCH_SIZE, because of not removing destroyed blkg in blkg_list. So the size of blkg_list is same after destroying a batch of blkg, and the infinite 'restart' occurs. Since blkg should stay on the queue list until blkg_free_workfn(), skip destroyed blkg when restart a new round, which will solve this kernel hang issue and satisfy the previous will to restart. Reported-by: Xiangfei Ma Tested-by: Xiangfei Ma Tested-by: Farrah Chen Signed-off-by: Tao Su Fixes: f1c006f1c685 ("blk-cgroup: synchronize pd_free_fn() from blkg_free_workfn() and blkcg_deactivate_policy()") Suggested-and-reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230428045149.1310073-1-tao1.su@linux.intel.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1c1ebeb51003..0ecb4cce8af2 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -527,6 +527,9 @@ static void blkg_destroy_all(struct gendisk *disk) list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + if (hlist_unhashed(&blkg->blkcg_node)) + continue; + spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); From 7949aa46bb4b5c5e43ba3ba8298ffcb48bebf0ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 30 Apr 2023 16:15:08 -0600 Subject: [PATCH 15/17] mailmap: add mailmap entries for Jens Axboe There's an old entry in there already, but update it and add fb/meta and oracle entries as well. Signed-off-by: Jens Axboe --- .mailmap | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.mailmap b/.mailmap index 424564f40733..77fb9cebb8d3 100644 --- a/.mailmap +++ b/.mailmap @@ -209,7 +209,10 @@ Jeff Garzik Jeff Layton Jeff Layton Jeff Layton -Jens Axboe +Jens Axboe +Jens Axboe +Jens Axboe +Jens Axboe Jens Osterkamp Jernej Skrabec Jessica Zhang From 3899d94e3831ee07ea6821c032dc297aec80586a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Wed, 3 May 2023 14:19:37 +0200 Subject: [PATCH 16/17] drbd: correctly submit flush bio on barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we receive a flush command (or "barrier" in DRBD), we currently use a REQ_OP_FLUSH with the REQ_PREFLUSH flag set. The correct way to submit a flush bio is by using a REQ_OP_WRITE without any data, and set the REQ_PREFLUSH flag. Since commit b4a6bb3a67aa ("block: add a sanity check for non-write flush/fua bios"), this triggers a warning in the block layer, but this has been broken for quite some time before that. So use the correct set of flags to actually make the flush happen. Cc: Christoph Hellwig Cc: stable@vger.kernel.org Fixes: f9ff0da56437 ("drbd: allow parallel flushes for multi-volume resources") Reported-by: Thomas Voegtle Signed-off-by: Christoph Böhmwalder Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230503121937.17232-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e54404c632e7..34b112752ab1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1283,7 +1283,7 @@ static void one_flush_endio(struct bio *bio) static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) { struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0, - REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO); + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO); struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); if (!octx) { From c0b79b0ff53be5b05be98e3caaa6a39de1fe9520 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 May 2023 10:42:31 +0800 Subject: [PATCH 17/17] ublk: add timeout handler Add timeout handler, so that we can provide forward progress guarantee for unprivileged ublk, which can't be trusted. One thing is that sync() calls sync_bdevs(wait) for all block devices after running sync_bdevs(no_wait), and if one device can't move on, the sync() won't return any more. Add timeout for unprivileged ublk to avoid such affect for other users which call sync() syscall. Meantime clear UBLK_F_USER_RECOVERY_REISSUE for unprivileged ublk since that feature may cause IO hang too. Fixes: 4093cb5a0634 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230502024231.888498-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 253008b2091d..e96309f2e1ad 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -129,6 +129,7 @@ struct ublk_queue { unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; + bool timeout; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; struct ublk_io ios[]; @@ -894,6 +895,22 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) } } +static enum blk_eh_timer_return ublk_timeout(struct request *rq) +{ + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { + if (!ubq->timeout) { + send_sig(SIGKILL, ubq->ubq_daemon, 0); + ubq->timeout = true; + } + + return BLK_EH_DONE; + } + + return BLK_EH_RESET_TIMER; +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -953,6 +970,7 @@ static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, .init_request = ublk_init_rq, + .timeout = ublk_timeout, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -1713,6 +1731,18 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) return -EPERM; + /* + * unprivileged device can't be trusted, but RECOVERY and + * RECOVERY_REISSUE still may hang error handling, so can't + * support recovery features for unprivileged ublk now + * + * TODO: provide forward progress for RECOVERY handler, so that + * unprivileged device can benefit from it + */ + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | + UBLK_F_USER_RECOVERY); + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -1981,6 +2011,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; + ubq->timeout = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i];