From 2e29be2e491595407087ab36a5e5a159be693f7b Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 25 May 2021 10:24:26 -0700 Subject: [PATCH 001/187] fs/fuse: Remove unneeded kaddr parameter fuse_dax_mem_range_init() does not need the address or the pfn of the memory requested in dax_direct_access(). It is only calling direct access to get the number of pages. Remove the unused variables and stop requesting the kaddr and pfn from dax_direct_access(). Reviewed-by: Dan Williams Signed-off-by: Ira Weiny Reviewed-by: Vivek Goyal Link: https://lore.kernel.org/r/20210525172428.3634316-2-ira.weiny@intel.com Signed-off-by: Dan Williams --- fs/fuse/dax.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index ff99ab2a3c43..34f8a5635c7f 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -1234,8 +1234,6 @@ void fuse_dax_conn_free(struct fuse_conn *fc) static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) { long nr_pages, nr_ranges; - void *kaddr; - pfn_t pfn; struct fuse_dax_mapping *range; int ret, id; size_t dax_size = -1; @@ -1247,8 +1245,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr, - &pfn); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, + NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); From 44788591c3cfb81d9315b8ee5c2076e51bfe8a39 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 25 May 2021 10:24:27 -0700 Subject: [PATCH 002/187] fs/dax: Clarify nr_pages to dax_direct_access() dax_direct_access() takes a number of pages. PHYS_PFN(PAGE_SIZE) is a very round about way to specify '1'. Change the nr_pages parameter to the explicit value of '1'. Reviewed-by: Dan Williams Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20210525172428.3634316-3-ira.weiny@intel.com Signed-off-by: Dan Williams --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 62352cbcf0f4..8ce3a7e147a8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -721,7 +721,7 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; From b05d4c576b697b9f462b9c532c997171d5c3b067 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 25 May 2021 10:24:28 -0700 Subject: [PATCH 003/187] dax: Ensure errno is returned from dax_direct_access If the caller specifies a negative nr_pages that is an invalid parameter. Return -EINVAL to ensure callers get an errno if they want to check it. Reviewed-by: Dan Williams Signed-off-by: Ira Weiny Link: https://lore.kernel.org/r/20210525172428.3634316-4-ira.weiny@intel.com Signed-off-by: Dan Williams --- drivers/dax/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 5fa6ae9dbc8b..44736cbd446e 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -313,7 +313,7 @@ long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, return -ENXIO; if (nr_pages < 0) - return nr_pages; + return -EINVAL; avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn); From 4377d9ab1f162e58e0e5ae89c9a5fd7b4d8a6bdb Mon Sep 17 00:00:00 2001 From: Sean Nyekjaer Date: Fri, 9 Jul 2021 09:17:27 +0200 Subject: [PATCH 004/187] iio: accel: fxls8962af: fix potential use of uninitialized symbol Fix this warning from kernel test robot: smatch warnings: drivers/iio/accel/fxls8962af-core.c:640 fxls8962af_i2c_raw_read_errata3() error: uninitialized symbol 'ret'. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Sean Nyekjaer Fixes: af959b7b96b8 ("iio: accel: fxls8962af: fix errata bug E3 - I2C burst reads") Link: https://lore.kernel.org/r/20210709071727.2453536-1-sean@geanix.com Signed-off-by: Jonathan Cameron --- drivers/iio/accel/fxls8962af-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/accel/fxls8962af-core.c b/drivers/iio/accel/fxls8962af-core.c index 078d87865fde..0019f1ea7df2 100644 --- a/drivers/iio/accel/fxls8962af-core.c +++ b/drivers/iio/accel/fxls8962af-core.c @@ -637,7 +637,7 @@ static int fxls8962af_i2c_raw_read_errata3(struct fxls8962af_data *data, return ret; } - return ret; + return 0; } static int fxls8962af_fifo_transfer(struct fxls8962af_data *data, From 2b6d2833cd1d8a43a837a45da65860ef086443dc Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 15 Jun 2021 17:39:05 +0800 Subject: [PATCH 005/187] mtd: mtd_blkdevs: Initialize rq.limits.discard_granularity Since commit b35fd7422c2f8("block: check queue's limits.discard_granularity in __blkdev_issue_discard()") checks rq.limits.discard_granularity in __blkdev_issue_discard(), we may get following warnings on formatted ftl: WARNING: CPU: 2 PID: 7313 at block/blk-lib.c:51 __blkdev_issue_discard+0x2a7/0x390 Reproducer: 1. ftl_format /dev/mtd0 2. modprobe ftl 3. mkfs.vfat /dev/ftla 4. mount -odiscard /dev/ftla temp 5. dd if=/dev/zero of=temp/tst bs=1M count=10 oflag=direct 6. dd if=/dev/zero of=temp/tst bs=1M count=10 oflag=direct Fix it by initializing rq.limits.discard_granularity if device supports discard operation. Signed-off-by: Zhihao Cheng Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210615093905.3473709-1-chengzhihao1@huawei.com --- drivers/mtd/mtd_blkdevs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 6ce4bc57f919..8f97a4fe6e93 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -419,6 +419,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) if (tr->discard) { blk_queue_flag_set(QUEUE_FLAG_DISCARD, new->rq); blk_queue_max_discard_sectors(new->rq, UINT_MAX); + new->rq->limits.discard_granularity = tr->blksize; } gd->queue = new->rq; From 14f97f0b8e2b9950c028d0cb7311ffe26a3cc1c0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 17 Jun 2021 16:37:25 +0300 Subject: [PATCH 006/187] mtd: rawnand: Add a check in of_get_nand_secure_regions() Check for whether of_property_count_elems_of_size() returns a negative error code. Fixes: 13b89768275d ("mtd: rawnand: Add support for secure regions in NAND memory") Signed-off-by: Dan Carpenter Reviewed-by: Manivannan Sadhasivam Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/YMtQFXE0F1w7mUh+@mwanda --- drivers/mtd/nand/raw/nand_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 57a583149cc0..cbba46432e39 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -5231,8 +5231,8 @@ static int of_get_nand_secure_regions(struct nand_chip *chip) int nr_elem, i, j; nr_elem = of_property_count_elems_of_size(dn, "secure-regions", sizeof(u64)); - if (!nr_elem) - return 0; + if (nr_elem <= 0) + return nr_elem; chip->nr_secure_regions = nr_elem / 2; chip->secure_regions = kcalloc(chip->nr_secure_regions, sizeof(*chip->secure_regions), From 962bf783ef65d15b0f8ca9c33342cf3b20bf0d2e Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Fri, 18 Jun 2021 00:09:04 +0800 Subject: [PATCH 007/187] mtd: break circular locks in register_mtd_blktrans Syzbot reported a circular locking dependency: https://syzkaller.appspot.com/bug?id=7bd106c28e846d1023d4ca915718b1a0905444cb This happens because of the following lock dependencies: 1. loop_ctl_mutex -> bdev->bd_mutex (when loop_control_ioctl calls loop_remove, which then calls del_gendisk; this also happens in loop_exit which eventually calls loop_remove) 2. bdev->bd_mutex -> mtd_table_mutex (when blkdev_get_by_dev calls __blkdev_get, which then calls blktrans_open) 3. mtd_table_mutex -> major_names_lock (when register_mtd_blktrans calls __register_blkdev) 4. major_names_lock -> loop_ctl_mutex (when blk_request_module calls loop_probe) Hence there's an overall dependency of: loop_ctl_mutex ----------> bdev->bd_mutex ^ | | | | v major_names_lock <--------- mtd_table_mutex We can break this circular dependency by holding mtd_table_mutex only for the required critical section in register_mtd_blktrans. This avoids the mtd_table_mutex -> major_names_lock dependency. Reported-and-tested-by: syzbot+6a8a0d93c91e8fbf2e80@syzkaller.appspotmail.com Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210617160904.570111-1-desmondcheongzx@gmail.com --- drivers/mtd/mtd_blkdevs.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 8f97a4fe6e93..31b208fb225e 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -526,14 +526,10 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr) if (!blktrans_notifier.list.next) register_mtd_user(&blktrans_notifier); - - mutex_lock(&mtd_table_mutex); - ret = register_blkdev(tr->major, tr->name); if (ret < 0) { printk(KERN_WARNING "Unable to register %s block device on major %d: %d\n", tr->name, tr->major, ret); - mutex_unlock(&mtd_table_mutex); return ret; } @@ -543,12 +539,12 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr) tr->blkshift = ffs(tr->blksize) - 1; INIT_LIST_HEAD(&tr->devs); - list_add(&tr->list, &blktrans_majors); + mutex_lock(&mtd_table_mutex); + list_add(&tr->list, &blktrans_majors); mtd_for_each_device(mtd) if (mtd->type != MTD_ABSENT) tr->add_mtd(tr, mtd); - mutex_unlock(&mtd_table_mutex); return 0; } From e83862ee1b9b1668826683f432b041875ec0c819 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Jun 2021 16:42:07 +0300 Subject: [PATCH 008/187] mtd: mchp48l640: silence some uninitialized variable warnings Smatch complains that zero length read/writes will lead to an uninitalized return value. I don't know if that's possible, but it's nicer to return a zero literal anyway so let's do that. Fixes: 88d125026753 ("mtd: devices: add support for microchip 48l640 EERAM") Signed-off-by: Dan Carpenter Reviewed-by: Fabio Estevam Reviewed-by: Heiko Schocher Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/YMyir961W28TX5dT@mwanda --- drivers/mtd/devices/mchp48l640.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/devices/mchp48l640.c b/drivers/mtd/devices/mchp48l640.c index efc2003bd13a..ad66b5aaf4e9 100644 --- a/drivers/mtd/devices/mchp48l640.c +++ b/drivers/mtd/devices/mchp48l640.c @@ -229,7 +229,7 @@ static int mchp48l640_write(struct mtd_info *mtd, loff_t to, size_t len, woff += ws; } - return ret; + return 0; } static int mchp48l640_read_page(struct mtd_info *mtd, loff_t from, size_t len, @@ -286,7 +286,7 @@ static int mchp48l640_read(struct mtd_info *mtd, loff_t from, size_t len, woff += ws; } - return ret; + return 0; }; static const struct mchp48_caps mchp48l640_caps = { From 45bb1faa29effbd4ca4d581b32373f2eda309b95 Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Wed, 7 Jul 2021 15:53:59 +0200 Subject: [PATCH 009/187] mtd: core: handle flashes without OTP gracefully There are flash drivers which registers the OTP callbacks although the flash doesn't support OTP regions and return -ENODATA for these callbacks if there is no OTP. If this happens, the probe of the whole flash will fail. Fix it by handling the ENODATA return code and skip the OTP region nvmem setup. Fixes: 4b361cfa8624 ("mtd: core: add OTP nvmem provider support") Reported-by: Guenter Roeck Signed-off-by: Michael Walle Tested-by: Guenter Roeck Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210707135359.32398-1-michael@walle.cc --- drivers/mtd/mtdcore.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index b5ccd3037788..c8fd7f758938 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -806,7 +806,9 @@ static ssize_t mtd_otp_size(struct mtd_info *mtd, bool is_user) err: kfree(info); - return ret; + + /* ENODATA means there is no OTP region. */ + return ret == -ENODATA ? 0 : ret; } static struct nvmem_device *mtd_otp_nvmem_register(struct mtd_info *mtd, From 2394e628738933aa014093d93093030f6232946d Mon Sep 17 00:00:00 2001 From: Andreas Persson Date: Mon, 12 Jul 2021 09:54:52 +0200 Subject: [PATCH 010/187] mtd: cfi_cmdset_0002: fix crash when erasing/writing AMD cards Erasing an AMD linear flash card (AM29F016D) crashes after the first sector has been erased. Likewise, writing to it crashes after two bytes have been written. The reason is a missing check for a null pointer - the cmdset_priv field is not set for this type of card. Fixes: 4844ef80305d ("mtd: cfi_cmdset_0002: Add support for polling status register") Signed-off-by: Andreas Persson Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/DB6P189MB05830B3530B8087476C5CFE4C1159@DB6P189MB0583.EURP189.PROD.OUTLOOK.COM --- drivers/mtd/chips/cfi_cmdset_0002.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index 3097e93787f7..a761134fd3be 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -119,7 +119,7 @@ static int cfi_use_status_reg(struct cfi_private *cfi) struct cfi_pri_amdstd *extp = cfi->cmdset_priv; u8 poll_mask = CFI_POLL_STATUS_REG | CFI_POLL_DQ; - return extp->MinorVersion >= '5' && + return extp && extp->MinorVersion >= '5' && (extp->SoftwareFeatures & poll_mask) == CFI_POLL_STATUS_REG; } From 5261cdf457ce3635bf18d393a3c1991dcfaf9d02 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Wed, 30 Jun 2021 12:32:52 +0200 Subject: [PATCH 011/187] crypto: drbg - select SHA512 With the swtich to use HMAC(SHA-512) as the default DRBG type, the configuration must now also select SHA-512. Fixes: 9b7b94683a9b "crypto: DRBG - switch to HMAC SHA512 DRBG as default DRBG" Reported-by: Sachin Sant Signed-off-by: Stephan Mueller Tested-by: Sachin Sant Signed-off-by: Herbert Xu --- crypto/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index ca3b02dcbbfa..64b772c5d1c9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -1768,7 +1768,7 @@ config CRYPTO_DRBG_HMAC bool default y select CRYPTO_HMAC - select CRYPTO_SHA256 + select CRYPTO_SHA512 config CRYPTO_DRBG_HASH bool "Enable Hash DRBG" From 9898cb24e454602beb6e17bacf9f97b26c85c955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 9 Jul 2021 12:11:10 +0200 Subject: [PATCH 012/187] iio: adc: ti-ads7950: Ensure CS is deasserted after reading channels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ADS7950 requires that CS is deasserted after each SPI word. Before commit e2540da86ef8 ("iio: adc: ti-ads7950: use SPI_CS_WORD to reduce CPU usage") the driver used a message with one spi transfer per channel where each but the last one had .cs_change set to enforce a CS toggle. This was wrongly translated into a message with a single transfer and .cs_change set which results in a CS toggle after each word but the last which corrupts the first adc conversion of all readouts after the first readout. Fixes: e2540da86ef8 ("iio: adc: ti-ads7950: use SPI_CS_WORD to reduce CPU usage") Signed-off-by: Uwe Kleine-König Reviewed-by: David Lechner Tested-by: David Lechner Cc: Link: https://lore.kernel.org/r/20210709101110.1814294-1-u.kleine-koenig@pengutronix.de Signed-off-by: Jonathan Cameron --- drivers/iio/adc/ti-ads7950.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iio/adc/ti-ads7950.c b/drivers/iio/adc/ti-ads7950.c index 2383eacada87..a2b83f0bd526 100644 --- a/drivers/iio/adc/ti-ads7950.c +++ b/drivers/iio/adc/ti-ads7950.c @@ -568,7 +568,6 @@ static int ti_ads7950_probe(struct spi_device *spi) st->ring_xfer.tx_buf = &st->tx_buf[0]; st->ring_xfer.rx_buf = &st->rx_buf[0]; /* len will be set later */ - st->ring_xfer.cs_change = true; spi_message_add_tail(&st->ring_xfer, &st->ring_msg); From 7e77ef8b8d600cf8448a2bbd32f682c28884551f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antti=20Ker=C3=A4nen?= Date: Thu, 8 Jul 2021 12:54:29 +0300 Subject: [PATCH 013/187] iio: adis: set GPIO reset pin direction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set reset pin direction to output as the reset pin needs to be an active low output pin. Co-developed-by: Hannu Hartikainen Signed-off-by: Hannu Hartikainen Signed-off-by: Antti Keränen Reviewed-by: Nuno Sá Fixes: ecb010d44108 ("iio: imu: adis: Refactor adis_initial_startup") Link: https://lore.kernel.org/r/20210708095425.13295-1-detegr@rbx.email Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/imu/adis.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/iio/imu/adis.c b/drivers/iio/imu/adis.c index a5b421f42287..b9a06ca29bee 100644 --- a/drivers/iio/imu/adis.c +++ b/drivers/iio/imu/adis.c @@ -411,12 +411,11 @@ int __adis_initial_startup(struct adis *adis) int ret; /* check if the device has rst pin low */ - gpio = devm_gpiod_get_optional(&adis->spi->dev, "reset", GPIOD_ASIS); + gpio = devm_gpiod_get_optional(&adis->spi->dev, "reset", GPIOD_OUT_HIGH); if (IS_ERR(gpio)) return PTR_ERR(gpio); if (gpio) { - gpiod_set_value_cansleep(gpio, 1); msleep(10); /* bring device out of reset */ gpiod_set_value_cansleep(gpio, 0); From 4152433c397697acc4b02c4a10d17d5859c2730d Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 20 Jul 2021 21:14:05 +1000 Subject: [PATCH 014/187] arm64: efi: kaslr: Fix occasional random alloc (and boot) failure The EFI stub random allocator used for kaslr on arm64 has a subtle bug. In function get_entry_num_slots() which counts the number of possible allocation "slots" for the image in a given chunk of free EFI memory, "last_slot" can become negative if the chunk is smaller than the requested allocation size. The test "if (first_slot > last_slot)" doesn't catch it because both first_slot and last_slot are unsigned. I chose not to make them signed to avoid problems if this is ever used on architectures where there are meaningful addresses with the top bit set. Instead, fix it with an additional test against the allocation size. This can cause a boot failure in addition to a loss of randomisation due to another bug in the arm64 stub fixed separately. Signed-off-by: Benjamin Herrenschmidt Fixes: 2ddbfc81eac8 ("efi: stub: add implementation of efi_random_alloc()") Signed-off-by: Ard Biesheuvel --- drivers/firmware/efi/libstub/randomalloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/efi/libstub/randomalloc.c b/drivers/firmware/efi/libstub/randomalloc.c index a408df474d83..724155b9e10d 100644 --- a/drivers/firmware/efi/libstub/randomalloc.c +++ b/drivers/firmware/efi/libstub/randomalloc.c @@ -30,6 +30,8 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md, region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1, (u64)ULONG_MAX); + if (region_end < size) + return 0; first_slot = round_up(md->phys_addr, align); last_slot = round_down(region_end - size + 1, align); From 83f877a09516bcb82e34df621cc3a794509a11a3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 21 Jul 2021 12:40:10 +0100 Subject: [PATCH 015/187] xen/events: remove redundant initialization of variable irq The variable irq is being initialized with a value that is never read, it is being updated later on. The assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20210721114010.108648-1-colin.king@canonical.com Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/events/events_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index d7e361fb0548..154daddbdcb4 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1009,7 +1009,7 @@ static void __unbind_from_irq(unsigned int irq) int xen_bind_pirq_gsi_to_irq(unsigned gsi, unsigned pirq, int shareable, char *name) { - int irq = -1; + int irq; struct physdev_irq irq_op; int ret; From 32ec3960175e58a914fc242b66dfe33e9059568f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 23 Jul 2021 11:13:52 +0200 Subject: [PATCH 016/187] pinctrl: qcom: fix GPIOLIB dependencies Enabling the PINCTRL_SM8350 symbol without GPIOLIB or SCM causes a build failure: WARNING: unmet direct dependencies detected for PINCTRL_MSM Depends on [m]: PINCTRL [=y] && (ARCH_QCOM [=y] || COMPILE_TEST [=y]) && GPIOLIB [=y] && (QCOM_SCM [=m] || !QCOM_SCM [=m]) Selected by [y]: - PINCTRL_SM8350 [=y] && PINCTRL [=y] && (ARCH_QCOM [=y] || COMPILE_TEST [=y]) && GPIOLIB [=y] && OF [=y] aarch64-linux-ld: drivers/pinctrl/qcom/pinctrl-msm.o: in function `msm_gpio_irq_set_type': pinctrl-msm.c:(.text.msm_gpio_irq_set_type+0x1c8): undefined reference to `qcom_scm_io_readl' The main problem here is the 'select PINCTRL_MSM', which needs to be a 'depends on' as it is for all the other front-ends. As the GPIOLIB dependency is now implied by that, symbol, remove the duplicate dependencies in the process. Fixes: d5d348a3271f ("pinctrl: qcom: Add SM8350 pinctrl driver") Fixes: 376f9e34c10f ("drivers: pinctrl: qcom: fix Kconfig dependency on GPIOLIB") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210723091400.1669716-1-arnd@kernel.org Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/Kconfig | 63 ++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/drivers/pinctrl/qcom/Kconfig b/drivers/pinctrl/qcom/Kconfig index 2f51b4f99393..cad4e60df618 100644 --- a/drivers/pinctrl/qcom/Kconfig +++ b/drivers/pinctrl/qcom/Kconfig @@ -13,7 +13,7 @@ config PINCTRL_MSM config PINCTRL_APQ8064 tristate "Qualcomm APQ8064 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -21,7 +21,7 @@ config PINCTRL_APQ8064 config PINCTRL_APQ8084 tristate "Qualcomm APQ8084 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -29,7 +29,7 @@ config PINCTRL_APQ8084 config PINCTRL_IPQ4019 tristate "Qualcomm IPQ4019 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -37,7 +37,7 @@ config PINCTRL_IPQ4019 config PINCTRL_IPQ8064 tristate "Qualcomm IPQ8064 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -45,7 +45,7 @@ config PINCTRL_IPQ8064 config PINCTRL_IPQ8074 tristate "Qualcomm Technologies, Inc. IPQ8074 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for @@ -55,7 +55,7 @@ config PINCTRL_IPQ8074 config PINCTRL_IPQ6018 tristate "Qualcomm Technologies, Inc. IPQ6018 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for @@ -65,7 +65,7 @@ config PINCTRL_IPQ6018 config PINCTRL_MSM8226 tristate "Qualcomm 8226 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -74,7 +74,7 @@ config PINCTRL_MSM8226 config PINCTRL_MSM8660 tristate "Qualcomm 8660 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -82,7 +82,7 @@ config PINCTRL_MSM8660 config PINCTRL_MSM8960 tristate "Qualcomm 8960 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -90,7 +90,7 @@ config PINCTRL_MSM8960 config PINCTRL_MDM9615 tristate "Qualcomm 9615 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -98,7 +98,7 @@ config PINCTRL_MDM9615 config PINCTRL_MSM8X74 tristate "Qualcomm 8x74 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -106,7 +106,7 @@ config PINCTRL_MSM8X74 config PINCTRL_MSM8916 tristate "Qualcomm 8916 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -114,7 +114,7 @@ config PINCTRL_MSM8916 config PINCTRL_MSM8953 tristate "Qualcomm 8953 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -124,7 +124,7 @@ config PINCTRL_MSM8953 config PINCTRL_MSM8976 tristate "Qualcomm 8976 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -134,7 +134,7 @@ config PINCTRL_MSM8976 config PINCTRL_MSM8994 tristate "Qualcomm 8994 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -143,7 +143,7 @@ config PINCTRL_MSM8994 config PINCTRL_MSM8996 tristate "Qualcomm MSM8996 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -151,7 +151,7 @@ config PINCTRL_MSM8996 config PINCTRL_MSM8998 tristate "Qualcomm MSM8998 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -159,7 +159,7 @@ config PINCTRL_MSM8998 config PINCTRL_QCS404 tristate "Qualcomm QCS404 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -167,7 +167,7 @@ config PINCTRL_QCS404 config PINCTRL_QDF2XXX tristate "Qualcomm Technologies QDF2xxx pin controller driver" - depends on GPIOLIB && ACPI + depends on ACPI depends on PINCTRL_MSM help This is the GPIO driver for the TLMM block found on the @@ -175,7 +175,7 @@ config PINCTRL_QDF2XXX config PINCTRL_QCOM_SPMI_PMIC tristate "Qualcomm SPMI PMIC pin controller driver" - depends on GPIOLIB && OF && SPMI + depends on OF && SPMI select REGMAP_SPMI select PINMUX select PINCONF @@ -190,7 +190,7 @@ config PINCTRL_QCOM_SPMI_PMIC config PINCTRL_QCOM_SSBI_PMIC tristate "Qualcomm SSBI PMIC pin controller driver" - depends on GPIOLIB && OF + depends on OF select PINMUX select PINCONF select GENERIC_PINCONF @@ -204,7 +204,7 @@ config PINCTRL_QCOM_SSBI_PMIC config PINCTRL_SC7180 tristate "Qualcomm Technologies Inc SC7180 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -213,7 +213,7 @@ config PINCTRL_SC7180 config PINCTRL_SC7280 tristate "Qualcomm Technologies Inc SC7280 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -222,7 +222,7 @@ config PINCTRL_SC7280 config PINCTRL_SC8180X tristate "Qualcomm Technologies Inc SC8180x pin controller driver" - depends on GPIOLIB && (OF || ACPI) + depends on (OF || ACPI) depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -231,7 +231,7 @@ config PINCTRL_SC8180X config PINCTRL_SDM660 tristate "Qualcomm Technologies Inc SDM660 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -240,7 +240,7 @@ config PINCTRL_SDM660 config PINCTRL_SDM845 tristate "Qualcomm Technologies Inc SDM845 pin controller driver" - depends on GPIOLIB && (OF || ACPI) + depends on (OF || ACPI) depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -249,7 +249,7 @@ config PINCTRL_SDM845 config PINCTRL_SDX55 tristate "Qualcomm Technologies Inc SDX55 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -258,7 +258,7 @@ config PINCTRL_SDX55 config PINCTRL_SM6125 tristate "Qualcomm Technologies Inc SM6125 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -267,7 +267,7 @@ config PINCTRL_SM6125 config PINCTRL_SM8150 tristate "Qualcomm Technologies Inc SM8150 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -276,7 +276,7 @@ config PINCTRL_SM8150 config PINCTRL_SM8250 tristate "Qualcomm Technologies Inc SM8250 pin controller driver" - depends on GPIOLIB && OF + depends on OF depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the @@ -285,8 +285,7 @@ config PINCTRL_SM8250 config PINCTRL_SM8350 tristate "Qualcomm Technologies Inc SM8350 pin controller driver" - depends on GPIOLIB && OF - select PINCTRL_MSM + depends on PINCTRL_MSM help This is the pinctrl, pinmux, pinconf and gpiolib driver for the Qualcomm Technologies Inc TLMM block found on the Qualcomm From 798a315fc359aa6dbe48e09d802aa59b7e158ffc Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Thu, 1 Jul 2021 16:09:55 +0800 Subject: [PATCH 017/187] pinctrl: mediatek: Fix fallback behavior for bias_set_combo Some pin doesn't support PUPD register, if it fails and fallbacks with bias_set_combo case, it will call mtk_pinconf_bias_set_pupd_r1_r0() to modify the PUPD pin again. Since the general bias set are either PU/PD or PULLSEL/PULLEN, try bias_set or bias_set_rev1 for the other fallback case. If the pin doesn't support neither PU/PD nor PULLSEL/PULLEN, it will return -ENOTSUPP. Fixes: 81bd1579b43e ("pinctrl: mediatek: Fix fallback call path") Signed-off-by: Hsin-Yi Wang Reviewed-by: Chen-Yu Tsai Reviewed-by: Zhiyong Tao Link: https://lore.kernel.org/r/20210701080955.2660294-1-hsinyi@chromium.org Signed-off-by: Linus Walleij --- drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c index 5b3b048725cc..45ebdeba985a 100644 --- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c +++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c @@ -925,12 +925,10 @@ int mtk_pinconf_adv_pull_set(struct mtk_pinctrl *hw, err = hw->soc->bias_set(hw, desc, pullup); if (err) return err; - } else if (hw->soc->bias_set_combo) { - err = hw->soc->bias_set_combo(hw, desc, pullup, arg); - if (err) - return err; } else { - return -ENOTSUPP; + err = mtk_pinconf_bias_set_rev1(hw, desc, pullup); + if (err) + err = mtk_pinconf_bias_set(hw, desc, pullup); } } From 9f9decdb64c5cc05b66f7a6ede226dd90684570b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Jul 2021 17:13:20 +0200 Subject: [PATCH 018/187] iio: accel: fxls8962af: fix i2c dependency With CONFIG_SPI=y and CONFIG_I2C=m, building fxls8962af into vmlinux causes a link error against the I2C module: aarch64-linux-ld: drivers/iio/accel/fxls8962af-core.o: in function `fxls8962af_fifo_flush': fxls8962af-core.c:(.text+0x3a0): undefined reference to `i2c_verify_client' Work around it by adding a Kconfig dependency that forces the SPI driver to be a loadable module whenever I2C is a module. Fixes: af959b7b96b8 ("iio: accel: fxls8962af: fix errata bug E3 - I2C burst reads") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20210721151330.2176653-1-arnd@kernel.org Signed-off-by: Jonathan Cameron --- drivers/iio/accel/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/accel/Kconfig b/drivers/iio/accel/Kconfig index 0e56ace61103..8d8b1ba42ff8 100644 --- a/drivers/iio/accel/Kconfig +++ b/drivers/iio/accel/Kconfig @@ -231,6 +231,7 @@ config DMARD10 config FXLS8962AF tristate + depends on I2C || !I2C # cannot be built-in for modular I2C config FXLS8962AF_I2C tristate "NXP FXLS8962AF/FXLS8964AF Accelerometer I2C Driver" @@ -247,6 +248,7 @@ config FXLS8962AF_I2C config FXLS8962AF_SPI tristate "NXP FXLS8962AF/FXLS8964AF Accelerometer SPI Driver" depends on SPI + depends on I2C || !I2C select FXLS8962AF select REGMAP_SPI help From 14a30238ecb8dcf52a9e2be514414e3ec443b536 Mon Sep 17 00:00:00 2001 From: Maxime Ripard Date: Wed, 21 Jul 2021 16:03:45 +0200 Subject: [PATCH 019/187] dt-bindings: iio: st: Remove wrong items length check The original bindings was listing the length of the interrupts as either 1 or 2, depending on the setup. This is also what is enforced by the top level schema. However, that is further constrained with an if clause that require exactly two interrupts, even though it might not make sense on those devices or in some setups. Let's remove the clause entirely. Cc: Denis Ciocca Cc: Lars-Peter Clausen Fixes: 0cd71145803d ("iio: st-sensors: Update ST Sensor bindings") Signed-off-by: Maxime Ripard Reviewed-by: Rob Herring Reviewed-by: Linus Walleij Link: https://lore.kernel.org/r/20210721140424.725744-16-maxime@cerno.tech Signed-off-by: Jonathan Cameron --- .../bindings/iio/st,st-sensors.yaml | 41 ------------------- 1 file changed, 41 deletions(-) diff --git a/Documentation/devicetree/bindings/iio/st,st-sensors.yaml b/Documentation/devicetree/bindings/iio/st,st-sensors.yaml index b2a1e42c56fa..71de5631ebae 100644 --- a/Documentation/devicetree/bindings/iio/st,st-sensors.yaml +++ b/Documentation/devicetree/bindings/iio/st,st-sensors.yaml @@ -152,47 +152,6 @@ allOf: maxItems: 1 st,drdy-int-pin: false - - if: - properties: - compatible: - enum: - # Two intertial interrupts i.e. accelerometer/gyro interrupts - - st,h3lis331dl-accel - - st,l3g4200d-gyro - - st,l3g4is-gyro - - st,l3gd20-gyro - - st,l3gd20h-gyro - - st,lis2de12 - - st,lis2dw12 - - st,lis2hh12 - - st,lis2dh12-accel - - st,lis331dl-accel - - st,lis331dlh-accel - - st,lis3de - - st,lis3dh-accel - - st,lis3dhh - - st,lis3mdl-magn - - st,lng2dm-accel - - st,lps331ap-press - - st,lsm303agr-accel - - st,lsm303dlh-accel - - st,lsm303dlhc-accel - - st,lsm303dlm-accel - - st,lsm330-accel - - st,lsm330-gyro - - st,lsm330d-accel - - st,lsm330d-gyro - - st,lsm330dl-accel - - st,lsm330dl-gyro - - st,lsm330dlc-accel - - st,lsm330dlc-gyro - - st,lsm9ds0-gyro - - st,lsm9ds1-magn - then: - properties: - interrupts: - maxItems: 2 - required: - compatible - reg From 84edec86f449adea9ee0b4912a79ab8d9d65abb7 Mon Sep 17 00:00:00 2001 From: Chris Lesiak Date: Mon, 14 Jun 2021 09:18:20 -0500 Subject: [PATCH 020/187] iio: humidity: hdc100x: Add margin to the conversion time The datasheets have the following note for the conversion time specification: "This parameter is specified by design and/or characterization and it is not tested in production." Parts have been seen that require more time to do 14-bit conversions for the relative humidity channel. The result is ENXIO due to the address phase of a transfer not getting an ACK. Delay an additional 1 ms per conversion to allow for additional margin. Fixes: 4839367d99e3 ("iio: humidity: add HDC100x support") Signed-off-by: Chris Lesiak Acked-by: Matt Ranostay Link: https://lore.kernel.org/r/20210614141820.2034827-1-chris.lesiak@licor.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/humidity/hdc100x.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/humidity/hdc100x.c b/drivers/iio/humidity/hdc100x.c index 2a957f19048e..9e0fce917ce4 100644 --- a/drivers/iio/humidity/hdc100x.c +++ b/drivers/iio/humidity/hdc100x.c @@ -25,6 +25,8 @@ #include #include +#include + #define HDC100X_REG_TEMP 0x00 #define HDC100X_REG_HUMIDITY 0x01 @@ -166,7 +168,7 @@ static int hdc100x_get_measurement(struct hdc100x_data *data, struct iio_chan_spec const *chan) { struct i2c_client *client = data->client; - int delay = data->adc_int_us[chan->address]; + int delay = data->adc_int_us[chan->address] + 1*USEC_PER_MSEC; int ret; __be16 val; @@ -316,7 +318,7 @@ static irqreturn_t hdc100x_trigger_handler(int irq, void *p) struct iio_dev *indio_dev = pf->indio_dev; struct hdc100x_data *data = iio_priv(indio_dev); struct i2c_client *client = data->client; - int delay = data->adc_int_us[0] + data->adc_int_us[1]; + int delay = data->adc_int_us[0] + data->adc_int_us[1] + 2*USEC_PER_MSEC; int ret; /* dual read starts at temp register */ From 0f673c16c850250db386537a422c11d248fb123c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Jul 2021 14:01:56 +0200 Subject: [PATCH 021/187] iwlwifi: pnvm: accept multiple HW-type TLVs Some products (So) may have two different types of products with different mac-type that are otherwise equivalent, and have the same PNVM data, so the PNVM file will contain two (or perhaps later more) HW-type TLVs. Accept the file and use the data section that contains any matching entry. Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210719140154.a6a86e903035.Ic0b1b75c45d386698859f251518e8a5144431938@changeid --- drivers/net/wireless/intel/iwlwifi/fw/pnvm.c | 25 +++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c b/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c index 2403490cbc26..b4b1f75b9c2a 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c @@ -37,6 +37,7 @@ static int iwl_pnvm_handle_section(struct iwl_trans *trans, const u8 *data, u32 sha1 = 0; u16 mac_type = 0, rf_id = 0; u8 *pnvm_data = NULL, *tmp; + bool hw_match = false; u32 size = 0; int ret; @@ -83,6 +84,9 @@ static int iwl_pnvm_handle_section(struct iwl_trans *trans, const u8 *data, break; } + if (hw_match) + break; + mac_type = le16_to_cpup((__le16 *)data); rf_id = le16_to_cpup((__le16 *)(data + sizeof(__le16))); @@ -90,15 +94,9 @@ static int iwl_pnvm_handle_section(struct iwl_trans *trans, const u8 *data, "Got IWL_UCODE_TLV_HW_TYPE mac_type 0x%0x rf_id 0x%0x\n", mac_type, rf_id); - if (mac_type != CSR_HW_REV_TYPE(trans->hw_rev) || - rf_id != CSR_HW_RFID_TYPE(trans->hw_rf_id)) { - IWL_DEBUG_FW(trans, - "HW mismatch, skipping PNVM section, mac_type 0x%0x, rf_id 0x%0x.\n", - CSR_HW_REV_TYPE(trans->hw_rev), trans->hw_rf_id); - ret = -ENOENT; - goto out; - } - + if (mac_type == CSR_HW_REV_TYPE(trans->hw_rev) && + rf_id == CSR_HW_RFID_TYPE(trans->hw_rf_id)) + hw_match = true; break; case IWL_UCODE_TLV_SEC_RT: { struct iwl_pnvm_section *section = (void *)data; @@ -149,6 +147,15 @@ static int iwl_pnvm_handle_section(struct iwl_trans *trans, const u8 *data, } done: + if (!hw_match) { + IWL_DEBUG_FW(trans, + "HW mismatch, skipping PNVM section (need mac_type 0x%x rf_id 0x%x)\n", + CSR_HW_REV_TYPE(trans->hw_rev), + CSR_HW_RFID_TYPE(trans->hw_rf_id)); + ret = -ENOENT; + goto out; + } + if (!size) { IWL_DEBUG_FW(trans, "Empty PNVM, skipping.\n"); ret = -ENOENT; From a5bf1d4434b93394fa37494d78fe9f3513557185 Mon Sep 17 00:00:00 2001 From: Yaara Baruch Date: Mon, 19 Jul 2021 14:45:38 +0200 Subject: [PATCH 022/187] iwlwifi: add new SoF with JF devices Add new SoF JF devices to the driver. Signed-off-by: Yaara Baruch Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210719144523.0545d8964ff2.I3498879d8c184e42b1578a64aa7b7c99a18b75fb@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 16baee3d52ae..e8693be51fb1 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1110,6 +1110,40 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_NO_CDB, iwl_cfg_bz_a0_mr_a0, iwl_ax211_name), +/* SoF with JF2 */ + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9560_160_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9560_name), + +/* SoF with JF */ + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9461_160_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9462_160_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9461_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SOF, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9462_name), + /* So with GF */ _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, From 891332f697e14bfb2002f56e21d9bbd4800a7098 Mon Sep 17 00:00:00 2001 From: Yaara Baruch Date: Mon, 19 Jul 2021 14:45:39 +0200 Subject: [PATCH 023/187] iwlwifi: add new so-jf devices Add new so-jf devices to the driver. Signed-off-by: Yaara Baruch Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210719144523.1c9a59fd2760.If5aef1942007828210f0f2c4a17985f63050bb45@changeid --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index e8693be51fb1..0b8a0cd3b652 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1149,7 +1149,41 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, IWL_CFG_RF_TYPE_GF, IWL_CFG_ANY, IWL_CFG_160, IWL_CFG_ANY, IWL_CFG_NO_CDB, - iwlax211_2ax_cfg_so_gf_a0, iwl_ax211_name) + iwlax211_2ax_cfg_so_gf_a0, iwl_ax211_name), + +/* So with JF2 */ + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9560_160_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF2, IWL_CFG_RF_ID_JF, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9560_name), + +/* So with JF */ + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9461_160_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, + IWL_CFG_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9462_160_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9461_name), + _IWL_DEV_INFO(IWL_CFG_ANY, IWL_CFG_ANY, + IWL_CFG_MAC_TYPE_SO, IWL_CFG_ANY, + IWL_CFG_RF_TYPE_JF1, IWL_CFG_RF_ID_JF1_DIV, + IWL_CFG_NO_160, IWL_CFG_CORES_BT, IWL_CFG_NO_CDB, + iwlax210_2ax_cfg_so_jf_b0, iwl9462_name) #endif /* CONFIG_IWLMVM */ }; From facee1be7689f8cf573b9ffee6a5c28ee193615e Mon Sep 17 00:00:00 2001 From: David Brazdil Date: Wed, 28 Jul 2021 15:32:31 +0000 Subject: [PATCH 024/187] KVM: arm64: Fix off-by-one in range_is_memory Hyp checks whether an address range only covers RAM by checking the start/endpoints against a list of memblock_region structs. However, the endpoint here is exclusive but internally is treated as inclusive. Fix the off-by-one error that caused valid address ranges to be rejected. Cc: Quentin Perret Fixes: 90134ac9cabb6 ("KVM: arm64: Protect the .hyp sections from the host") Signed-off-by: David Brazdil Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210728153232.1018911-2-dbrazdil@google.com --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index d938ce95d3bd..a6ce991b1467 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -193,7 +193,7 @@ static bool range_is_memory(u64 start, u64 end) { struct kvm_mem_range r1, r2; - if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2)) + if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2)) return false; if (r1.start != r2.start) return false; From c4d7c51845af9542d42cd18a25c570583abf2768 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Thu, 29 Jul 2021 17:00:36 +0100 Subject: [PATCH 025/187] KVM: arm64: Fix race when enabling KVM_ARM_CAP_MTE When enabling KVM_CAP_ARM_MTE the ioctl checks that there are no VCPUs created to ensure that the capability is enabled before the VM is running. However no locks are held at that point so it is (theoretically) possible for another thread in the VMM to create VCPUs between the check and actually setting mte_enabled. Close the race by taking kvm->lock. Reported-by: Alexandru Elisei Fixes: 673638f434ee ("KVM: arm64: Expose KVM_ARM_CAP_MTE") Signed-off-by: Steven Price Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210729160036.20433-1-steven.price@arm.com --- arch/arm64/kvm/arm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e9a2b8f27792..0ca72f5cda41 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -94,10 +94,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.return_nisv_io_abort_to_user = true; break; case KVM_CAP_ARM_MTE: - if (!system_supports_mte() || kvm->created_vcpus) - return -EINVAL; - r = 0; - kvm->arch.mte_enabled = true; + mutex_lock(&kvm->lock); + if (!system_supports_mte() || kvm->created_vcpus) { + r = -EINVAL; + } else { + r = 0; + kvm->arch.mte_enabled = true; + } + mutex_unlock(&kvm->lock); break; default: r = -EINVAL; From 567c39047dbee341244fe3bf79fea24ee0897ff9 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 5 Jul 2021 08:09:21 +0300 Subject: [PATCH 026/187] selftests/sgx: Fix Q1 and Q2 calculation in sigstruct.c Q1 and Q2 are numbers with *maximum* length of 384 bytes. If the calculated length of Q1 and Q2 is less than 384 bytes, things will go wrong. E.g. if Q2 is 383 bytes, then 1. The bytes of q2 are copied to sigstruct->q2 in calc_q1q2(). 2. The entire sigstruct->q2 is reversed, which results it being 256 * Q2, given that the last byte of sigstruct->q2 is added to before the bytes given by calc_q1q2(). Either change in key or measurement can trigger the bug. E.g. an unmeasured heap could cause a devastating change in Q1 or Q2. Reverse exactly the bytes of Q1 and Q2 in calc_q1q2() before returning to the caller. Fixes: 2adcba79e69d ("selftests/x86: Add a selftest for SGX") Link: https://lore.kernel.org/linux-sgx/20210301051836.30738-1-tianjia.zhang@linux.alibaba.com/ Signed-off-by: Tianjia Zhang Signed-off-by: Jarkko Sakkinen Signed-off-by: Shuah Khan --- tools/testing/selftests/sgx/sigstruct.c | 41 +++++++++++++------------ 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index dee7a3d6c5a5..92bbc5a15c39 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -55,10 +55,27 @@ static bool alloc_q1q2_ctx(const uint8_t *s, const uint8_t *m, return true; } +static void reverse_bytes(void *data, int length) +{ + int i = 0; + int j = length - 1; + uint8_t temp; + uint8_t *ptr = data; + + while (i < j) { + temp = ptr[i]; + ptr[i] = ptr[j]; + ptr[j] = temp; + i++; + j--; + } +} + static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, uint8_t *q2) { struct q1q2_ctx ctx; + int len; if (!alloc_q1q2_ctx(s, m, &ctx)) { fprintf(stderr, "Not enough memory for Q1Q2 calculation\n"); @@ -89,8 +106,10 @@ static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, goto out; } - BN_bn2bin(ctx.q1, q1); - BN_bn2bin(ctx.q2, q2); + len = BN_bn2bin(ctx.q1, q1); + reverse_bytes(q1, len); + len = BN_bn2bin(ctx.q2, q2); + reverse_bytes(q2, len); free_q1q2_ctx(&ctx); return true; @@ -152,22 +171,6 @@ static RSA *gen_sign_key(void) return key; } -static void reverse_bytes(void *data, int length) -{ - int i = 0; - int j = length - 1; - uint8_t temp; - uint8_t *ptr = data; - - while (i < j) { - temp = ptr[i]; - ptr[i] = ptr[j]; - ptr[j] = temp; - i++; - j--; - } -} - enum mrtags { MRECREATE = 0x0045544145524345, MREADD = 0x0000000044444145, @@ -367,8 +370,6 @@ bool encl_measure(struct encl *encl) /* BE -> LE */ reverse_bytes(sigstruct->signature, SGX_MODULUS_SIZE); reverse_bytes(sigstruct->modulus, SGX_MODULUS_SIZE); - reverse_bytes(sigstruct->q1, SGX_MODULUS_SIZE); - reverse_bytes(sigstruct->q2, SGX_MODULUS_SIZE); EVP_MD_CTX_destroy(ctx); RSA_free(key); From 5afc1540f13804a31bb704b763308e17688369c5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 30 Jul 2021 08:16:51 +0100 Subject: [PATCH 027/187] iio: adc: Fix incorrect exit of for-loop Currently the for-loop that scans for the optimial adc_period iterates through all the possible adc_period levels because the exit logic in the loop is inverted. I believe the comparison should be swapped and the continue replaced with a break to exit the loop at the correct point. Addresses-Coverity: ("Continue has no effect") Fixes: e08e19c331fb ("iio:adc: add iio driver for Palmas (twl6035/7) gpadc") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20210730071651.17394-1-colin.king@canonical.com Cc: Signed-off-by: Jonathan Cameron --- drivers/iio/adc/palmas_gpadc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/palmas_gpadc.c b/drivers/iio/adc/palmas_gpadc.c index 6ef09609be9f..f9c8385c72d3 100644 --- a/drivers/iio/adc/palmas_gpadc.c +++ b/drivers/iio/adc/palmas_gpadc.c @@ -664,8 +664,8 @@ static int palmas_adc_wakeup_configure(struct palmas_gpadc *adc) adc_period = adc->auto_conversion_period; for (i = 0; i < 16; ++i) { - if (((1000 * (1 << i)) / 32) < adc_period) - continue; + if (((1000 * (1 << i)) / 32) >= adc_period) + break; } if (i > 0) i--; From 5b94046efb4706b3429c9c8e7377bd8d1621d588 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 26 Jul 2021 11:38:41 +0200 Subject: [PATCH 028/187] efi/libstub: arm64: Force Image reallocation if BSS was not reserved Distro versions of GRUB replace the usual LoadImage/StartImage calls used to load the kernel image with some local code that fails to honor the allocation requirements described in the PE/COFF header, as it does not account for the image's BSS section at all: it fails to allocate space for it, and fails to zero initialize it. Since the EFI stub itself is allocated in the .init segment, which is in the middle of the image, its BSS section is not impacted by this, and the main consequence of this omission is that the BSS section may overlap with memory regions that are already used by the firmware. So let's warn about this condition, and force image reallocation to occur in this case, which works around the problem. Fixes: 82046702e288 ("efi/libstub/arm64: Replace 'preferred' offset with alignment check") Signed-off-by: Ard Biesheuvel Tested-by: Benjamin Herrenschmidt --- drivers/firmware/efi/libstub/arm64-stub.c | 49 ++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 7bf0a7acae5e..3698c1ce2940 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -34,6 +34,51 @@ efi_status_t check_platform_features(void) return EFI_SUCCESS; } +/* + * Distro versions of GRUB may ignore the BSS allocation entirely (i.e., fail + * to provide space, and fail to zero it). Check for this condition by double + * checking that the first and the last byte of the image are covered by the + * same EFI memory map entry. + */ +static bool check_image_region(u64 base, u64 size) +{ + unsigned long map_size, desc_size, buff_size; + efi_memory_desc_t *memory_map; + struct efi_boot_memmap map; + efi_status_t status; + bool ret = false; + int map_offset; + + map.map = &memory_map; + map.map_size = &map_size; + map.desc_size = &desc_size; + map.desc_ver = NULL; + map.key_ptr = NULL; + map.buff_size = &buff_size; + + status = efi_get_memory_map(&map); + if (status != EFI_SUCCESS) + return false; + + for (map_offset = 0; map_offset < map_size; map_offset += desc_size) { + efi_memory_desc_t *md = (void *)memory_map + map_offset; + u64 end = md->phys_addr + md->num_pages * EFI_PAGE_SIZE; + + /* + * Find the region that covers base, and return whether + * it covers base+size bytes. + */ + if (base >= md->phys_addr && base < end) { + ret = (base + size) <= end; + break; + } + } + + efi_bs_call(free_pool, memory_map); + + return ret; +} + /* * Although relocatable kernels can fix up the misalignment with respect to * MIN_KIMG_ALIGN, the resulting virtual text addresses are subtly out of @@ -92,7 +137,9 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, } if (status != EFI_SUCCESS) { - if (IS_ALIGNED((u64)_text, min_kimg_align())) { + if (!check_image_region((u64)_text, kernel_memsize)) { + efi_err("FIRMWARE BUG: Image BSS overlaps adjacent EFI memory region\n"); + } else if (IS_ALIGNED((u64)_text, min_kimg_align())) { /* * Just execute from wherever we were loaded by the * UEFI PE/COFF loader if the alignment is suitable. From 3a262423755b83a5f85009ace415d6e7f572dfe8 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 22 Jul 2021 12:10:31 +0200 Subject: [PATCH 029/187] efi/libstub: arm64: Relax 2M alignment again for relocatable kernels Commit 82046702e288 ("efi/libstub/arm64: Replace 'preferred' offset with alignment check") simplified the way the stub moves the kernel image around in memory before booting it, given that a relocatable image does not need to be copied to a 2M aligned offset if it was loaded on a 64k boundary by EFI. Commit d32de9130f6c ("efi/arm64: libstub: Deal gracefully with EFI_RNG_PROTOCOL failure") inadvertently defeated this logic by overriding the value of efi_nokaslr if EFI_RNG_PROTOCOL is not available, which was mistaken by the loader logic as an explicit request on the part of the user to disable KASLR and any associated relocation of an Image not loaded on a 2M boundary. So let's reinstate this functionality, by capturing the value of efi_nokaslr at function entry to choose the minimum alignment. Fixes: d32de9130f6c ("efi/arm64: libstub: Deal gracefully with EFI_RNG_PROTOCOL failure") Signed-off-by: Ard Biesheuvel Tested-by: Benjamin Herrenschmidt --- drivers/firmware/efi/libstub/arm64-stub.c | 28 +++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 3698c1ce2940..6f214c9c303e 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -79,18 +79,6 @@ static bool check_image_region(u64 base, u64 size) return ret; } -/* - * Although relocatable kernels can fix up the misalignment with respect to - * MIN_KIMG_ALIGN, the resulting virtual text addresses are subtly out of - * sync with those recorded in the vmlinux when kaslr is disabled but the - * image required relocation anyway. Therefore retain 2M alignment unless - * KASLR is in use. - */ -static u64 min_kimg_align(void) -{ - return efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; -} - efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, @@ -101,6 +89,16 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long kernel_size, kernel_memsize = 0; u32 phys_seed = 0; + /* + * Although relocatable kernels can fix up the misalignment with + * respect to MIN_KIMG_ALIGN, the resulting virtual text addresses are + * subtly out of sync with those recorded in the vmlinux when kaslr is + * disabled but the image required relocation anyway. Therefore retain + * 2M alignment if KASLR was explicitly disabled, even if it was not + * going to be activated to begin with. + */ + u64 min_kimg_align = efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { if (!efi_nokaslr) { status = efi_get_random_bytes(sizeof(phys_seed), @@ -130,7 +128,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, * If KASLR is enabled, and we have some randomness available, * locate the kernel at a randomized offset in physical memory. */ - status = efi_random_alloc(*reserve_size, min_kimg_align(), + status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed); } else { status = EFI_OUT_OF_RESOURCES; @@ -139,7 +137,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, if (status != EFI_SUCCESS) { if (!check_image_region((u64)_text, kernel_memsize)) { efi_err("FIRMWARE BUG: Image BSS overlaps adjacent EFI memory region\n"); - } else if (IS_ALIGNED((u64)_text, min_kimg_align())) { + } else if (IS_ALIGNED((u64)_text, min_kimg_align)) { /* * Just execute from wherever we were loaded by the * UEFI PE/COFF loader if the alignment is suitable. @@ -150,7 +148,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, } status = efi_allocate_pages_aligned(*reserve_size, reserve_addr, - ULONG_MAX, min_kimg_align()); + ULONG_MAX, min_kimg_align); if (status != EFI_SUCCESS) { efi_err("Failed to relocate kernel\n"); From ff80ef5bf5bd59e5eab82d1d846acc613ebbf6c4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 26 Jul 2021 16:24:01 +0200 Subject: [PATCH 030/187] efi/libstub: arm64: Warn when efi_random_alloc() fails Randomization of the physical load address of the kernel image relies on efi_random_alloc() returning successfully, and currently, we ignore any failures and just carry on, using the ordinary, non-randomized page allocator routine. This means we never find out if a failure occurs, which could harm security, so let's at least warn about this condition. Signed-off-by: Ard Biesheuvel Tested-by: Benjamin Herrenschmidt --- drivers/firmware/efi/libstub/arm64-stub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 6f214c9c303e..010564f8bbc4 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -130,6 +130,8 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, */ status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed); + if (status != EFI_SUCCESS) + efi_warn("efi_random_alloc() failed: 0x%lx\n", status); } else { status = EFI_OUT_OF_RESOURCES; } From c32ac11da3f83bb42b986702a9b92f0a14ed4182 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 26 Jul 2021 16:31:44 +0200 Subject: [PATCH 031/187] efi/libstub: arm64: Double check image alignment at entry On arm64, the stub only moves the kernel image around in memory if needed, which is typically only for KASLR, given that relocatable kernels (which is the default) can run from any 64k aligned address, which is also the minimum alignment communicated to EFI via the PE/COFF header. Unfortunately, some loaders appear to ignore this header, and load the kernel at some arbitrary offset in memory. We can deal with this, but let's check for this condition anyway, so non-compliant code can be spotted and fixed. Cc: # v5.10+ Signed-off-by: Ard Biesheuvel Tested-by: Benjamin Herrenschmidt --- drivers/firmware/efi/libstub/arm64-stub.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 010564f8bbc4..2363fee9211c 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -119,6 +119,10 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, if (image->image_base != _text) efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n"); + if (!IS_ALIGNED((u64)_text, EFI_KIMG_ALIGN)) + efi_err("FIRMWARE BUG: kernel image not aligned on %ldk boundary\n", + EFI_KIMG_ALIGN >> 10); + kernel_size = _edata - _text; kernel_memsize = kernel_size + (_end - _edata); *reserve_size = kernel_memsize; From 2f658f7a3953f6d70bab90e117aff8d0ad44e200 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Aug 2021 14:21:41 +0300 Subject: [PATCH 032/187] pinctrl: tigerlake: Fix GPIO mapping for newer version of software The software mapping for GPIO, which initially comes from Microsoft, is subject to change by respective Windows and firmware developers. Due to the above the driver had been written and published way ahead of the schedule, and thus the numbering schema used in it is outdated. Fix the numbering schema in accordance with the real products on market. Fixes: 653d96455e1e ("pinctrl: tigerlake: Add support for Tiger Lake-H") Reported-and-tested-by: Kai-Heng Feng Reported-by: Riccardo Mori Reported-and-tested-by: Lovesh BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=213463 BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=213579 BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=213857 Signed-off-by: Andy Shevchenko Acked-by: Mika Westerberg --- drivers/pinctrl/intel/pinctrl-tigerlake.c | 26 +++++++++++------------ 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/pinctrl/intel/pinctrl-tigerlake.c b/drivers/pinctrl/intel/pinctrl-tigerlake.c index 3e4ef2b87526..0bcd19597e4a 100644 --- a/drivers/pinctrl/intel/pinctrl-tigerlake.c +++ b/drivers/pinctrl/intel/pinctrl-tigerlake.c @@ -701,32 +701,32 @@ static const struct pinctrl_pin_desc tglh_pins[] = { static const struct intel_padgroup tglh_community0_gpps[] = { TGL_GPP(0, 0, 24, 0), /* GPP_A */ - TGL_GPP(1, 25, 44, 128), /* GPP_R */ - TGL_GPP(2, 45, 70, 32), /* GPP_B */ - TGL_GPP(3, 71, 78, INTEL_GPIO_BASE_NOMAP), /* vGPIO_0 */ + TGL_GPP(1, 25, 44, 32), /* GPP_R */ + TGL_GPP(2, 45, 70, 64), /* GPP_B */ + TGL_GPP(3, 71, 78, 96), /* vGPIO_0 */ }; static const struct intel_padgroup tglh_community1_gpps[] = { - TGL_GPP(0, 79, 104, 96), /* GPP_D */ - TGL_GPP(1, 105, 128, 64), /* GPP_C */ - TGL_GPP(2, 129, 136, 160), /* GPP_S */ - TGL_GPP(3, 137, 153, 192), /* GPP_G */ - TGL_GPP(4, 154, 180, 224), /* vGPIO */ + TGL_GPP(0, 79, 104, 128), /* GPP_D */ + TGL_GPP(1, 105, 128, 160), /* GPP_C */ + TGL_GPP(2, 129, 136, 192), /* GPP_S */ + TGL_GPP(3, 137, 153, 224), /* GPP_G */ + TGL_GPP(4, 154, 180, 256), /* vGPIO */ }; static const struct intel_padgroup tglh_community3_gpps[] = { - TGL_GPP(0, 181, 193, 256), /* GPP_E */ - TGL_GPP(1, 194, 217, 288), /* GPP_F */ + TGL_GPP(0, 181, 193, 288), /* GPP_E */ + TGL_GPP(1, 194, 217, 320), /* GPP_F */ }; static const struct intel_padgroup tglh_community4_gpps[] = { - TGL_GPP(0, 218, 241, 320), /* GPP_H */ + TGL_GPP(0, 218, 241, 352), /* GPP_H */ TGL_GPP(1, 242, 251, 384), /* GPP_J */ - TGL_GPP(2, 252, 266, 352), /* GPP_K */ + TGL_GPP(2, 252, 266, 416), /* GPP_K */ }; static const struct intel_padgroup tglh_community5_gpps[] = { - TGL_GPP(0, 267, 281, 416), /* GPP_I */ + TGL_GPP(0, 267, 281, 448), /* GPP_I */ TGL_GPP(1, 282, 290, INTEL_GPIO_BASE_NOMAP), /* JTAG */ }; From 7d3fc01796fc895e5fcce45c994c5a8db8120a8d Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Wed, 4 Aug 2021 18:37:22 +0000 Subject: [PATCH 033/187] cifs: create sd context must be a multiple of 8 We used to follow the rule earlier that the create SD context always be a multiple of 8. However, with the change: cifs: refactor create_sd_buf() and and avoid corrupting the buffer ...we recompute the length, and we failed that rule. Fixing that with this change. Cc: # v5.10+ Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 781d14e5f2af..b6d2e3591927 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2426,7 +2426,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) memcpy(aclptr, &acl, sizeof(struct cifs_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); - *len = ptr - (__u8 *)buf; + *len = roundup(ptr - (__u8 *)buf, 8); return buf; } From abf3d98dee7c4038152ce88833ddc2189f68cbd4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 21 Jul 2021 17:06:56 +0200 Subject: [PATCH 034/187] mt76: fix enum type mismatch There is no 'NONE' version of 'enum mcu_cipher_type', and returning 'MT_CIPHER_NONE' causes a warning: drivers/net/wireless/mediatek/mt76/mt7921/mcu.c: In function 'mt7921_mcu_get_cipher': drivers/net/wireless/mediatek/mt76/mt7921/mcu.c:114:24: error: implicit conversion from 'enum mt76_cipher_type' to 'enum mcu_cipher_type' [-Werror=enum-conversion] 114 | return MT_CIPHER_NONE; | ^~~~~~~~~~~~~~ Add the missing MCU_CIPHER_NONE defintion that fits in here with the same value. Fixes: c368362c36d3 ("mt76: fix iv and CCMP header insertion") Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20210721150745.1914829-1-arnd@kernel.org --- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/mcu.h | 3 ++- drivers/net/wireless/mediatek/mt76/mt7921/mcu.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7921/mcu.h | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 863aa18b3024..43960770a9af 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -111,7 +111,7 @@ mt7915_mcu_get_cipher(int cipher) case WLAN_CIPHER_SUITE_SMS4: return MCU_CIPHER_WAPI; default: - return MT_CIPHER_NONE; + return MCU_CIPHER_NONE; } } diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h index edd3ba3a0c2d..e68a562cc5b4 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.h @@ -1073,7 +1073,8 @@ enum { }; enum mcu_cipher_type { - MCU_CIPHER_WEP40 = 1, + MCU_CIPHER_NONE = 0, + MCU_CIPHER_WEP40, MCU_CIPHER_WEP104, MCU_CIPHER_WEP128, MCU_CIPHER_TKIP, diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c index cd690c64f65b..9fbaacc67cfa 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.c @@ -111,7 +111,7 @@ mt7921_mcu_get_cipher(int cipher) case WLAN_CIPHER_SUITE_SMS4: return MCU_CIPHER_WAPI; default: - return MT_CIPHER_NONE; + return MCU_CIPHER_NONE; } } diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.h b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.h index d76cf8f8dfdf..de3c091f6736 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/mcu.h +++ b/drivers/net/wireless/mediatek/mt76/mt7921/mcu.h @@ -199,7 +199,8 @@ struct sta_rec_sec { } __packed; enum mcu_cipher_type { - MCU_CIPHER_WEP40 = 1, + MCU_CIPHER_NONE = 0, + MCU_CIPHER_WEP40, MCU_CIPHER_WEP104, MCU_CIPHER_WEP128, MCU_CIPHER_TKIP, From 99dc4ad992bf156692b088fab4d98deab7cbd3e6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 12 Jul 2021 15:52:14 +0100 Subject: [PATCH 035/187] mtd: devices: mchp48l640: Fix memory leak on cmd The allocation for cmd is not being kfree'd on the return leading to a memory leak. Fix this by kfree'ing it. Addresses-Coverity: ("Resource leak") Fixes: 88d125026753 ("mtd: devices: add support for microchip 48l640 EERAM") Signed-off-by: Colin Ian King Reviewed-by: Heiko Schocher Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210712145214.101377-1-colin.king@canonical.com --- drivers/mtd/devices/mchp48l640.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/devices/mchp48l640.c b/drivers/mtd/devices/mchp48l640.c index ad66b5aaf4e9..99400d0fb8c1 100644 --- a/drivers/mtd/devices/mchp48l640.c +++ b/drivers/mtd/devices/mchp48l640.c @@ -255,6 +255,7 @@ static int mchp48l640_read_page(struct mtd_info *mtd, loff_t from, size_t len, if (!ret) *retlen += len; + kfree(cmd); return ret; fail: From b7abb051682263e51866bc78762fd0083d64c5ed Mon Sep 17 00:00:00 2001 From: Desmond Cheong Zhi Xi Date: Sat, 17 Jul 2021 18:07:19 +0800 Subject: [PATCH 036/187] mtd: fix lock hierarchy in deregister_mtd_blktrans There is a lock hierarchy of major_names_lock --> mtd_table_mutex. One existing chain is as follows: 1. major_names_lock --> loop_ctl_mutex (when blk_request_module calls loop_probe) 2. loop_ctl_mutex --> bdev->bd_mutex (when loop_control_ioctl calls loop_remove, which then calls del_gendisk) 3. bdev->bd_mutex --> mtd_table_mutex (when blkdev_get_by_dev calls __blkdev_get, which then calls blktrans_open) Since unregister_blkdev grabs the major_names_lock, we need to call it outside the critical section for mtd_table_mutex, otherwise we invert the lock hierarchy. Reported-by: Hillf Danton Signed-off-by: Desmond Cheong Zhi Xi Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210717100719.728829-1-desmondcheongzx@gmail.com --- drivers/mtd/mtd_blkdevs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 31b208fb225e..44bea3f65060 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -561,8 +561,8 @@ int deregister_mtd_blktrans(struct mtd_blktrans_ops *tr) list_for_each_entry_safe(dev, next, &tr->devs, list) tr->remove_dev(dev); - unregister_blkdev(tr->major, tr->name); mutex_unlock(&mtd_table_mutex); + unregister_blkdev(tr->major, tr->name); BUG_ON(!list_empty(&tr->devs)); return 0; From b48027083a78b13356695555a05b0e085e378687 Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Tue, 27 Jul 2021 11:58:13 +0530 Subject: [PATCH 037/187] mtd: rawnand: Fix probe failure due to of_get_nand_secure_regions() Due to 14f97f0b8e2b, the rawnand platforms without "secure-regions" property defined in DT fails to probe. The issue is, of_get_nand_secure_regions() errors out if of_property_count_elems_of_size() returns a negative error code. If the "secure-regions" property is not present in DT, then also we'll get -EINVAL from of_property_count_elems_of_size() but it should not be treated as an error for platforms not declaring "secure-regions" in DT. So fix this behaviour by checking for the existence of that property in DT and return 0 if it is not present. Fixes: 14f97f0b8e2b ("mtd: rawnand: Add a check in of_get_nand_secure_regions()") Reported-by: Martin Kaiser Signed-off-by: Manivannan Sadhasivam Reviewed-by: Martin Kaiser Tested-by: Martin Kaiser Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20210727062813.32619-1-manivannan.sadhasivam@linaro.org --- drivers/mtd/nand/raw/nand_base.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index cbba46432e39..3d6c6e880520 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -5228,8 +5228,14 @@ static bool of_get_nand_on_flash_bbt(struct device_node *np) static int of_get_nand_secure_regions(struct nand_chip *chip) { struct device_node *dn = nand_get_flash_node(chip); + struct property *prop; int nr_elem, i, j; + /* Only proceed if the "secure-regions" property is present in DT */ + prop = of_find_property(dn, "secure-regions", NULL); + if (!prop) + return 0; + nr_elem = of_property_count_elems_of_size(dn, "secure-regions", sizeof(u64)); if (nr_elem <= 0) return nr_elem; From 62376365048878f770d8b7d11b89b8b3e18018f1 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 3 Aug 2021 15:14:27 +0000 Subject: [PATCH 038/187] powerpc/32s: Fix napping restore in data storage interrupt (DSI) When a DSI (Data Storage Interrupt) is taken while in NAP mode, r11 doesn't survive the call to power_save_ppc32_restore(). So use r1 instead of r11 as they both contain the virtual stack pointer at that point. Fixes: 4c0104a83fc3 ("powerpc/32: Dismantle EXC_XFER_STD/LITE/TEMPLATE") Cc: stable@vger.kernel.org # v5.13+ Reported-by: Finn Thain Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/731694e0885271f6ee9ffc179eb4bcee78313682.1628003562.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_book3s_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 764edd860ed4..68e5c0a7e99d 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -300,7 +300,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_HPTE_TABLE) EXCEPTION_PROLOG_1 EXCEPTION_PROLOG_2 INTERRUPT_DATA_STORAGE DataAccess handle_dar_dsisr=1 prepare_transfer_to_handler - lwz r5, _DSISR(r11) + lwz r5, _DSISR(r1) andis. r0, r5, DSISR_DABRMATCH@h bne- 1f bl do_page_fault From b5cfc9cd7b0426e94ffd9e9ed79d1b00ace7780a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 7 Jul 2021 05:55:07 +0000 Subject: [PATCH 039/187] powerpc/32: Fix critical and debug interrupts on BOOKE 32 bits BOOKE have special interrupts for debug and other critical events. When handling those interrupts, dedicated registers are saved in the stack frame in addition to the standard registers, leading to a shift of the pt_regs struct. Since commit db297c3b07af ("powerpc/32: Don't save thread.regs on interrupt entry"), the pt_regs struct is expected to be at the same place all the time. Instead of handling a special struct in addition to pt_regs, just add those special registers to struct pt_regs. Fixes: db297c3b07af ("powerpc/32: Don't save thread.regs on interrupt entry") Cc: stable@vger.kernel.org Reported-by: Radu Rendec Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/028d5483b4851b01ea4334d0751e7f260419092b.1625637264.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ptrace.h | 16 ++++++++++++++++ arch/powerpc/kernel/asm-offsets.c | 31 ++++++++++++++----------------- arch/powerpc/kernel/head_booke.h | 27 +++------------------------ 3 files changed, 33 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 3e5d470a6155..14422e851494 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -70,6 +70,22 @@ struct pt_regs unsigned long __pad[4]; /* Maintain 16 byte interrupt stack alignment */ }; #endif +#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE) + struct { /* Must be a multiple of 16 bytes */ + unsigned long mas0; + unsigned long mas1; + unsigned long mas2; + unsigned long mas3; + unsigned long mas6; + unsigned long mas7; + unsigned long srr0; + unsigned long srr1; + unsigned long csrr0; + unsigned long csrr1; + unsigned long dsrr0; + unsigned long dsrr1; + }; +#endif }; #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a47eefa09bcb..5bee245d832b 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -309,24 +309,21 @@ int main(void) STACK_PT_REGS_OFFSET(STACK_REGS_IAMR, iamr); #endif -#if defined(CONFIG_PPC32) -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) - DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); - DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); +#if defined(CONFIG_PPC32) && defined(CONFIG_BOOKE) + STACK_PT_REGS_OFFSET(MAS0, mas0); /* we overload MMUCR for 44x on MAS0 since they are mutually exclusive */ - DEFINE(MMUCR, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); - DEFINE(MAS1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas1)); - DEFINE(MAS2, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas2)); - DEFINE(MAS3, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas3)); - DEFINE(MAS6, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas6)); - DEFINE(MAS7, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas7)); - DEFINE(_SRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr0)); - DEFINE(_SRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, srr1)); - DEFINE(_CSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr0)); - DEFINE(_CSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, csrr1)); - DEFINE(_DSRR0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr0)); - DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); -#endif + STACK_PT_REGS_OFFSET(MMUCR, mas0); + STACK_PT_REGS_OFFSET(MAS1, mas1); + STACK_PT_REGS_OFFSET(MAS2, mas2); + STACK_PT_REGS_OFFSET(MAS3, mas3); + STACK_PT_REGS_OFFSET(MAS6, mas6); + STACK_PT_REGS_OFFSET(MAS7, mas7); + STACK_PT_REGS_OFFSET(_SRR0, srr0); + STACK_PT_REGS_OFFSET(_SRR1, srr1); + STACK_PT_REGS_OFFSET(_CSRR0, csrr0); + STACK_PT_REGS_OFFSET(_CSRR1, csrr1); + STACK_PT_REGS_OFFSET(_DSRR0, dsrr0); + STACK_PT_REGS_OFFSET(_DSRR1, dsrr1); #endif /* About the CPU features table */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 87b806e8eded..e5503420b6c6 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -168,20 +168,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) /* only on e500mc */ #define DBG_STACK_BASE dbgirq_ctx -#define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE) - #ifdef CONFIG_SMP #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ mfspr r8,SPRN_PIR; \ slwi r8,r8,2; \ addis r8,r8,level##_STACK_BASE@ha; \ lwz r8,level##_STACK_BASE@l(r8); \ - addi r8,r8,EXC_LVL_FRAME_OVERHEAD; + addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE; #else #define BOOKE_LOAD_EXC_LEVEL_STACK(level) \ lis r8,level##_STACK_BASE@ha; \ lwz r8,level##_STACK_BASE@l(r8); \ - addi r8,r8,EXC_LVL_FRAME_OVERHEAD; + addi r8,r8,THREAD_SIZE - INT_FRAME_SIZE; #endif /* @@ -208,7 +206,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) mtmsr r11; \ mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11, TASK_STACK - THREAD(r11); /* this thread's kernel stack */\ - addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ + addi r11,r11,THREAD_SIZE - INT_FRAME_SIZE; /* allocate stack frame */\ beq 1f; \ /* COMING FROM USER MODE */ \ stw r9,_CCR(r11); /* save CR */\ @@ -516,24 +514,5 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_EMB_HV) bl kernel_fp_unavailable_exception; \ b interrupt_return -#else /* __ASSEMBLY__ */ -struct exception_regs { - unsigned long mas0; - unsigned long mas1; - unsigned long mas2; - unsigned long mas3; - unsigned long mas6; - unsigned long mas7; - unsigned long srr0; - unsigned long srr1; - unsigned long csrr0; - unsigned long csrr1; - unsigned long dsrr0; - unsigned long dsrr1; -}; - -/* ensure this structure is always sized to a multiple of the stack alignment */ -#define STACK_EXC_LVL_FRAME_SIZE ALIGN(sizeof (struct exception_regs), 16) - #endif /* __ASSEMBLY__ */ #endif /* __HEAD_BOOKE_H__ */ From 8241461536f21bbe51308a6916d1c9fb2e6b75a7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 4 Aug 2021 18:24:10 +0000 Subject: [PATCH 040/187] powerpc/smp: Fix OOPS in topology_init() Running an SMP kernel on an UP platform not prepared for it, I encountered the following OOPS: BUG: Kernel NULL pointer dereference on read at 0x00000034 Faulting instruction address: 0xc0a04110 Oops: Kernel access of bad area, sig: 11 [#1] BE PAGE_SIZE=4K SMP NR_CPUS=2 CMPCPRO Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-pmac-00001-g230fedfaad21 #5234 NIP: c0a04110 LR: c0a040d8 CTR: c0a04084 REGS: e100dda0 TRAP: 0300 Not tainted (5.13.0-pmac-00001-g230fedfaad21) MSR: 00009032 CR: 84000284 XER: 00000000 DAR: 00000034 DSISR: 20000000 GPR00: c0006bd4 e100de60 c1033320 00000000 00000000 c0942274 00000000 00000000 GPR08: 00000000 00000000 00000001 00000063 00000007 00000000 c0006f30 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000005 GPR24: c0c67d74 c0c67f1c c0c60000 c0c67d70 c0c0c558 1efdf000 c0c00020 00000000 NIP [c0a04110] topology_init+0x8c/0x138 LR [c0a040d8] topology_init+0x54/0x138 Call Trace: [e100de60] [80808080] 0x80808080 (unreliable) [e100de90] [c0006bd4] do_one_initcall+0x48/0x1bc [e100def0] [c0a0150c] kernel_init_freeable+0x1c8/0x278 [e100df20] [c0006f44] kernel_init+0x14/0x10c [e100df30] [c00190fc] ret_from_kernel_thread+0x14/0x1c Instruction dump: 7c692e70 7d290194 7c035040 7c7f1b78 5529103a 546706fe 5468103a 39400001 7c641b78 40800054 80c690b4 7fb9402e <81060034> 7fbeea14 2c080000 7fa3eb78 ---[ end trace b246ffbc6bbbb6fb ]--- Fix it by checking smp_ops before using it, as already done in several other places in the arch/powerpc/kernel/smp.c Fixes: 39f87561454d ("powerpc/smp: Move ppc_md.cpu_die() to smp_ops.cpu_offline_self()") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/75287841cbb8740edd44880fe60be66d489160d9.1628097995.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 5ff0e55d0db1..defecb3b1b15 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -1167,7 +1167,7 @@ static int __init topology_init(void) * CPU. For instance, the boot cpu might never be valid * for hotplugging. */ - if (smp_ops->cpu_offline_self) + if (smp_ops && smp_ops->cpu_offline_self) c->hotpluggable = 1; #endif From c18956e6e0b95f78dad2773ecc8c61a9e41f6405 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 5 Aug 2021 17:23:08 +0200 Subject: [PATCH 041/187] powerpc/pseries: Fix update of LPAR security flavor after LPM After LPM, when migrating from a system with security mitigation enabled to a system with mitigation disabled, the security flavor exposed in /proc is not correctly set back to 0. Do not assume the value of the security flavor is set to 0 when entering init_cpu_char_feature_flags(), so when called after a LPM, the value is set correctly even if the mitigation are not turned off. Fixes: 6ce56e1ac380 ("powerpc/pseries: export LPAR security flavor in lparcfg") Cc: stable@vger.kernel.org # v5.13+ Signed-off-by: Laurent Dufour Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805152308.33988-1-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/setup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 6b0886668465..0dfaa6ab44cc 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -539,9 +539,10 @@ static void init_cpu_char_feature_flags(struct h_cpu_char_result *result) * H_CPU_BEHAV_FAVOUR_SECURITY_H could be set only if * H_CPU_BEHAV_FAVOUR_SECURITY is. */ - if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) + if (!(result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY)) { security_ftr_clear(SEC_FTR_FAVOUR_SECURITY); - else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H) + pseries_security_flavor = 0; + } else if (result->behaviour & H_CPU_BEHAV_FAVOUR_SECURITY_H) pseries_security_flavor = 1; else pseries_security_flavor = 2; From ef98eb0409c31c39ab55ff46b2721c3b4f84c122 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 7 Aug 2021 17:13:41 -0700 Subject: [PATCH 042/187] io_uring: clear TIF_NOTIFY_SIGNAL when running task work When using SQPOLL, the submission queue polling thread calls task_work_run() to run queued work. However, when work is added with TWA_SIGNAL - as done by io_uring itself - the TIF_NOTIFY_SIGNAL remains set afterwards and is never cleared. Consequently, when the submission queue polling thread checks whether signal_pending(), it may always find a pending signal, if task_work_add() was ever called before. The impact of this bug might be different on different kernel versions. It appears that on 5.14 it would only cause unnecessary calculation and prevent the polling thread from sleeping. On 5.13, where the bug was found, it stops the polling thread from finding newly submitted work. Instead of task_work_run(), use tracehook_notify_signal() that clears TIF_NOTIFY_SIGNAL. Test for TIF_NOTIFY_SIGNAL in addition to current->task_works to avoid a race in which task_works is cleared but the TIF_NOTIFY_SIGNAL is set. Fixes: 685fe7feedb96 ("io-wq: eliminate the need for a manager thread") Cc: Jens Axboe Cc: Pavel Begunkov Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210808001342.964634-2-namit@vmware.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index bf548af0426c..1093df3977b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2222,9 +2223,9 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) static inline bool io_run_task_work(void) { - if (current->task_works) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { __set_current_state(TASK_RUNNING); - task_work_run(); + tracehook_notify_signal(); return true; } From 20c0b380f971e7d48f5d978bc27d827f7eabb21a Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 7 Aug 2021 17:13:42 -0700 Subject: [PATCH 043/187] io_uring: Use WRITE_ONCE() when writing to sq_flags The compiler should be forbidden from any strange optimization for async writes to user visible data-structures. Without proper protection, the compiler can cause write-tearing or invent writes that would confuse the userspace. However, there are writes to sq_flags which are not protected by WRITE_ONCE(). Use WRITE_ONCE() for these writes. This is purely a theoretical issue. Presumably, any compiler is very unlikely to do such optimizations. Fixes: 75b28affdd6a ("io_uring: allocate the two rings together") Cc: Jens Axboe Cc: Pavel Begunkov Signed-off-by: Nadav Amit Link: https://lore.kernel.org/r/20210808001342.964634-3-namit@vmware.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 1093df3977b8..ca064486cb41 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1500,7 +1500,8 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { clear_bit(0, &ctx->check_cq_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); } if (posted) @@ -1579,7 +1580,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->check_cq_overflow); - ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; @@ -6804,14 +6807,16 @@ static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) { /* Tell userspace we may need a wakeup call */ spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); spin_unlock_irq(&ctx->completion_lock); } static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) { spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); spin_unlock_irq(&ctx->completion_lock); } From 43e8f76006592cb1573a959aa287c45421066f9c Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Mon, 9 Aug 2021 10:36:58 +0800 Subject: [PATCH 044/187] powerpc/kprobes: Fix kprobe Oops happens in booke When using kprobe on powerpc booke series processor, Oops happens as show bellow: / # echo "p:myprobe do_nanosleep" > /sys/kernel/debug/tracing/kprobe_events / # echo 1 > /sys/kernel/debug/tracing/events/kprobes/myprobe/enable / # sleep 1 [ 50.076730] Oops: Exception in kernel mode, sig: 5 [#1] [ 50.077017] BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 [ 50.077221] Modules linked in: [ 50.077462] CPU: 0 PID: 77 Comm: sleep Not tainted 5.14.0-rc4-00022-g251a1524293d #21 [ 50.077887] NIP: c0b9c4e0 LR: c00ebecc CTR: 00000000 [ 50.078067] REGS: c3883de0 TRAP: 0700 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.078349] MSR: 00029000 CR: 24000228 XER: 20000000 [ 50.078675] [ 50.078675] GPR00: c00ebdf0 c3883e90 c313e300 c3883ea0 00000001 00000000 c3883ecc 00000001 [ 50.078675] GPR08: c100598c c00ea250 00000004 00000000 24000222 102490c2 bff4180c 101e60d4 [ 50.078675] GPR16: 00000000 102454ac 00000040 10240000 10241100 102410f8 10240000 00500000 [ 50.078675] GPR24: 00000002 00000000 c3883ea0 00000001 00000000 0000c350 3b9b8d50 00000000 [ 50.080151] NIP [c0b9c4e0] do_nanosleep+0x0/0x190 [ 50.080352] LR [c00ebecc] hrtimer_nanosleep+0x14c/0x1e0 [ 50.080638] Call Trace: [ 50.080801] [c3883e90] [c00ebdf0] hrtimer_nanosleep+0x70/0x1e0 (unreliable) [ 50.081110] [c3883f00] [c00ec004] sys_nanosleep_time32+0xa4/0x110 [ 50.081336] [c3883f40] [c001509c] ret_from_syscall+0x0/0x28 [ 50.081541] --- interrupt: c00 at 0x100a4d08 [ 50.081749] NIP: 100a4d08 LR: 101b5234 CTR: 00000003 [ 50.081931] REGS: c3883f50 TRAP: 0c00 Not tainted (5.14.0-rc4-00022-g251a1524293d) [ 50.082183] MSR: 0002f902 CR: 24000222 XER: 00000000 [ 50.082457] [ 50.082457] GPR00: 000000a2 bf980040 1024b4d0 bf980084 bf980084 64000000 00555345 fefefeff [ 50.082457] GPR08: 7f7f7f7f 101e0000 00000069 00000003 28000422 102490c2 bff4180c 101e60d4 [ 50.082457] GPR16: 00000000 102454ac 00000040 10240000 10241100 102410f8 10240000 00500000 [ 50.082457] GPR24: 00000002 bf9803f4 10240000 00000000 00000000 100039e0 00000000 102444e8 [ 50.083789] NIP [100a4d08] 0x100a4d08 [ 50.083917] LR [101b5234] 0x101b5234 [ 50.084042] --- interrupt: c00 [ 50.084238] Instruction dump: [ 50.084483] 4bfffc40 60000000 60000000 60000000 9421fff0 39400402 914200c0 38210010 [ 50.084841] 4bfffc20 00000000 00000000 00000000 <7fe00008> 7c0802a6 7c892378 93c10048 [ 50.085487] ---[ end trace f6fffe98e2fa8f3e ]--- [ 50.085678] Trace/breakpoint trap There is no real mode for booke arch and the MMU translation is always on. The corresponding MSR_IS/MSR_DS bit in booke is used to switch the address space, but not for real mode judgment. Fixes: 21f8b2fa3ca5 ("powerpc/kprobes: Ignore traps that happened in real mode") Signed-off-by: Pu Lehui Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210809023658.218915-1-pulehui@huawei.com --- arch/powerpc/kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index cbc28d1a2e1b..7a7cd6bda53e 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -292,7 +292,8 @@ int kprobe_handler(struct pt_regs *regs) if (user_mode(regs)) return 0; - if (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR)) + if (!IS_ENABLED(CONFIG_BOOKE) && + (!(regs->msr & MSR_IR) || !(regs->msr & MSR_DR))) return 0; /* From 769f52676756b8c5feb302d2d95af59577fc69ec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 4 Aug 2021 21:35:01 -0700 Subject: [PATCH 045/187] configfs: restore the kernel v5.13 text attribute write behavior Instead of appending new text attribute data at the offset specified by the write() system call, only pass the newly written data to the .store() callback. Reported-by: Bodo Stroesser Tested-by: Bodo Stroesser Signed-off-by: Bart Van Assche Signed-off-by: Christoph Hellwig --- fs/configfs/file.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 5a0be9985bae..0ad32150611e 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -177,28 +177,22 @@ static ssize_t configfs_bin_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval; } -/* Fill [buffer, buffer + pos) with data coming from @from. */ -static int fill_write_buffer(struct configfs_buffer *buffer, loff_t pos, +/* Fill @buffer with data coming from @from. */ +static int fill_write_buffer(struct configfs_buffer *buffer, struct iov_iter *from) { - loff_t to_copy; int copied; - u8 *to; if (!buffer->page) buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); if (!buffer->page) return -ENOMEM; - to_copy = SIMPLE_ATTR_SIZE - 1 - pos; - if (to_copy <= 0) - return 0; - to = buffer->page + pos; - copied = copy_from_iter(to, to_copy, from); + copied = copy_from_iter(buffer->page, SIMPLE_ATTR_SIZE - 1, from); buffer->needs_read_fill = 1; /* if buf is assumed to contain a string, terminate it by \0, * so e.g. sscanf() can scan the string easily */ - to[copied] = 0; + buffer->page[copied] = 0; return copied ? : -EFAULT; } @@ -227,10 +221,10 @@ static ssize_t configfs_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct configfs_buffer *buffer = file->private_data; - ssize_t len; + int len; mutex_lock(&buffer->mutex); - len = fill_write_buffer(buffer, iocb->ki_pos, from); + len = fill_write_buffer(buffer, from); if (len > 0) len = flush_write_buffer(file, buffer, len); if (len > 0) From 4956b9eaad456a88b0d56947bef036e086250beb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Aug 2021 07:49:41 -0600 Subject: [PATCH 046/187] io_uring: rsrc ref lock needs to be IRQ safe Nadav reports running into the below splat on re-enabling softirqs: WARNING: CPU: 2 PID: 1777 at kernel/softirq.c:364 __local_bh_enable_ip+0xaa/0xe0 Modules linked in: CPU: 2 PID: 1777 Comm: umem Not tainted 5.13.1+ #161 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/22/2020 RIP: 0010:__local_bh_enable_ip+0xaa/0xe0 Code: a9 00 ff ff 00 74 38 65 ff 0d a2 21 8c 7a e8 ed 1a 20 00 fb 66 0f 1f 44 00 00 5b 41 5c 5d c3 65 8b 05 e6 2d 8c 7a 85 c0 75 9a <0f> 0b eb 96 e8 2d 1f 20 00 eb a5 4c 89 e7 e8 73 4f 0c 00 eb ae 65 RSP: 0018:ffff88812e58fcc8 EFLAGS: 00010046 RAX: 0000000000000000 RBX: 0000000000000201 RCX: dffffc0000000000 RDX: 0000000000000007 RSI: 0000000000000201 RDI: ffffffff8898c5ac RBP: ffff88812e58fcd8 R08: ffffffff8575dbbf R09: ffffed1028ef14f9 R10: ffff88814778a7c3 R11: ffffed1028ef14f8 R12: ffffffff85c9e9ae R13: ffff88814778a000 R14: ffff88814778a7b0 R15: ffff8881086db890 FS: 00007fbcfee17700(0000) GS:ffff8881e0300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c0402a5008 CR3: 000000011c1ac003 CR4: 00000000003706e0 Call Trace: _raw_spin_unlock_bh+0x31/0x40 io_rsrc_node_ref_zero+0x13e/0x190 io_dismantle_req+0x215/0x220 io_req_complete_post+0x1b8/0x720 __io_complete_rw.isra.0+0x16b/0x1f0 io_complete_rw+0x10/0x20 where it's clear we end up calling the percpu count release directly from the completion path, as it's in atomic mode and we drop the last ref. For file/block IO, this can be from IRQ context already, and the softirq locking for rsrc isn't enough. Just make the lock fully IRQ safe, and ensure we correctly safe state from the release path as we don't know the full context there. Reported-by: Nadav Amit Tested-by: Nadav Amit Link: https://lore.kernel.org/io-uring/C187C836-E78B-4A31-B24C-D16919ACA093@gmail.com/ Signed-off-by: Jens Axboe --- fs/io_uring.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index ca064486cb41..6a8257233061 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7138,16 +7138,6 @@ static void **io_alloc_page_table(size_t size) return table; } -static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) -{ - spin_lock_bh(&ctx->rsrc_ref_lock); -} - -static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx) -{ - spin_unlock_bh(&ctx->rsrc_ref_lock); -} - static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) { percpu_ref_exit(&ref_node->refs); @@ -7164,9 +7154,9 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_node *rsrc_node = ctx->rsrc_node; rsrc_node->rsrc_data = data_to_kill; - io_rsrc_ref_lock(ctx); + spin_lock_irq(&ctx->rsrc_ref_lock); list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - io_rsrc_ref_unlock(ctx); + spin_unlock_irq(&ctx->rsrc_ref_lock); atomic_inc(&data_to_kill->refs); percpu_ref_kill(&rsrc_node->refs); @@ -7674,9 +7664,10 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref) { struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); struct io_ring_ctx *ctx = node->rsrc_data->ctx; + unsigned long flags; bool first_add = false; - io_rsrc_ref_lock(ctx); + spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); node->done = true; while (!list_empty(&ctx->rsrc_ref_list)) { @@ -7688,7 +7679,7 @@ static void io_rsrc_node_ref_zero(struct percpu_ref *ref) list_del(&node->node); first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); } - io_rsrc_ref_unlock(ctx); + spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); if (first_add) mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); From 49e7f0c789add1330b111af0b7caeb0e87df063e Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sun, 8 Aug 2021 21:54:33 +0800 Subject: [PATCH 047/187] io-wq: fix bug of creating io-wokers unconditionally The former patch to add check between nr_workers and max_workers has a bug, which will cause unconditionally creating io-workers. That's because the result of the check doesn't affect the call of create_io_worker(), fix it by bringing in a boolean value for it. Fixes: 21698274da5b ("io-wq: fix lack of acct->nr_workers < acct->max_workers judgement") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210808135434.68667-2-haoxu@linux.alibaba.com [axboe: drop hunk that isn't strictly needed] Signed-off-by: Jens Axboe --- fs/io-wq.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 12fc19353bb0..ac8604a35169 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -282,16 +282,24 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wqe *wqe; struct io_wqe_acct *acct; + bool do_create = false; cwd = container_of(cb, struct create_worker_data, work); wqe = cwd->wqe; wq = wqe->wq; acct = &wqe->acct[cwd->index]; raw_spin_lock_irq(&wqe->lock); - if (acct->nr_workers < acct->max_workers) + if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; + do_create = true; + } raw_spin_unlock_irq(&wqe->lock); - create_io_worker(wq, cwd->wqe, cwd->index); + if (do_create) { + create_io_worker(wq, cwd->wqe, cwd->index); + } else { + atomic_dec(&acct->nr_running); + io_worker_ref_put(wq); + } kfree(cwd); } From 47cae0c71f7a126903f930191e6e9f103674aca1 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Sun, 8 Aug 2021 21:54:34 +0800 Subject: [PATCH 048/187] io-wq: fix IO_WORKER_F_FIXED issue in create_io_worker() There may be cases like: A B spin_lock(wqe->lock) nr_workers is 0 nr_workers++ spin_unlock(wqe->lock) spin_lock(wqe->lock) nr_wokers is 1 nr_workers++ spin_unlock(wqe->lock) create_io_worker() acct->worker is 1 create_io_worker() acct->worker is 1 There should be one worker marked IO_WORKER_F_FIXED, but no one is. Fix this by introduce a new agrument for create_io_worker() to indicate if it is the first worker. Fixes: 3d4e4face9c1 ("io-wq: fix no lock protection of acct->nr_worker") Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20210808135434.68667-3-haoxu@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io-wq.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index ac8604a35169..7d2ed8c7dd31 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -129,7 +129,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first); static void io_wqe_dec_running(struct io_worker *worker); static bool io_worker_get(struct io_worker *worker) @@ -248,18 +248,20 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) rcu_read_unlock(); if (!ret) { - bool do_create = false; + bool do_create = false, first = false; raw_spin_lock_irq(&wqe->lock); if (acct->nr_workers < acct->max_workers) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); + if (!acct->nr_workers) + first = true; acct->nr_workers++; do_create = true; } raw_spin_unlock_irq(&wqe->lock); if (do_create) - create_io_worker(wqe->wq, wqe, acct->index); + create_io_worker(wqe->wq, wqe, acct->index, first); } } @@ -282,7 +284,7 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wqe *wqe; struct io_wqe_acct *acct; - bool do_create = false; + bool do_create = false, first = false; cwd = container_of(cb, struct create_worker_data, work); wqe = cwd->wqe; @@ -290,12 +292,14 @@ static void create_worker_cb(struct callback_head *cb) acct = &wqe->acct[cwd->index]; raw_spin_lock_irq(&wqe->lock); if (acct->nr_workers < acct->max_workers) { + if (!acct->nr_workers) + first = true; acct->nr_workers++; do_create = true; } raw_spin_unlock_irq(&wqe->lock); if (do_create) { - create_io_worker(wq, cwd->wqe, cwd->index); + create_io_worker(wq, wqe, cwd->index, first); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -637,7 +641,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) raw_spin_unlock_irq(&worker->wqe->lock); } -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) { struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; @@ -678,7 +682,7 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) worker->flags |= IO_WORKER_F_FREE; if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - if ((acct->nr_workers == 1) && (worker->flags & IO_WORKER_F_BOUND)) + if (first && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; raw_spin_unlock_irq(&wqe->lock); wake_up_new_task(tsk); From c018db4a57f3e31a9cb24d528e9f094eda89a499 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 9 Aug 2021 08:15:50 -0600 Subject: [PATCH 049/187] io_uring: drop ctx->uring_lock before flushing work item Ammar reports that he's seeing a lockdep splat on running test/rsrc_tags from the regression suite: ====================================================== WARNING: possible circular locking dependency detected 5.14.0-rc3-bluetea-test-00249-gc7d102232649 #5 Tainted: G OE ------------------------------------------------------ kworker/2:4/2684 is trying to acquire lock: ffff88814bb1c0a8 (&ctx->uring_lock){+.+.}-{3:3}, at: io_rsrc_put_work+0x13d/0x1a0 but task is already holding lock: ffffc90001c6be70 ((work_completion)(&(&ctx->rsrc_put_work)->work)){+.+.}-{0:0}, at: process_one_work+0x1bc/0x530 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 ((work_completion)(&(&ctx->rsrc_put_work)->work)){+.+.}-{0:0}: __flush_work+0x31b/0x490 io_rsrc_ref_quiesce.part.0.constprop.0+0x35/0xb0 __do_sys_io_uring_register+0x45b/0x1060 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #0 (&ctx->uring_lock){+.+.}-{3:3}: __lock_acquire+0x119a/0x1e10 lock_acquire+0xc8/0x2f0 __mutex_lock+0x86/0x740 io_rsrc_put_work+0x13d/0x1a0 process_one_work+0x236/0x530 worker_thread+0x52/0x3b0 kthread+0x135/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&(&ctx->rsrc_put_work)->work)); lock(&ctx->uring_lock); lock((work_completion)(&(&ctx->rsrc_put_work)->work)); lock(&ctx->uring_lock); *** DEADLOCK *** 2 locks held by kworker/2:4/2684: #0: ffff88810004d938 ((wq_completion)events){+.+.}-{0:0}, at: process_one_work+0x1bc/0x530 #1: ffffc90001c6be70 ((work_completion)(&(&ctx->rsrc_put_work)->work)){+.+.}-{0:0}, at: process_one_work+0x1bc/0x530 stack backtrace: CPU: 2 PID: 2684 Comm: kworker/2:4 Tainted: G OE 5.14.0-rc3-bluetea-test-00249-gc7d102232649 #5 Hardware name: Acer Aspire ES1-421/OLVIA_BE, BIOS V1.05 07/02/2015 Workqueue: events io_rsrc_put_work Call Trace: dump_stack_lvl+0x6a/0x9a check_noncircular+0xfe/0x110 __lock_acquire+0x119a/0x1e10 lock_acquire+0xc8/0x2f0 ? io_rsrc_put_work+0x13d/0x1a0 __mutex_lock+0x86/0x740 ? io_rsrc_put_work+0x13d/0x1a0 ? io_rsrc_put_work+0x13d/0x1a0 ? io_rsrc_put_work+0x13d/0x1a0 ? process_one_work+0x1ce/0x530 io_rsrc_put_work+0x13d/0x1a0 process_one_work+0x236/0x530 worker_thread+0x52/0x3b0 ? process_one_work+0x530/0x530 kthread+0x135/0x160 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 which is due to holding the ctx->uring_lock when flushing existing pending work, while the pending work flushing may need to grab the uring lock if we're using IOPOLL. Fix this by dropping the uring_lock a bit earlier as part of the flush. Cc: stable@vger.kernel.org Link: https://github.com/axboe/liburing/issues/404 Tested-by: Ammar Faizi Signed-off-by: Jens Axboe --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6a8257233061..0c3e90f974e9 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7195,17 +7195,19 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct /* kill initial ref, already quiesced if zero */ if (atomic_dec_and_test(&data->refs)) break; + mutex_unlock(&ctx->uring_lock); flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); - if (!ret) + if (!ret) { + mutex_lock(&ctx->uring_lock); break; + } atomic_inc(&data->refs); /* wait for all works potentially completing data->done */ flush_delayed_work(&ctx->rsrc_put_work); reinit_completion(&data->done); - mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(); mutex_lock(&ctx->uring_lock); } while (ret >= 0); From 43597aac1f87230cb565ab354d331682f13d3c7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 10 Aug 2021 02:44:23 +0100 Subject: [PATCH 050/187] io_uring: fix ctx-exit io_rsrc_put_work() deadlock __io_rsrc_put_work() might need ->uring_lock, so nobody should wait for rsrc nodes holding the mutex. However, that's exactly what io_ring_ctx_free() does with io_wait_rsrc_data(). Split it into rsrc wait + dealloc, and move the first one out of the lock. Cc: stable@vger.kernel.org Fixes: b60c8dce33895 ("io_uring: preparation for rsrc tagging") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0130c5c2693468173ec1afab714e0885d2c9c363.1628559783.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 0c3e90f974e9..04c6d059ea94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8652,13 +8652,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static bool io_wait_rsrc_data(struct io_rsrc_data *data) +static void io_wait_rsrc_data(struct io_rsrc_data *data) { - if (!data) - return false; - if (!atomic_dec_and_test(&data->refs)) + if (data && !atomic_dec_and_test(&data->refs)) wait_for_completion(&data->done); - return true; } static void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -8670,10 +8667,14 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } + /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ + io_wait_rsrc_data(ctx->buf_data); + io_wait_rsrc_data(ctx->file_data); + mutex_lock(&ctx->uring_lock); - if (io_wait_rsrc_data(ctx->buf_data)) + if (ctx->buf_data) __io_sqe_buffers_unregister(ctx); - if (io_wait_rsrc_data(ctx->file_data)) + if (ctx->file_data) __io_sqe_files_unregister(ctx); if (ctx->rings) __io_cqring_overflow_flush(ctx, true); From 11431e26c9c43fa26f6b33ee1a90989f57b86024 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 3 Aug 2021 15:06:08 +0800 Subject: [PATCH 051/187] blk-iocost: fix lockdep warning on blkcg->lock blkcg->lock depends on q->queue_lock which may depend on another driver lock required in irq context, one example is dm-thin: Chain exists of: &pool->lock#3 --> &q->queue_lock --> &blkcg->lock Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&blkcg->lock); local_irq_disable(); lock(&pool->lock#3); lock(&q->queue_lock); lock(&pool->lock#3); Fix the issue by using spin_lock_irq(&blkcg->lock) in ioc_weight_write(). Cc: Tejun Heo Reported-by: Bruno Goncalves Link: https://lore.kernel.org/linux-block/CA+QYu4rzz6079ighEanS3Qq_Dmnczcf45ZoJoHKVLVATTo1e4Q@mail.gmail.com/T/#u Signed-off-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210803070608.1766400-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5fac3757e6e0..0e56557cacf2 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3061,19 +3061,19 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; - spin_lock(&blkcg->lock); + spin_lock_irq(&blkcg->lock); iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { - spin_lock_irq(&iocg->ioc->lock); + spin_lock(&iocg->ioc->lock); ioc_now(iocg->ioc, &now); weight_updated(iocg, &now); - spin_unlock_irq(&iocg->ioc->lock); + spin_unlock(&iocg->ioc->lock); } } - spin_unlock(&blkcg->lock); + spin_unlock_irq(&blkcg->lock); return nbytes; } From 9977d880f7a3c233db9165a75a3a14defc2a4aee Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 9 Aug 2021 11:09:47 -0400 Subject: [PATCH 052/187] scsi: lpfc: Move initialization of phba->poll_list earlier to avoid crash The phba->poll_list is traversed in case of an error in lpfc_sli4_hba_setup(), so it must be initialized earlier in case the error path is taken. [ 490.030738] lpfc 0000:65:00.0: 0:1413 Failed to init iocb list. [ 490.036661] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000 [ 490.044485] PGD 0 P4D 0 [ 490.047027] Oops: 0000 [#1] SMP PTI [ 490.050518] CPU: 0 PID: 7 Comm: kworker/0:1 Kdump: loaded Tainted: G I --------- - - 4.18. [ 490.060511] Hardware name: Dell Inc. PowerEdge R440/0WKGTH, BIOS 1.4.8 05/22/2018 [ 490.067994] Workqueue: events work_for_cpu_fn [ 490.072371] RIP: 0010:lpfc_sli4_cleanup_poll_list+0x20/0xb0 [lpfc] [ 490.078546] Code: cf e9 04 f7 fe ff 0f 1f 40 00 0f 1f 44 00 00 41 57 49 89 ff 41 56 41 55 41 54 4d 8d a79 [ 490.097291] RSP: 0018:ffffbd1a463dbcc8 EFLAGS: 00010246 [ 490.102518] RAX: 0000000000008200 RBX: ffff945cdb8c0000 RCX: 0000000000000000 [ 490.109649] RDX: 0000000000018200 RSI: ffff9468d0e16818 RDI: 0000000000000000 [ 490.116783] RBP: ffff945cdb8c1740 R08: 00000000000015c5 R09: 0000000000000042 [ 490.123915] R10: 0000000000000000 R11: ffffbd1a463dbab0 R12: ffff945cdb8c25c0 [ 490.131049] R13: 00000000fffffff4 R14: 0000000000001800 R15: ffff945cdb8c0000 [ 490.138182] FS: 0000000000000000(0000) GS:ffff9468d0e00000(0000) knlGS:0000000000000000 [ 490.146267] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 490.152013] CR2: 0000000000000000 CR3: 000000042ca10002 CR4: 00000000007706f0 [ 490.159146] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 490.166277] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 490.173409] PKRU: 55555554 [ 490.176123] Call Trace: [ 490.178598] lpfc_sli4_queue_destroy+0x7f/0x3c0 [lpfc] [ 490.183745] lpfc_sli4_hba_setup+0x1bc7/0x23e0 [lpfc] [ 490.188797] ? kernfs_activate+0x63/0x80 [ 490.192721] ? kernfs_add_one+0xe7/0x130 [ 490.196647] ? __kernfs_create_file+0x80/0xb0 [ 490.201020] ? lpfc_pci_probe_one_s4.isra.48+0x46f/0x9e0 [lpfc] [ 490.206944] lpfc_pci_probe_one_s4.isra.48+0x46f/0x9e0 [lpfc] [ 490.212697] lpfc_pci_probe_one+0x179/0xb70 [lpfc] [ 490.217492] local_pci_probe+0x41/0x90 [ 490.221246] work_for_cpu_fn+0x16/0x20 [ 490.224994] process_one_work+0x1a7/0x360 [ 490.229009] ? create_worker+0x1a0/0x1a0 [ 490.232933] worker_thread+0x1cf/0x390 [ 490.236687] ? create_worker+0x1a0/0x1a0 [ 490.240612] kthread+0x116/0x130 [ 490.243846] ? kthread_flush_work_fn+0x10/0x10 [ 490.248293] ret_from_fork+0x35/0x40 [ 490.251869] Modules linked in: lpfc(+) xt_CHECKSUM ipt_MASQUERADE xt_conntrack ipt_REJECT nf_reject_ipv4i [ 490.332609] CR2: 0000000000000000 Link: https://lore.kernel.org/r/20210809150947.18104-1-emilne@redhat.com Fixes: 93a4d6f40198 ("scsi: lpfc: Add registration for CPU Offline/Online events") Cc: stable@vger.kernel.org Reviewed-by: James Smart Signed-off-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 5983e05b648f..e29523a1b530 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -13193,6 +13193,8 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) if (!phba) return -ENOMEM; + INIT_LIST_HEAD(&phba->poll_list); + /* Perform generic PCI device enabling operation */ error = lpfc_enable_pci_dev(phba); if (error) @@ -13327,7 +13329,6 @@ lpfc_pci_probe_one_s4(struct pci_dev *pdev, const struct pci_device_id *pid) /* Enable RAS FW log support */ lpfc_sli4_ras_setup(phba); - INIT_LIST_HEAD(&phba->poll_list); timer_setup(&phba->cpuhp_poll_timer, lpfc_sli4_poll_hbtimer, 0); cpuhp_state_add_instance_nocalls(lpfc_cpuhp_state, &phba->cpuhp); From dbe7633c394be4a500b887fe8f9ad486dcba9d77 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 6 Aug 2021 10:12:50 -0700 Subject: [PATCH 053/187] scsi: storvsc: Log TEST_UNIT_READY errors as warnings Commit 08f76547f08d ("scsi: storvsc: Update error logging") added more robust logging of errors, particularly those reported as Hyper-V errors. But this change produces extra logging noise in that TEST_UNIT_READY may report errors during the normal course of detecting device adds and removes. Fix this by logging TEST_UNIT_READY errors as warnings, so that log lines are produced only if the storvsc log level is changed to WARN level on the kernel boot line. Link: https://lore.kernel.org/r/1628269970-87876-1-git-send-email-mikelley@microsoft.com Fixes: 08f76547f08d ("scsi: storvsc: Update error logging") Signed-off-by: Michael Kelley Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 328bb961c281..37506b3fe5a9 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1199,14 +1199,24 @@ static void storvsc_on_io_completion(struct storvsc_device *stor_device, vstor_packet->vm_srb.sense_info_length); if (vstor_packet->vm_srb.scsi_status != 0 || - vstor_packet->vm_srb.srb_status != SRB_STATUS_SUCCESS) - storvsc_log(device, STORVSC_LOGGING_ERROR, + vstor_packet->vm_srb.srb_status != SRB_STATUS_SUCCESS) { + + /* + * Log TEST_UNIT_READY errors only as warnings. Hyper-V can + * return errors when detecting devices using TEST_UNIT_READY, + * and logging these as errors produces unhelpful noise. + */ + int loglevel = (stor_pkt->vm_srb.cdb[0] == TEST_UNIT_READY) ? + STORVSC_LOGGING_WARN : STORVSC_LOGGING_ERROR; + + storvsc_log(device, loglevel, "tag#%d cmd 0x%x status: scsi 0x%x srb 0x%x hv 0x%x\n", request->cmd->request->tag, stor_pkt->vm_srb.cdb[0], vstor_packet->vm_srb.scsi_status, vstor_packet->vm_srb.srb_status, vstor_packet->status); + } if (vstor_packet->vm_srb.scsi_status == SAM_STAT_CHECK_CONDITION && (vstor_packet->vm_srb.srb_status & SRB_STATUS_AUTOSENSE_VALID)) From 40d32727931cee82cdc5aaca25ce725d1f3ac864 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Aug 2021 14:49:40 +0100 Subject: [PATCH 054/187] scsi: mpt3sas: Fix incorrectly assigned error return and check Currently the call to _base_static_config_pages() is assigning the error return to variable 'rc' but checking the error return in error 'r'. Fix this by assigning the error return to variable 'r' instead of 'rc'. Link: https://lore.kernel.org/r/20210804134940.114011-1-colin.king@canonical.com Fixes: 19a622c39a9d ("scsi: mpt3sas: Handle firmware faults during first half of IOC init") Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen Addresses-Coverity: ("Unused value") --- drivers/scsi/mpt3sas/mpt3sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 19b1c0cf5f2a..cf4a3a2c22ad 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -7851,7 +7851,7 @@ _base_make_ioc_operational(struct MPT3SAS_ADAPTER *ioc) return r; } - rc = _base_static_config_pages(ioc); + r = _base_static_config_pages(ioc); if (r) return r; From 07d25971b220e477eb019fcb520a9f2e3ac966af Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 31 Jul 2021 20:30:11 +0800 Subject: [PATCH 055/187] locking/rtmutex: Use the correct rtmutex debugging config option It's CONFIG_DEBUG_RT_MUTEXES not CONFIG_DEBUG_RT_MUTEX. Fixes: f7efc4799f81 ("locking/rtmutex: Inline chainwalk depth check") Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Acked-by: Boqun Feng Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210731123011.4555-1-thunder.leizhen@huawei.com --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index b5d9bb5202c6..ad0db322ed3b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,7 +343,7 @@ static __always_inline bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, enum rtmutex_chainwalk chwalk) { - if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEX)) + if (IS_ENABLED(CONFIG_DEBUG_RT_MUTEXES)) return waiter != NULL; return chwalk == RT_MUTEX_FULL_CHAINWALK; } From 664cc971fb259007e49cc8a3ac43b0787d89443f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 10 Aug 2021 09:10:15 +0200 Subject: [PATCH 056/187] Revert "usb: dwc3: gadget: Use list_replace_init() before traversing lists" This reverts commit d25d85061bd856d6be221626605319154f9b5043 as it is reported to cause problems on many different types of boards. Reported-by: Thinh Nguyen Reported-by: John Stultz Cc: Ray Chi Link: https://lore.kernel.org/r/CANcMJZCEVxVLyFgLwK98hqBEdc0_n4P0x_K6Gih8zNH3ouzbJQ@mail.gmail.com Fixes: d25d85061bd8 ("usb: dwc3: gadget: Use list_replace_init() before traversing lists") Cc: stable Cc: Felipe Balbi Cc: Wesley Cheng Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index b8d4b2d327b2..84fe57ef5a49 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1741,13 +1741,9 @@ static void dwc3_gadget_ep_cleanup_cancelled_requests(struct dwc3_ep *dep) { struct dwc3_request *req; struct dwc3_request *tmp; - struct list_head local; struct dwc3 *dwc = dep->dwc; -restart: - list_replace_init(&dep->cancelled_list, &local); - - list_for_each_entry_safe(req, tmp, &local, list) { + list_for_each_entry_safe(req, tmp, &dep->cancelled_list, list) { dwc3_gadget_ep_skip_trbs(dep, req); switch (req->status) { case DWC3_REQUEST_STATUS_DISCONNECTED: @@ -1765,9 +1761,6 @@ static void dwc3_gadget_ep_cleanup_cancelled_requests(struct dwc3_ep *dep) break; } } - - if (!list_empty(&dep->cancelled_list)) - goto restart; } static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, @@ -2976,12 +2969,8 @@ static void dwc3_gadget_ep_cleanup_completed_requests(struct dwc3_ep *dep, { struct dwc3_request *req; struct dwc3_request *tmp; - struct list_head local; -restart: - list_replace_init(&dep->started_list, &local); - - list_for_each_entry_safe(req, tmp, &local, list) { + list_for_each_entry_safe(req, tmp, &dep->started_list, list) { int ret; ret = dwc3_gadget_ep_cleanup_completed_request(dep, event, @@ -2989,9 +2978,6 @@ static void dwc3_gadget_ep_cleanup_completed_requests(struct dwc3_ep *dep, if (ret) break; } - - if (!list_empty(&dep->started_list)) - goto restart; } static bool dwc3_gadget_ep_should_continue(struct dwc3_ep *dep) From 438553958ba19296663c6d6583d208dfb6792830 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:40 +0200 Subject: [PATCH 057/187] PCI/MSI: Enable and mask MSI-X early The ordering of MSI-X enable in hardware is dysfunctional: 1) MSI-X is disabled in the control register 2) Various setup functions 3) pci_msi_setup_msi_irqs() is invoked which ends up accessing the MSI-X table entries 4) MSI-X is enabled and masked in the control register with the comment that enabling is required for some hardware to access the MSI-X table Step #4 obviously contradicts #3. The history of this is an issue with the NIU hardware. When #4 was introduced the table access actually happened in msix_program_entries() which was invoked after enabling and masking MSI-X. This was changed in commit d71d6432e105 ("PCI/MSI: Kill redundant call of irq_set_msi_desc() for MSI-X interrupts") which removed the table write from msix_program_entries(). Interestingly enough nobody noticed and either NIU still works or it did not get any testing with a kernel 3.19 or later. Nevertheless this is inconsistent and there is no reason why MSI-X can't be enabled and masked in the control register early on, i.e. move step #4 above to step #1. This preserves the NIU workaround and has no side effects on other hardware. Fixes: d71d6432e105 ("PCI/MSI: Kill redundant call of irq_set_msi_desc() for MSI-X interrupts") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Ashok Raj Reviewed-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.344136412@linutronix.de --- drivers/pci/msi.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 9232255c8515..5d39ed828085 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -772,18 +772,25 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, u16 control; void __iomem *base; - /* Ensure MSI-X is disabled while it is set up */ - pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); + /* + * Some devices require MSI-X to be enabled before the MSI-X + * registers can be accessed. Mask all the vectors to prevent + * interrupts coming in before they're fully set up. + */ + pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | + PCI_MSIX_FLAGS_ENABLE); pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control); /* Request & Map MSI-X table region */ base = msix_map_region(dev, msix_table_size(control)); - if (!base) - return -ENOMEM; + if (!base) { + ret = -ENOMEM; + goto out_disable; + } ret = msix_setup_entries(dev, base, entries, nvec, affd); if (ret) - return ret; + goto out_disable; ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX); if (ret) @@ -794,14 +801,6 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, if (ret) goto out_free; - /* - * Some devices require MSI-X to be enabled before we can touch the - * MSI-X registers. We need to mask all the vectors to prevent - * interrupts coming in before they're fully set up. - */ - pci_msix_clear_and_set_ctrl(dev, 0, - PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE); - msix_program_entries(dev, entries); ret = populate_msi_sysfs(dev); @@ -836,6 +835,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, out_free: free_msi_irqs(dev); +out_disable: + pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); + return ret; } From 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:41 +0200 Subject: [PATCH 058/187] PCI/MSI: Mask all unused MSI-X entries When MSI-X is enabled the ordering of calls is: msix_map_region(); msix_setup_entries(); pci_msi_setup_msi_irqs(); msix_program_entries(); This has a few interesting issues: 1) msix_setup_entries() allocates the MSI descriptors and initializes them except for the msi_desc:masked member which is left zero initialized. 2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets up the MSI interrupts which ends up in pci_write_msi_msg() unless the interrupt chip provides its own irq_write_msi_msg() function. 3) msix_program_entries() does not do what the name suggests. It solely updates the entries array (if not NULL) and initializes the masked member for each MSI descriptor by reading the hardware state and then masks the entry. Obviously this has some issues: 1) The uninitialized masked member of msi_desc prevents the enforcement of masking the entry in pci_write_msi_msg() depending on the cached masked bit. Aside of that half initialized data is a NONO in general 2) msix_program_entries() only ensures that the actually allocated entries are masked. This is wrong as experimentation with crash testing and crash kernel kexec has shown. This limited testing unearthed that when the production kernel had more entries in use and unmasked when it crashed and the crash kernel allocated a smaller amount of entries, then a full scan of all entries found unmasked entries which were in use in the production kernel. This is obviously a device or emulation issue as the device reset should mask all MSI-X table entries, but obviously that's just part of the paper specification. Cure this by: 1) Masking all table entries in hardware 2) Initializing msi_desc::masked in msix_setup_entries() 3) Removing the mask dance in msix_program_entries() 4) Renaming msix_program_entries() to msix_update_entries() to reflect the purpose of that function. As the masking of unused entries has never been done the Fixes tag refers to a commit in: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de --- drivers/pci/msi.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 5d39ed828085..57c9ec9b976b 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -691,6 +691,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, { struct irq_affinity_desc *curmsk, *masks = NULL; struct msi_desc *entry; + void __iomem *addr; int ret, i; int vec_count = pci_msix_vec_count(dev); @@ -711,6 +712,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.is_msix = 1; entry->msi_attrib.is_64 = 1; + if (entries) entry->msi_attrib.entry_nr = entries[i].entry; else @@ -722,6 +724,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; + addr = pci_msix_desc_addr(entry); + if (addr) + entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); + list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); if (masks) curmsk++; @@ -732,28 +738,27 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, return ret; } -static void msix_program_entries(struct pci_dev *dev, - struct msix_entry *entries) +static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries) { struct msi_desc *entry; - int i = 0; - void __iomem *desc_addr; for_each_pci_msi_entry(entry, dev) { - if (entries) - entries[i++].vector = entry->irq; - - desc_addr = pci_msix_desc_addr(entry); - if (desc_addr) - entry->masked = readl(desc_addr + - PCI_MSIX_ENTRY_VECTOR_CTRL); - else - entry->masked = 0; - - msix_mask_irq(entry, 1); + if (entries) { + entries->vector = entry->irq; + entries++; + } } } +static void msix_mask_all(void __iomem *base, int tsize) +{ + u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; + int i; + + for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE) + writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); +} + /** * msix_capability_init - configure device's MSI-X capability * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -768,9 +773,9 @@ static void msix_program_entries(struct pci_dev *dev, static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec, struct irq_affinity *affd) { - int ret; - u16 control; void __iomem *base; + int ret, tsize; + u16 control; /* * Some devices require MSI-X to be enabled before the MSI-X @@ -782,12 +787,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control); /* Request & Map MSI-X table region */ - base = msix_map_region(dev, msix_table_size(control)); + tsize = msix_table_size(control); + base = msix_map_region(dev, tsize); if (!base) { ret = -ENOMEM; goto out_disable; } + /* Ensure that all table entries are masked. */ + msix_mask_all(base, tsize); + ret = msix_setup_entries(dev, base, entries, nvec, affd); if (ret) goto out_disable; @@ -801,7 +810,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, if (ret) goto out_free; - msix_program_entries(dev, entries); + msix_update_entries(dev, entries); ret = populate_msi_sysfs(dev); if (ret) From da181dc974ad667579baece33c2c8d2d1e4558d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:42 +0200 Subject: [PATCH 059/187] PCI/MSI: Enforce that MSI-X table entry is masked for update The specification (PCIe r5.0, sec 6.1.4.5) states: For MSI-X, a function is permitted to cache Address and Data values from unmasked MSI-X Table entries. However, anytime software unmasks a currently masked MSI-X Table entry either by clearing its Mask bit or by clearing the Function Mask bit, the function must update any Address or Data values that it cached from that entry. If software changes the Address or Data value of an entry while the entry is unmasked, the result is undefined. The Linux kernel's MSI-X support never enforced that the entry is masked before the entry is modified hence the Fixes tag refers to a commit in: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Enforce the entry to be masked across the update. There is no point in enforcing this to be handled at all possible call sites as this is just pointless code duplication and the common update function is the obvious place to enforce this. Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") Reported-by: Kevin Tian Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de --- drivers/pci/msi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 57c9ec9b976b..7ee1ac47caa7 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -289,13 +289,28 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) /* Don't touch the hardware now */ } else if (entry->msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); + bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT); if (!base) goto skip; + /* + * The specification mandates that the entry is masked + * when the message is modified: + * + * "If software changes the Address or Data value of an + * entry while the entry is unmasked, the result is + * undefined." + */ + if (unmasked) + __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT); + writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); writel(msg->data, base + PCI_MSIX_ENTRY_DATA); + + if (unmasked) + __pci_msix_desc_mask_irq(entry, 0); } else { int pos = dev->msi_cap; u16 msgctl; From b9255a7cb51754e8d2645b65dd31805e282b4f3e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:43 +0200 Subject: [PATCH 060/187] PCI/MSI: Enforce MSI[X] entry updates to be visible Nothing enforces the posted writes to be visible when the function returns. Flush them even if the flush might be redundant when the entry is masked already as the unmask will flush as well. This is either setup or a rare affinity change event so the extra flush is not the end of the world. While this is more a theoretical issue especially the logic in the X86 specific msi_set_affinity() function relies on the assumption that the update has reached the hardware when the function returns. Again, as this never has been enforced the Fixes tag refers to a commit in: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Acked-by: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de --- drivers/pci/msi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 7ee1ac47caa7..434c7041b176 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -311,6 +311,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) if (unmasked) __pci_msix_desc_mask_irq(entry, 0); + + /* Ensure that the writes are visible in the device */ + readl(base + PCI_MSIX_ENTRY_DATA); } else { int pos = dev->msi_cap; u16 msgctl; @@ -331,6 +334,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) pci_write_config_word(dev, pos + PCI_MSI_DATA_32, msg->data); } + /* Ensure that the writes are visible in the device */ + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); } skip: From 361fd37397f77578735907341579397d5bed0a2d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:44 +0200 Subject: [PATCH 061/187] PCI/MSI: Do not set invalid bits in MSI mask msi_mask_irq() takes a mask and a flags argument. The mask argument is used to mask out bits from the cached mask and the flags argument to set bits. Some places invoke it with a flags argument which sets bits which are not used by the device, i.e. when the device supports up to 8 vectors a full unmask in some places sets the mask to 0xFFFFFF00. While devices probably do not care, it's still bad practice. Fixes: 7ba1930db02f ("PCI MSI: Unmask MSI if setup failed") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.568173099@linutronix.de --- drivers/pci/msi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 434c7041b176..e27ac6b5b8d5 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -656,21 +656,21 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, /* Configure MSI capability structure */ ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); if (ret) { - msi_mask_irq(entry, mask, ~mask); + msi_mask_irq(entry, mask, 0); free_msi_irqs(dev); return ret; } ret = msi_verify_entries(dev); if (ret) { - msi_mask_irq(entry, mask, ~mask); + msi_mask_irq(entry, mask, 0); free_msi_irqs(dev); return ret; } ret = populate_msi_sysfs(dev); if (ret) { - msi_mask_irq(entry, mask, ~mask); + msi_mask_irq(entry, mask, 0); free_msi_irqs(dev); return ret; } @@ -962,7 +962,7 @@ static void pci_msi_shutdown(struct pci_dev *dev) /* Return the device with MSI unmasked as initial states */ mask = msi_mask(desc->msi_attrib.multi_cap); /* Keep cached state to be restored */ - __pci_msi_desc_mask_irq(desc, mask, ~mask); + __pci_msi_desc_mask_irq(desc, mask, 0); /* Restore dev->irq to its default pin-assertion IRQ */ dev->irq = desc->msi_attrib.default_irq; From 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:45 +0200 Subject: [PATCH 062/187] PCI/MSI: Correct misleading comments The comments about preserving the cached state in pci_msi[x]_shutdown() are misleading as the MSI descriptors are freed right after those functions return. So there is nothing to restore. Preparatory change. Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.621609423@linutronix.de --- drivers/pci/msi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index e27ac6b5b8d5..b3f58074e8d4 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -961,7 +961,6 @@ static void pci_msi_shutdown(struct pci_dev *dev) /* Return the device with MSI unmasked as initial states */ mask = msi_mask(desc->msi_attrib.multi_cap); - /* Keep cached state to be restored */ __pci_msi_desc_mask_irq(desc, mask, 0); /* Restore dev->irq to its default pin-assertion IRQ */ @@ -1047,10 +1046,8 @@ static void pci_msix_shutdown(struct pci_dev *dev) } /* Return the device with MSI-X masked as initial states */ - for_each_pci_msi_entry(entry, dev) { - /* Keep cached states to be restored */ + for_each_pci_msi_entry(entry, dev) __pci_msix_desc_mask_irq(entry, 1); - } pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); pci_intx_for_msi(dev, 1); From d28d4ad2a1aef27458b3383725bb179beb8d015c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:46 +0200 Subject: [PATCH 063/187] PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown() No point in using the raw write function from shutdown. Preparatory change to introduce proper serialization for the msi_desc::masked cache. Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.674391354@linutronix.de --- drivers/pci/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index b3f58074e8d4..f0f7026b7ac0 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -961,7 +961,7 @@ static void pci_msi_shutdown(struct pci_dev *dev) /* Return the device with MSI unmasked as initial states */ mask = msi_mask(desc->msi_attrib.multi_cap); - __pci_msi_desc_mask_irq(desc, mask, 0); + msi_mask_irq(desc, mask, 0); /* Restore dev->irq to its default pin-assertion IRQ */ dev->irq = desc->msi_attrib.default_irq; From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:47 +0200 Subject: [PATCH 064/187] PCI/MSI: Protect msi_desc::masked for multi-MSI Multi-MSI uses a single MSI descriptor and there is a single mask register when the device supports per vector masking. To avoid reading back the mask register the value is cached in the MSI descriptor and updates are done by clearing and setting bits in the cache and writing it to the device. But nothing protects msi_desc::masked and the mask register from being modified concurrently on two different CPUs for two different Linux interrupts which belong to the same multi-MSI descriptor. Add a lock to struct device and protect any operation on the mask and the mask register with it. This makes the update of msi_desc::masked unconditional, but there is no place which requires a modification of the hardware register without updating the masked cache. msi_mask_irq() is now an empty wrapper which will be cleaned up in follow up changes. The problem goes way back to the initial support of multi-MSI, but picking the commit which introduced the mask cache is a valid cut off point (2.6.30). Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de --- drivers/base/core.c | 1 + drivers/pci/msi.c | 19 ++++++++++--------- include/linux/device.h | 1 + include/linux/msi.h | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index f6360490a4a3..6c0ef9d55a34 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2837,6 +2837,7 @@ void device_initialize(struct device *dev) device_pm_init(dev); set_dev_node(dev, -1); #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spin_lock_init(&dev->msi_lock); INIT_LIST_HEAD(&dev->msi_list); #endif INIT_LIST_HEAD(&dev->links.consumers); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index f0f7026b7ac0..e5e75331b415 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -143,24 +143,25 @@ static inline __attribute_const__ u32 msi_mask(unsigned x) * reliably as devices without an INTx disable bit will then generate a * level IRQ which will never be cleared. */ -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - u32 mask_bits = desc->masked; + raw_spinlock_t *lock = &desc->dev->msi_lock; + unsigned long flags; if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit) - return 0; + return; - mask_bits &= ~mask; - mask_bits |= flag; + raw_spin_lock_irqsave(lock, flags); + desc->masked &= ~mask; + desc->masked |= flag; pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos, - mask_bits); - - return mask_bits; + desc->masked); + raw_spin_unlock_irqrestore(lock, flags); } static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); + __pci_msi_desc_mask_irq(desc, mask, flag); } static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) diff --git a/include/linux/device.h b/include/linux/device.h index 59940f1744c1..e53aa5065f58 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -506,6 +506,7 @@ struct device { struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spinlock_t msi_lock; struct list_head msi_list; #endif #ifdef CONFIG_DMA_OPS diff --git a/include/linux/msi.h b/include/linux/msi.h index 6aff469e511d..e8bdcb83172b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag); -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); From 826da771291fc25a428e871f9e7fb465e390f852 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:48 +0200 Subject: [PATCH 065/187] genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP X86 IO/APIC and MSI interrupts (when used without interrupts remapping) require that the affinity setup on startup is done before the interrupt is enabled for the first time as the non-remapped operation mode cannot safely migrate enabled interrupts from arbitrary contexts. Provide a new irq chip flag which allows affected hardware to request this. This has to be opt-in because there have been reports in the past that some interrupt chips cannot handle affinity setting before startup. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 8e9a9ae471a6..c8293c817646 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -569,6 +569,7 @@ struct irq_chip { * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state + * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -581,6 +582,7 @@ enum { IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), + IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), }; #include diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 7f04c7d8296e..a98bcfc4be7b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) } else { switch (__irq_startup_managed(desc, aff, force)) { case IRQ_STARTUP_NORMAL: + if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) + irq_setup_affinity(desc); ret = __irq_startup(desc); - irq_setup_affinity(desc); + if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) + irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: irq_do_set_affinity(d, aff, false); From 0c0e37dc11671384e53ba6ede53a4d91162a2cc5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:49 +0200 Subject: [PATCH 066/187] x86/ioapic: Force affinity setup before startup The IO/APIC cannot handle interrupt affinity changes safely after startup other than from an interrupt handler. The startup sequence in the generic interrupt code violates that assumption. Mark the irq chip with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that the default interrupt setting happens before the interrupt is started up for the first time. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.832143400@linutronix.de --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d5c691a3208b..39224e035e47 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1986,7 +1986,8 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static struct irq_chip ioapic_ir_chip __read_mostly = { @@ -1999,7 +2000,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static inline void init_IO_APIC_traps(void) From ff363f480e5997051dd1de949121ffda3b753741 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:50 +0200 Subject: [PATCH 067/187] x86/msi: Force affinity setup before startup The X86 MSI mechanism cannot handle interrupt affinity changes safely after startup other than from an interrupt handler, unless interrupt remapping is enabled. The startup sequence in the generic interrupt code violates that assumption. Mark the irq chips with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that the default interrupt setting happens before the interrupt is started up for the first time. While the interrupt remapping MSI chip does not require this, there is no point in treating it differently as this might spare an interrupt to a CPU which is not in the default affinity mask. For the non-remapping case go to the direct write path when the interrupt is not yet started similar to the not yet activated case. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.886722080@linutronix.de --- arch/x86/kernel/apic/msi.c | 11 ++++++++--- arch/x86/kernel/hpet.c | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 44ebe25e7703..dbacb9ec8843 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -58,11 +58,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * The quirk bit is not set in this case. * - The new vector is the same as the old vector * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The interrupt is not yet started up * - The new destination CPU is the same as the old destination CPU */ if (!irqd_msi_nomask_quirk(irqd) || cfg->vector == old_cfg.vector || old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + !irqd_is_started(irqd) || cfg->dest_apicid == old_cfg.dest_apicid) { irq_msi_update_msg(irqd, cfg); return ret; @@ -150,7 +152,8 @@ static struct irq_chip pci_msi_controller = { .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, @@ -219,7 +222,8 @@ static struct irq_chip pci_msi_ir_controller = { .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static struct msi_domain_info pci_msi_ir_domain_info = { @@ -273,7 +277,8 @@ static struct irq_chip dmar_msi_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = dmar_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, }; static int dmar_msi_init(struct irq_domain *domain, diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 08651a4e6aa0..42fc41dd0e1f 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -508,7 +508,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = { .irq_set_affinity = msi_domain_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, }; static int hpet_msi_init(struct irq_domain *domain, From d1dee814168538eba166ae4150b37f0d88257884 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 22 Jul 2021 14:25:48 +0100 Subject: [PATCH 068/187] pinctrl: sunxi: Don't underestimate number of functions When we are building all the various pinctrl structures for the Allwinner pinctrl devices, we do some estimation about the maximum number of distinct function (names) that we will need. So far we take the number of pins as an upper bound, even though we can actually have up to four special functions per pin. This wasn't a problem until now, since we indeed have typically far more pins than functions, and most pins share common functions. However the H616 "-r" pin controller has only two pins, but four functions, so we run over the end of the array when we are looking for a matching function name in sunxi_pinctrl_add_function - there is no NULL sentinel left that would terminate the loop: [ 8.200648] Unable to handle kernel paging request at virtual address fffdff7efbefaff5 [ 8.209179] Mem abort info: .... [ 8.368456] Call trace: [ 8.370925] __pi_strcmp+0x90/0xf0 [ 8.374559] sun50i_h616_r_pinctrl_probe+0x1c/0x28 [ 8.379557] platform_probe+0x68/0xd8 Do an actual worst case allocation (4 functions per pin, three common functions and the sentinel) for the initial array allocation. This is now heavily overestimating the number of functions in the common case, but we will reallocate this array later with the actual number of functions, so it's only temporarily. Fixes: 561c1cf17c46 ("pinctrl: sunxi: Add support for the Allwinner H616-R pin controller") Signed-off-by: Andre Przywara Acked-by: Maxime Ripard Link: https://lore.kernel.org/r/20210722132548.22121-1-andre.przywara@arm.com Signed-off-by: Linus Walleij --- drivers/pinctrl/sunxi/pinctrl-sunxi.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/sunxi/pinctrl-sunxi.c b/drivers/pinctrl/sunxi/pinctrl-sunxi.c index dc8d39ae045b..9c7679c06dca 100644 --- a/drivers/pinctrl/sunxi/pinctrl-sunxi.c +++ b/drivers/pinctrl/sunxi/pinctrl-sunxi.c @@ -1219,10 +1219,12 @@ static int sunxi_pinctrl_build_state(struct platform_device *pdev) } /* - * We suppose that we won't have any more functions than pins, - * we'll reallocate that later anyway + * Find an upper bound for the maximum number of functions: in + * the worst case we have gpio_in, gpio_out, irq and up to four + * special functions per pin, plus one entry for the sentinel. + * We'll reallocate that later anyway. */ - pctl->functions = kcalloc(pctl->ngroups, + pctl->functions = kcalloc(4 * pctl->ngroups + 4, sizeof(*pctl->functions), GFP_KERNEL); if (!pctl->functions) From b9cc7d8a4656a6e815852c27ab50365009cb69c1 Mon Sep 17 00:00:00 2001 From: Ben Dai Date: Sun, 25 Apr 2021 23:09:03 +0800 Subject: [PATCH 069/187] genirq/timings: Prevent potential array overflow in __irq_timings_store() When the interrupt interval is greater than 2 ^ PREDICTION_BUFFER_SIZE * PREDICTION_FACTOR us and less than 1s, the calculated index will be greater than the length of irqs->ema_time[]. Check the calculated index before using it to prevent array overflow. Fixes: 23aa3b9a6b7d ("genirq/timings: Encapsulate storing function") Signed-off-by: Ben Dai Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210425150903.25456-1-ben.dai9703@gmail.com --- kernel/irq/timings.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index d309d6fbf5bd..4d2a702d7aa9 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -453,6 +453,11 @@ static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, */ index = irq_timings_interval_index(interval); + if (index > PREDICTION_BUFFER_SIZE - 1) { + irqs->count = 0; + return; + } + /* * Store the index as an element of the pattern in another * circular array. From dbbc93576e03fbe24b365fab0e901eb442237a8a Mon Sep 17 00:00:00 2001 From: Bixuan Cui Date: Tue, 18 May 2021 11:31:17 +0800 Subject: [PATCH 070/187] genirq/msi: Ensure deactivation on teardown msi_domain_alloc_irqs() invokes irq_domain_activate_irq(), but msi_domain_free_irqs() does not enforce deactivation before tearing down the interrupts. This happens when PCI/MSI interrupts are set up and never used before being torn down again, e.g. in error handling pathes. The only place which cleans that up is the error handling path in msi_domain_alloc_irqs(). Move the cleanup from msi_domain_alloc_irqs() into msi_domain_free_irqs() to cure that. Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early") Signed-off-by: Bixuan Cui Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210518033117.78104-1-cuibixuan@huawei.com --- kernel/irq/msi.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index c41965e348b5..85df3ca03efe 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -476,11 +476,6 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return 0; cleanup: - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - if (irqd_is_activated(irq_data)) - irq_domain_deactivate_irq(irq_data); - } msi_domain_free_irqs(domain, dev); return ret; } @@ -505,7 +500,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { + struct irq_data *irq_data; struct msi_desc *desc; + int i; + + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); + if (irqd_is_activated(irq_data)) + irq_domain_deactivate_irq(irq_data); + } for_each_msi_entry(desc, dev) { /* From 981567bd965329df7e64b13e92a54da816c1e0a4 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 10 Aug 2021 16:33:55 +1000 Subject: [PATCH 071/187] cifs: use the correct max-length for dentry_path_raw() RHBZ: 1972502 PATH_MAX is 4096 but PAGE_SIZE can be >4096 on some architectures such as ppc and would thus write beyond the end of the actual object. Cc: Reported-by: Xiaoli Feng Suggested-by: Brian foster Reviewed-by: Paulo Alcantara (SUSE) Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 79402ca0ddfa..5f8a302ffcb2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -100,7 +100,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; - s = dentry_path_raw(direntry, page, PAGE_SIZE); + s = dentry_path_raw(direntry, page, PATH_MAX); if (IS_ERR(s)) return s; if (!s[1]) // for root we want "", not "/" From 60f0779862e4ab943810187752c462e85f5fa371 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:45 +0300 Subject: [PATCH 072/187] virtio: Improve vq->broken access to avoid any compiler optimization Currently vq->broken field is read by virtqueue_is_broken() in busy loop in one context by virtnet_send_command(). vq->broken is set to true in other process context by virtio_break_device(). Reader and writer are accessing it without any synchronization. This may lead to a compiler optimization which may result to optimize reading vq->broken only once. Hence, force reading vq->broken on each invocation of virtqueue_is_broken() and also force writing it so that such update is visible to the readers. It is a theoretical fix that isn't yet encountered in the field. Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-2-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 89bfe46a8a7f..e179c7c7622c 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2373,7 +2373,7 @@ bool virtqueue_is_broken(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->broken; + return READ_ONCE(vq->broken); } EXPORT_SYMBOL_GPL(virtqueue_is_broken); @@ -2387,7 +2387,9 @@ void virtio_break_device(struct virtio_device *dev) list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); - vq->broken = true; + + /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ + WRITE_ONCE(vq->broken, true); } } EXPORT_SYMBOL_GPL(virtio_break_device); From 249f255476328e597a598ccdbd4414e51a5b6d6e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:46 +0300 Subject: [PATCH 073/187] virtio: Keep vring_del_virtqueue() mirror of VQ create Keep the vring_del_virtqueue() mirror of the create routines. i.e. to delete list entry first as it is added last during the create routine. Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-3-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index e179c7c7622c..d5934c2e5a89 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2291,6 +2291,8 @@ void vring_del_virtqueue(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + list_del(&_vq->list); + if (vq->we_own_ring) { if (vq->packed_ring) { vring_free_queue(vq->vq.vdev, @@ -2321,7 +2323,6 @@ void vring_del_virtqueue(struct virtqueue *_vq) kfree(vq->split.desc_state); kfree(vq->split.desc_extra); } - list_del(&_vq->list); kfree(vq); } EXPORT_SYMBOL_GPL(vring_del_virtqueue); From 0e566c8f0f2e8325e35f6f97e13cde5356b41814 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:47 +0300 Subject: [PATCH 074/187] virtio: Protect vqs list access VQs may be accessed to mark the device broken while they are created/destroyed. Hence protect the access to the vqs list. Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues broken.") Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-4-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 1 + drivers/virtio/virtio_ring.c | 8 ++++++++ include/linux/virtio.h | 1 + 3 files changed, 10 insertions(+) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 4b15c00c0a0a..49984d2cba24 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -355,6 +355,7 @@ int register_virtio_device(struct virtio_device *dev) virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); INIT_LIST_HEAD(&dev->vqs); + spin_lock_init(&dev->vqs_list_lock); /* * device_add() causes the bus infrastructure to look for a matching diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index d5934c2e5a89..c2aaa0eff6df 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1755,7 +1755,9 @@ static struct virtqueue *vring_create_virtqueue_packed( cpu_to_le16(vq->packed.event_flags_shadow); } + spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); + spin_unlock(&vdev->vqs_list_lock); return &vq->vq; err_desc_extra: @@ -2229,7 +2231,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); + spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); + spin_unlock(&vdev->vqs_list_lock); return &vq->vq; err_extra: @@ -2291,7 +2295,9 @@ void vring_del_virtqueue(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + spin_lock(&vq->vq.vdev->vqs_list_lock); list_del(&_vq->list); + spin_unlock(&vq->vq.vdev->vqs_list_lock); if (vq->we_own_ring) { if (vq->packed_ring) { @@ -2386,12 +2392,14 @@ void virtio_break_device(struct virtio_device *dev) { struct virtqueue *_vq; + spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, true); } + spin_unlock(&dev->vqs_list_lock); } EXPORT_SYMBOL_GPL(virtio_break_device); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b1894e0323fa..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -110,6 +110,7 @@ struct virtio_device { bool config_enabled; bool config_change_pending; spinlock_t config_lock; + spinlock_t vqs_list_lock; /* Protects VQs list access */ struct device dev; struct virtio_device_id id; const struct virtio_config_ops *config; From 43bb40c5b92659966bdf4bfe584fde0a3575a049 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:48 +0300 Subject: [PATCH 075/187] virtio_pci: Support surprise removal of virtio pci device When a virtio pci device undergo surprise removal (aka async removal in PCIe spec), mark the device as broken so that any upper layer drivers can abort any outstanding operation. When a virtio net pci device undergo surprise removal which is used by a NetworkManager, a below call trace was observed. kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:1:27059] watchdog: BUG: soft lockup - CPU#1 stuck for 52s! [kworker/1:1:27059] CPU: 1 PID: 27059 Comm: kworker/1:1 Tainted: G S W I L 5.13.0-hotplug+ #8 Hardware name: Dell Inc. PowerEdge R640/0H28RR, BIOS 2.9.4 11/06/2020 Workqueue: events linkwatch_event RIP: 0010:virtnet_send_command+0xfc/0x150 [virtio_net] Call Trace: virtnet_set_rx_mode+0xcf/0x2a7 [virtio_net] ? __hw_addr_create_ex+0x85/0xc0 __dev_mc_add+0x72/0x80 igmp6_group_added+0xa7/0xd0 ipv6_mc_up+0x3c/0x60 ipv6_find_idev+0x36/0x80 addrconf_add_dev+0x1e/0xa0 addrconf_dev_config+0x71/0x130 addrconf_notify+0x1f5/0xb40 ? rtnl_is_locked+0x11/0x20 ? __switch_to_asm+0x42/0x70 ? finish_task_switch+0xaf/0x2c0 ? raw_notifier_call_chain+0x3e/0x50 raw_notifier_call_chain+0x3e/0x50 netdev_state_change+0x67/0x90 linkwatch_do_dev+0x3c/0x50 __linkwatch_run_queue+0xd2/0x220 linkwatch_event+0x21/0x30 process_one_work+0x1c8/0x370 worker_thread+0x30/0x380 ? process_one_work+0x370/0x370 kthread+0x118/0x140 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30 Hence, add the ability to abort the command on surprise removal which prevents infinite loop and system lockup. Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-5-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_common.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 222d630c41fc..b35bb2d57f62 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -576,6 +576,13 @@ static void virtio_pci_remove(struct pci_dev *pci_dev) struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); + /* + * Device is marked broken on surprise removal so that virtio upper + * layers can abort any ongoing operation. + */ + if (!pci_device_is_present(pci_dev)) + virtio_break_device(&vp_dev->vdev); + pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); From 0e398290cff997610b66e73573faaee70c9a700e Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Wed, 28 Jul 2021 21:07:55 +0800 Subject: [PATCH 076/187] vhost-vdpa: Fix integer overflow in vhost_vdpa_process_iotlb_update() The "msg->iova + msg->size" addition can have an integer overflow if the iotlb message is from a malicious user space application. So let's fix it. Fixes: 1b48dc03e575 ("vhost: vdpa: report iova range") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210728130756.97-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 210ab35a7ebf..9479f7f79217 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -614,7 +614,8 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, long pinned; int ret = 0; - if (msg->iova < v->range.first || + if (msg->iova < v->range.first || !msg->size || + msg->iova > U64_MAX - msg->size + 1 || msg->iova + msg->size - 1 > v->range.last) return -EINVAL; From 7b9cae027ba3aaac295ae23a62f47876ed97da73 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 10 Aug 2021 10:19:49 -0700 Subject: [PATCH 077/187] KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation Use the secondary_exec_controls_get() accessor in vmx_has_waitpkg() to effectively get the controls for the current VMCS, as opposed to using vmx->secondary_exec_controls, which is the cached value of KVM's desired controls for vmcs01 and truly not reflective of any particular VMCS. While the waitpkg control is not dynamic, i.e. vmcs01 will always hold the same waitpkg configuration as vmx->secondary_exec_controls, the same does not hold true for vmcs02 if the L1 VMM hides the feature from L2. If L1 hides the feature _and_ does not intercept MSR_IA32_UMWAIT_CONTROL, L2 could incorrectly read/write L1's virtual MSR instead of taking a #GP. Fixes: 6e3ba4abcea5 ("KVM: vmx: Emulate MSR IA32_UMWAIT_CONTROL") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20210810171952.2758100-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index db88ed4f2121..17a1cb4b059d 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -522,7 +522,7 @@ static inline struct vmcs *alloc_vmcs(bool shadow) static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) { - return vmx->secondary_exec_control & + return secondary_exec_controls_get(vmx) & SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } From bba676cc0b6122a74fa2e246f38a6b05c6f95b36 Mon Sep 17 00:00:00 2001 From: Dhananjay Phadke Date: Thu, 5 Aug 2021 14:49:05 -0700 Subject: [PATCH 078/187] i2c: iproc: fix race between client unreg and tasklet Similar NULL deref was originally fixed by graceful teardown sequence - https://lore.kernel.org/linux-i2c/1597106560-79693-1-git-send-email-dphadke@linux.microsoft.com After this, a tasklet was added to take care of FIFO full condition for large i2c transaction. https://lore.kernel.org/linux-arm-kernel/20201102035433.6774-1-rayagonda.kokatanur@broadcom.com/ This introduced regression, a new race condition between tasklet enabling interrupts and client unreg teardown sequence. Kill tasklet before unreg_slave() masks bits in IE_OFFSET. Updated teardown sequence - (1) disable_irq() (2) Kill tasklet (3) Mask event enable bits in control reg (4) Erase slave address (avoid further writes to rx fifo) (5) Flush tx and rx FIFOs (6) Clear pending event (interrupt) bits in status reg (7) Set client pointer to NULL (8) enable_irq() -- Unable to handle kernel read from unreadable memory at virtual address 0000000000000320 Mem abort info: ESR = 0x96000004 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000004 CM = 0, WnR = 0 user pgtable: 4k pages, 48-bit VAs, pgdp=000000009212a000 [0000000000000320] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 96000004 [#1] SMP CPU: 0 PID: 0 Comm: swapper/0 Tainted: G O Hardware name: Overlake (DT) pstate: 40400085 (nZcv daIf +PAN -UAO -TCO BTYPE=--) pc : bcm_iproc_i2c_slave_isr+0x2b8/0x8e4 lr : bcm_iproc_i2c_slave_isr+0x1c8/0x8e4 sp : ffff800010003e70 x29: ffff800010003e80 x28: ffffda017acdc000 x27: ffffda017b0ae000 x26: ffff800010004000 x25: ffff800010000000 x24: ffffda017af4a168 x23: 0000000000000073 x22: 0000000000000000 x21: 0000000001400000 x20: 0000000001000000 x19: ffff06f09583f880 x18: 00000000fa83b2da x17: 000000000000b67e x16: 0000000002edb2f3 x15: 00000000000002c7 x14: 00000000000002c7 x13: 0000000000000006 x12: 0000000000000033 x11: 0000000000000000 x10: 0000000001000000 x9 : 0000000003289312 x8 : 0000000003289311 x7 : 02d0cd03a303adbc x6 : 02d18e7f0a4dfc6c x5 : 02edb2f33f76ea68 x4 : 00000000fa83b2da x3 : ffffda017af43cd0 x2 : ffff800010003e74 x1 : 0000000001400000 x0 : 0000000000000000 Call trace: bcm_iproc_i2c_slave_isr+0x2b8/0x8e4 bcm_iproc_i2c_isr+0x178/0x290 __handle_irq_event_percpu+0xd0/0x200 handle_irq_event+0x60/0x1a0 handle_fasteoi_irq+0x130/0x220 __handle_domain_irq+0x8c/0xcc gic_handle_irq+0xc0/0x120 el1_irq+0xcc/0x180 finish_task_switch+0x100/0x1d8 __schedule+0x61c/0x7a0 schedule_idle+0x28/0x44 do_idle+0x254/0x28c cpu_startup_entry+0x28/0x2c rest_init+0xc4/0xd0 arch_call_rest_init+0x14/0x1c start_kernel+0x33c/0x3b8 Code: f9423260 910013e2 11000509 b9047a69 (f9419009) ---[ end trace 4781455b2a7bec15 ]--- Fixes: 4d658451c9d6 ("i2c: iproc: handle rx fifo full interrupt") Signed-off-by: Dhananjay Phadke Acked-by: Ray Jui Acked-by: Rayagonda Kokatanur Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-bcm-iproc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index cceaf69279a9..6304d1dd2dd6 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -1224,14 +1224,14 @@ static int bcm_iproc_i2c_unreg_slave(struct i2c_client *slave) disable_irq(iproc_i2c->irq); + tasklet_kill(&iproc_i2c->slave_rx_tasklet); + /* disable all slave interrupts */ tmp = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET); tmp &= ~(IE_S_ALL_INTERRUPT_MASK << IE_S_ALL_INTERRUPT_SHIFT); iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, tmp); - tasklet_kill(&iproc_i2c->slave_rx_tasklet); - /* Erase the slave address programmed */ tmp = iproc_i2c_rd_reg(iproc_i2c, S_CFG_SMBUS_ADDR_OFFSET); tmp &= ~BIT(S_CFG_EN_NIC_SMB_ADDR3_SHIFT); From 86ff25ed6cd8240d18df58930bd8848b19fce308 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 29 Jul 2021 16:35:32 +0200 Subject: [PATCH 079/187] i2c: dev: zero out array used for i2c reads from userspace If an i2c driver happens to not provide the full amount of data that a user asks for, it is possible that some uninitialized data could be sent to userspace. While all in-kernel drivers look to be safe, just be sure by initializing the buffer to zero before it is passed to the i2c driver so that any future drivers will not have this issue. Also properly copy the amount of data recvieved to the userspace buffer, as pointed out by Dan Carpenter. Reported-by: Eric Dumazet Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/i2c-dev.c b/drivers/i2c/i2c-dev.c index cb64fe649390..77f576e51652 100644 --- a/drivers/i2c/i2c-dev.c +++ b/drivers/i2c/i2c-dev.c @@ -141,7 +141,7 @@ static ssize_t i2cdev_read(struct file *file, char __user *buf, size_t count, if (count > 8192) count = 8192; - tmp = kmalloc(count, GFP_KERNEL); + tmp = kzalloc(count, GFP_KERNEL); if (tmp == NULL) return -ENOMEM; @@ -150,7 +150,8 @@ static ssize_t i2cdev_read(struct file *file, char __user *buf, size_t count, ret = i2c_master_recv(client, tmp, count); if (ret >= 0) - ret = copy_to_user(buf, tmp, count) ? -EFAULT : ret; + if (copy_to_user(buf, tmp, ret)) + ret = -EFAULT; kfree(tmp); return ret; } From 3f12cc4bb0a4d7b542af43b6f1b7175f13015629 Mon Sep 17 00:00:00 2001 From: Hu Haowen Date: Wed, 28 Jul 2021 23:53:46 +0800 Subject: [PATCH 080/187] Documentation: i2c: add i2c-sysfs into index Append i2c-sysfs to toctree in order to get rid of building warnings. Fixes: 31df7195b100 ("Documentation: i2c: Add doc for I2C sysfs") Signed-off-by: Hu Haowen Signed-off-by: Wolfram Sang --- Documentation/i2c/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/i2c/index.rst b/Documentation/i2c/index.rst index 8b76217e370a..6270f1fd7d4e 100644 --- a/Documentation/i2c/index.rst +++ b/Documentation/i2c/index.rst @@ -17,6 +17,7 @@ Introduction busses/index i2c-topology muxes/i2c-mux-gpio + i2c-sysfs Writing device drivers ====================== From f7ad318ea0ad58ebe0e595e59aed270bb643b29b Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Wed, 28 Jul 2021 21:07:56 +0800 Subject: [PATCH 081/187] vhost: Fix the calculation in vhost_overflow() This fixes the incorrect calculation for integer overflow when the last address of iova range is 0xffffffff. Fixes: ec33d031a14b ("vhost: detect 32 bit integer wrap around") Reported-by: Jason Wang Signed-off-by: Xie Yongji Acked-by: Jason Wang Link: https://lore.kernel.org/r/20210728130756.97-2-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index b9e853e6094d..59edb5a1ffe2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -735,10 +735,16 @@ static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); } +/* Make sure 64 bit math will not overflow. */ static bool vhost_overflow(u64 uaddr, u64 size) { - /* Make sure 64 bit math will not overflow. */ - return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size; + if (uaddr > ULONG_MAX || size > ULONG_MAX) + return true; + + if (!size) + return false; + + return uaddr > ULONG_MAX - size + 1; } /* Caller should have vq mutex and device mutex. */ From 2b847f21145d84e2e1dde99d3e2c00a5468f02e4 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:23 +0800 Subject: [PATCH 082/187] vdpa_sim: Fix return value check for vdpa_alloc_device() The vdpa_alloc_device() returns an error pointer upon failure, not NULL. To handle the failure correctly, this replaces NULL check with IS_ERR() check and propagate the error upwards. Fixes: 2c53d0f64c06 ("vdpasim: vDPA device simulator") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 14e024de5cbf..c621cf7feec0 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -251,8 +251,10 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, dev_attr->name); - if (!vdpasim) + if (IS_ERR(vdpasim)) { + ret = PTR_ERR(vdpasim); goto err_alloc; + } vdpasim->dev_attr = *dev_attr; INIT_WORK(&vdpasim->work, dev_attr->work_fn); From 9632e78e82648aa98340df78eab9106f63da151e Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:24 +0800 Subject: [PATCH 083/187] vp_vdpa: Fix return value check for vdpa_alloc_device() The vdpa_alloc_device() returns an error pointer upon failure, not NULL. To handle the failure correctly, this replaces NULL check with IS_ERR() check and propagate the error upwards. Fixes: 64b9f64f80a6 ("vdpa: introduce virtio pci driver") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-2-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- drivers/vdpa/virtio_pci/vp_vdpa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 7b4a6396c553..fe0527329857 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -436,9 +436,9 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, dev, &vp_vdpa_ops, NULL); - if (vp_vdpa == NULL) { + if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); - return -ENOMEM; + return PTR_ERR(vp_vdpa); } mdev = &vp_vdpa->mdev; From 1057afa0121db8bd3ca4718c8e0ca12388ab7759 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:25 +0800 Subject: [PATCH 084/187] vDPA/ifcvf: Fix return value check for vdpa_alloc_device() The vdpa_alloc_device() returns an error pointer upon failure, not NULL. To handle the failure correctly, this replaces NULL check with IS_ERR() check and propagate the error upwards. Fixes: 5a2414bc454e ("virtio: Intel IFC VF driver for VDPA") Reported-by: Dan Carpenter Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-3-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- drivers/vdpa/ifcvf/ifcvf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 21b78f1cd521..351c6cfb24c3 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -493,9 +493,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, dev, &ifc_vdpa_ops, NULL); - if (adapter == NULL) { + if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); - return -ENOMEM; + return PTR_ERR(adapter); } pci_set_master(pdev); From c8d182bd387a09a8b95303c8086238e8bf61fcfc Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:26 +0800 Subject: [PATCH 085/187] vdpa: Add documentation for vdpa_alloc_device() macro The return value of vdpa_alloc_device() macro is not very clear, so that most of callers did the wrong check. Let's add some comments to better document it. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-4-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- include/linux/vdpa.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 3357ac98878d..8cfe49d201dd 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -277,6 +277,17 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, size_t size, const char *name); +/** + * vdpa_alloc_device - allocate and initilaize a vDPA device + * + * @dev_struct: the type of the parent structure + * @member: the name of struct vdpa_device within the @dev_struct + * @parent: the parent device + * @config: the bus operations that is supported by this device + * @name: name of the vdpa device + * + * Return allocated data structure or ERR_PTR upon error + */ #define vdpa_alloc_device(dev_struct, member, parent, config, name) \ container_of(__vdpa_alloc_device( \ parent, config, \ From cb5d2c1f6cc0e5769099a7d44b9d08cf58cae206 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Thu, 1 Jul 2021 13:46:52 +0200 Subject: [PATCH 086/187] virtio_vdpa: reject invalid vq indices Do not call vDPA drivers' callbacks with vq indicies larger than what the drivers indicate that they support. vDPA drivers do not bounds check the indices. Signed-off-by: Vincent Whitchurch Link: https://lore.kernel.org/r/20210701114652.21956-1-vincent.whitchurch@axis.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- drivers/virtio/virtio_vdpa.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index e1a141135992..72eaef2caeb1 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -151,6 +151,9 @@ virtio_vdpa_setup_vq(struct virtio_device *vdev, unsigned int index, if (!name) return NULL; + if (index >= vdpa->nvqs) + return ERR_PTR(-ENOENT); + /* Queue shouldn't already be set up. */ if (ops->get_vq_ready(vdpa, index)) return ERR_PTR(-ENOENT); From e74cfa91f42c50f7f649b0eca46aa049754ccdbd Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 25 Jun 2021 08:55:02 +0530 Subject: [PATCH 087/187] vringh: Use wiov->used to check for read/write desc order As __vringh_iov() traverses a descriptor chain, it populates each descriptor entry into either read or write vring iov and increments that iov's ->used member. So, as we iterate over a descriptor chain, at any point, (riov/wriov)->used value gives the number of descriptor enteries available, which are to be read or written by the device. As all read iovs must precede the write iovs, wiov->used should be zero when we are traversing a read descriptor. Current code checks for wiov->i, to figure out whether any previous entry in the current descriptor chain was a write descriptor. However, iov->i is only incremented, when these vring iovs are consumed, at a later point, and remain 0 in __vringh_iov(). So, correct the check for read and write descriptor order, to use wiov->used. Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Signed-off-by: Neeraj Upadhyay Link: https://lore.kernel.org/r/1624591502-4827-1-git-send-email-neeraju@codeaurora.org Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 4af8fa259d65..14e2043d7685 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -359,7 +359,7 @@ __vringh_iov(struct vringh *vrh, u16 i, iov = wiov; else { iov = riov; - if (unlikely(wiov && wiov->i)) { + if (unlikely(wiov && wiov->used)) { vringh_bad("Readable desc %p after writable", &descs[i]); err = -EINVAL; From 82e89ea077b93b3c131fa175b0df3acb5b1d5cdf Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 9 Aug 2021 18:16:09 +0800 Subject: [PATCH 088/187] virtio-blk: Add validation for block size in config space An untrusted device might presents an invalid block size in configuration space. This tries to add validation for it in the validate callback and clear the VIRTIO_BLK_F_BLK_SIZE feature bit if the value is out of the supported range. And we also double check the value in virtblk_probe() in case that it's changed after the validation. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210809101609.148-1-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/block/virtio_blk.c | 39 ++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4b49df2dfd23..afb37aac09e8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -692,6 +692,28 @@ static const struct blk_mq_ops virtio_mq_ops = { static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); +static int virtblk_validate(struct virtio_device *vdev) +{ + u32 blk_size; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + if (!virtio_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) + return 0; + + blk_size = virtio_cread32(vdev, + offsetof(struct virtio_blk_config, blk_size)); + + if (blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE) + __virtio_clear_bit(vdev, VIRTIO_BLK_F_BLK_SIZE); + + return 0; +} + static int virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -703,12 +725,6 @@ static int virtblk_probe(struct virtio_device *vdev) u8 physical_block_exp, alignment_offset; unsigned int queue_depth; - if (!vdev->config->get) { - dev_err(&vdev->dev, "%s failure: config access disabled\n", - __func__); - return -EINVAL; - } - err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), GFP_KERNEL); if (err < 0) @@ -823,6 +839,14 @@ static int virtblk_probe(struct virtio_device *vdev) else blk_size = queue_logical_block_size(q); + if (unlikely(blk_size < SECTOR_SIZE || blk_size > PAGE_SIZE)) { + dev_err(&vdev->dev, + "block size is changed unexpectedly, now is %u\n", + blk_size); + err = -EINVAL; + goto err_cleanup_disk; + } + /* Use topology information if available */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, physical_block_exp, @@ -881,6 +905,8 @@ static int virtblk_probe(struct virtio_device *vdev) device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); return 0; +err_cleanup_disk: + blk_cleanup_disk(vblk->disk); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); out_free_vq: @@ -983,6 +1009,7 @@ static struct virtio_driver virtio_blk = { .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, + .validate = virtblk_validate, .probe = virtblk_probe, .remove = virtblk_remove, .config_changed = virtblk_config_changed, From ea2f6af16532511eb1cd8eb62845c37861f24ce8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 10 Aug 2021 12:25:05 -0400 Subject: [PATCH 089/187] vringh: pull in spinlock header we use a spinlock now pull in the correct header to make vring.h self sufficient. Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 84db7b8f912f..212892cf9822 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -14,6 +14,7 @@ #include #include #include +#include #if IS_REACHABLE(CONFIG_VHOST_IOTLB) #include #include From f8ce72632fa7ed286cc9a62c35e279330a14d3e0 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 10 Aug 2021 12:26:05 -0400 Subject: [PATCH 090/187] virtio_ring: pull in spinlock header we use a spinlock now pull in the correct header to make virtio_ring.c self sufficient. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index c2aaa0eff6df..dd95dfd85e98 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifdef DEBUG From a24ce06c70fe7df795a846ad713ccaa9b56a7666 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 10 Aug 2021 12:26:05 -0400 Subject: [PATCH 091/187] tools/virtio: fix build We use a spinlock now so add a stub. Ignore bogus uninitialized variable warnings. Signed-off-by: Michael S. Tsirkin --- tools/virtio/Makefile | 3 +- tools/virtio/linux/spinlock.h | 56 +++++++++++++++++++++++++++++++++++ tools/virtio/linux/virtio.h | 2 ++ 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 tools/virtio/linux/spinlock.h diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile index b587b9a7a124..0d7bbe49359d 100644 --- a/tools/virtio/Makefile +++ b/tools/virtio/Makefile @@ -4,7 +4,8 @@ test: virtio_test vringh_test virtio_test: virtio_ring.o virtio_test.o vringh_test: vringh_test.o vringh.o virtio_ring.o -CFLAGS += -g -O2 -Werror -Wall -I. -I../include/ -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -include ../../include/linux/kconfig.h +CFLAGS += -g -O2 -Werror -Wno-maybe-uninitialized -Wall -I. -I../include/ -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE -include ../../include/linux/kconfig.h +LDFLAGS += -lpthread vpath %.c ../../drivers/virtio ../../drivers/vhost mod: ${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test V=${V} diff --git a/tools/virtio/linux/spinlock.h b/tools/virtio/linux/spinlock.h new file mode 100644 index 000000000000..028e3cdcc5d3 --- /dev/null +++ b/tools/virtio/linux/spinlock.h @@ -0,0 +1,56 @@ +#ifndef SPINLOCK_H_STUB +#define SPINLOCK_H_STUB + +#include + +typedef pthread_spinlock_t spinlock_t; + +static inline void spin_lock_init(spinlock_t *lock) +{ + int r = pthread_spin_init(lock, 0); + assert(!r); +} + +static inline void spin_lock(spinlock_t *lock) +{ + int ret = pthread_spin_lock(lock); + assert(!ret); +} + +static inline void spin_unlock(spinlock_t *lock) +{ + int ret = pthread_spin_unlock(lock); + assert(!ret); +} + +static inline void spin_lock_bh(spinlock_t *lock) +{ + spin_lock(lock); +} + +static inline void spin_unlock_bh(spinlock_t *lock) +{ + spin_unlock(lock); +} + +static inline void spin_lock_irq(spinlock_t *lock) +{ + spin_lock(lock); +} + +static inline void spin_unlock_irq(spinlock_t *lock) +{ + spin_unlock(lock); +} + +static inline void spin_lock_irqsave(spinlock_t *lock, unsigned long f) +{ + spin_lock(lock); +} + +static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long f) +{ + spin_unlock(lock); +} + +#endif diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h index 5d90254ddae4..363b98228301 100644 --- a/tools/virtio/linux/virtio.h +++ b/tools/virtio/linux/virtio.h @@ -3,6 +3,7 @@ #define LINUX_VIRTIO_H #include #include +#include struct device { void *parent; @@ -12,6 +13,7 @@ struct virtio_device { struct device dev; u64 features; struct list_head vqs; + spinlock_t vqs_list_lock; }; struct virtqueue { From 08dbd5660232bede7916d8568003012c1182cc9a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Aug 2021 08:37:13 +0300 Subject: [PATCH 092/187] vdpa/mlx5: Avoid destroying MR on empty iotlb The current code treats an empty iotlb provdied in set_map() as a special case and destroy the memory region object. This must not be done since the virtqueue objects reference this MR. Doing so will cause the driver unload to emit errors and log timeouts caused by the firmware complaining on busy resources. This patch treats an empty iotlb as any other change of mapping. In this case, mlx5_vdpa_create_mr() will fail and the entire set_map() call to fail. This issue has not been encountered before but was seen to occur in a non-official version of qemu. Since qemu is a userspace program, the driver must protect against such case. Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210811053713.66658-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/core/mr.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index dcee6039e966..e59135fa867e 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -512,11 +512,6 @@ void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) mutex_unlock(&mr->mkey_mtx); } -static bool map_empty(struct vhost_iotlb *iotlb) -{ - return !vhost_iotlb_itree_first(iotlb, 0, U64_MAX); -} - int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, bool *change_map) { @@ -524,10 +519,6 @@ int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *io int err = 0; *change_map = false; - if (map_empty(iotlb)) { - mlx5_vdpa_destroy_mr(mvdev); - return 0; - } mutex_lock(&mr->mkey_mtx); if (mr->initialized) { mlx5_vdpa_info(mvdev, "memory map update\n"); From 879753c816dbbdb2a9a395aa4448d29feee92d1a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Aug 2021 08:37:59 +0300 Subject: [PATCH 093/187] vdpa/mlx5: Fix queue type selection logic get_queue_type() comments that splict virtqueue is preferred, however, the actual logic preferred packed virtqueues. Since firmware has not supported packed virtqueues we ended up using split virtqueues as was desired. Since we do not advertise support for packed virtqueues, we add a check to verify split virtqueues are indeed supported. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210811053759.66752-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 14 ++++++++++---- include/linux/mlx5/mlx5_ifc_vdpa.h | 10 ++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2a31467f7ac5..b1230fa2f5d1 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -753,12 +753,12 @@ static int get_queue_type(struct mlx5_vdpa_net *ndev) type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type); /* prefer split queue */ - if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED) - return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; + if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT) + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; - WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)); + WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)); - return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; } static bool vq_is_tx(u16 idx) @@ -2030,6 +2030,12 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) return -ENOSPC; mdev = mgtdev->madev->mdev; + if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) & + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) { + dev_warn(mdev->device, "missing support for split virtqueues\n"); + return -EOPNOTSUPP; + } + /* we save one virtqueue for control virtqueue should we require it */ max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 98b56b75c625..1a9c9d94cb59 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -11,13 +11,15 @@ enum { }; enum { - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = 0x1, // do I check this caps? - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = 0x2, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, }; enum { - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT), + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED), }; struct mlx5_ifc_virtio_q_bits { From 31697ef7f3f45293bba3da87bcc710953e97fc3e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 6 Aug 2021 09:43:11 +0900 Subject: [PATCH 094/187] pinctrl: k210: Fix k210_fpioa_probe() In k210_fpioa_probe(), add missing calls to clk_disable_unprepare() in case of error after cenabling the clk and pclk clocks. Also add missing error handling when enabling pclk. Reported-by: kernel test robot Reported-by: Dan Carpenter Fixes: d4c34d09ab03 ("pinctrl: Add RISC-V Canaan Kendryte K210 FPIOA driver") Cc: Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20210806004311.52859-1-damien.lemoal@wdc.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-k210.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c index f831526d06ff..49e32684dbb2 100644 --- a/drivers/pinctrl/pinctrl-k210.c +++ b/drivers/pinctrl/pinctrl-k210.c @@ -950,23 +950,37 @@ static int k210_fpioa_probe(struct platform_device *pdev) return ret; pdata->pclk = devm_clk_get_optional(dev, "pclk"); - if (!IS_ERR(pdata->pclk)) - clk_prepare_enable(pdata->pclk); + if (!IS_ERR(pdata->pclk)) { + ret = clk_prepare_enable(pdata->pclk); + if (ret) + goto disable_clk; + } pdata->sysctl_map = syscon_regmap_lookup_by_phandle_args(np, "canaan,k210-sysctl-power", 1, &pdata->power_offset); - if (IS_ERR(pdata->sysctl_map)) - return PTR_ERR(pdata->sysctl_map); + if (IS_ERR(pdata->sysctl_map)) { + ret = PTR_ERR(pdata->sysctl_map); + goto disable_pclk; + } k210_fpioa_init_ties(pdata); pdata->pctl = pinctrl_register(&k210_pinctrl_desc, dev, (void *)pdata); - if (IS_ERR(pdata->pctl)) - return PTR_ERR(pdata->pctl); + if (IS_ERR(pdata->pctl)) { + ret = PTR_ERR(pdata->pctl); + goto disable_pclk; + } return 0; + +disable_pclk: + clk_disable_unprepare(pdata->pclk); +disable_clk: + clk_disable_unprepare(pdata->clk); + + return ret; } static const struct of_device_id k210_fpioa_dt_ids[] = { From 2d3a1e3615c5449a4583010f41a6f824a4ffa03e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 10 Aug 2021 16:05:37 -0700 Subject: [PATCH 095/187] bpf: Add rcu_read_lock in bpf_get_current_[ancestor_]cgroup_id() helpers Currently, if bpf_get_current_cgroup_id() or bpf_get_current_ancestor_cgroup_id() helper is called with sleepable programs e.g., sleepable fentry/fmod_ret/fexit/lsm programs, a rcu warning may appear. For example, if I added the following hack to test_progs/test_lsm sleepable fentry program test_sys_setdomainname: --- a/tools/testing/selftests/bpf/progs/lsm.c +++ b/tools/testing/selftests/bpf/progs/lsm.c @@ -168,6 +168,10 @@ int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs) int buf = 0; long ret; + __u64 cg_id = bpf_get_current_cgroup_id(); + if (cg_id == 1000) + copy_test++; + ret = bpf_copy_from_user(&buf, sizeof(buf), ptr); if (len == -2 && ret == 0 && buf == 1234) copy_test++; I will hit the following rcu warning: include/linux/cgroup.h:481 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by test_progs/260: #0: ffffffffa5173360 (rcu_read_lock_trace){....}-{0:0}, at: __bpf_prog_enter_sleepable+0x0/0xa0 stack backtrace: CPU: 1 PID: 260 Comm: test_progs Tainted: G O 5.14.0-rc2+ #176 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x56/0x7b bpf_get_current_cgroup_id+0x9c/0xb1 bpf_prog_a29888d1c6706e09_test_sys_setdomainname+0x3e/0x89c bpf_trampoline_6442469132_0+0x2d/0x1000 __x64_sys_setdomainname+0x5/0x110 do_syscall_64+0x3a/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae I can get similar warning using bpf_get_current_ancestor_cgroup_id() helper. syzbot reported a similar issue in [1] for syscall program. Helper bpf_get_current_cgroup_id() or bpf_get_current_ancestor_cgroup_id() has the following callchain: task_dfl_cgroup task_css_set task_css_set_check and we have #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) Since cgroup_mutex/css_set_lock is not held and the task is not existing and rcu read_lock is not held, a warning will be issued. Note that bpf sleepable program is protected by rcu_read_lock_trace(). The above sleepable bpf programs are already protected by migrate_disable(). Adding rcu_read_lock() in these two helpers will silence the above warning. I marked the patch fixing 95b861a7935b ("bpf: Allow bpf_get_current_ancestor_cgroup_id for tracing") which added bpf_get_current_ancestor_cgroup_id() to tracing programs in 5.14. I think backporting 5.14 is probably good enough as sleepable progrems are not widely used. This patch should fix [1] as well since syscall program is a sleepable program protected with migrate_disable(). [1] https://lore.kernel.org/bpf/0000000000006d5cab05c7d9bb87@google.com/ Fixes: 95b861a7935b ("bpf: Allow bpf_get_current_ancestor_cgroup_id for tracing") Reported-by: syzbot+7ee5c2c09c284495371f@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210810230537.2864668-1-yhs@fb.com --- kernel/bpf/helpers.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 7a97b2f4747d..55f83ea09dae 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -353,9 +353,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = { #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { - struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *cgrp; + u64 cgrp_id; - return cgroup_id(cgrp); + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); + cgrp_id = cgroup_id(cgrp); + rcu_read_unlock(); + + return cgrp_id; } const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { @@ -366,13 +372,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = { BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level) { - struct cgroup *cgrp = task_dfl_cgroup(current); + struct cgroup *cgrp; struct cgroup *ancestor; + u64 cgrp_id; + rcu_read_lock(); + cgrp = task_dfl_cgroup(current); ancestor = cgroup_ancestor(cgrp, ancestor_level); - if (!ancestor) - return 0; - return cgroup_id(ancestor); + cgrp_id = ancestor ? cgroup_id(ancestor) : 0; + rcu_read_unlock(); + + return cgrp_id; } const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = { From b93dfa6bda4d4e88e5386490f2b277a26958f9d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 11 Aug 2021 11:53:37 -0700 Subject: [PATCH 096/187] ACPI: NFIT: Fix support for virtual SPA ranges Fix the NFIT parsing code to treat a 0 index in a SPA Range Structure as a special case and not match Region Mapping Structures that use 0 to indicate that they are not mapped. Without this fix some platform BIOS descriptions of "virtual disk" ranges do not result in the pmem driver attaching to the range. Details: In addition to typical persistent memory ranges, the ACPI NFIT may also convey "virtual" ranges. These ranges are indicated by a UUID in the SPA Range Structure of UUID_VOLATILE_VIRTUAL_DISK, UUID_VOLATILE_VIRTUAL_CD, UUID_PERSISTENT_VIRTUAL_DISK, or UUID_PERSISTENT_VIRTUAL_CD. The critical difference between virtual ranges and UUID_PERSISTENT_MEMORY, is that virtual do not support associations with Region Mapping Structures. For this reason the "index" value of virtual SPA Range Structures is allowed to be 0. If a platform BIOS decides to represent NVDIMMs with disconnected "Region Mapping Structures" (range-index == 0), the kernel may falsely associate them with standalone ranges where the "SPA Range Structure Index" is also zero. When this happens the driver may falsely require labels where "virtual disks" are expected to be label-less. I.e. "label-less" is where the namespace-range == region-range and the pmem driver attaches with no user action to create a namespace. Cc: Jacek Zloch Cc: Lukasz Sobieraj Cc: "Lee, Chun-Yi" Cc: Fixes: c2f32acdf848 ("acpi, nfit: treat virtual ramdisk SPA as pmem region") Reported-by: Krzysztof Rusocki Reported-by: Damian Bassa Reviewed-by: Jeff Moyer Link: https://lore.kernel.org/r/162870796589.2521182.1240403310175570220.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 23d9a09d7060..a3ef6cce644c 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -3021,6 +3021,9 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping_desc *mapping; + /* range index 0 == unmapped in SPA or invalid-SPA */ + if (memdev->range_index == 0 || spa->range_index == 0) + continue; if (memdev->range_index != spa->range_index) continue; if (count >= ND_MAX_MAPPINGS) { From d9cee9f85b22fab88d2b76d2e92b18e3d0e6aa8c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jul 2021 09:46:04 -0700 Subject: [PATCH 097/187] libnvdimm/region: Fix label activation vs errors There are a few scenarios where init_active_labels() can return without registering deactivate_labels() to run when the region is disabled. In particular label error injection creates scenarios where a DIMM is disabled, but labels on other DIMMs in the region become activated. Arrange for init_active_labels() to always register deactivate_labels(). Reported-by: Krzysztof Kensicki Cc: Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation.") Reviewed-by: Jeff Moyer Link: https://lore.kernel.org/r/162766356450.3223041.1183118139023841447.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- drivers/nvdimm/namespace_devs.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 2403b71b601e..745478213ff2 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -2527,7 +2527,7 @@ static void deactivate_labels(void *region) static int init_active_labels(struct nd_region *nd_region) { - int i; + int i, rc = 0; for (i = 0; i < nd_region->ndr_mappings; i++) { struct nd_mapping *nd_mapping = &nd_region->mapping[i]; @@ -2546,13 +2546,14 @@ static int init_active_labels(struct nd_region *nd_region) else if (test_bit(NDD_LABELING, &nvdimm->flags)) /* fail, labels needed to disambiguate dpa */; else - return 0; + continue; dev_err(&nd_region->dev, "%s: is %s, failing probe\n", dev_name(&nd_mapping->nvdimm->dev), test_bit(NDD_LOCKED, &nvdimm->flags) ? "locked" : "disabled"); - return -ENXIO; + rc = -ENXIO; + goto out; } nd_mapping->ndd = ndd; atomic_inc(&nvdimm->busy); @@ -2586,13 +2587,17 @@ static int init_active_labels(struct nd_region *nd_region) break; } - if (i < nd_region->ndr_mappings) { + if (i < nd_region->ndr_mappings) + rc = -ENOMEM; + +out: + if (rc) { deactivate_labels(nd_region); - return -ENOMEM; + return rc; } return devm_add_action_or_reset(&nd_region->dev, deactivate_labels, - nd_region); + nd_region); } int nd_region_register_namespaces(struct nd_region *nd_region, int *err) From f21453b0ff6e307bfd59e7a126d9848cea25315c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 30 Jul 2021 13:00:20 -0700 Subject: [PATCH 098/187] tools/testing/nvdimm: Fix missing 'fallthrough' warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use "fallthrough;" to address: tools/testing/nvdimm/test/nfit.c: In function ‘nd_intel_test_finish_query’: tools/testing/nvdimm/test/nfit.c:436:37: warning: this statement may fall through [-Wimplicit-fallthrough=] 436 | fw->missed_activate = false; | ~~~~~~~~~~~~~~~~~~~~^~~~~~~ tools/testing/nvdimm/test/nfit.c:438:9: note: here 438 | case FW_STATE_UPDATED: | ^~~~ Reviewed-by: Jeff Moyer Link: https://lore.kernel.org/r/162767522046.3313209.14767278726893995797.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- tools/testing/nvdimm/test/nfit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 54f367cbadae..b1bff5fb0f65 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -434,7 +434,7 @@ static int nd_intel_test_finish_query(struct nfit_test *t, dev_dbg(dev, "%s: transition out verify\n", __func__); fw->state = FW_STATE_UPDATED; fw->missed_activate = false; - /* fall through */ + fallthrough; case FW_STATE_UPDATED: nd_cmd->status = 0; /* bogus test version */ From 0f78399551146bfbed357759e2ad5abb8d39e50a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 11 Aug 2021 07:41:45 -1000 Subject: [PATCH 099/187] Revert "block/mq-deadline: Add cgroup support" This reverts commit 08a9ad8bf607 ("block/mq-deadline: Add cgroup support") and a follow-up commit c06bc5a3fb42 ("block/mq-deadline: Remove a WARN_ON_ONCE() call"). The added cgroup support has the following issues: * It breaks cgroup interface file format rule by adding custom elements to a nested key-value file. * It registers mq-deadline as a cgroup-aware policy even though all it's doing is collecting per-cgroup stats. Even if we need these stats, this isn't the right way to add them. * It hasn't been reviewed from cgroup side. Cc: Bart Van Assche Cc: Jens Axboe Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 6 - block/Makefile | 2 - block/mq-deadline-cgroup.c | 126 -------------------- block/mq-deadline-cgroup.h | 114 ------------------ block/{mq-deadline-main.c => mq-deadline.c} | 73 +++--------- 5 files changed, 14 insertions(+), 307 deletions(-) delete mode 100644 block/mq-deadline-cgroup.c delete mode 100644 block/mq-deadline-cgroup.h rename block/{mq-deadline-main.c => mq-deadline.c} (95%) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 64053d67a97b..2f2158e05a91 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -9,12 +9,6 @@ config MQ_IOSCHED_DEADLINE help MQ version of the deadline IO scheduler. -config MQ_IOSCHED_DEADLINE_CGROUP - tristate - default y - depends on MQ_IOSCHED_DEADLINE - depends on BLK_CGROUP - config MQ_IOSCHED_KYBER tristate "Kyber I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index bfbe4e13ca1e..1e1afa10f869 100644 --- a/block/Makefile +++ b/block/Makefile @@ -22,8 +22,6 @@ obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o -mq-deadline-y += mq-deadline-main.o -mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c deleted file mode 100644 index 3b4bfddec39f..000000000000 --- a/block/mq-deadline-cgroup.c +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include - -#include "mq-deadline-cgroup.h" - -static struct blkcg_policy dd_blkcg_policy; - -static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp) -{ - struct dd_blkcg *pd; - - pd = kzalloc(sizeof(*pd), gfp); - if (!pd) - return NULL; - pd->stats = alloc_percpu_gfp(typeof(*pd->stats), - GFP_KERNEL | __GFP_ZERO); - if (!pd->stats) { - kfree(pd); - return NULL; - } - return &pd->cpd; -} - -static void dd_cpd_free(struct blkcg_policy_data *cpd) -{ - struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd); - - free_percpu(dd_blkcg->stats); - kfree(dd_blkcg); -} - -static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd) -{ - return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy), - struct dd_blkcg, cpd); -} - -/* - * Convert an association between a block cgroup and a request queue into a - * pointer to the mq-deadline information associated with a (blkcg, queue) pair. - */ -struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) -{ - struct blkg_policy_data *pd; - - pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy); - if (!pd) - return NULL; - - return dd_blkcg_from_pd(pd); -} - -static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) -{ - static const char *const prio_class_name[] = { - [IOPRIO_CLASS_NONE] = "NONE", - [IOPRIO_CLASS_RT] = "RT", - [IOPRIO_CLASS_BE] = "BE", - [IOPRIO_CLASS_IDLE] = "IDLE", - }; - struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); - int res = 0; - u8 prio; - - for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) - res += scnprintf(buf + res, size - res, - " [%s] dispatched=%u inserted=%u merged=%u", - prio_class_name[prio], - ddcg_sum(blkcg, dispatched, prio) + - ddcg_sum(blkcg, merged, prio) - - ddcg_sum(blkcg, completed, prio), - ddcg_sum(blkcg, inserted, prio) - - ddcg_sum(blkcg, completed, prio), - ddcg_sum(blkcg, merged, prio)); - - return res; -} - -static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) -{ - struct dd_blkg *pd; - - pd = kzalloc(sizeof(*pd), gfp); - if (!pd) - return NULL; - return &pd->pd; -} - -static void dd_pd_free(struct blkg_policy_data *pd) -{ - struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd); - - kfree(dd_blkg); -} - -static struct blkcg_policy dd_blkcg_policy = { - .cpd_alloc_fn = dd_cpd_alloc, - .cpd_free_fn = dd_cpd_free, - - .pd_alloc_fn = dd_pd_alloc, - .pd_free_fn = dd_pd_free, - .pd_stat_fn = dd_pd_stat, -}; - -int dd_activate_policy(struct request_queue *q) -{ - return blkcg_activate_policy(q, &dd_blkcg_policy); -} - -void dd_deactivate_policy(struct request_queue *q) -{ - blkcg_deactivate_policy(q, &dd_blkcg_policy); -} - -int __init dd_blkcg_init(void) -{ - return blkcg_policy_register(&dd_blkcg_policy); -} - -void __exit dd_blkcg_exit(void) -{ - blkcg_policy_unregister(&dd_blkcg_policy); -} diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h deleted file mode 100644 index 0143fd74f3ce..000000000000 --- a/block/mq-deadline-cgroup.h +++ /dev/null @@ -1,114 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#if !defined(_MQ_DEADLINE_CGROUP_H_) -#define _MQ_DEADLINE_CGROUP_H_ - -#include - -struct request_queue; - -/** - * struct io_stats_per_prio - I/O statistics per I/O priority class. - * @inserted: Number of inserted requests. - * @merged: Number of merged requests. - * @dispatched: Number of dispatched requests. - * @completed: Number of I/O completions. - */ -struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - -/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */ -struct blkcg_io_stats { - struct io_stats_per_prio stats[4]; -}; - -/** - * struct dd_blkcg - Per cgroup data. - * @cpd: blkcg_policy_data structure. - * @stats: I/O statistics. - */ -struct dd_blkcg { - struct blkcg_policy_data cpd; /* must be the first member */ - struct blkcg_io_stats __percpu *stats; -}; - -/* - * Count one event of type 'event_type' and with I/O priority class - * 'prio_class'. - */ -#define ddcg_count(ddcg, event_type, prio_class) do { \ -if (ddcg) { \ - struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \ - \ - BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ - BUILD_BUG_ON(!__same_type((prio_class), u8)); \ - local_inc(&io_stats->stats[(prio_class)].event_type); \ - put_cpu_ptr(io_stats); \ -} \ -} while (0) - -/* - * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls - * across all CPUs. No locking or barriers since it is fine if the returned - * sum is slightly outdated. - */ -#define ddcg_sum(ddcg, event_type, prio) ({ \ - unsigned int cpu; \ - u32 sum = 0; \ - \ - BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ - BUILD_BUG_ON(!__same_type((prio), u8)); \ - for_each_present_cpu(cpu) \ - sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \ - stats[(prio)].event_type); \ - sum; \ -}) - -#ifdef CONFIG_BLK_CGROUP - -/** - * struct dd_blkg - Per (cgroup, request queue) data. - * @pd: blkg_policy_data structure. - */ -struct dd_blkg { - struct blkg_policy_data pd; /* must be the first member */ -}; - -struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio); -int dd_activate_policy(struct request_queue *q); -void dd_deactivate_policy(struct request_queue *q); -int __init dd_blkcg_init(void); -void __exit dd_blkcg_exit(void); - -#else /* CONFIG_BLK_CGROUP */ - -static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) -{ - return NULL; -} - -static inline int dd_activate_policy(struct request_queue *q) -{ - return 0; -} - -static inline void dd_deactivate_policy(struct request_queue *q) -{ -} - -static inline int dd_blkcg_init(void) -{ - return 0; -} - -static inline void dd_blkcg_exit(void) -{ -} - -#endif /* CONFIG_BLK_CGROUP */ - -#endif /* _MQ_DEADLINE_CGROUP_H_ */ diff --git a/block/mq-deadline-main.c b/block/mq-deadline.c similarity index 95% rename from block/mq-deadline-main.c rename to block/mq-deadline.c index 6f612e6dc82b..a09761cbdf12 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline.c @@ -25,7 +25,6 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" -#include "mq-deadline-cgroup.h" /* * See Documentation/block/deadline-iosched.rst @@ -57,6 +56,14 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; +/* I/O statistics per I/O priority. */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + /* I/O statistics for all I/O priorities (enum dd_prio). */ struct io_stats { struct io_stats_per_prio stats[DD_PRIO_COUNT]; @@ -79,9 +86,6 @@ struct deadline_data { * run time data */ - /* Request queue that owns this data structure. */ - struct request_queue *queue; - struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -234,10 +238,8 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_blkcg *blkcg = next->elv.priv[0]; dd_count(dd, merged, prio); - ddcg_count(blkcg, merged, ioprio_class); /* * if next expires before rq, assign its expire time to rq @@ -375,7 +377,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; - struct dd_blkcg *blkcg; enum dd_prio prio; u8 ioprio_class; @@ -474,8 +475,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, dispatched, prio); - blkcg = rq->elv.priv[0]; - ddcg_count(blkcg, dispatched, ioprio_class); /* * If the request needs its target zone locked, do it. */ @@ -569,8 +568,6 @@ static void dd_exit_sched(struct elevator_queue *e) struct deadline_data *dd = e->elevator_data; enum dd_prio prio; - dd_deactivate_policy(dd->queue); - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -584,7 +581,7 @@ static void dd_exit_sched(struct elevator_queue *e) } /* - * Initialize elevator private data (deadline_data) and associate with blkcg. + * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { @@ -593,12 +590,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) enum dd_prio prio; int ret = -ENOMEM; - /* - * Initialization would be very tricky if the queue is not frozen, - * hence the warning statement below. - */ - WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter)); - eq = elevator_alloc(q, e); if (!eq) return ret; @@ -614,8 +605,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) if (!dd->stats) goto free_dd; - dd->queue = q; - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -635,17 +624,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); - ret = dd_activate_policy(q); - if (ret) - goto free_stats; - - ret = 0; q->elevator = eq; return 0; -free_stats: - free_percpu(dd->stats); - free_dd: kfree(dd); @@ -718,7 +699,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; - struct dd_blkcg *blkcg; LIST_HEAD(free); lockdep_assert_held(&dd->lock); @@ -729,18 +709,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); - /* - * If a block cgroup has been associated with the submitter and if an - * I/O priority has been set in the associated block cgroup, use the - * lowest of the cgroup priority and the request priority for the - * request. If no priority has been set in the request, use the cgroup - * priority. - */ prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, inserted, prio); - blkcg = dd_blkcg_from_bio(rq->bio); - ddcg_count(blkcg, inserted, ioprio_class); - rq->elv.priv[0] = blkcg; if (blk_mq_sched_try_insert_merge(q, rq, &free)) { blk_mq_free_requests(&free); @@ -789,10 +759,12 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } -/* Callback from inside blk_mq_rq_ctx_init(). */ +/* + * Nothing to do here. This is defined only to ensure that .finish_request + * method is called upon request completion. + */ static void dd_prepare_request(struct request *rq) { - rq->elv.priv[0] = NULL; } /* @@ -815,13 +787,11 @@ static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; struct deadline_data *dd = q->elevator->elevator_data; - struct dd_blkcg *blkcg = rq->elv.priv[0]; const u8 ioprio_class = dd_rq_ioclass(rq); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; dd_count(dd, completed, prio); - ddcg_count(blkcg, completed, ioprio_class); if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -1144,26 +1114,11 @@ MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { - int ret; - - ret = elv_register(&mq_deadline); - if (ret) - goto out; - ret = dd_blkcg_init(); - if (ret) - goto unreg; - -out: - return ret; - -unreg: - elv_unregister(&mq_deadline); - goto out; + return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { - dd_blkcg_exit(); elv_unregister(&mq_deadline); } From 14c4c8e41511aa8fba7fb239b20b6539b5bce201 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Wed, 11 Aug 2021 08:59:14 -0700 Subject: [PATCH 100/187] cfi: Use rcu_read_{un}lock_sched_notrace If rcu_read_lock_sched tracing is enabled, the tracing subsystem can perform a jump which needs to be checked by CFI. For example, stm_ftrace source is enabled as a module and hooks into enabled ftrace events. This can cause an recursive loop where find_shadow_check_fn -> rcu_read_lock_sched -> (call to stm_ftrace generates cfi slowpath) -> find_shadow_check_fn -> rcu_read_lock_sched -> ... To avoid the recursion, either the ftrace codes needs to be marked with __no_cfi or CFI should not trace. Use the "_notrace" in CFI to avoid tracing so that CFI can guard ftrace. Signed-off-by: Elliot Berman Reviewed-by: Sami Tolvanen Cc: stable@vger.kernel.org Fixes: cf68fffb66d6 ("add support for Clang CFI") Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20210811155914.19550-1-quic_eberman@quicinc.com --- kernel/cfi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/cfi.c b/kernel/cfi.c index e17a56639766..9594cfd1cf2c 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -248,9 +248,9 @@ static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr) { cfi_check_fn fn; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr); - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); return fn; } @@ -269,11 +269,11 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr) cfi_check_fn fn = NULL; struct module *mod; - rcu_read_lock_sched(); + rcu_read_lock_sched_notrace(); mod = __module_address(ptr); if (mod) fn = mod->cfi_check; - rcu_read_unlock_sched(); + rcu_read_unlock_sched_notrace(); return fn; } From c4b68e513953c3370ce02c3208c1c628c0b86fd3 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Aug 2021 15:15:13 -0500 Subject: [PATCH 101/187] pinctrl: amd: Fix an issue with shutdown when system set to s0ix IRQs are getting armed on shutdown causing the system to immediately wake back up. Link: https://lkml.org/lkml/2021/8/2/1114 Reported-by: nix.or.die@googlemail.com Acked-by: Shyam Sundar S K Tested-by: Gabriel Craciunescu CC: Raul E Rangel Fixes: d62bd5ce12d7 ("pinctrl: amd: Implement irq_set_wake") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20210809201513.12367-1-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index a76be6cc26ee..5b764740b829 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -444,8 +444,7 @@ static int amd_gpio_irq_set_wake(struct irq_data *d, unsigned int on) unsigned long flags; struct gpio_chip *gc = irq_data_get_irq_chip_data(d); struct amd_gpio *gpio_dev = gpiochip_get_data(gc); - u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3) | - BIT(WAKE_CNTRL_OFF_S4); + u32 wake_mask = BIT(WAKE_CNTRL_OFF_S0I3) | BIT(WAKE_CNTRL_OFF_S3); raw_spin_lock_irqsave(&gpio_dev->lock, flags); pin_reg = readl(gpio_dev->base + (d->hwirq)*4); From a2befe9380dd04ee76c871568deca00eedf89134 Mon Sep 17 00:00:00 2001 From: Jaroslav Kysela Date: Wed, 11 Aug 2021 18:14:41 +0200 Subject: [PATCH 102/187] ALSA: hda - fix the 'Capture Switch' value change notifications The original code in the cap_put_caller() function does not handle correctly the positive values returned from the passed function for multiple iterations. It means that the change notifications may be lost. Fixes: 352f7f914ebb ("ALSA: hda - Merge Realtek parser code to generic parser") BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=213851 Cc: Signed-off-by: Jaroslav Kysela Link: https://lore.kernel.org/r/20210811161441.1325250-1-perex@perex.cz Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_generic.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/hda_generic.c b/sound/pci/hda/hda_generic.c index e97d00585e8e..481d8f8d3396 100644 --- a/sound/pci/hda/hda_generic.c +++ b/sound/pci/hda/hda_generic.c @@ -3460,7 +3460,7 @@ static int cap_put_caller(struct snd_kcontrol *kcontrol, struct hda_gen_spec *spec = codec->spec; const struct hda_input_mux *imux; struct nid_path *path; - int i, adc_idx, err = 0; + int i, adc_idx, ret, err = 0; imux = &spec->input_mux; adc_idx = kcontrol->id.index; @@ -3470,9 +3470,13 @@ static int cap_put_caller(struct snd_kcontrol *kcontrol, if (!path || !path->ctls[type]) continue; kcontrol->private_value = path->ctls[type]; - err = func(kcontrol, ucontrol); - if (err < 0) + ret = func(kcontrol, ucontrol); + if (ret < 0) { + err = ret; break; + } + if (ret > 0) + err = 1; } mutex_unlock(&codec->control_mutex); if (err >= 0 && spec->cap_sync_hook) From 67bb66d32905627e29400e2cb7f87a7c4c8cf667 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Thu, 12 Aug 2021 11:28:39 +0900 Subject: [PATCH 103/187] ALSA: oxfw: fix functioal regression for silence in Apogee Duet FireWire OXFW 971 has no function to use the value in syt field of received isochronous packet for playback timing generation. In kernel prepatch for v5.14, ALSA OXFW driver got change to send NO_INFO value in the field instead of actual timing value. The change brings Apogee Duet FireWire to generate no playback sound, while output meter moves. As long as I investigate, _any_ value in the syt field takes the device to generate sound. It's reasonable to think that the device just ignores data blocks in packet with NO_INFO value in its syt field for audio data processing. This commit adds a new flag for the quirk to fix regression. Fixes: 029ffc429440 ("ALSA: oxfw: perform sequence replay for media clock recovery") Cc: Signed-off-by: Takashi Sakamoto Link: https://lore.kernel.org/r/20210812022839.42043-1-o-takashi@sakamocchi.jp Signed-off-by: Takashi Iwai --- sound/firewire/oxfw/oxfw-stream.c | 9 ++++++++- sound/firewire/oxfw/oxfw.c | 6 ++++-- sound/firewire/oxfw/oxfw.h | 5 +++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/sound/firewire/oxfw/oxfw-stream.c b/sound/firewire/oxfw/oxfw-stream.c index 0ef242fdd3bc..fff18b5d4e05 100644 --- a/sound/firewire/oxfw/oxfw-stream.c +++ b/sound/firewire/oxfw/oxfw-stream.c @@ -153,7 +153,7 @@ static int init_stream(struct snd_oxfw *oxfw, struct amdtp_stream *stream) struct cmp_connection *conn; enum cmp_direction c_dir; enum amdtp_stream_direction s_dir; - unsigned int flags = CIP_UNAWARE_SYT; + unsigned int flags = 0; int err; if (!(oxfw->quirks & SND_OXFW_QUIRK_BLOCKING_TRANSMISSION)) @@ -161,6 +161,13 @@ static int init_stream(struct snd_oxfw *oxfw, struct amdtp_stream *stream) else flags |= CIP_BLOCKING; + // OXFW 970/971 has no function to generate playback timing according to the sequence + // of value in syt field, thus the packet should include NO_INFO value in the field. + // However, some models just ignore data blocks in packet with NO_INFO for audio data + // processing. + if (!(oxfw->quirks & SND_OXFW_QUIRK_IGNORE_NO_INFO_PACKET)) + flags |= CIP_UNAWARE_SYT; + if (stream == &oxfw->tx_stream) { conn = &oxfw->out_conn; c_dir = CMP_OUTPUT; diff --git a/sound/firewire/oxfw/oxfw.c b/sound/firewire/oxfw/oxfw.c index 84971d78d152..cb5b5e3a481b 100644 --- a/sound/firewire/oxfw/oxfw.c +++ b/sound/firewire/oxfw/oxfw.c @@ -159,8 +159,10 @@ static int detect_quirks(struct snd_oxfw *oxfw, const struct ieee1394_device_id return snd_oxfw_scs1x_add(oxfw); } - if (entry->vendor_id == OUI_APOGEE && entry->model_id == MODEL_DUET_FW) - oxfw->quirks |= SND_OXFW_QUIRK_BLOCKING_TRANSMISSION; + if (entry->vendor_id == OUI_APOGEE && entry->model_id == MODEL_DUET_FW) { + oxfw->quirks |= SND_OXFW_QUIRK_BLOCKING_TRANSMISSION | + SND_OXFW_QUIRK_IGNORE_NO_INFO_PACKET; + } /* * TASCAM FireOne has physical control and requires a pair of additional diff --git a/sound/firewire/oxfw/oxfw.h b/sound/firewire/oxfw/oxfw.h index ee47abcb0c90..c13034f6c2ca 100644 --- a/sound/firewire/oxfw/oxfw.h +++ b/sound/firewire/oxfw/oxfw.h @@ -42,6 +42,11 @@ enum snd_oxfw_quirk { SND_OXFW_QUIRK_BLOCKING_TRANSMISSION = 0x04, // Stanton SCS1.d and SCS1.m support unique transaction. SND_OXFW_QUIRK_SCS_TRANSACTION = 0x08, + // Apogee Duet FireWire ignores data blocks in packet with NO_INFO for audio data + // processing, while output level meter moves. Any value in syt field of packet takes + // the device to process audio data even if the value is invalid in a point of + // IEC 61883-1/6. + SND_OXFW_QUIRK_IGNORE_NO_INFO_PACKET = 0x10, }; /* This is an arbitrary number for convinience. */ From 98694166c27d473c36b434bd3572934c2f2a16ab Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 10 Aug 2021 16:13:16 +0000 Subject: [PATCH 104/187] powerpc/interrupt: Fix OOPS by not calling do_IRQ() from timer_interrupt() An interrupt handler shall not be called from another interrupt handler otherwise this leads to problems like the following: Kernel attempted to write user page (afd4fa84) - exploit attempt? (uid: 1000) ------------[ cut here ]------------ Bug: Write fault blocked by KUAP! WARNING: CPU: 0 PID: 1617 at arch/powerpc/mm/fault.c:230 do_page_fault+0x484/0x720 Modules linked in: CPU: 0 PID: 1617 Comm: sshd Tainted: G W 5.13.0-pmac-00010-g8393422eb77 #7 NIP: c001b77c LR: c001b77c CTR: 00000000 REGS: cb9e5bc0 TRAP: 0700 Tainted: G W (5.13.0-pmac-00010-g8393422eb77) MSR: 00021032 CR: 24942424 XER: 00000000 GPR00: c001b77c cb9e5c80 c1582c00 00000021 3ffffbff 085b0000 00000027 c8eb644c GPR08: 00000023 00000000 00000000 00000000 24942424 0063f8c8 00000000 000186a0 GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 c07640c4 cb9e5e98 cb9e5e90 GPR24: 00000040 afd4fa96 00000040 02000000 c1fda6c0 afd4fa84 00000300 cb9e5cc0 NIP [c001b77c] do_page_fault+0x484/0x720 LR [c001b77c] do_page_fault+0x484/0x720 Call Trace: [cb9e5c80] [c001b77c] do_page_fault+0x484/0x720 (unreliable) [cb9e5cb0] [c000424c] DataAccess_virt+0xd4/0xe4 --- interrupt: 300 at __copy_tofrom_user+0x110/0x20c NIP: c001f9b4 LR: c03250a0 CTR: 00000004 REGS: cb9e5cc0 TRAP: 0300 Tainted: G W (5.13.0-pmac-00010-g8393422eb77) MSR: 00009032 CR: 48028468 XER: 20000000 DAR: afd4fa84 DSISR: 0a000000 GPR00: 20726f6f cb9e5d80 c1582c00 00000004 cb9e5e3a 00000016 afd4fa80 00000000 GPR08: 3835202d 72777872 2d78722d 00000004 28028464 0063f8c8 00000000 000186a0 GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 c07640c4 cb9e5e98 cb9e5e90 GPR24: 00000040 afd4fa96 00000040 cb9e5e0c 00000daa a0000000 cb9e5e98 afd4fa56 NIP [c001f9b4] __copy_tofrom_user+0x110/0x20c LR [c03250a0] _copy_to_iter+0x144/0x990 --- interrupt: 300 [cb9e5d80] [c03e89c0] n_tty_read+0xa4/0x598 (unreliable) [cb9e5df0] [c03e2a0c] tty_read+0xdc/0x2b4 [cb9e5e80] [c0156bf8] vfs_read+0x274/0x340 [cb9e5f00] [c01571ac] ksys_read+0x70/0x118 [cb9e5f30] [c0016048] ret_from_syscall+0x0/0x28 --- interrupt: c00 at 0xa7855c88 NIP: a7855c88 LR: a7855c5c CTR: 00000000 REGS: cb9e5f40 TRAP: 0c00 Tainted: G W (5.13.0-pmac-00010-g8393422eb77) MSR: 0000d032 CR: 2402446c XER: 00000000 GPR00: 00000003 afd4ec70 a72137d0 0000000b afd4ecac 00004000 0065a990 00000800 GPR08: 00000000 a7947930 00000000 00000004 c15831b0 0063f8c8 00000000 000186a0 GPR16: afd52dd4 afd52dd0 afd52dcc afd52dc8 0065a990 0065a9e0 00000001 0065fac0 GPR24: 00000000 00000089 00664050 00000000 00668e30 a720c8dc a7943ff4 0065f9b0 NIP [a7855c88] 0xa7855c88 LR [a7855c5c] 0xa7855c5c --- interrupt: c00 Instruction dump: 3884aa88 38630178 48076861 807f0080 48042e45 2f830000 419e0148 3c80c079 3c60c076 38841be4 386301c0 4801f705 <0fe00000> 3860000b 4bfffe30 3c80c06b ---[ end trace fd69b91a8046c2e5 ]--- Here the problem is that by re-enterring an exception handler, kuap_save_and_lock() is called a second time with this time KUAP access locked, leading to regs->kuap being overwritten hence KUAP not being unlocked at exception exit as expected. Do not call do_IRQ() from timer_interrupt() directly. Instead, redefine do_IRQ() as a standard function named __do_IRQ(), and call it from both do_IRQ() and time_interrupt() handlers. Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers") Cc: stable@vger.kernel.org # v5.12+ Reported-by: Stan Johnson Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c17d234f4927d39a1d7100864a8e1145323d33a0.1628611927.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/interrupt.h | 3 +++ arch/powerpc/include/asm/irq.h | 2 +- arch/powerpc/kernel/irq.c | 7 ++++++- arch/powerpc/kernel/time.c | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index d4bdf7d274ac..6b800d3e2681 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -583,6 +583,9 @@ DECLARE_INTERRUPT_HANDLER_NMI(hmi_exception_realmode); DECLARE_INTERRUPT_HANDLER_ASYNC(TAUException); +/* irq.c */ +DECLARE_INTERRUPT_HANDLER_ASYNC(do_IRQ); + void __noreturn unrecoverable_exception(struct pt_regs *regs); void replay_system_reset(void); diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 4982f3711fc3..2b3278534bc1 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -52,7 +52,7 @@ extern void *mcheckirq_ctx[NR_CPUS]; extern void *hardirq_ctx[NR_CPUS]; extern void *softirq_ctx[NR_CPUS]; -extern void do_IRQ(struct pt_regs *regs); +void __do_IRQ(struct pt_regs *regs); extern void __init init_IRQ(void); extern void __do_irq(struct pt_regs *regs); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 91e63eac4e8f..551b653228c4 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -750,7 +750,7 @@ void __do_irq(struct pt_regs *regs) trace_irq_exit(regs); } -DEFINE_INTERRUPT_HANDLER_ASYNC(do_IRQ) +void __do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); void *cursp, *irqsp, *sirqsp; @@ -774,6 +774,11 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(do_IRQ) set_irq_regs(old_regs); } +DEFINE_INTERRUPT_HANDLER_ASYNC(do_IRQ) +{ + __do_IRQ(regs); +} + static void *__init alloc_vm_stack(void) { return __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP, diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e45ce427bffb..c487ba5a6e11 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -586,7 +586,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) #if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC) if (atomic_read(&ppc_n_lost_interrupts) != 0) - do_IRQ(regs); + __do_IRQ(regs); #endif old_regs = set_irq_regs(regs); From 01fcac8e4dfc112f420dcaeb70056a74e326cacf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 10 Aug 2021 16:13:17 +0000 Subject: [PATCH 105/187] powerpc/interrupt: Do not call single_step_exception() from other exceptions single_step_exception() is called by emulate_single_step() which is called from (at least) alignment exception() handler and program_check_exception() handler. Redefine it as a regular __single_step_exception() which is called by both single_step_exception() handler and emulate_single_step() function. Fixes: 3a96570ffceb ("powerpc: convert interrupt handlers to use wrappers") Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Christophe Leroy Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/aed174f5cbc06f2cf95233c071d8aac948e46043.1628611921.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/traps.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..d56254f05e17 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1104,7 +1104,7 @@ DEFINE_INTERRUPT_HANDLER(RunModeException) _exception(SIGTRAP, regs, TRAP_UNK, 0); } -DEFINE_INTERRUPT_HANDLER(single_step_exception) +static void __single_step_exception(struct pt_regs *regs) { clear_single_step(regs); clear_br_trace(regs); @@ -1121,6 +1121,11 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception) _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); } +DEFINE_INTERRUPT_HANDLER(single_step_exception) +{ + __single_step_exception(regs); +} + /* * After we have successfully emulated an instruction, we have to * check if the instruction was being single-stepped, and if so, @@ -1130,7 +1135,7 @@ DEFINE_INTERRUPT_HANDLER(single_step_exception) static void emulate_single_step(struct pt_regs *regs) { if (single_stepping(regs)) - single_step_exception(regs); + __single_step_exception(regs); } static inline int __parse_fpscr(unsigned long fpscr) From cbc06f051c524dcfe52ef0d1f30647828e226d30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Sat, 7 Aug 2021 09:20:57 +0200 Subject: [PATCH 106/187] powerpc/xive: Do not skip CPU-less nodes when creating the IPIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On PowerVM, CPU-less nodes can be populated with hot-plugged CPUs at runtime. Today, the IPI is not created for such nodes, and hot-plugged CPUs use a bogus IPI, which leads to soft lockups. We can not directly allocate and request the IPI on demand because bringup_up() is called under the IRQ sparse lock. The alternative is to allocate the IPIs for all possible nodes at startup and to request the mapping on demand when the first CPU of a node is brought up. Fixes: 7dcc37b3eff9 ("powerpc/xive: Map one IPI interrupt per node") Cc: stable@vger.kernel.org # v5.13 Reported-by: Geetika Moolchandani Signed-off-by: Cédric Le Goater Tested-by: Srikar Dronamraju Tested-by: Laurent Vivier Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807072057.184698-1-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 35 +++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index dbdbbc2f1dc5..943fd30095af 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -67,6 +67,7 @@ static struct irq_domain *xive_irq_domain; static struct xive_ipi_desc { unsigned int irq; char name[16]; + atomic_t started; } *xive_ipis; /* @@ -1120,7 +1121,7 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = { .alloc = xive_ipi_irq_domain_alloc, }; -static int __init xive_request_ipi(void) +static int __init xive_init_ipis(void) { struct fwnode_handle *fwnode; struct irq_domain *ipi_domain; @@ -1144,10 +1145,6 @@ static int __init xive_request_ipi(void) struct xive_ipi_desc *xid = &xive_ipis[node]; struct xive_ipi_alloc_info info = { node }; - /* Skip nodes without CPUs */ - if (cpumask_empty(cpumask_of_node(node))) - continue; - /* * Map one IPI interrupt per node for all cpus of that node. * Since the HW interrupt number doesn't have any meaning, @@ -1159,11 +1156,6 @@ static int __init xive_request_ipi(void) xid->irq = ret; snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); - - ret = request_irq(xid->irq, xive_muxed_ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); - - WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); } return ret; @@ -1178,6 +1170,22 @@ static int __init xive_request_ipi(void) return ret; } +static int __init xive_request_ipi(unsigned int cpu) +{ + struct xive_ipi_desc *xid = &xive_ipis[early_cpu_to_node(cpu)]; + int ret; + + if (atomic_inc_return(&xid->started) > 1) + return 0; + + ret = request_irq(xid->irq, xive_muxed_ipi_action, + IRQF_PERCPU | IRQF_NO_THREAD, + xid->name, NULL); + + WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); + return ret; +} + static int xive_setup_cpu_ipi(unsigned int cpu) { unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu); @@ -1192,6 +1200,9 @@ static int xive_setup_cpu_ipi(unsigned int cpu) if (xc->hw_ipi != XIVE_BAD_IRQ) return 0; + /* Register the IPI */ + xive_request_ipi(cpu); + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ if (xive_ops->get_ipi(cpu, xc)) return -EIO; @@ -1231,6 +1242,8 @@ static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) if (xc->hw_ipi == XIVE_BAD_IRQ) return; + /* TODO: clear IPI mapping */ + /* Mask the IPI */ xive_do_source_set_mask(&xc->ipi_data, true); @@ -1253,7 +1266,7 @@ void __init xive_smp_probe(void) smp_ops->cause_ipi = xive_cause_ipi; /* Register the IPI */ - xive_request_ipi(); + xive_init_ipis(); /* Allocate and setup IPI for the boot CPU */ xive_setup_cpu_ipi(smp_processor_id()); From 030d6dbf0c2e5fdf23ad29557f0c87a882993e26 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 22 Jul 2021 10:17:15 +0800 Subject: [PATCH 107/187] riscv: kexec: do not add '-mno-relax' flag if compiler doesn't support it The RISC-V special option '-mno-relax' which to disable linker relaxations is supported by GCC8+. For GCC7 and lower versions do not support this option. Fixes: fba8a8674f68 ("RISC-V: Add kexec support") Signed-off-by: Changbin Du Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index d3081e4d9600..3397ddac1a30 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -11,7 +11,7 @@ endif CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) ifdef CONFIG_KEXEC -AFLAGS_kexec_relocate.o := -mcmodel=medany -mno-relax +AFLAGS_kexec_relocate.o := -mcmodel=medany $(call cc-option,-mno-relax) endif extra-y += head.o From fdf3a7a1e0a67a52f631b055975c6ac7e0e49a65 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 26 Jul 2021 07:42:54 +0200 Subject: [PATCH 108/187] riscv: Fix comment regarding kernel mapping overlapping with IS_ERR_VALUE The current comment states that we check if the 64-bit kernel mapping overlaps with the last 4K of the address space that is reserved to error values in create_kernel_page_table, which is not the case since it is done in setup_vm. But anyway, remove the reference to any function and simply note that in 64-bit kernel, the check should be done as soon as the kernel mapping base address is known. Fixes: db6b84a368b4 ("riscv: Make sure the kernel mapping does not overlap with IS_ERR_VALUE") Signed-off-by: Alexandre Ghiti Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 88134cc288d9..7cb4f391d106 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -197,7 +197,7 @@ static void __init setup_bootmem(void) * if end of dram is equal to maximum addressable memory. For 64-bit * kernel, this problem can't happen here as the end of the virtual * address space is occupied by the kernel mapping then this check must - * be done in create_kernel_page_table. + * be done as soon as the kernel mapping base address is determined. */ max_mapped_addr = __pa(~(ulong)0); if (max_mapped_addr == (phys_ram_end - 1)) From 839ad22f755132838f406751439363c07272ad87 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 30 Jul 2021 17:01:46 -0700 Subject: [PATCH 109/187] x86/tools: Fix objdump version check again Skip (omit) any version string info that is parenthesized. Warning: objdump version 15) is older than 2.19 Warning: Skipping posttest. where 'objdump -v' says: GNU objdump (GNU Binutils; SUSE Linux Enterprise 15) 2.35.1.20201123-7.18 Fixes: 8bee738bb1979 ("x86: Fix objdump version check in chkobjdump.awk for different formats.") Signed-off-by: Randy Dunlap Signed-off-by: Thomas Gleixner Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/r/20210731000146.2720-1-rdunlap@infradead.org --- arch/x86/tools/chkobjdump.awk | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk index fd1ab80be0de..a4cf678cf5c8 100644 --- a/arch/x86/tools/chkobjdump.awk +++ b/arch/x86/tools/chkobjdump.awk @@ -10,6 +10,7 @@ BEGIN { /^GNU objdump/ { verstr = "" + gsub(/\(.*\)/, ""); for (i = 3; i <= NF; i++) if (match($(i), "^[0-9]")) { verstr = $(i); From 7f45621c14a209b986cd636447bb53b7f6f881c3 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 12 Aug 2021 16:55:12 +0200 Subject: [PATCH 110/187] platform/x86: asus-nb-wmi: Allow configuring SW_TABLET_MODE method with a module option Unfortunately we have been unable to find a reliable way to detect if and how SW_TABLET_MODE reporting is supported, so we are relying on DMI quirks for this. Add a module-option to specify the SW_TABLET_MODE method so that this can be easily tested without needing to rebuild the kernel. BugLink: https://gitlab.freedesktop.org/libinput/libinput/-/issues/639 Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20210812145513.39117-1-hdegoede@redhat.com --- drivers/platform/x86/asus-nb-wmi.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 0cb927f0f301..9929eedf7dd8 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -41,6 +41,10 @@ static int wapf = -1; module_param(wapf, uint, 0444); MODULE_PARM_DESC(wapf, "WAPF value"); +static int tablet_mode_sw = -1; +module_param(tablet_mode_sw, uint, 0444); +MODULE_PARM_DESC(tablet_mode_sw, "Tablet mode detect: -1:auto 0:disable 1:kbd-dock 2:lid-flip"); + static struct quirk_entry *quirks; static bool asus_q500a_i8042_filter(unsigned char data, unsigned char str, @@ -477,6 +481,21 @@ static void asus_nb_wmi_quirks(struct asus_wmi_driver *driver) else wapf = quirks->wapf; + switch (tablet_mode_sw) { + case 0: + quirks->use_kbd_dock_devid = false; + quirks->use_lid_flip_devid = false; + break; + case 1: + quirks->use_kbd_dock_devid = true; + quirks->use_lid_flip_devid = false; + break; + case 2: + quirks->use_kbd_dock_devid = false; + quirks->use_lid_flip_devid = true; + break; + } + if (quirks->i8042_filter) { ret = i8042_install_filter(quirks->i8042_filter); if (ret) { From 73fcbad691110ece47a487c9e584822070e3626f Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 12 Aug 2021 16:55:13 +0200 Subject: [PATCH 111/187] platform/x86: asus-nb-wmi: Add tablet_mode_sw=lid-flip quirk for the TP200s The Asus TP200s / E205SA 360 degree hinges 2-in-1 supports reporting SW_TABLET_MODE info through the ASUS_WMI_DEVID_LID_FLIP WMI device-id. Add a quirk to enable this. BugLink: https://gitlab.freedesktop.org/libinput/libinput/-/issues/639 Signed-off-by: Hans de Goede Link: https://lore.kernel.org/r/20210812145513.39117-2-hdegoede@redhat.com --- drivers/platform/x86/asus-nb-wmi.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c index 9929eedf7dd8..a81dc4b191b7 100644 --- a/drivers/platform/x86/asus-nb-wmi.c +++ b/drivers/platform/x86/asus-nb-wmi.c @@ -462,6 +462,15 @@ static const struct dmi_system_id asus_quirks[] = { }, .driver_data = &quirk_asus_use_lid_flip_devid, }, + { + .callback = dmi_matched, + .ident = "ASUS TP200s / E205SA", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), + DMI_MATCH(DMI_PRODUCT_NAME, "E205SA"), + }, + .driver_data = &quirk_asus_use_lid_flip_devid, + }, {}, }; From 88ca2521bd5b4e8b83743c01a2d4cb09325b51e9 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Thu, 12 Aug 2021 13:09:27 +0000 Subject: [PATCH 112/187] xen/events: Fix race in set_evtchn_to_irq There is a TOCTOU issue in set_evtchn_to_irq. Rows in the evtchn_to_irq mapping are lazily allocated in this function. The check whether the row is already present and the row initialization is not synchronized. Two threads can at the same time allocate a new row for evtchn_to_irq and add the irq mapping to the their newly allocated row. One thread will overwrite what the other has set for evtchn_to_irq[row] and therefore the irq mapping is lost. This will trigger a BUG_ON later in bind_evtchn_to_cpu: INFO: pci 0000:1a:15.4: [1d0f:8061] type 00 class 0x010802 INFO: nvme 0000:1a:12.1: enabling device (0000 -> 0002) INFO: nvme nvme77: 1/0/0 default/read/poll queues CRIT: kernel BUG at drivers/xen/events/events_base.c:427! WARN: invalid opcode: 0000 [#1] SMP NOPTI WARN: Workqueue: nvme-reset-wq nvme_reset_work [nvme] WARN: RIP: e030:bind_evtchn_to_cpu+0xc2/0xd0 WARN: Call Trace: WARN: set_affinity_irq+0x121/0x150 WARN: irq_do_set_affinity+0x37/0xe0 WARN: irq_setup_affinity+0xf6/0x170 WARN: irq_startup+0x64/0xe0 WARN: __setup_irq+0x69e/0x740 WARN: ? request_threaded_irq+0xad/0x160 WARN: request_threaded_irq+0xf5/0x160 WARN: ? nvme_timeout+0x2f0/0x2f0 [nvme] WARN: pci_request_irq+0xa9/0xf0 WARN: ? pci_alloc_irq_vectors_affinity+0xbb/0x130 WARN: queue_request_irq+0x4c/0x70 [nvme] WARN: nvme_reset_work+0x82d/0x1550 [nvme] WARN: ? check_preempt_wakeup+0x14f/0x230 WARN: ? check_preempt_curr+0x29/0x80 WARN: ? nvme_irq_check+0x30/0x30 [nvme] WARN: process_one_work+0x18e/0x3c0 WARN: worker_thread+0x30/0x3a0 WARN: ? process_one_work+0x3c0/0x3c0 WARN: kthread+0x113/0x130 WARN: ? kthread_park+0x90/0x90 WARN: ret_from_fork+0x3a/0x50 This patch sets evtchn_to_irq rows via a cmpxchg operation so that they will be set only once. The row is now cleared before writing it to evtchn_to_irq in order to not create a race once the row is visible for other threads. While at it, do not require the page to be zeroed, because it will be overwritten with -1's in clear_evtchn_to_irq_row anyway. Signed-off-by: Maximilian Heyne Fixes: d0b075ffeede ("xen/events: Refactor evtchn_to_irq array to be dynamically allocated") Link: https://lore.kernel.org/r/20210812130930.127134-1-mheyne@amazon.de Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/events/events_base.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 154daddbdcb4..a78704ae3618 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -198,12 +198,12 @@ static void disable_dynirq(struct irq_data *data); static DEFINE_PER_CPU(unsigned int, irq_epoch); -static void clear_evtchn_to_irq_row(unsigned row) +static void clear_evtchn_to_irq_row(int *evtchn_row) { unsigned col; for (col = 0; col < EVTCHN_PER_ROW; col++) - WRITE_ONCE(evtchn_to_irq[row][col], -1); + WRITE_ONCE(evtchn_row[col], -1); } static void clear_evtchn_to_irq_all(void) @@ -213,7 +213,7 @@ static void clear_evtchn_to_irq_all(void) for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { if (evtchn_to_irq[row] == NULL) continue; - clear_evtchn_to_irq_row(row); + clear_evtchn_to_irq_row(evtchn_to_irq[row]); } } @@ -221,6 +221,7 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) { unsigned row; unsigned col; + int *evtchn_row; if (evtchn >= xen_evtchn_max_channels()) return -EINVAL; @@ -233,11 +234,18 @@ static int set_evtchn_to_irq(evtchn_port_t evtchn, unsigned int irq) if (irq == -1) return 0; - evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); - if (evtchn_to_irq[row] == NULL) + evtchn_row = (int *) __get_free_pages(GFP_KERNEL, 0); + if (evtchn_row == NULL) return -ENOMEM; - clear_evtchn_to_irq_row(row); + clear_evtchn_to_irq_row(evtchn_row); + + /* + * We've prepared an empty row for the mapping. If a different + * thread was faster inserting it, we can drop ours. + */ + if (cmpxchg(&evtchn_to_irq[row], NULL, evtchn_row) != NULL) + free_page((unsigned long) evtchn_row); } WRITE_ONCE(evtchn_to_irq[row][col], irq); From 41535701da3324b80029cabb501e86c4fafe339d Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Thu, 29 Jul 2021 07:45:29 +0000 Subject: [PATCH 113/187] cifs: Handle race conditions during rename When rename is executed on directory which has files for which close is deferred, then rename will fail with EACCES. This patch will try to close all deferred files when EACCES is received and retry rename on a directory. Signed-off-by: Rohith Surabattula Cc: stable@vger.kernel.org # 5.13 Reviewed-by: Shyam Prasad N Signed-off-by: Steve French --- fs/cifs/inode.c | 19 +++++++++++++++++-- fs/cifs/misc.c | 16 +++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b96b253e7635..65f8a70cece3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1625,7 +1625,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } - cifs_close_all_deferred_files(tcon); + cifs_close_deferred_file(CIFS_I(inode)); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -2084,6 +2084,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, FILE_UNIX_BASIC_INFO *info_buf_target; unsigned int xid; int rc, tmprc; + int retry_count = 0; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -2113,10 +2114,24 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_all_deferred_files(tcon); + cifs_close_deferred_file(CIFS_I(d_inode(source_dentry))); + if (d_inode(target_dentry) != NULL) + cifs_close_deferred_file(CIFS_I(d_inode(target_dentry))); + rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); + if (rc == -EACCES) { + while (retry_count < 3) { + cifs_close_all_deferred_files(tcon); + rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, + to_name); + if (rc != -EACCES) + break; + retry_count++; + } + } + /* * No-replace is the natural behavior for CIFS, so skip unlink hacks. */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 844abeb2b48f..cdb1ec1461de 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -723,13 +723,19 @@ void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *cfile = NULL; - struct cifs_deferred_close *dclose; + + if (cifs_inode == NULL) + return; list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { - spin_lock(&cifs_inode->deferred_lock); - if (cifs_is_deferred_close(cfile, &dclose)) - mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); - spin_unlock(&cifs_inode->deferred_lock); + if (delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } } } From 9e992755be8f2d458a0bcbefd19e493483c1dba2 Mon Sep 17 00:00:00 2001 From: Rohith Surabattula Date: Mon, 9 Aug 2021 09:32:46 +0000 Subject: [PATCH 114/187] cifs: Call close synchronously during unlink/rename/lease break. During unlink/rename/lease break, deferred work for close is scheduled immediately but in an asynchronous manner which might lead to race with actual(unlink/rename) commands. This change will schedule close synchronously which will avoid the race conditions with other commands. Signed-off-by: Rohith Surabattula Reviewed-by: Shyam Prasad N Cc: stable@vger.kernel.org # 5.13 Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 5 +++++ fs/cifs/file.c | 35 +++++++++++++++++------------------ fs/cifs/misc.c | 46 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 56 insertions(+), 30 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0bfc2f01030..c6a9542ca281 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1611,6 +1611,11 @@ struct dfs_info3_param { int ttl; }; +struct file_list { + struct list_head list; + struct cifsFileInfo *cfile; +}; + /* * common struct for holding inode info when searching for or updating an * inode with new info diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a72840a88f1..bb98fbdd22a9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4847,6 +4847,22 @@ void cifs_oplock_break(struct work_struct *work) cifs_dbg(VFS, "Push locks rc = %d\n", rc); oplock_break_ack: + /* + * When oplock break is received and there are no active + * file handles but cached, then schedule deferred close immediately. + * So, new open will not use cached handle. + */ + spin_lock(&CIFS_I(inode)->deferred_lock); + is_deferred = cifs_is_deferred_close(cfile, &dclose); + spin_unlock(&CIFS_I(inode)->deferred_lock); + if (is_deferred && + cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + if (cancel_delayed_work(&cfile->deferred)) { + _cifsFileInfo_put(cfile, false, false); + goto oplock_break_done; + } + } /* * releasing stale oplock after recent reconnect of smb session using * a now incorrect file handle is not a data integrity issue but do @@ -4858,24 +4874,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } - /* - * When oplock break is received and there are no active - * file handles but cached, then schedule deferred close immediately. - * So, new open will not use cached handle. - */ - spin_lock(&CIFS_I(inode)->deferred_lock); - is_deferred = cifs_is_deferred_close(cfile, &dclose); - if (is_deferred && - cfile->deferred_close_scheduled && - delayed_work_pending(&cfile->deferred)) { - /* - * If there is no pending work, mod_delayed_work queues new work. - * So, Increase the ref count to avoid use-after-free. - */ - if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) - cifsFileInfo_get(cfile); - } - spin_unlock(&CIFS_I(inode)->deferred_lock); +oplock_break_done: _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index cdb1ec1461de..9469f1cf0b46 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -723,20 +723,32 @@ void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *cfile = NULL; + struct file_list *tmp_list, *tmp_next_list; + struct list_head file_head; if (cifs_inode == NULL) return; + INIT_LIST_HEAD(&file_head); + spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { if (delayed_work_pending(&cfile->deferred)) { - /* - * If there is no pending work, mod_delayed_work queues new work. - * So, Increase the ref count to avoid use-after-free. - */ - if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) - cifsFileInfo_get(cfile); + if (cancel_delayed_work(&cfile->deferred)) { + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + continue; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); + } } } + spin_unlock(&cifs_inode->open_file_lock); + + list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { + _cifsFileInfo_put(tmp_list->cfile, true, false); + list_del(&tmp_list->list); + kfree(tmp_list); + } } void @@ -744,20 +756,30 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) { struct cifsFileInfo *cfile; struct list_head *tmp; + struct file_list *tmp_list, *tmp_next_list; + struct list_head file_head; + INIT_LIST_HEAD(&file_head); spin_lock(&tcon->open_file_lock); list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); if (delayed_work_pending(&cfile->deferred)) { - /* - * If there is no pending work, mod_delayed_work queues new work. - * So, Increase the ref count to avoid use-after-free. - */ - if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) - cifsFileInfo_get(cfile); + if (cancel_delayed_work(&cfile->deferred)) { + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + continue; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); + } } } spin_unlock(&tcon->open_file_lock); + + list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { + _cifsFileInfo_put(tmp_list->cfile, true, false); + list_del(&tmp_list->list); + kfree(tmp_list); + } } /* parses DFS refferal V3 structure From d03721a6e7e8c04261873b3840daa3ce2c5b0543 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 18 Jul 2021 11:07:53 +0200 Subject: [PATCH 115/187] trace/osnoise: Add a header with PREEMPT_RT additional fields Some extra flags are printed to the trace header when using the PREEMPT_RT config. The extra flags are: need-resched-lazy, preempt-lazy-depth, and migrate-disable. Without printing these fields, the osnoise specific fields are shifted by three positions, for example: # tracer: osnoise # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth MAX # || / SINGLE Interference counters: # |||| RUNTIME NOISE %% OF CPU NOISE +-----------------------------+ # TASK-PID CPU# |||| TIMESTAMP IN US IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD # | | | |||| | | | | | | | | | | <...>-741 [000] ....... 1105.690909: 1000000 234 99.97660 36 21 0 1001 22 3 <...>-742 [001] ....... 1105.691923: 1000000 281 99.97190 197 7 0 1012 35 14 <...>-743 [002] ....... 1105.691958: 1000000 1324 99.86760 118 11 0 1016 155 143 <...>-744 [003] ....... 1105.691998: 1000000 109 99.98910 21 4 0 1004 33 7 <...>-745 [004] ....... 1105.692015: 1000000 2023 99.79770 97 37 0 1023 52 18 Add a new header for osnoise with the missing fields, to be used when the PREEMPT_RT is enabled. Link: https://lkml.kernel.org/r/1f03289d2a51fde5a58c2e7def063dc630820ad1.1626598844.git.bristot@kernel.org Cc: Tom Zanussi Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index a7e3c24dee13..03ef720b491d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -253,10 +253,40 @@ static struct osnoise_data { */ static bool osnoise_busy; +#ifdef CONFIG_PREEMPT_RT /* * Print the osnoise header info. */ static void print_osnoise_headers(struct seq_file *s) +{ + if (osnoise_data.tainted) + seq_puts(s, "# osnoise is tainted!\n"); + + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + + seq_puts(s, "# |||||| / "); + seq_puts(s, " MAX\n"); + + seq_puts(s, "# ||||| / "); + seq_puts(s, " SINGLE Interference counters:\n"); + + seq_puts(s, "# ||||||| RUNTIME "); + seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); + + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US "); + seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); + + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | | | | | | | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ +static void print_osnoise_headers(struct seq_file *s) { if (osnoise_data.tainted) seq_puts(s, "# osnoise is tainted!\n"); @@ -279,6 +309,7 @@ static void print_osnoise_headers(struct seq_file *s) seq_puts(s, "# | | | |||| | | "); seq_puts(s, " | | | | | | | |\n"); } +#endif /* CONFIG_PREEMPT_RT */ /* * osnoise_taint - report an osnoise error. From e1c4ad4a7f58417a6c483432b69c640670b6fe3d Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 18 Jul 2021 11:07:54 +0200 Subject: [PATCH 116/187] trace/timerlat: Add a header with PREEMPT_RT additional fields Some extra flags are printed to the trace header when using the PREEMPT_RT config. The extra flags are: need-resched-lazy, preempt-lazy-depth, and migrate-disable. Without printing these fields, the timerlat specific fields are shifted by three positions, for example: # tracer: timerlat # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # || / # |||| ACTIVATION # TASK-PID CPU# |||| TIMESTAMP ID CONTEXT LATENCY # | | | |||| | | | | -0 [000] d..h... 3279.798871: #1 context irq timer_latency 830 ns <...>-807 [000] ....... 3279.798881: #1 context thread timer_latency 11301 ns Add a new header for timerlat with the missing fields, to be used when the PREEMPT_RT is enabled. Link: https://lkml.kernel.org/r/babb83529a3211bd0805be0b8c21608230202c55.1626598844.git.bristot@kernel.org Cc: Tom Zanussi Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 03ef720b491d..518a5c190b2b 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -354,6 +354,24 @@ static void trace_osnoise_sample(struct osnoise_sample *sample) /* * Print the timerlat header info. */ +#ifdef CONFIG_PREEMPT_RT +static void print_timerlat_headers(struct seq_file *s) +{ + seq_puts(s, "# _-------=> irqs-off\n"); + seq_puts(s, "# / _------=> need-resched\n"); + seq_puts(s, "# | / _-----=> need-resched-lazy\n"); + seq_puts(s, "# || / _----=> hardirq/softirq\n"); + seq_puts(s, "# ||| / _---=> preempt-depth\n"); + seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n"); + seq_puts(s, "# ||||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||||| /\n"); + seq_puts(s, "# ||||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||||| | | "); + seq_puts(s, " | |\n"); +} +#else /* CONFIG_PREEMPT_RT */ static void print_timerlat_headers(struct seq_file *s) { seq_puts(s, "# _-----=> irqs-off\n"); @@ -367,6 +385,7 @@ static void print_timerlat_headers(struct seq_file *s) seq_puts(s, "# | | | |||| | | "); seq_puts(s, " | |\n"); } +#endif /* CONFIG_PREEMPT_RT */ /* * Record an timerlat_sample into the tracer buffer. From 0e05ba498dd0a19fc12868a9506be0f86cf36912 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Sun, 18 Jul 2021 11:07:55 +0200 Subject: [PATCH 117/187] trace/osnoise: Print a stop tracing message When using osnoise/timerlat with stop tracing, sometimes it is not clear in which CPU the stop condition was hit, mainly when using some extra events. Print a message informing in which CPU the trace stopped, like in the example below: -0 [006] d.h. 2932.676616: #1672599 context irq timer_latency 34689 ns -0 [006] dNh. 2932.676618: irq_noise: local_timer:236 start 2932.676615639 duration 2391 ns -0 [006] dNh. 2932.676620: irq_noise: virtio0-output.0:47 start 2932.676620180 duration 86 ns -0 [003] d.h. 2932.676621: #1673374 context irq timer_latency 1200 ns -0 [006] d... 2932.676623: thread_noise: swapper/6:0 start 2932.676615964 duration 4339 ns -0 [003] dNh. 2932.676623: irq_noise: local_timer:236 start 2932.676620597 duration 1881 ns -0 [006] d... 2932.676623: sched_switch: prev_comm=swapper/6 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=timerlat/6 next_pid=852 next_prio=4 timerlat/6-852 [006] .... 2932.676623: #1672599 context thread timer_latency 41931 ns -0 [003] d... 2932.676623: thread_noise: swapper/3:0 start 2932.676620854 duration 880 ns -0 [003] d... 2932.676624: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=timerlat/3 next_pid=849 next_prio=4 timerlat/6-852 [006] .... 2932.676624: timerlat_main: stop tracing hit on cpu 6 timerlat/3-849 [003] .... 2932.676624: #1673374 context thread timer_latency 4310 ns Link: https://lkml.kernel.org/r/b30a0d7542adba019185f44ee648e60e14923b11.1626598844.git.bristot@kernel.org Cc: Tom Zanussi Cc: Namhyung Kim Cc: Masami Hiramatsu Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 518a5c190b2b..b61eefe5ccf5 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1075,9 +1075,13 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample * /* * osnoise_stop_tracing - Stop tracing and the tracer. */ -static void osnoise_stop_tracing(void) +static __always_inline void osnoise_stop_tracing(void) { struct trace_array *tr = osnoise_trace; + + trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, + "stop tracing hit on cpu %d\n", smp_processor_id()); + tracer_tracing_off(tr); } From 12f9951d3f311acb1d4ffe8e839bc2c07983546f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 6 Aug 2021 21:50:27 +0200 Subject: [PATCH 118/187] tracing: define needed config DYNAMIC_FTRACE_WITH_ARGS Commit 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of REGS when ARGS is available") intends to enable config LIVEPATCH when ftrace with ARGS is available. However, the chain of configs to enable LIVEPATCH is incomplete, as HAVE_DYNAMIC_FTRACE_WITH_ARGS is available, but the definition of DYNAMIC_FTRACE_WITH_ARGS, combining DYNAMIC_FTRACE and HAVE_DYNAMIC_FTRACE_WITH_ARGS, needed to enable LIVEPATCH, is missing in the commit. Fortunately, ./scripts/checkkconfigsymbols.py detects this and warns: DYNAMIC_FTRACE_WITH_ARGS Referencing files: kernel/livepatch/Kconfig So, define the config DYNAMIC_FTRACE_WITH_ARGS analogously to the already existing similar configs, DYNAMIC_FTRACE_WITH_REGS and DYNAMIC_FTRACE_WITH_DIRECT_CALLS, in ./kernel/trace/Kconfig to connect the chain of configs. Link: https://lore.kernel.org/kernel-janitors/CAKXUXMwT2zS9fgyQHKUUiqo8ynZBdx2UEUu1WnV_q0OCmknqhw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20210806195027.16808-1-lukas.bulwahn@gmail.com Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Peter Zijlstra Cc: Miroslav Benes Cc: stable@vger.kernel.org Fixes: 2860cd8a2353 ("livepatch: Use the default ftrace_ops instead of REGS when ARGS is available") Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d567b1717c4c..3ee23f4d437f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -219,6 +219,11 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +config DYNAMIC_FTRACE_WITH_ARGS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER From d0ac5fbaf783d59715b8bf426fdffc8c9e84187a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 5 Aug 2021 11:10:51 +0900 Subject: [PATCH 119/187] init: Suppress wrong warning for bootconfig cmdline parameter Since the 'bootconfig' command line parameter is handled before parsing the command line, it doesn't use early_param(). But in this case, kernel shows a wrong warning message about it. [ 0.013714] Kernel command line: ro console=ttyS0 bootconfig console=tty0 [ 0.013741] Unknown command line parameters: bootconfig To suppress this message, add a dummy handler for 'bootconfig'. Link: https://lkml.kernel.org/r/162812945097.77369.1849780946468010448.stgit@devnote2 Fixes: 86d1919a4fb0 ("init: print out unknown kernel parameters") Reviewed-by: Andrew Halaney Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- init/main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/init/main.c b/init/main.c index f5b8246e8aa1..8d97aba78c3a 100644 --- a/init/main.c +++ b/init/main.c @@ -397,6 +397,12 @@ static int __init bootconfig_params(char *param, char *val, return 0; } +static int __init warn_bootconfig(char *str) +{ + /* The 'bootconfig' has been handled by bootconfig_params(). */ + return 0; +} + static void __init setup_boot_config(void) { static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; @@ -475,9 +481,8 @@ static int __init warn_bootconfig(char *str) pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; } -early_param("bootconfig", warn_bootconfig); - #endif +early_param("bootconfig", warn_bootconfig); /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) From 5acce0bff2a0420ce87d4591daeb867f47d552c2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 8 Aug 2021 00:30:11 -0400 Subject: [PATCH 120/187] tracing / histogram: Fix NULL pointer dereference on strcmp() on NULL event name The following commands: # echo 'read_max u64 size;' > synthetic_events # echo 'hist:keys=common_pid:count=count:onmax($count).trace(read_max,count)' > events/syscalls/sys_enter_read/trigger Causes: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 4 PID: 1763 Comm: bash Not tainted 5.14.0-rc2-test+ #155 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:strcmp+0xc/0x20 Code: 75 f7 31 c0 0f b6 0c 06 88 0c 02 48 83 c0 01 84 c9 75 f1 4c 89 c0 c3 0f 1f 80 00 00 00 00 31 c0 eb 08 48 83 c0 01 84 d2 74 0f <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 31 c0 c3 66 90 48 89 RSP: 0018:ffffb5fdc0963ca8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffffb3a4e040 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9714c0d0b640 RDI: 0000000000000000 RBP: 0000000000000000 R08: 00000022986b7cde R09: ffffffffb3a4dff8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9714c50603c8 R13: 0000000000000000 R14: ffff97143fdf9e48 R15: ffff9714c01a2210 FS: 00007f1fa6785740(0000) GS:ffff9714da400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000002d863004 CR4: 00000000001706e0 Call Trace: __find_event_file+0x4e/0x80 action_create+0x6b7/0xeb0 ? kstrdup+0x44/0x60 event_hist_trigger_func+0x1a07/0x2130 trigger_process_regex+0xbd/0x110 event_trigger_write+0x71/0xd0 vfs_write+0xe9/0x310 ksys_write+0x68/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f1fa6879e87 The problem was the "trace(read_max,count)" where the "count" should be "$count" as "onmax()" only handles variables (although it really should be able to figure out that "count" is a field of sys_enter_read). But there's a path that does not find the variable and ends up passing a NULL for the event, which ends up getting passed to "strcmp()". Add a check for NULL to return and error on the command with: # cat error_log hist:syscalls:sys_enter_read: error: Couldn't create or find variable Command: hist:keys=common_pid:count=count:onmax($count).trace(read_max,count) ^ Link: https://lkml.kernel.org/r/20210808003011.4037f8d0@oasis.local.home Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Fixes: 50450603ec9cb tracing: Add 'onmax' hist trigger action support Reviewed-by: Tom Zanussi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 949ef09dc537..a48aa2a2875b 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3430,6 +3430,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, event = data->match_data.event; } + if (!event) + goto free; /* * At this point, we're looking at a field on another * event. Because we can't modify a hist trigger on From 064855a69003c24bd6b473b367d364e418c57625 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 2 Aug 2021 14:38:58 -0500 Subject: [PATCH 121/187] x86/resctrl: Fix default monitoring groups reporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Creating a new sub monitoring group in the root /sys/fs/resctrl leads to getting the "Unavailable" value for mbm_total_bytes and mbm_local_bytes on the entire filesystem. Steps to reproduce: 1. mount -t resctrl resctrl /sys/fs/resctrl/ 2. cd /sys/fs/resctrl/ 3. cat mon_data/mon_L3_00/mbm_total_bytes 23189832 4. Create sub monitor group: mkdir mon_groups/test1 5. cat mon_data/mon_L3_00/mbm_total_bytes Unavailable When a new monitoring group is created, a new RMID is assigned to the new group. But the RMID is not active yet. When the events are read on the new RMID, it is expected to report the status as "Unavailable". When the user reads the events on the default monitoring group with multiple subgroups, the events on all subgroups are consolidated together. Currently, if any of the RMID reads report as "Unavailable", then everything will be reported as "Unavailable". Fix the issue by discarding the "Unavailable" reads and reporting all the successful RMID reads. This is not a problem on Intel systems as Intel reports 0 on Inactive RMIDs. Fixes: d89b7379015f ("x86/intel_rdt/cqm: Add mon_data") Reported-by: Paweł Szulik Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov Acked-by: Reinette Chatre Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=213311 Link: https://lkml.kernel.org/r/162793309296.9224.15871659871696482080.stgit@bmoger-ubuntu --- arch/x86/kernel/cpu/resctrl/monitor.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index f07c10b87a87..57e4bb695ff9 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -285,15 +285,14 @@ static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) return chunks >>= shift; } -static int __mon_event_count(u32 rmid, struct rmid_read *rr) +static u64 __mon_event_count(u32 rmid, struct rmid_read *rr) { struct mbm_state *m; u64 chunks, tval; tval = __rmid_read(rmid, rr->evtid); if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { - rr->val = tval; - return -EINVAL; + return tval; } switch (rr->evtid) { case QOS_L3_OCCUP_EVENT_ID: @@ -305,12 +304,6 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) case QOS_L3_MBM_LOCAL_EVENT_ID: m = &rr->d->mbm_local[rmid]; break; - default: - /* - * Code would never reach here because - * an invalid event id would fail the __rmid_read. - */ - return -EINVAL; } if (rr->first) { @@ -361,23 +354,29 @@ void mon_event_count(void *info) struct rdtgroup *rdtgrp, *entry; struct rmid_read *rr = info; struct list_head *head; + u64 ret_val; rdtgrp = rr->rgrp; - if (__mon_event_count(rdtgrp->mon.rmid, rr)) - return; + ret_val = __mon_event_count(rdtgrp->mon.rmid, rr); /* - * For Ctrl groups read data from child monitor groups. + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. */ head = &rdtgrp->mon.crdtgrp_list; if (rdtgrp->type == RDTCTRL_GROUP) { list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->mon.rmid, rr)) - return; + if (__mon_event_count(entry->mon.rmid, rr) == 0) + ret_val = 0; } } + + /* Report error if none of rmid_reads are successful */ + if (ret_val) + rr->val = ret_val; } /* From 1383279c6494c6b62d1d6939f34906a4d2ef721c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 5 Aug 2021 11:38:04 -0700 Subject: [PATCH 122/187] KVM: x86: Allow guest to set EFER.NX=1 on non-PAE 32-bit kernels Remove an ancient restriction that disallowed exposing EFER.NX to the guest if EFER.NX=0 on the host, even if NX is fully supported by the CPU. The motivation of the check, added by commit 2cc51560aed0 ("KVM: VMX: Avoid saving and restoring msr_efer on lightweight vmexit"), was to rule out the case of host.EFER.NX=0 and guest.EFER.NX=1 so that KVM could run the guest with the host's EFER.NX and thus avoid context switching EFER if the only divergence was the NX bit. Fast forward to today, and KVM has long since stopped running the guest with the host's EFER.NX. Not only does KVM context switch EFER if host.EFER.NX=1 && guest.EFER.NX=0, KVM also forces host.EFER.NX=0 && guest.EFER.NX=1 when using shadow paging (to emulate SMEP). Furthermore, the entire motivation for the restriction was made obsolete over a decade ago when Intel added dedicated host and guest EFER fields in the VMCS (Nehalem timeframe), which reduced the overhead of context switching EFER from 400+ cycles (2 * WRMSR + 1 * RDMSR) to a mere ~2 cycles. In practice, the removed restriction only affects non-PAE 32-bit kernels, as EFER.NX is set during boot if NX is supported and the kernel will use PAE paging (32-bit or 64-bit), regardless of whether or not the kernel will actually use NX itself (mark PTEs non-executable). Alternatively and/or complementarily, startup_32_smp() in head_32.S could be modified to set EFER.NX=1 regardless of paging mode, thus eliminating the scenario where NX is supported but not enabled. However, that runs the risk of breaking non-KVM non-PAE kernels (though the risk is very, very low as there are no known EFER.NX errata), and also eliminates an easy-to-use mechanism for stressing KVM's handling of guest vs. host EFER across nested virtualization transitions. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20210805183804.1221554-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 739be5da3bca..fe03bd978761 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -208,30 +208,6 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_after_set_cpuid(vcpu); } -static int is_efer_nx(void) -{ - return host_efer & EFER_NX; -} - -static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) -{ - int i; - struct kvm_cpuid_entry2 *e, *entry; - - entry = NULL; - for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - e = &vcpu->arch.cpuid_entries[i]; - if (e->function == 0x80000001) { - entry = e; - break; - } - } - if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { - cpuid_entry_clear(entry, X86_FEATURE_NX); - printk(KERN_INFO "kvm: guest NX capability removed\n"); - } -} - int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -302,7 +278,6 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries = e2; vcpu->arch.cpuid_nent = cpuid->nent; - cpuid_fix_nx_cap(vcpu); kvm_update_cpuid_runtime(vcpu); kvm_vcpu_after_set_cpuid(vcpu); @@ -401,7 +376,6 @@ static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) void kvm_set_cpu_caps(void) { - unsigned int f_nx = is_efer_nx() ? F(NX) : 0; #ifdef CONFIG_X86_64 unsigned int f_gbpages = F(GBPAGES); unsigned int f_lm = F(LM); @@ -515,7 +489,7 @@ void kvm_set_cpu_caps(void) F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(NX) | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) ); From ffbe17cadaf564b5da0e4eabdcff1b719e184a76 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 9 Aug 2021 07:00:58 -0400 Subject: [PATCH 123/187] KVM: x86: remove dead initialization hv_vcpu is initialized again a dozen lines below, and at this point vcpu->arch.hyperv is not valid. Remove the initializer. Reported-by: kernel test robot Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 0b38f944c6b6..41d2a53c5dea 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1933,7 +1933,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *entry; - struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); + struct kvm_vcpu_hv *hv_vcpu; entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE, 0); if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) { From 85aa8889b82e0eec680a21ea28dbf57c6acfe182 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Fri, 6 Aug 2021 15:22:29 -0700 Subject: [PATCH 124/187] kvm: vmx: Sync all matching EPTPs when injecting nested EPT fault When a nested EPT violation/misconfig is injected into the guest, the shadow EPT PTEs associated with that address need to be synced. This is done by kvm_inject_emulated_page_fault() before it calls nested_ept_inject_page_fault(). However, that will only sync the shadow EPT PTE associated with the current L1 EPTP. Since the ASID is based on EP4TA rather than the full EPTP, so syncing the current EPTP is not enough. The SPTEs associated with any other L1 EPTPs in the prev_roots cache with the same EP4TA also need to be synced. Signed-off-by: Junaid Shahid Message-Id: <20210806222229.1645356-1-junaids@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 53 ++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1a52134b0c42..81160ebe6acb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -330,6 +330,31 @@ void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) vcpu_put(vcpu); } +#define EPTP_PA_MASK GENMASK_ULL(51, 12) + +static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) +{ + return VALID_PAGE(root_hpa) && + ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); +} + +static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, + gpa_t addr) +{ + uint i; + struct kvm_mmu_root_info *cached_root; + + WARN_ON_ONCE(!mmu_is_nested(vcpu)); + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { + cached_root = &vcpu->arch.mmu->prev_roots[i]; + + if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, + eptp)) + vcpu->arch.mmu->invlpg(vcpu, addr, cached_root->hpa); + } +} + static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -342,10 +367,22 @@ static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, vm_exit_reason = EXIT_REASON_PML_FULL; vmx->nested.pml_full = false; exit_qualification &= INTR_INFO_UNBLOCK_NMI; - } else if (fault->error_code & PFERR_RSVD_MASK) - vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; - else - vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + } else { + if (fault->error_code & PFERR_RSVD_MASK) + vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; + else + vm_exit_reason = EXIT_REASON_EPT_VIOLATION; + + /* + * Although the caller (kvm_inject_emulated_page_fault) would + * have already synced the faulting address in the shadow EPT + * tables for the current EPTP12, we also need to sync it for + * any other cached EPTP02s based on the same EP4TA, since the + * TLB associates mappings to the EP4TA rather than the full EPTP. + */ + nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, + fault->address); + } nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); vmcs12->guest_physical_address = fault->address; @@ -5325,14 +5362,6 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return nested_vmx_succeed(vcpu); } -#define EPTP_PA_MASK GENMASK_ULL(51, 12) - -static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) -{ - return VALID_PAGE(root_hpa) && - ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); -} - /* Emulate the INVEPT instruction */ static int handle_invept(struct kvm_vcpu *vcpu) { From 18712c13709d2de9516c5d3414f707c4f0a9c190 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Aug 2021 21:56:15 -0700 Subject: [PATCH 125/187] KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF Use vmx_need_pf_intercept() when determining if L0 wants to handle a #PF in L2 or if the VM-Exit should be forwarded to L1. The current logic fails to account for the case where #PF is intercepted to handle guest.MAXPHYADDR < host.MAXPHYADDR and ends up reflecting all #PFs into L1. At best, L1 will complain and inject the #PF back into L2. At worst, L1 will eat the unexpected fault and cause L2 to hang on infinite page faults. Note, while the bug was technically introduced by the commit that added support for the MAXPHYADDR madness, the shame is all on commit a0c134347baf ("KVM: VMX: introduce vmx_need_pf_intercept"). Fixes: 1dbf5d68af6f ("KVM: VMX: Add guest physical address check in EPT violation and misconfig") Cc: stable@vger.kernel.org Cc: Peter Shier Cc: Oliver Upton Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20210812045615.3167686-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 81160ebe6acb..b3f77d18eb5a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5855,7 +5855,8 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) - return vcpu->arch.apf.host_apf_flags || !enable_ept; + return vcpu->arch.apf.host_apf_flags || + vmx_need_pf_intercept(vcpu); else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) From 524a1e4e381fc5e7781008d5bd420fd1357c0113 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:14:13 -0700 Subject: [PATCH 126/187] KVM: x86/mmu: Don't leak non-leaf SPTEs when zapping all SPTEs Pass "all ones" as the end GFN to signal "zap all" for the TDP MMU and really zap all SPTEs in this case. As is, zap_gfn_range() skips non-leaf SPTEs whose range exceeds the range to be zapped. If shadow_phys_bits is not aligned to the range size of top-level SPTEs, e.g. 512gb with 4-level paging, the "zap all" flows will skip top-level SPTEs whose range extends beyond shadow_phys_bits and leak their SPs when the VM is destroyed. Use the current upper bound (based on host.MAXPHYADDR) to detect that the caller wants to zap all SPTEs, e.g. instead of using the max theoretical gfn, 1 << (52 - 12). The more precise upper bound allows the TDP iterator to terminate its walk earlier when running on hosts with MAXPHYADDR < 52. Add a WARN on kmv->arch.tdp_mmu_pages when the TDP MMU is destroyed to help future debuggers should KVM decide to leak SPTEs again. The bug is most easily reproduced by running (and unloading!) KVM in a VM whose host.MAXPHYADDR < 39, as the SPTE for gfn=0 will be skipped. ============================================================================= BUG kvm_mmu_page_header (Not tainted): Objects remaining in kvm_mmu_page_header on __kmem_cache_shutdown() ----------------------------------------------------------------------------- Slab 0x000000004d8f7af1 objects=22 used=2 fp=0x00000000624d29ac flags=0x4000000000000200(slab|zone=1) CPU: 0 PID: 1582 Comm: rmmod Not tainted 5.14.0-rc2+ #420 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 Call Trace: dump_stack_lvl+0x45/0x59 slab_err+0x95/0xc9 __kmem_cache_shutdown.cold+0x3c/0x158 kmem_cache_destroy+0x3d/0xf0 kvm_mmu_module_exit+0xa/0x30 [kvm] kvm_arch_exit+0x5d/0x90 [kvm] kvm_exit+0x78/0x90 [kvm] vmx_exit+0x1a/0x50 [kvm_intel] __x64_sys_delete_module+0x13f/0x220 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: faaf05b00aec ("kvm: x86/mmu: Support zapping SPTEs in the TDP MMU") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181414.3376143-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 0853370bd811..8783b9eb2b33 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -43,6 +43,7 @@ void kvm_mmu_uninit_tdp_mmu(struct kvm *kvm) if (!kvm->arch.tdp_mmu_enabled) return; + WARN_ON(!list_empty(&kvm->arch.tdp_mmu_pages)); WARN_ON(!list_empty(&kvm->arch.tdp_mmu_roots)); /* @@ -81,8 +82,6 @@ static void tdp_mmu_free_sp_rcu_callback(struct rcu_head *head) void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); - kvm_lockdep_assert_mmu_lock_held(kvm, shared); if (!refcount_dec_and_test(&root->tdp_mmu_root_count)) @@ -94,7 +93,7 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, list_del_rcu(&root->link); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); - zap_gfn_range(kvm, root, 0, max_gfn, false, false, shared); + zap_gfn_range(kvm, root, 0, -1ull, false, false, shared); call_rcu(&root->rcu_head, tdp_mmu_free_sp_rcu_callback); } @@ -724,8 +723,17 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, bool can_yield, bool flush, bool shared) { + gfn_t max_gfn_host = 1ULL << (shadow_phys_bits - PAGE_SHIFT); + bool zap_all = (start == 0 && end >= max_gfn_host); struct tdp_iter iter; + /* + * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will + * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, + * and so KVM will never install a SPTE for such addresses. + */ + end = min(end, max_gfn_host); + kvm_lockdep_assert_mmu_lock_held(kvm, shared); rcu_read_lock(); @@ -744,9 +752,10 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, /* * If this is a non-last-level SPTE that covers a larger range * than should be zapped, continue, and zap the mappings at a - * lower level. + * lower level, except when zapping all SPTEs. */ - if ((iter.gfn < start || + if (!zap_all && + (iter.gfn < start || iter.gfn + KVM_PAGES_PER_HPAGE(iter.level) > end) && !is_last_spte(iter.old_spte, iter.level)) continue; @@ -794,12 +803,11 @@ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, void kvm_tdp_mmu_zap_all(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); bool flush = false; int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, max_gfn, + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush, false); if (flush) @@ -838,7 +846,6 @@ static struct kvm_mmu_page *next_invalidated_root(struct kvm *kvm, */ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) { - gfn_t max_gfn = 1ULL << (shadow_phys_bits - PAGE_SHIFT); struct kvm_mmu_page *next_root; struct kvm_mmu_page *root; bool flush = false; @@ -854,8 +861,7 @@ void kvm_tdp_mmu_zap_invalidated_roots(struct kvm *kvm) rcu_read_unlock(); - flush = zap_gfn_range(kvm, root, 0, max_gfn, true, flush, - true); + flush = zap_gfn_range(kvm, root, 0, -1ull, true, flush, true); /* * Put the reference acquired in From 0103098fb4f13b447b26ed514bcd3140f6791047 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:14:14 -0700 Subject: [PATCH 127/187] KVM: x86/mmu: Don't step down in the TDP iterator when zapping all SPTEs Set the min_level for the TDP iterator at the root level when zapping all SPTEs to optimize the iterator's try_step_down(). Zapping a non-leaf SPTE will recursively zap all its children, thus there is no need for the iterator to attempt to step down. This avoids rereading the top-level SPTEs after they are zapped by causing try_step_down() to short-circuit. In most cases, optimizing try_step_down() will be in the noise as the cost of zapping SPTEs completely dominates the overall time. The optimization is however helpful if the zap occurs with relatively few SPTEs, e.g. if KVM is zapping in response to multiple memslot updates when userspace is adding and removing read-only memslots for option ROMs. In that case, the task doing the zapping likely isn't a vCPU thread, but it still holds mmu_lock for read and thus can be a noisy neighbor of sorts. Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181414.3376143-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 8783b9eb2b33..d80cb122b5f3 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -727,6 +727,12 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, bool zap_all = (start == 0 && end >= max_gfn_host); struct tdp_iter iter; + /* + * No need to try to step down in the iterator when zapping all SPTEs, + * zapping the top-level non-leaf SPTEs will recurse on their children. + */ + int min_level = zap_all ? root->role.level : PG_LEVEL_4K; + /* * Bound the walk at host.MAXPHYADDR, guest accesses beyond that will * hit a #PF(RSVD) and never get to an EPT Violation/Misconfig / #NPF, @@ -738,7 +744,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, rcu_read_lock(); - tdp_root_for_each_pte(iter, root, start, end) { + for_each_tdp_pte_min_level(iter, root->spt, root->role.level, + min_level, start, end) { retry: if (can_yield && tdp_mmu_iter_cond_resched(kvm, &iter, flush, shared)) { From ce25681d59ffc4303321e555a2d71b1946af07da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 Aug 2021 11:18:15 -0700 Subject: [PATCH 128/187] KVM: x86/mmu: Protect marking SPs unsync when using TDP MMU with spinlock Add yet another spinlock for the TDP MMU and take it when marking indirect shadow pages unsync. When using the TDP MMU and L1 is running L2(s) with nested TDP, KVM may encounter shadow pages for the TDP entries managed by L1 (controlling L2) when handling a TDP MMU page fault. The unsync logic is not thread safe, e.g. the kvm_mmu_page fields are not atomic, and misbehaves when a shadow page is marked unsync via a TDP MMU page fault, which runs with mmu_lock held for read, not write. Lack of a critical section manifests most visibly as an underflow of unsync_children in clear_unsync_child_bit() due to unsync_children being corrupted when multiple CPUs write it without a critical section and without atomic operations. But underflow is the best case scenario. The worst case scenario is that unsync_children prematurely hits '0' and leads to guest memory corruption due to KVM neglecting to properly sync shadow pages. Use an entirely new spinlock even though piggybacking tdp_mmu_pages_lock would functionally be ok. Usurping the lock could degrade performance when building upper level page tables on different vCPUs, especially since the unsync flow could hold the lock for a comparatively long time depending on the number of indirect shadow pages and the depth of the paging tree. For simplicity, take the lock for all MMUs, even though KVM could fairly easily know that mmu_lock is held for write. If mmu_lock is held for write, there cannot be contention for the inner spinlock, and marking shadow pages unsync across multiple vCPUs will be slow enough that bouncing the kvm_arch cacheline should be in the noise. Note, even though L2 could theoretically be given access to its own EPT entries, a nested MMU must hold mmu_lock for write and thus cannot race against a TDP MMU page fault. I.e. the additional spinlock only _needs_ to be taken by the TDP MMU, as opposed to being taken by any MMU for a VM that is running with the TDP MMU enabled. Holding mmu_lock for read also prevents the indirect shadow page from being freed. But as above, keep it simple and always take the lock. Alternative #1, the TDP MMU could simply pass "false" for can_unsync and effectively disable unsync behavior for nested TDP. Write protecting leaf shadow pages is unlikely to noticeably impact traditional L1 VMMs, as such VMMs typically don't modify TDP entries, but the same may not hold true for non-standard use cases and/or VMMs that are migrating physical pages (from L1's perspective). Alternative #2, the unsync logic could be made thread safe. In theory, simply converting all relevant kvm_mmu_page fields to atomics and using atomic bitops for the bitmap would suffice. However, (a) an in-depth audit would be required, (b) the code churn would be substantial, and (c) legacy shadow paging would incur additional atomic operations in performance sensitive paths for no benefit (to legacy shadow paging). Fixes: a2855afc7ee8 ("KVM: x86/mmu: Allow parallel page faults for the TDP MMU") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210812181815.3378104-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/locking.rst | 8 ++++---- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu/mmu.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 35eca377543d..88fa495abbac 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -25,10 +25,10 @@ On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock -- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock is - taken inside kvm->arch.mmu_lock, and cannot be taken without already - holding kvm->arch.mmu_lock (typically with ``read_lock``, otherwise - there's no need to take kvm->arch.tdp_mmu_pages_lock at all). +- kvm->arch.mmu_lock is an rwlock. kvm->arch.tdp_mmu_pages_lock and + kvm->arch.mmu_unsync_pages_lock are taken inside kvm->arch.mmu_lock, and + cannot be taken without already holding kvm->arch.mmu_lock (typically with + ``read_lock`` for the TDP MMU, thus the need for additional spinlocks). Everything else is a leaf: no other lock is taken inside the critical sections. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 974cbfb1eefe..af6ce8d4c86a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1038,6 +1038,13 @@ struct kvm_arch { struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; + /* + * Protects marking pages unsync during page faults, as TDP MMU page + * faults only take mmu_lock for read. For simplicity, the unsync + * pages lock is always taken when marking pages unsync regardless of + * whether mmu_lock is held for read or write. + */ + spinlock_t mmu_unsync_pages_lock; struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c4f4fa23320e..47b765270239 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2535,6 +2535,7 @@ static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *sp; + bool locked = false; /* * Force write-protection if the page is being tracked. Note, the page @@ -2557,9 +2558,34 @@ int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) if (sp->unsync) continue; + /* + * TDP MMU page faults require an additional spinlock as they + * run with mmu_lock held for read, not write, and the unsync + * logic is not thread safe. Take the spinklock regardless of + * the MMU type to avoid extra conditionals/parameters, there's + * no meaningful penalty if mmu_lock is held for write. + */ + if (!locked) { + locked = true; + spin_lock(&vcpu->kvm->arch.mmu_unsync_pages_lock); + + /* + * Recheck after taking the spinlock, a different vCPU + * may have since marked the page unsync. A false + * positive on the unprotected check above is not + * possible as clearing sp->unsync _must_ hold mmu_lock + * for write, i.e. unsync cannot transition from 0->1 + * while this CPU holds mmu_lock for read (or write). + */ + if (READ_ONCE(sp->unsync)) + continue; + } + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } + if (locked) + spin_unlock(&vcpu->kvm->arch.mmu_unsync_pages_lock); /* * We need to ensure that the marking of unsync pages is visible @@ -5537,6 +5563,8 @@ void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; + spin_lock_init(&kvm->arch.mmu_unsync_pages_lock); + if (!kvm_mmu_init_tdp_mmu(kvm)) /* * No smp_load/store wrappers needed here as we are in From 7a3dc4f35bf8e1a07e5c3f8ecc8ac923f48493fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Aug 2021 12:36:14 +0200 Subject: [PATCH 129/187] driver core: Add missing kernel doc for device::msi_lock Fixes: 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI") Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/device.h b/include/linux/device.h index e53aa5065f58..65d84b67b024 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -407,6 +407,7 @@ struct dev_links_info { * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. + * @msi_lock: Lock to protect MSI mask cache and mask register * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. From 454bb6775202d94f0f489c4632efecdb62d3c904 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 31 Jul 2021 14:21:30 +0800 Subject: [PATCH 130/187] blk-mq: clear active_queues before clearing BLK_MQ_F_TAG_QUEUE_SHARED We run a test that delete and recover devcies frequently(two devices on the same host), and we found that 'active_queues' is super big after a period of time. If device a and device b share a tag set, and a is deleted, then blk_mq_exit_queue() will clear BLK_MQ_F_TAG_QUEUE_SHARED because there is only one queue that are using the tag set. However, if b is still active, the active_queues of b might never be cleared even if b is deleted. Thus clear active_queues before BLK_MQ_F_TAG_QUEUE_SHARED is cleared. Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210731062130.1533893-1-yukuai3@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 2c4ac51e54eb..2fe396385a4a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2994,10 +2994,12 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) int i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) + if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; - else + } else { + blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; + } } } From 8f40d0370795313b6f1b1782035919cfc76b159f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Aug 2021 08:57:07 -0600 Subject: [PATCH 131/187] tools/io_uring/io_uring-cp: sync with liburing example This example is missing a few fixes that are in the liburing version, synchronize with the upstream version. Reported-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- tools/io_uring/io_uring-cp.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tools/io_uring/io_uring-cp.c b/tools/io_uring/io_uring-cp.c index 81461813ec62..d9bd6f5f8f46 100644 --- a/tools/io_uring/io_uring-cp.c +++ b/tools/io_uring/io_uring-cp.c @@ -131,8 +131,7 @@ static int copy_file(struct io_uring *ring, off_t insize) writes = reads = offset = 0; while (insize || write_left) { - unsigned long had_reads; - int got_comp; + int had_reads, got_comp; /* * Queue up as many reads as we can @@ -174,8 +173,13 @@ static int copy_file(struct io_uring *ring, off_t insize) if (!got_comp) { ret = io_uring_wait_cqe(ring, &cqe); got_comp = 1; - } else + } else { ret = io_uring_peek_cqe(ring, &cqe); + if (ret == -EAGAIN) { + cqe = NULL; + ret = 0; + } + } if (ret < 0) { fprintf(stderr, "io_uring_peek_cqe: %s\n", strerror(-ret)); @@ -194,7 +198,7 @@ static int copy_file(struct io_uring *ring, off_t insize) fprintf(stderr, "cqe failed: %s\n", strerror(-cqe->res)); return 1; - } else if ((size_t) cqe->res != data->iov.iov_len) { + } else if (cqe->res != data->iov.iov_len) { /* Short read/write, adjust and requeue */ data->iov.iov_base += cqe->res; data->iov.iov_len -= cqe->res; @@ -221,6 +225,25 @@ static int copy_file(struct io_uring *ring, off_t insize) } } + /* wait out pending writes */ + while (writes) { + struct io_data *data; + + ret = io_uring_wait_cqe(ring, &cqe); + if (ret) { + fprintf(stderr, "wait_cqe=%d\n", ret); + return 1; + } + if (cqe->res < 0) { + fprintf(stderr, "write res=%d\n", cqe->res); + return 1; + } + data = io_uring_cqe_get_data(cqe); + free(data); + writes--; + io_uring_cqe_seen(ring, cqe); + } + return 0; } From 45c709f8c71b525b51988e782febe84ce933e7e0 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 12 Aug 2021 17:18:10 +0200 Subject: [PATCH 132/187] bpf: Clear zext_dst of dead insns "access skb fields ok" verifier test fails on s390 with the "verifier bug. zext_dst is set, but no reg is defined" message. The first insns of the test prog are ... 0: 61 01 00 00 00 00 00 00 ldxw %r0,[%r1+0] 8: 35 00 00 01 00 00 00 00 jge %r0,0,1 10: 61 01 00 08 00 00 00 00 ldxw %r0,[%r1+8] ... and the 3rd one is dead (this does not look intentional to me, but this is a separate topic). sanitize_dead_code() converts dead insns into "ja -1", but keeps zext_dst. When opt_subreg_zext_lo32_rnd_hi32() tries to parse such an insn, it sees this discrepancy and bails. This problem can be seen only with JITs whose bpf_jit_needs_zext() returns true. Fix by clearning dead insns' zext_dst. The commits that contributed to this problem are: 1. 5aa5bd14c5f8 ("bpf: add initial suite for selftests"), which introduced the test with the dead code. 2. 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag"), which introduced the zext_dst flag. 3. 83a2881903f3 ("bpf: Account for BPF_FETCH in insn_has_def32()"), which introduced the sanity check. 4. 9183671af6db ("bpf: Fix leakage under speculation on mispredicted branches"), which bisect points to. It's best to fix this on stable branches that contain the second one, since that's the point where the inconsistency was introduced. Fixes: 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag") Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210812151811.184086-2-iii@linux.ibm.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f9bda5476ea5..381d3d6f24bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11663,6 +11663,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) if (aux_data[i].seen) continue; memcpy(insn + i, &trap, sizeof(trap)); + aux_data[i].zext_dst = false; } } From 3776f3517ed94d40ff0e3851d7ce2ce17b63099f Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 12 Aug 2021 17:18:11 +0200 Subject: [PATCH 133/187] selftests, bpf: Test that dead ldx_w insns are accepted Prevent regressions related to zero-extension metadata handling during dead code sanitization. Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210812151811.184086-3-iii@linux.ibm.com --- tools/testing/selftests/bpf/verifier/dead_code.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 2c8935b3e65d..ee454327e5c6 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -159,3 +159,15 @@ .result = ACCEPT, .retval = 2, }, +{ + "dead code: zero extension", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -4), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .retval = 0, +}, From cddce01160582a5f52ada3da9626c052d852ec42 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Fri, 13 Aug 2021 23:13:30 +0800 Subject: [PATCH 134/187] nbd: Aovid double completion of a request There is a race between iterating over requests in nbd_clear_que() and completing requests in recv_work(), which can lead to double completion of a request. To fix it, flush the recv worker before iterating over the requests and don't abort the completed request while iterating. Fixes: 96d97e17828f ("nbd: clear_sock on netlink disconnect") Reported-by: Jiang Yadong Signed-off-by: Xie Yongji Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20210813151330.96-1-xieyongji@bytedance.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c38317979f74..19f5d5a8b16a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -818,6 +818,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return true; + mutex_lock(&cmd->lock); cmd->status = BLK_STS_IOERR; mutex_unlock(&cmd->lock); @@ -2004,15 +2008,19 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) { mutex_lock(&nbd->config_lock); nbd_disconnect(nbd); - nbd_clear_sock(nbd); - mutex_unlock(&nbd->config_lock); + sock_shutdown(nbd); /* * Make sure recv thread has finished, so it does not drop the last * config ref and try to destroy the workqueue from inside the work - * queue. + * queue. And this also ensure that we can safely call nbd_clear_que() + * to cancel the inflight I/Os. */ if (nbd->recv_workq) flush_workqueue(nbd->recv_workq); + nbd_clear_que(nbd); + nbd->task_setup = NULL; + mutex_unlock(&nbd->config_lock); + if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, &nbd->config->runtime_flags)) nbd_config_put(nbd); From 3c603136c9f82833813af77185618de5af67676c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Aug 2021 14:42:39 -0700 Subject: [PATCH 135/187] bnxt: don't lock the tx queue from napi poll We can't take the tx lock from the napi poll routine, because netpoll can poll napi at any moment, including with the tx lock already held. The tx lock is protecting against two paths - the disable path, and (as Michael points out) the NETDEV_TX_BUSY case which may occur if NAPI completions race with start_xmit and both decide to re-enable the queue. For the disable/ifdown path use synchronize_net() to make sure closing the device does not race we restarting the queues. Annotate accesses to dev_state against data races. For the NAPI cleanup vs start_xmit path - appropriate barriers are already in place in the main spot where Tx queue is stopped but we need to do the same careful dance in the TX_BUSY case. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Reviewed-by: Michael Chan Reviewed-by: Edwin Peer Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 54 ++++++++++++++--------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2fe743503949..84aa25875050 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -365,6 +365,26 @@ static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) return md_dst->u.port_info.port_id; } +static bool bnxt_txr_netif_try_stop_queue(struct bnxt *bp, + struct bnxt_tx_ring_info *txr, + struct netdev_queue *txq) +{ + netif_tx_stop_queue(txq); + + /* netif_tx_stop_queue() must be done before checking + * tx index in bnxt_tx_avail() below, because in + * bnxt_tx_int(), we update tx index before checking for + * netif_tx_queue_stopped(). + */ + smp_mb(); + if (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh) { + netif_tx_wake_queue(txq); + return false; + } + + return true; +} + static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bnxt *bp = netdev_priv(dev); @@ -393,8 +413,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) free_size = bnxt_tx_avail(bp, txr); if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { - netif_tx_stop_queue(txq); - return NETDEV_TX_BUSY; + if (bnxt_txr_netif_try_stop_queue(bp, txr, txq)) + return NETDEV_TX_BUSY; } length = skb->len; @@ -626,16 +646,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) if (netdev_xmit_more() && !tx_buf->is_push) bnxt_db_write(bp, &txr->tx_db, prod); - netif_tx_stop_queue(txq); - - /* netif_tx_stop_queue() must be done before checking - * tx index in bnxt_tx_avail() below, because in - * bnxt_tx_int(), we update tx index before checking for - * netif_tx_queue_stopped(). - */ - smp_mb(); - if (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh) - netif_tx_wake_queue(txq); + bnxt_txr_netif_try_stop_queue(bp, txr, txq); } return NETDEV_TX_OK; @@ -732,14 +743,9 @@ static void bnxt_tx_int(struct bnxt *bp, struct bnxt_napi *bnapi, int nr_pkts) smp_mb(); if (unlikely(netif_tx_queue_stopped(txq)) && - (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh)) { - __netif_tx_lock(txq, smp_processor_id()); - if (netif_tx_queue_stopped(txq) && - bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh && - txr->dev_state != BNXT_DEV_STATE_CLOSING) - netif_tx_wake_queue(txq); - __netif_tx_unlock(txq); - } + bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh && + READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING) + netif_tx_wake_queue(txq); } static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, @@ -9165,9 +9171,11 @@ void bnxt_tx_disable(struct bnxt *bp) if (bp->tx_ring) { for (i = 0; i < bp->tx_nr_rings; i++) { txr = &bp->tx_ring[i]; - txr->dev_state = BNXT_DEV_STATE_CLOSING; + WRITE_ONCE(txr->dev_state, BNXT_DEV_STATE_CLOSING); } } + /* Make sure napi polls see @dev_state change */ + synchronize_net(); /* Drop carrier first to prevent TX timeout */ netif_carrier_off(bp->dev); /* Stop all TX queues */ @@ -9181,8 +9189,10 @@ void bnxt_tx_enable(struct bnxt *bp) for (i = 0; i < bp->tx_nr_rings; i++) { txr = &bp->tx_ring[i]; - txr->dev_state = 0; + WRITE_ONCE(txr->dev_state, 0); } + /* Make sure napi polls see @dev_state change */ + synchronize_net(); netif_tx_wake_all_queues(bp->dev); if (bp->link_info.link_up) netif_carrier_on(bp->dev); From 01cca6b9330ac7460de44eeeb3a0607f8aae69ff Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Aug 2021 14:42:40 -0700 Subject: [PATCH 136/187] bnxt: disable napi before canceling DIM napi schedules DIM, napi has to be disabled first, then DIM canceled. Noticed while reading the code. Fixes: 0bc0b97fca73 ("bnxt_en: cleanup DIM work on device shutdown") Fixes: 6a8788f25625 ("bnxt_en: add support for software dynamic interrupt moderation") Reviewed-by: Michael Chan Reviewed-by: Edwin Peer Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 84aa25875050..721b5df36311 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -9134,10 +9134,9 @@ static void bnxt_disable_napi(struct bnxt *bp) for (i = 0; i < bp->cp_nr_rings; i++) { struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring; + napi_disable(&bp->bnapi[i]->napi); if (bp->bnapi[i]->rx_ring) cancel_work_sync(&cpr->dim.work); - - napi_disable(&bp->bnapi[i]->napi); } } From e8d8c5d80f5e9d4586c68061b62c642752289095 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Aug 2021 14:42:41 -0700 Subject: [PATCH 137/187] bnxt: make sure xmit_more + errors does not miss doorbells skbs are freed on error and not put on the ring. We may, however, be in a situation where we're freeing the last skb of a batch, and there is a doorbell ring pending because of xmit_more() being true earlier. Make sure we ring the door bell in such situations. Since errors are rare don't pay attention to xmit_more() and just always flush the pending frames. The busy case should be safe to be left alone because it can only happen if start_xmit races with completions and they both enable the queue. In that case the kick can't be pending. Noticed while reading the code. Fixes: 4d172f21cefe ("bnxt_en: Implement xmit_more.") Reviewed-by: Michael Chan Reviewed-by: Edwin Peer Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 39 +++++++++++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 1 + 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 721b5df36311..389016ea65cf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -72,7 +72,8 @@ #include "bnxt_debugfs.h" #define BNXT_TX_TIMEOUT (5 * HZ) -#define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW) +#define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \ + NETIF_MSG_TX_ERR) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Broadcom BCM573xx network driver"); @@ -365,6 +366,13 @@ static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) return md_dst->u.port_info.port_id; } +static void bnxt_txr_db_kick(struct bnxt *bp, struct bnxt_tx_ring_info *txr, + u16 prod) +{ + bnxt_db_write(bp, &txr->tx_db, prod); + txr->kick_pending = 0; +} + static bool bnxt_txr_netif_try_stop_queue(struct bnxt *bp, struct bnxt_tx_ring_info *txr, struct netdev_queue *txq) @@ -413,6 +421,10 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) free_size = bnxt_tx_avail(bp, txr); if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { + /* We must have raced with NAPI cleanup */ + if (net_ratelimit() && txr->kick_pending) + netif_warn(bp, tx_err, dev, + "bnxt: ring busy w/ flush pending!\n"); if (bnxt_txr_netif_try_stop_queue(bp, txr, txq)) return NETDEV_TX_BUSY; } @@ -537,21 +549,16 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) normal_tx: if (length < BNXT_MIN_PKT_SIZE) { pad = BNXT_MIN_PKT_SIZE - length; - if (skb_pad(skb, pad)) { + if (skb_pad(skb, pad)) /* SKB already freed. */ - tx_buf->skb = NULL; - return NETDEV_TX_OK; - } + goto tx_kick_pending; length = BNXT_MIN_PKT_SIZE; } mapping = dma_map_single(&pdev->dev, skb->data, len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(&pdev->dev, mapping))) { - dev_kfree_skb_any(skb); - tx_buf->skb = NULL; - return NETDEV_TX_OK; - } + if (unlikely(dma_mapping_error(&pdev->dev, mapping))) + goto tx_free; dma_unmap_addr_set(tx_buf, mapping, mapping); flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD | @@ -638,13 +645,15 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) txr->tx_prod = prod; if (!netdev_xmit_more() || netif_xmit_stopped(txq)) - bnxt_db_write(bp, &txr->tx_db, prod); + bnxt_txr_db_kick(bp, txr, prod); + else + txr->kick_pending = 1; tx_done: if (unlikely(bnxt_tx_avail(bp, txr) <= MAX_SKB_FRAGS + 1)) { if (netdev_xmit_more() && !tx_buf->is_push) - bnxt_db_write(bp, &txr->tx_db, prod); + bnxt_txr_db_kick(bp, txr, prod); bnxt_txr_netif_try_stop_queue(bp, txr, txq); } @@ -659,7 +668,6 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) /* start back at beginning and unmap skb */ prod = txr->tx_prod; tx_buf = &txr->tx_buf_ring[prod]; - tx_buf->skb = NULL; dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping), skb_headlen(skb), PCI_DMA_TODEVICE); prod = NEXT_TX(prod); @@ -673,7 +681,12 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) PCI_DMA_TODEVICE); } +tx_free: dev_kfree_skb_any(skb); +tx_kick_pending: + if (txr->kick_pending) + bnxt_txr_db_kick(bp, txr, txr->tx_prod); + txr->tx_buf_ring[txr->tx_prod].skb = NULL; return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index bcf8d00b8c80..ba4e0fc38520 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -786,6 +786,7 @@ struct bnxt_tx_ring_info { u16 tx_prod; u16 tx_cons; u16 txq_index; + u8 kick_pending; struct bnxt_db_info tx_db; struct tx_bd *tx_desc_ring[MAX_TX_PAGES]; From fb9f7190092d2bbd1f8f0b1cc252732cbe99a87e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 12 Aug 2021 14:42:42 -0700 Subject: [PATCH 138/187] bnxt: count Tx drops Drivers should count packets they are dropping. Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") Reviewed-by: Michael Chan Reviewed-by: Edwin Peer Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 389016ea65cf..dd2d2a5fef15 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -412,6 +412,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) i = skb_get_queue_mapping(skb); if (unlikely(i >= bp->tx_nr_rings)) { dev_kfree_skb_any(skb); + atomic_long_inc(&dev->tx_dropped); return NETDEV_TX_OK; } @@ -687,6 +688,7 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) if (txr->kick_pending) bnxt_txr_db_kick(bp, txr, txr->tx_prod); txr->tx_buf_ring[txr->tx_prod].skb = NULL; + atomic_long_inc(&dev->tx_dropped); return NETDEV_TX_OK; } From 6c7a00b843370feaf7710cef2350367c7e61cd1a Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Fri, 13 Aug 2021 16:54:24 -0700 Subject: [PATCH 139/187] kasan, kmemleak: reset tags when scanning block Patch series "kasan, slub: reset tag when printing address", v3. With hardware tag-based kasan enabled, we reset the tag when we access metadata to avoid from false alarm. This patch (of 2): Kmemleak needs to scan kernel memory to check memory leak. With hardware tag-based kasan enabled, when it scans on the invalid slab and dereference, the issue will occur as below. Hardware tag-based KASAN doesn't use compiler instrumentation, we can not use kasan_disable_current() to ignore tag check. Based on the below report, there are 11 0xf7 granules, which amounts to 176 bytes, and the object is allocated from the kmalloc-256 cache. So when kmemleak accesses the last 256-176 bytes, it causes faults, as those are marked with KASAN_KMALLOC_REDZONE == KASAN_TAG_INVALID == 0xfe. Thus, we reset tags before accessing metadata to avoid from false positives. BUG: KASAN: out-of-bounds in scan_block+0x58/0x170 Read at addr f7ff0000c0074eb0 by task kmemleak/138 Pointer tag: [f7], memory tag: [fe] CPU: 7 PID: 138 Comm: kmemleak Not tainted 5.14.0-rc2-00001-g8cae8cd89f05-dirty #134 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1b0 show_stack+0x1c/0x30 dump_stack_lvl+0x68/0x84 print_address_description+0x7c/0x2b4 kasan_report+0x138/0x38c __do_kernel_fault+0x190/0x1c4 do_tag_check_fault+0x78/0x90 do_mem_abort+0x44/0xb4 el1_abort+0x40/0x60 el1h_64_sync_handler+0xb4/0xd0 el1h_64_sync+0x78/0x7c scan_block+0x58/0x170 scan_gray_list+0xdc/0x1a0 kmemleak_scan+0x2ac/0x560 kmemleak_scan_thread+0xb0/0xe0 kthread+0x154/0x160 ret_from_fork+0x10/0x18 Allocated by task 0: kasan_save_stack+0x2c/0x60 __kasan_kmalloc+0xec/0x104 __kmalloc+0x224/0x3c4 __register_sysctl_paths+0x200/0x290 register_sysctl_table+0x2c/0x40 sysctl_init+0x20/0x34 proc_sys_init+0x3c/0x48 proc_root_init+0x80/0x9c start_kernel+0x648/0x6a4 __primary_switched+0xc0/0xc8 Freed by task 0: kasan_save_stack+0x2c/0x60 kasan_set_track+0x2c/0x40 kasan_set_free_info+0x44/0x54 ____kasan_slab_free.constprop.0+0x150/0x1b0 __kasan_slab_free+0x14/0x20 slab_free_freelist_hook+0xa4/0x1fc kfree+0x1e8/0x30c put_fs_context+0x124/0x220 vfs_kern_mount.part.0+0x60/0xd4 kern_mount+0x24/0x4c bdev_cache_init+0x70/0x9c vfs_caches_init+0xdc/0xf4 start_kernel+0x638/0x6a4 __primary_switched+0xc0/0xc8 The buggy address belongs to the object at ffff0000c0074e00 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 176 bytes inside of 256-byte region [ffff0000c0074e00, ffff0000c0074f00) The buggy address belongs to the page: page:(____ptrval____) refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x100074 head:(____ptrval____) order:2 compound_mapcount:0 compound_pincount:0 flags: 0xbfffc0000010200(slab|head|node=0|zone=2|lastcpupid=0xffff|kasantag=0x0) raw: 0bfffc0000010200 0000000000000000 dead000000000122 f5ff0000c0002300 raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000c0074c00: f0 f0 f0 f0 f0 f0 f0 f0 f0 fe fe fe fe fe fe fe ffff0000c0074d00: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe >ffff0000c0074e00: f7 f7 f7 f7 f7 f7 f7 f7 f7 f7 f7 fe fe fe fe fe ^ ffff0000c0074f00: fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe ffff0000c0075000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== Disabling lock debugging due to kernel taint kmemleak: 181 new suspected memory leaks (see /sys/kernel/debug/kmemleak) Link: https://lkml.kernel.org/r/20210804090957.12393-1-Kuan-Ying.Lee@mediatek.com Link: https://lkml.kernel.org/r/20210804090957.12393-2-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Acked-by: Catalin Marinas Reviewed-by: Andrey Konovalov Cc: Marco Elver Cc: Nicholas Tang Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Chinwen Chang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 228a2fbe0657..73d46d16d575 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -290,7 +290,7 @@ static void hex_dump_object(struct seq_file *seq, warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len); kasan_disable_current(); warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE, - HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII); kasan_enable_current(); } @@ -1171,7 +1171,7 @@ static bool update_checksum(struct kmemleak_object *object) kasan_disable_current(); kcsan_disable_current(); - object->checksum = crc32(0, (void *)object->pointer, object->size); + object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); kasan_enable_current(); kcsan_enable_current(); @@ -1246,7 +1246,7 @@ static void scan_block(void *_start, void *_end, break; kasan_disable_current(); - pointer = *ptr; + pointer = *(unsigned long *)kasan_reset_tag((void *)ptr); kasan_enable_current(); untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); From 340caf178ddc2efb0294afaf54c715f7928c258e Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Fri, 13 Aug 2021 16:54:27 -0700 Subject: [PATCH 140/187] kasan, slub: reset tag when printing address The address still includes the tags when it is printed. With hardware tag-based kasan enabled, we will get a false positive KASAN issue when we access metadata. Reset the tag before we access the metadata. Link: https://lkml.kernel.org/r/20210804090957.12393-3-Kuan-Ying.Lee@mediatek.com Fixes: aa1ef4d7b3f6 ("kasan, mm: reset tags when accessing metadata") Signed-off-by: Kuan-Ying Lee Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Chinwen Chang Cc: Nicholas Tang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index af984e4990e8..1583354fbf48 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -576,8 +576,8 @@ static void print_section(char *level, char *text, u8 *addr, unsigned int length) { metadata_access_enable(); - print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS, - 16, 1, addr, length, 1); + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + 16, 1, kasan_reset_tag((void *)addr), length, 1); metadata_access_disable(); } From 1ed7ce574c136569f55fb5c32e69e382c77ba500 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 13 Aug 2021 16:54:31 -0700 Subject: [PATCH 141/187] slub: fix kmalloc_pagealloc_invalid_free unit test The unit test kmalloc_pagealloc_invalid_free makes sure that for the higher order slub allocation which goes to page allocator, the free is called with the correct address i.e. the virtual address of the head page. Commit f227f0faf63b ("slub: fix unreclaimable slab stat for bulk free") unified the free code paths for page allocator based slub allocations but instead of using the address passed by the caller, it extracted the address from the page. Thus making the unit test kmalloc_pagealloc_invalid_free moot. So, fix this by using the address passed by the caller. Should we fix this? I think yes because dev expect kasan to catch these type of programming bugs. Link: https://lkml.kernel.org/r/20210802180819.1110165-1-shakeelb@google.com Fixes: f227f0faf63b ("slub: fix unreclaimable slab stat for bulk free") Signed-off-by: Shakeel Butt Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Acked-by: Roman Gushchin Cc: Michal Hocko Cc: Muchun Song Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1583354fbf48..5c2a13d66e71 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3236,12 +3236,12 @@ struct detached_freelist { struct kmem_cache *s; }; -static inline void free_nonslab_page(struct page *page) +static inline void free_nonslab_page(struct page *page, void *object) { unsigned int order = compound_order(page); VM_BUG_ON_PAGE(!PageCompound(page), page); - kfree_hook(page_address(page)); + kfree_hook(object); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __free_pages(page, order); } @@ -3282,7 +3282,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!PageSlab(page))) { - free_nonslab_page(page); + free_nonslab_page(page, object); p[size] = NULL; /* mark object processed */ return size; } @@ -4258,7 +4258,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - free_nonslab_page(page); + free_nonslab_page(page, object); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); From a7f1d48585b34730765dcda09ead6edc4ac16a5c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 13 Aug 2021 16:54:34 -0700 Subject: [PATCH 142/187] mm: slub: fix slub_debug disabling for list of slabs Vijayanand Jitta reports: Consider the scenario where CONFIG_SLUB_DEBUG_ON is set and we would want to disable slub_debug for few slabs. Using boot parameter with slub_debug=-,slab_name syntax doesn't work as expected i.e; only disabling debugging for the specified list of slabs. Instead it disables debugging for all slabs, which is wrong. This patch fixes it by delaying the moment when the global slub_debug flags variable is updated. In case a "slub_debug=-,slab_name" has been passed, the global flags remain as initialized (depending on CONFIG_SLUB_DEBUG_ON enabled or disabled) and are not simply reset to 0. Link: https://lkml.kernel.org/r/8a3d992a-473a-467b-28a0-4ad2ff60ab82@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Vijayanand Jitta Reviewed-by: Vijayanand Jitta Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5c2a13d66e71..f77d8cd79ef7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1400,12 +1400,13 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) static int __init setup_slub_debug(char *str) { slab_flags_t flags; + slab_flags_t global_flags; char *saved_str; char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; - slub_debug = DEBUG_DEFAULT_FLAGS; + global_flags = DEBUG_DEFAULT_FLAGS; if (*str++ != '=' || !*str) /* * No options specified. Switch on full debugging. @@ -1417,7 +1418,7 @@ static int __init setup_slub_debug(char *str) str = parse_slub_debug_flags(str, &flags, &slab_list, true); if (!slab_list) { - slub_debug = flags; + global_flags = flags; global_slub_debug_changed = true; } else { slab_list_specified = true; @@ -1426,16 +1427,18 @@ static int __init setup_slub_debug(char *str) /* * For backwards compatibility, a single list of flags with list of - * slabs means debugging is only enabled for those slabs, so the global - * slub_debug should be 0. We can extended that to multiple lists as + * slabs means debugging is only changed for those slabs, so the global + * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ if (slab_list_specified) { if (!global_slub_debug_changed) - slub_debug = 0; + global_flags = slub_debug; slub_debug_string = saved_str; } out: + slub_debug = global_flags; if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else From eb2faa513c246ed47ae34a205928ab663bc5a18f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 13 Aug 2021 16:54:37 -0700 Subject: [PATCH 143/187] mm/madvise: report SIGBUS as -EFAULT for MADV_POPULATE_(READ|WRITE) Doing some extended tests and polishing the man page update for MADV_POPULATE_(READ|WRITE), I realized that we end up converting also SIGBUS (via -EFAULT) to -EINVAL, making it look like yet another madvise() user error. We want to report only problematic mappings and permission problems that the user could have know as -EINVAL. Let's not convert -EFAULT arising due to SIGBUS (or SIGSEGV) to -EINVAL, but instead indicate -EFAULT to user space. While we could also convert it to -ENOMEM, using -EFAULT looks more helpful when user space might want to troubleshoot what's going wrong: MADV_POPULATE_(READ|WRITE) is not part of an final Linux release and we can still adjust the behavior. Link: https://lkml.kernel.org/r/20210726154932.102880-1-david@redhat.com Fixes: 4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault page tables") Signed-off-by: David Hildenbrand Cc: Arnd Bergmann Cc: Michal Hocko Cc: Oscar Salvador Cc: Matthew Wilcox (Oracle) Cc: Andrea Arcangeli Cc: Minchan Kim Cc: Jann Horn Cc: Jason Gunthorpe Cc: Dave Hansen Cc: Hugh Dickins Cc: Rik van Riel Cc: Michael S. Tsirkin Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Chris Zankel Cc: Max Filippov Cc: Mike Kravetz Cc: Peter Xu Cc: Rolf Eike Beer Cc: Ram Pai Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 7 +++++-- mm/madvise.c | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 42b8b1fa6521..b94717977d17 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1558,9 +1558,12 @@ long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start, gup_flags |= FOLL_WRITE; /* - * See check_vma_flags(): Will return -EFAULT on incompatible mappings - * or with insufficient permissions. + * We want to report -EINVAL instead of -EFAULT for any permission + * problems or incompatible mappings. */ + if (check_vma_flags(vma, gup_flags)) + return -EINVAL; + return __get_user_pages(mm, start, nr_pages, gup_flags, NULL, NULL, locked); } diff --git a/mm/madvise.c b/mm/madvise.c index 6d3d348b17f4..5c065bc8b5f6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -862,10 +862,12 @@ static long madvise_populate(struct vm_area_struct *vma, switch (pages) { case -EINTR: return -EINTR; - case -EFAULT: /* Incompatible mappings / permissions. */ + case -EINVAL: /* Incompatible mappings / permissions. */ return -EINVAL; case -EHWPOISON: return -EHWPOISON; + case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ + return -EFAULT; default: pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); From 7fa0dacbaf1259fd3d1dda6d602fdd084dea9c0e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 13 Aug 2021 16:54:41 -0700 Subject: [PATCH 144/187] mm/memcg: fix incorrect flushing of lruvec data in obj_stock When mod_objcg_state() is called with a pgdat that is different from that in the obj_stock, the old lruvec data cached in obj_stock are flushed out. Unfortunately, they were flushed to the new pgdat and so the data go to the wrong node. This will screw up the slab data reported in /sys/devices/system/node/node*/meminfo. Fix that by flushing the data to the cached pgdat instead. Link: https://lkml.kernel.org/r/20210802143834.30578-1-longman@redhat.com Fixes: 68ac5b3c8db2 ("mm/memcg: cache vmstat data in percpu memcg_stock_pcp") Signed-off-by: Waiman Long Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Tejun Heo Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Muchun Song Cc: Alex Shi Cc: Chris Down Cc: Yafang Shao Cc: Wei Yang Cc: Masayoshi Mizuma Cc: Xing Zhengjun Cc: Matthew Wilcox Cc: Waiman Long Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index eb8e87c4833f..702a81dfe72d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3106,13 +3106,15 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ + struct pglist_data *oldpg = stock->cached_pgdat; + if (stock->nr_slab_reclaimable_b) { - mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } From 854f32648b8a5e424d682953b1a9f3b7c3322701 Mon Sep 17 00:00:00 2001 From: Liang Wang Date: Fri, 13 Aug 2021 16:54:45 -0700 Subject: [PATCH 145/187] lib: use PFN_PHYS() in devmem_is_allowed() The physical address may exceed 32 bits on 32-bit systems with more than 32 bits of physcial address. Use PFN_PHYS() in devmem_is_allowed(), or the physical address may overflow and be truncated. We found this bug when mapping a high addresses through devmem tool, when CONFIG_STRICT_DEVMEM is enabled on the ARM with ARM_LPAE and devmem is used to map a high address that is not in the iomem address range, an unexpected error indicating no permission is returned. This bug was initially introduced from v2.6.37, and the function was moved to lib in v5.11. Link: https://lkml.kernel.org/r/20210731025057.78825-1-wangliang101@huawei.com Fixes: 087aaffcdf9c ("ARM: implement CONFIG_STRICT_DEVMEM by disabling access to RAM via /dev/mem") Fixes: 527701eda5f1 ("lib: Add a generic version of devmem_is_allowed()") Signed-off-by: Liang Wang Reviewed-by: Luis Chamberlain Cc: Palmer Dabbelt Cc: Greg Kroah-Hartman Cc: Russell King Cc: Liang Wang Cc: Xiaoming Ni Cc: Kefeng Wang Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/devmem_is_allowed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/devmem_is_allowed.c b/lib/devmem_is_allowed.c index c0d67c541849..60be9e24bd57 100644 --- a/lib/devmem_is_allowed.c +++ b/lib/devmem_is_allowed.c @@ -19,7 +19,7 @@ */ int devmem_is_allowed(unsigned long pfn) { - if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + if (iomem_is_exclusive(PFN_PHYS(pfn))) return 0; if (!page_is_ram(pfn)) return 1; From 5f773519639041faed4132f5943d7895f9078a4c Mon Sep 17 00:00:00 2001 From: Maciej Machnikowski Date: Fri, 13 Aug 2021 09:50:18 -0700 Subject: [PATCH 146/187] ice: Fix perout start time rounding Internal tests found out that the latest code doesn't bring up 1PPS out as expected. As a result of incorrect define used to round the time up the time was round down to the past second boundary. Fix define used for rounding to properly round up to the next Top of second in ice_ptp_cfg_clkout to fix it. Fixes: 172db5f91d5f ("ice: add support for auxiliary input/output pins") Signed-off-by: Maciej Machnikowski Tested-by: Sunitha Mekala Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20210813165018.2196013-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ice/ice_ptp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index 5d5207b56ca9..9e3ddb9b8b51 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -656,7 +656,7 @@ static int ice_ptp_cfg_clkout(struct ice_pf *pf, unsigned int chan, * maintaining phase */ if (start_time < current_time) - start_time = div64_u64(current_time + NSEC_PER_MSEC - 1, + start_time = div64_u64(current_time + NSEC_PER_SEC - 1, NSEC_PER_SEC) * NSEC_PER_SEC + phase; start_time -= E810_OUT_PROP_DELAY_NS; From da94692001ea45ffa1f5e9f17ecdef7aecd90c27 Mon Sep 17 00:00:00 2001 From: Kristin Paget Date: Sat, 14 Aug 2021 15:46:05 -0700 Subject: [PATCH 147/187] ALSA: hda/realtek: Enable 4-speaker output for Dell XPS 15 9510 laptop The 2021-model XPS 15 appears to use the same 4-speakers-on-ALC289 audio setup as the Precision models, so requires the same quirk to enable woofer output. Tested on my own 9510. Signed-off-by: Kristin Paget Cc: Link: https://lore.kernel.org/r/e1fc95c5-c10a-1f98-a5c2-dd6e336157e1@tombom.co.uk Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index a065260d0d20..96f32eaa24df 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8332,6 +8332,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0a2e, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a30, "Dell", ALC236_FIXUP_DELL_AIO_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0a58, "Dell", ALC255_FIXUP_DELL_HEADSET_MIC), + SND_PCI_QUIRK(0x1028, 0x0a61, "Dell XPS 15 9510", ALC289_FIXUP_DUAL_SPK), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From 7c60610d476766e128cc4284bb6349732cbd6606 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 15 Aug 2021 13:40:53 -1000 Subject: [PATCH 148/187] Linux 5.14-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eae1314a5b86..c19d1638da25 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Opossums on Parade # *DOCUMENTATION* From 19d1532a187669ce86d5a2696eb7275310070793 Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Fri, 13 Aug 2021 18:14:33 +0300 Subject: [PATCH 149/187] net: 6pack: fix slab-out-of-bounds in decode_data Syzbot reported slab-out-of bounds write in decode_data(). The problem was in missing validation checks. Syzbot's reproducer generated malicious input, which caused decode_data() to be called a lot in sixpack_decode(). Since rx_count_cooked is only 400 bytes and noone reported before, that 400 bytes is not enough, let's just check if input is malicious and complain about buffer overrun. Fail log: ================================================================== BUG: KASAN: slab-out-of-bounds in drivers/net/hamradio/6pack.c:843 Write of size 1 at addr ffff888087c5544e by task kworker/u4:0/7 CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.6.0-rc3-syzkaller #0 ... Workqueue: events_unbound flush_to_ldisc Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:641 __asan_report_store1_noabort+0x17/0x20 mm/kasan/generic_report.c:137 decode_data.part.0+0x23b/0x270 drivers/net/hamradio/6pack.c:843 decode_data drivers/net/hamradio/6pack.c:965 [inline] sixpack_decode drivers/net/hamradio/6pack.c:968 [inline] Reported-and-tested-by: syzbot+fc8cd9a673d4577fb2e4@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Pavel Skripkin Reviewed-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/net/hamradio/6pack.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index fcf3af76b6d7..8fe8887d506a 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -827,6 +827,12 @@ static void decode_data(struct sixpack *sp, unsigned char inbyte) return; } + if (sp->rx_count_cooked + 2 >= sizeof(sp->cooked_buf)) { + pr_err("6pack: cooked buffer overrun, data loss\n"); + sp->rx_count = 0; + return; + } + buf = sp->raw_buf; sp->cooked_buf[sp->rx_count_cooked++] = buf[0] | ((buf[1] << 2) & 0xc0); From 55c8fca1dae1fb0d11deaa21b65a647dedb1bc50 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 13 Aug 2021 20:33:27 +0300 Subject: [PATCH 150/187] ptp_pch: Restore dependency on PCI During the swap dependency on PCH_GBE to selection PTP_1588_CLOCK_PCH incidentally dropped the implicit dependency on the PCI. Restore it. Fixes: 18d359ceb044 ("pch_gbe, ptp_pch: Fix the dependency direction between these drivers") Reported-by: kernel test robot Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/ptp/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig index 8c20e524e9ad..e085c255da0c 100644 --- a/drivers/ptp/Kconfig +++ b/drivers/ptp/Kconfig @@ -90,7 +90,8 @@ config PTP_1588_CLOCK_INES config PTP_1588_CLOCK_PCH tristate "Intel PCH EG20T as PTP clock" depends on X86_32 || COMPILE_TEST - depends on HAS_IOMEM && NET + depends on HAS_IOMEM && PCI + depends on NET imply PTP_1588_CLOCK help This driver adds support for using the PCH EG20T as a PTP From 7387a72c5f84f0dfb57618f9e4770672c0d2e4c9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 15 Aug 2021 03:13:36 -0400 Subject: [PATCH 151/187] tipc: call tipc_wait_for_connect only when dlen is not 0 __tipc_sendmsg() is called to send SYN packet by either tipc_sendmsg() or tipc_connect(). The difference is in tipc_connect(), it will call tipc_wait_for_connect() after __tipc_sendmsg() to wait until connecting is done. So there's no need to wait in __tipc_sendmsg() for this case. This patch is to fix it by calling tipc_wait_for_connect() only when dlen is not 0 in __tipc_sendmsg(), which means it's called by tipc_connect(). Note this also fixes the failure in tipcutils/test/ptts/: # ./tipcTS & # ./tipcTC 9 (hang) Fixes: 36239dab6da7 ("tipc: fix implicit-connect for SYN+") Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 75b99b7eda22..8754bd885169 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1518,7 +1518,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(syn && !rc)) { tipc_set_sk_state(sk, TIPC_CONNECTING); - if (timeout) { + if (dlen && timeout) { timeout = msecs_to_jiffies(timeout); tipc_wait_for_connect(sock, &timeout); } From 37110237f31105d679fc0aa7b11cdec867750ea7 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Sun, 15 Aug 2021 14:05:08 +0300 Subject: [PATCH 152/187] qed: qed ll2 race condition fixes Avoiding qed ll2 race condition and NULL pointer dereference as part of the remove and recovery flows. Changes form V1: - Change (!p_rx->set_prod_addr). - qed_ll2.c checkpatch fixes. Change from V2: - Revert "qed_ll2.c checkpatch fixes". Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 02a4610d9330..c46a7f756ed5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -327,6 +327,9 @@ static int qed_ll2_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) unsigned long flags; int rc = -EINVAL; + if (!p_ll2_conn) + return rc; + spin_lock_irqsave(&p_tx->lock, flags); if (p_tx->b_completing_packet) { rc = -EBUSY; @@ -500,7 +503,16 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie) unsigned long flags = 0; int rc = 0; + if (!p_ll2_conn) + return rc; + spin_lock_irqsave(&p_rx->lock, flags); + + if (!QED_LL2_RX_REGISTERED(p_ll2_conn)) { + spin_unlock_irqrestore(&p_rx->lock, flags); + return 0; + } + cq_new_idx = le16_to_cpu(*p_rx->p_fw_cons); cq_old_idx = qed_chain_get_cons_idx(&p_rx->rcq_chain); @@ -821,6 +833,9 @@ static int qed_ll2_lb_rxq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) struct qed_ll2_info *p_ll2_conn = (struct qed_ll2_info *)p_cookie; int rc; + if (!p_ll2_conn) + return 0; + if (!QED_LL2_RX_REGISTERED(p_ll2_conn)) return 0; @@ -844,6 +859,9 @@ static int qed_ll2_lb_txq_completion(struct qed_hwfn *p_hwfn, void *p_cookie) u16 new_idx = 0, num_bds = 0; int rc; + if (!p_ll2_conn) + return 0; + if (!QED_LL2_TX_REGISTERED(p_ll2_conn)) return 0; @@ -1728,6 +1746,8 @@ int qed_ll2_post_rx_buffer(void *cxt, if (!p_ll2_conn) return -EINVAL; p_rx = &p_ll2_conn->rx_queue; + if (!p_rx->set_prod_addr) + return -EIO; spin_lock_irqsave(&p_rx->lock, flags); if (!list_empty(&p_rx->free_descq)) From d33d19d313d3466abdf8b0428be7837aff767802 Mon Sep 17 00:00:00 2001 From: Shai Malin Date: Sun, 15 Aug 2021 14:06:39 +0300 Subject: [PATCH 153/187] qed: Fix null-pointer dereference in qed_rdma_create_qp() Fix a possible null-pointer dereference in qed_rdma_create_qp(). Changes from V2: - Revert checkpatch fixes. Reported-by: TOTE Robot Signed-off-by: Ariel Elior Signed-off-by: Shai Malin Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_rdma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index da864d12916b..4f4b79250a2b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -1285,8 +1285,7 @@ qed_rdma_create_qp(void *rdma_cxt, if (!rdma_cxt || !in_params || !out_params || !p_hwfn->p_rdma_info->active) { - DP_ERR(p_hwfn->cdev, - "qed roce create qp failed due to NULL entry (rdma_cxt=%p, in=%p, out=%p, roce_info=?\n", + pr_err("qed roce create qp failed due to NULL entry (rdma_cxt=%p, in=%p, out=%p, roce_info=?\n", rdma_cxt, in_params, out_params); return NULL; } From 976e52b718c3de9077fff8f3f674afb159c57fb1 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 15 Aug 2021 16:15:36 -0400 Subject: [PATCH 154/187] bnxt_en: Disable aRFS if running on 212 firmware 212 firmware broke aRFS, so disable it. Traffic may stop after ntuple filters are inserted and deleted by the 212 firmware. Fixes: ae10ae740ad2 ("bnxt_en: Add new hardware RFS mode.") Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index dd2d2a5fef15..207ef7dc9bff 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -10792,6 +10792,9 @@ static bool bnxt_rfs_supported(struct bnxt *bp) return true; return false; } + /* 212 firmware is broken for aRFS */ + if (BNXT_FW_MAJ(bp) == 212) + return false; if (BNXT_PF(bp) && !BNXT_CHIP_TYPE_NITRO_A0(bp)) return true; if (bp->flags & BNXT_FLAG_NEW_RSS_CAP) From 828affc27ed43441bd1efdaf4e07e96dd43a0362 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sun, 15 Aug 2021 16:15:37 -0400 Subject: [PATCH 155/187] bnxt_en: Add missing DMA memory barriers Each completion ring entry has a valid bit to indicate that the entry contains a valid completion event. The driver's main poll loop __bnxt_poll_work() has the proper dma_rmb() to make sure the valid bit of the next entry has been checked before proceeding further. But when we call bnxt_rx_pkt() to process the RX event, the RX completion event consists of two completion entries and only the first entry has been checked to be valid. We need the same barrier after checking the next completion entry. Add missing dma_rmb() barriers in bnxt_rx_pkt() and other similar locations. Fixes: 67a95e2022c7 ("bnxt_en: Need memory barrier when processing the completion ring.") Reported-by: Lance Richardson Reviewed-by: Andy Gospodarek Reviewed-by: Lance Richardson Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 207ef7dc9bff..8a97640cdfe7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -1788,6 +1788,10 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons)) return -EBUSY; + /* The valid test of the entry must be done first before + * reading any further. + */ + dma_rmb(); prod = rxr->rx_prod; if (cmp_type == CMP_TYPE_RX_L2_TPA_START_CMP) { @@ -2010,6 +2014,10 @@ static int bnxt_force_rx_discard(struct bnxt *bp, if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons)) return -EBUSY; + /* The valid test of the entry must be done first before + * reading any further. + */ + dma_rmb(); cmp_type = RX_CMP_TYPE(rxcmp); if (cmp_type == CMP_TYPE_RX_L2_CMP) { rxcmp1->rx_cmp_cfa_code_errors_v2 |= @@ -2475,6 +2483,10 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget) if (!TX_CMP_VALID(txcmp, raw_cons)) break; + /* The valid test of the entry must be done first before + * reading any further. + */ + dma_rmb(); if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) { tmp_raw_cons = NEXT_RAW_CMP(raw_cons); cp_cons = RING_CMP(tmp_raw_cons); From 3f79f6f6247c83f448c8026c3ee16d4636ef8d4f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 6 Aug 2021 14:26:24 +1000 Subject: [PATCH 156/187] btrfs: prevent rename2 from exchanging a subvol with a directory from different parents Cross-rename lacks a check when that would prevent exchanging a directory and subvolume from different parent subvolume. This causes data inconsistencies and is caught before commit by tree-checker, turning the filesystem to read-only. Calling the renameat2 with RENAME_EXCHANGE flags like renameat2(AT_FDCWD, namesrc, AT_FDCWD, namedest, (1 << 1)) on two paths: namesrc = dir1/subvol1/dir2 namedest = subvol2/subvol3 will cause key order problem with following write time tree-checker report: [1194842.307890] BTRFS critical (device loop1): corrupt leaf: root=5 block=27574272 slot=10 ino=258, invalid previous key objectid, have 257 expect 258 [1194842.322221] BTRFS info (device loop1): leaf 27574272 gen 8 total ptrs 11 free space 15444 owner 5 [1194842.331562] BTRFS info (device loop1): refs 2 lock_owner 0 current 26561 [1194842.338772] item 0 key (256 1 0) itemoff 16123 itemsize 160 [1194842.338793] inode generation 3 size 16 mode 40755 [1194842.338801] item 1 key (256 12 256) itemoff 16111 itemsize 12 [1194842.338809] item 2 key (256 84 2248503653) itemoff 16077 itemsize 34 [1194842.338817] dir oid 258 type 2 [1194842.338823] item 3 key (256 84 2363071922) itemoff 16043 itemsize 34 [1194842.338830] dir oid 257 type 2 [1194842.338836] item 4 key (256 96 2) itemoff 16009 itemsize 34 [1194842.338843] item 5 key (256 96 3) itemoff 15975 itemsize 34 [1194842.338852] item 6 key (257 1 0) itemoff 15815 itemsize 160 [1194842.338863] inode generation 6 size 8 mode 40755 [1194842.338869] item 7 key (257 12 256) itemoff 15801 itemsize 14 [1194842.338876] item 8 key (257 84 2505409169) itemoff 15767 itemsize 34 [1194842.338883] dir oid 256 type 2 [1194842.338888] item 9 key (257 96 2) itemoff 15733 itemsize 34 [1194842.338895] item 10 key (258 12 256) itemoff 15719 itemsize 14 [1194842.339163] BTRFS error (device loop1): block=27574272 write time tree block corruption detected [1194842.339245] ------------[ cut here ]------------ [1194842.443422] WARNING: CPU: 6 PID: 26561 at fs/btrfs/disk-io.c:449 csum_one_extent_buffer+0xed/0x100 [btrfs] [1194842.511863] CPU: 6 PID: 26561 Comm: kworker/u17:2 Not tainted 5.14.0-rc3-git+ #793 [1194842.511870] Hardware name: empty empty/S3993, BIOS PAQEX0-3 02/24/2008 [1194842.511876] Workqueue: btrfs-worker-high btrfs_work_helper [btrfs] [1194842.511976] RIP: 0010:csum_one_extent_buffer+0xed/0x100 [btrfs] [1194842.512068] RSP: 0018:ffffa2c284d77da0 EFLAGS: 00010282 [1194842.512074] RAX: 0000000000000000 RBX: 0000000000001000 RCX: ffff928867bd9978 [1194842.512078] RDX: 0000000000000000 RSI: 0000000000000027 RDI: ffff928867bd9970 [1194842.512081] RBP: ffff92876b958000 R08: 0000000000000001 R09: 00000000000c0003 [1194842.512085] R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 [1194842.512088] R13: ffff92875f989f98 R14: 0000000000000000 R15: 0000000000000000 [1194842.512092] FS: 0000000000000000(0000) GS:ffff928867a00000(0000) knlGS:0000000000000000 [1194842.512095] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1194842.512099] CR2: 000055f5384da1f0 CR3: 0000000102fe4000 CR4: 00000000000006e0 [1194842.512103] Call Trace: [1194842.512128] ? run_one_async_free+0x10/0x10 [btrfs] [1194842.631729] btree_csum_one_bio+0x1ac/0x1d0 [btrfs] [1194842.631837] run_one_async_start+0x18/0x30 [btrfs] [1194842.631938] btrfs_work_helper+0xd5/0x1d0 [btrfs] [1194842.647482] process_one_work+0x262/0x5e0 [1194842.647520] worker_thread+0x4c/0x320 [1194842.655935] ? process_one_work+0x5e0/0x5e0 [1194842.655946] kthread+0x135/0x160 [1194842.655953] ? set_kthread_struct+0x40/0x40 [1194842.655965] ret_from_fork+0x1f/0x30 [1194842.672465] irq event stamp: 1729 [1194842.672469] hardirqs last enabled at (1735): [] console_trylock_spinning+0x185/0x1a0 [1194842.672477] hardirqs last disabled at (1740): [] console_trylock_spinning+0x15c/0x1a0 [1194842.672482] softirqs last enabled at (1666): [] __do_softirq+0x2e1/0x50a [1194842.672491] softirqs last disabled at (1651): [] __irq_exit_rcu+0xa7/0xd0 The corrupted data will not be written, and filesystem can be unmounted and mounted again (all changes since the last commit will be lost). Add the missing check for new_ino so that all non-subvolumes must reside under the same parent subvolume. There's an exception allowing to exchange two subvolumes from any parents as the directory representing a subvolume is only a logical link and does not have any other structures related to the parent subvolume, unlike files, directories etc, that are always in the inode namespace of the parent subvolume. Fixes: cdd1fedf8261 ("btrfs: add support for RENAME_EXCHANGE and RENAME_WHITEOUT") CC: stable@vger.kernel.org # 4.7+ Reviewed-by: Nikolay Borisov Signed-off-by: NeilBrown Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0117d867ecf8..06f9f167222b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9226,8 +9226,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, bool dest_log_pinned = false; bool need_abort = false; - /* we only allow rename subvolume link between subvolumes */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + /* + * For non-subvolumes allow exchange only within one subvolume, in the + * same inode namespace. Two subvolumes (represented as directory) can + * be exchanged as they're a logical link and have a fixed inode number. + */ + if (root != dest && + (old_ino != BTRFS_FIRST_FREE_OBJECTID || + new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; /* close the race window with snapshot create/destroy ioctl */ From 4f3f2e3fa0431b93745b110da1c365806c5acce3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 16 Aug 2021 14:13:33 +0300 Subject: [PATCH 157/187] net: iosm: Prevent underflow in ipc_chnl_cfg_get() The bounds check on "index" doesn't catch negative values. Using ARRAY_SIZE() directly is more readable and more robust because it prevents negative values for "index". Fortunately we only pass valid values to ipc_chnl_cfg_get() so this patch does not affect runtime. Reported-by: Solomon Ucko Signed-off-by: Dan Carpenter Reviewed-by: M Chetan Kumar Signed-off-by: David S. Miller --- drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c b/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c index 804e6c4f2c78..519361ec40df 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c +++ b/drivers/net/wwan/iosm/iosm_ipc_chnl_cfg.c @@ -64,10 +64,9 @@ static struct ipc_chnl_cfg modem_cfg[] = { int ipc_chnl_cfg_get(struct ipc_chnl_cfg *chnl_cfg, int index) { - int array_size = ARRAY_SIZE(modem_cfg); - - if (index >= array_size) { - pr_err("index: %d and array_size %d", index, array_size); + if (index >= ARRAY_SIZE(modem_cfg)) { + pr_err("index: %d and array size %zu", index, + ARRAY_SIZE(modem_cfg)); return -ECHRNG; } From 0f923e07124df069ba68d8bb12324398f4b6b709 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Thu, 15 Jul 2021 01:56:24 +0300 Subject: [PATCH 158/187] KVM: nSVM: avoid picking up unsupported bits from L2 in int_ctl (CVE-2021-3653) * Invert the mask of bits that we pick from L2 in nested_vmcb02_prepare_control * Invert and explicitly use VIRQ related bits bitmask in svm_clear_vintr This fixes a security issue that allowed a malicious L1 to run L2 with AVIC enabled, which allowed the L2 to exploit the uninitialized and enabled AVIC to read/write the host physical memory at some offsets. Fixes: 3d6368ef580a ("KVM: SVM: Add VMRUN handler") Signed-off-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 2 ++ arch/x86/kvm/svm/nested.c | 10 +++++++--- arch/x86/kvm/svm/svm.c | 9 +++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index e322676039f4..b00dbc5fac2b 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -184,6 +184,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_IGN_TPR_SHIFT 20 #define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT) +#define V_IRQ_INJECTION_BITS_MASK (V_IRQ_MASK | V_INTR_PRIO_MASK | V_IGN_TPR_MASK) + #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 61738ff8ef33..28381ca5221c 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -503,7 +503,11 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { - const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; + const u32 int_ctl_vmcb01_bits = + V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK; + + const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; + struct kvm_vcpu *vcpu = &svm->vcpu; /* @@ -535,8 +539,8 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) vcpu->arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; svm->vmcb->control.int_ctl = - (svm->nested.ctl.int_ctl & ~mask) | - (svm->vmcb01.ptr->control.int_ctl & mask); + (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | + (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits); svm->vmcb->control.virt_ext = svm->nested.ctl.virt_ext; svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e8ccab50ebf6..69639f9624f5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1589,17 +1589,18 @@ static void svm_set_vintr(struct vcpu_svm *svm) static void svm_clear_vintr(struct vcpu_svm *svm) { - const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; svm_clr_intercept(svm, INTERCEPT_VINTR); /* Drop int_ctl fields related to VINTR injection. */ - svm->vmcb->control.int_ctl &= mask; + svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; if (is_guest_mode(&svm->vcpu)) { - svm->vmcb01.ptr->control.int_ctl &= mask; + svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != (svm->nested.ctl.int_ctl & V_TPR_MASK)); - svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; + + svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & + V_IRQ_INJECTION_BITS_MASK; } vmcb_mark_dirty(svm->vmcb, VMCB_INTR); From c7dfa4009965a9b2d7b329ee970eb8da0d32f0bc Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Mon, 19 Jul 2021 16:05:00 +0300 Subject: [PATCH 159/187] KVM: nSVM: always intercept VMLOAD/VMSAVE when nested (CVE-2021-3656) If L1 disables VMLOAD/VMSAVE intercepts, and doesn't enable Virtual VMLOAD/VMSAVE (currently not supported for the nested hypervisor), then VMLOAD/VMSAVE must operate on the L1 physical memory, which is only possible by making L0 intercept these instructions. Failure to do so allowed the nested guest to run VMLOAD/VMSAVE unintercepted, and thus read/write portions of the host physical memory. Fixes: 89c8a4984fc9 ("KVM: SVM: Enable Virtual VMLOAD VMSAVE feature") Suggested-by: Paolo Bonzini Signed-off-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 28381ca5221c..e5515477c30a 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -158,6 +158,9 @@ void recalc_intercepts(struct vcpu_svm *svm) /* If SMI is not intercepted, ignore guest SMI intercept as well */ if (!intercept_smi) vmcb_clr_intercept(c, INTERCEPT_SMI); + + vmcb_set_intercept(c, INTERCEPT_VMLOAD); + vmcb_set_intercept(c, INTERCEPT_VMSAVE); } static void copy_vmcb_control_area(struct vmcb_control_area *dst, From 6c34df6f350df9579ce99d887a2b5fa14cc13b32 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Sat, 14 Aug 2021 11:45:38 +0800 Subject: [PATCH 160/187] tracing: Apply trace filters on all output channels The event filters are not applied on all of the output, which results in the flood of printk when using tp_printk. Unfolding event_trigger_unlock_commit_regs() into trace_event_buffer_commit(), so the filters can be applied on every output. Link: https://lkml.kernel.org/r/20210814034538.8428-1-kernelfans@gmail.com Cc: stable@vger.kernel.org Fixes: 0daa2302968c1 ("tracing: Add tp_printk cmdline to have tracepoints go to printk()") Signed-off-by: Pingfan Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 18 +++++++++++++++--- kernel/trace/trace.h | 32 -------------------------------- 2 files changed, 15 insertions(+), 35 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 33899a71fdc1..a1adb29ef5c1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2897,14 +2897,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write, void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) { + enum event_trigger_type tt = ETT_NONE; + struct trace_event_file *file = fbuffer->trace_file; + + if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event, + fbuffer->entry, &tt)) + goto discard; + if (static_key_false(&tracepoint_printk_key.key)) output_printk(fbuffer); if (static_branch_unlikely(&trace_event_exports_enabled)) ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT); - event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer, - fbuffer->event, fbuffer->entry, - fbuffer->trace_ctx, fbuffer->regs); + + trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer, + fbuffer->event, fbuffer->trace_ctx, fbuffer->regs); + +discard: + if (tt) + event_triggers_post_call(file, tt); + } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index a180abf76d4e..4a0e693000c6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1389,38 +1389,6 @@ event_trigger_unlock_commit(struct trace_event_file *file, event_triggers_post_call(file, tt); } -/** - * event_trigger_unlock_commit_regs - handle triggers and finish event commit - * @file: The file pointer associated with the event - * @buffer: The ring buffer that the event is being written to - * @event: The event meta data in the ring buffer - * @entry: The event itself - * @trace_ctx: The tracing context flags. - * - * This is a helper function to handle triggers that require data - * from the event itself. It also tests the event against filters and - * if the event is soft disabled and should be discarded. - * - * Same as event_trigger_unlock_commit() but calls - * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). - */ -static inline void -event_trigger_unlock_commit_regs(struct trace_event_file *file, - struct trace_buffer *buffer, - struct ring_buffer_event *event, - void *entry, unsigned int trace_ctx, - struct pt_regs *regs) -{ - enum event_trigger_type tt = ETT_NONE; - - if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) - trace_buffer_unlock_commit_regs(file->tr, buffer, event, - trace_ctx, regs); - - if (tt) - event_triggers_post_call(file, tt); -} - #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) From 09e856d54bda5f288ef8437a90ab2b9b3eab83d1 Mon Sep 17 00:00:00 2001 From: Lahav Schlesinger Date: Sun, 15 Aug 2021 12:00:02 +0000 Subject: [PATCH 161/187] vrf: Reset skb conntrack connection on VRF rcv To fix the "reverse-NAT" for replies. When a packet is sent over a VRF, the POST_ROUTING hooks are called twice: Once from the VRF interface, and once from the "actual" interface the packet will be sent from: 1) First SNAT: l3mdev_l3_out() -> vrf_l3_out() -> .. -> vrf_output_direct() This causes the POST_ROUTING hooks to run. 2) Second SNAT: 'ip_output()' calls POST_ROUTING hooks again. Similarly for replies, first ip_rcv() calls PRE_ROUTING hooks, and second vrf_l3_rcv() calls them again. As an example, consider the following SNAT rule: > iptables -t nat -A POSTROUTING -p udp -m udp --dport 53 -j SNAT --to-source 2.2.2.2 -o vrf_1 In this case sending over a VRF will create 2 conntrack entries. The first is from the VRF interface, which performs the IP SNAT. The second will run the SNAT, but since the "expected reply" will remain the same, conntrack randomizes the source port of the packet: e..g With a socket bound to 1.1.1.1:10000, sending to 3.3.3.3:53, the conntrack rules are: udp 17 29 src=2.2.2.2 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 [UNREPLIED] src=3.3.3.3 dst=2.2.2.2 sport=53 dport=61033 packets=0 bytes=0 mark=0 use=1 udp 17 29 src=1.1.1.1 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 [UNREPLIED] src=3.3.3.3 dst=2.2.2.2 sport=53 dport=10000 packets=0 bytes=0 mark=0 use=1 i.e. First SNAT IP from 1.1.1.1 --> 2.2.2.2, and second the src port is SNAT-ed from 10000 --> 61033. But when a reply is sent (3.3.3.3:53 -> 2.2.2.2:61033) only the later conntrack entry is matched: udp 17 29 src=2.2.2.2 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 src=3.3.3.3 dst=2.2.2.2 sport=53 dport=61033 packets=1 bytes=49 mark=0 use=1 udp 17 28 src=1.1.1.1 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 [UNREPLIED] src=3.3.3.3 dst=2.2.2.2 sport=53 dport=10000 packets=0 bytes=0 mark=0 use=1 And a "port 61033 unreachable" ICMP packet is sent back. The issue is that when PRE_ROUTING hooks are called from vrf_l3_rcv(), the skb already has a conntrack flow attached to it, which means nf_conntrack_in() will not resolve the flow again. This means only the dest port is "reverse-NATed" (61033 -> 10000) but the dest IP remains 2.2.2.2, and since the socket is bound to 1.1.1.1 it's not received. This can be verified by logging the 4-tuple of the packet in '__udp4_lib_rcv()'. The fix is then to reset the flow when skb is received on a VRF, to let conntrack resolve the flow again (which now will hit the earlier flow). To reproduce: (Without the fix "Got pkt_to_nat_port" will not be printed by running 'bash ./repro'): $ cat run_in_A1.py import logging logging.getLogger("scapy.runtime").setLevel(logging.ERROR) from scapy.all import * import argparse def get_packet_to_send(udp_dst_port, msg_name): return Ether(src='11:22:33:44:55:66', dst=iface_mac)/ \ IP(src='3.3.3.3', dst='2.2.2.2')/ \ UDP(sport=53, dport=udp_dst_port)/ \ Raw(f'{msg_name}\x0012345678901234567890') parser = argparse.ArgumentParser() parser.add_argument('-iface_mac', dest="iface_mac", type=str, required=True, help="From run_in_A3.py") parser.add_argument('-socket_port', dest="socket_port", type=str, required=True, help="From run_in_A3.py") parser.add_argument('-v1_mac', dest="v1_mac", type=str, required=True, help="From script") args, _ = parser.parse_known_args() iface_mac = args.iface_mac socket_port = int(args.socket_port) v1_mac = args.v1_mac print(f'Source port before NAT: {socket_port}') while True: pkts = sniff(iface='_v0', store=True, count=1, timeout=10) if 0 == len(pkts): print('Something failed, rerun the script :(', flush=True) break pkt = pkts[0] if not pkt.haslayer('UDP'): continue pkt_sport = pkt.getlayer('UDP').sport print(f'Source port after NAT: {pkt_sport}', flush=True) pkt_to_send = get_packet_to_send(pkt_sport, 'pkt_to_nat_port') sendp(pkt_to_send, '_v0', verbose=False) # Will not be received pkt_to_send = get_packet_to_send(socket_port, 'pkt_to_socket_port') sendp(pkt_to_send, '_v0', verbose=False) break $ cat run_in_A2.py import socket import netifaces print(f"{netifaces.ifaddresses('e00000')[netifaces.AF_LINK][0]['addr']}", flush=True) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, str('vrf_1' + '\0').encode('utf-8')) s.connect(('3.3.3.3', 53)) print(f'{s. getsockname()[1]}', flush=True) s.settimeout(5) while True: try: # Periodically send in order to keep the conntrack entry alive. s.send(b'a'*40) resp = s.recvfrom(1024) msg_name = resp[0].decode('utf-8').split('\0')[0] print(f"Got {msg_name}", flush=True) except Exception as e: pass $ cat repro.sh ip netns del A1 2> /dev/null ip netns del A2 2> /dev/null ip netns add A1 ip netns add A2 ip -n A1 link add _v0 type veth peer name _v1 netns A2 ip -n A1 link set _v0 up ip -n A2 link add e00000 type bond ip -n A2 link add lo0 type dummy ip -n A2 link add vrf_1 type vrf table 10001 ip -n A2 link set vrf_1 up ip -n A2 link set e00000 master vrf_1 ip -n A2 addr add 1.1.1.1/24 dev e00000 ip -n A2 link set e00000 up ip -n A2 link set _v1 master e00000 ip -n A2 link set _v1 up ip -n A2 link set lo0 up ip -n A2 addr add 2.2.2.2/32 dev lo0 ip -n A2 neigh add 1.1.1.10 lladdr 77:77:77:77:77:77 dev e00000 ip -n A2 route add 3.3.3.3/32 via 1.1.1.10 dev e00000 table 10001 ip netns exec A2 iptables -t nat -A POSTROUTING -p udp -m udp --dport 53 -j \ SNAT --to-source 2.2.2.2 -o vrf_1 sleep 5 ip netns exec A2 python3 run_in_A2.py > x & XPID=$! sleep 5 IFACE_MAC=`sed -n 1p x` SOCKET_PORT=`sed -n 2p x` V1_MAC=`ip -n A2 link show _v1 | sed -n 2p | awk '{print $2'}` ip netns exec A1 python3 run_in_A1.py -iface_mac ${IFACE_MAC} -socket_port \ ${SOCKET_PORT} -v1_mac ${SOCKET_PORT} sleep 5 kill -9 $XPID wait $XPID 2> /dev/null ip netns del A1 ip netns del A2 tail x -n 2 rm x set +x Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device") Signed-off-by: Lahav Schlesinger Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20210815120002.2787653-1-lschlesinger@drivenets.com Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 2b1b944d4b28..8bbe2a7bb141 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1367,6 +1367,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); bool is_ndisc = ipv6_ndisc_frame(skb); + nf_reset_ct(skb); + /* loopback, multicast & non-ND link-local traffic; do not push through * packet taps again. Reset pkt_type for upper layers to process skb. * For strict packets with a source LLA, determine the dst using the @@ -1429,6 +1431,8 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, skb->skb_iif = vrf_dev->ifindex; IPCB(skb)->flags |= IPSKB_L3SLAVE; + nf_reset_ct(skb); + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) goto out; From 0165c4e19f6ec76b535de090e4bd145c73810c51 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Mon, 16 Aug 2021 20:42:59 +0300 Subject: [PATCH 162/187] ALSA: hda: Fix hang during shutdown due to link reset During system shutdown codecs may be still active, and resetting the controller->codec HW link in this state - based on the bug reporter's tests - leads to the shutdown sequence to get stuck. This happens at least on the reporter's KBL system with an ALC662 codec. For now fix the issue by skipping the link reset step. Fixes: 472e18f63c42 ("ALSA: hda: Release controller display power during shutdown/reboot") References: https://bugzilla.kernel.org/show_bug.cgi?id=214045 References: https://gitlab.freedesktop.org/drm/intel/-/issues/3618#note_1024665 Reported-and-tested-by: youling257@gmail.com Cc: youling257@gmail.com Signed-off-by: Imre Deak Link: https://lore.kernel.org/r/20210816174259.2759103-1-imre.deak@intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index 0322b289505e..0062c18b646a 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -883,10 +883,11 @@ static unsigned int azx_get_pos_skl(struct azx *chip, struct azx_dev *azx_dev) return azx_get_pos_posbuf(chip, azx_dev); } -static void azx_shutdown_chip(struct azx *chip) +static void __azx_shutdown_chip(struct azx *chip, bool skip_link_reset) { azx_stop_chip(chip); - azx_enter_link_reset(chip); + if (!skip_link_reset) + azx_enter_link_reset(chip); azx_clear_irq_pending(chip); display_power(chip, false); } @@ -895,6 +896,11 @@ static void azx_shutdown_chip(struct azx *chip) static DEFINE_MUTEX(card_list_lock); static LIST_HEAD(card_list); +static void azx_shutdown_chip(struct azx *chip) +{ + __azx_shutdown_chip(chip, false); +} + static void azx_add_card_list(struct azx *chip) { struct hda_intel *hda = container_of(chip, struct hda_intel, chip); @@ -2385,7 +2391,7 @@ static void azx_shutdown(struct pci_dev *pci) return; chip = card->private_data; if (chip && chip->running) - azx_shutdown_chip(chip); + __azx_shutdown_chip(chip, true); } /* PCI IDs */ From 4bf61ad5f0204b67ba570da6e5c052c2095e29df Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 17 Aug 2021 07:24:32 +0200 Subject: [PATCH 163/187] ALSA: hda/via: Apply runtime PM workaround for ASUS B23E ASUS B23E requires the same workaround like other machines with VT1802, otherwise it looses the codec power on a few nodes and the sound kept silence. Fixes: a0645daf1610 ("ALSA: HDA: Early Forbid of runtime PM") Link: https://lore.kernel.org/r/ac2232f142efcd67fe6ac38897f704f7176bd200.camel@gmail.com Cc: Link: https://lore.kernel.org/r/20210817052432.14751-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_via.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index a5c1a2c4eae4..773a136161f1 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -1041,6 +1041,7 @@ static const struct hda_fixup via_fixups[] = { }; static const struct snd_pci_quirk vt2002p_fixups[] = { + SND_PCI_QUIRK(0x1043, 0x13f7, "Asus B23E", VIA_FIXUP_POWER_SAVE), SND_PCI_QUIRK(0x1043, 0x1487, "Asus G75", VIA_FIXUP_ASUS_G75), SND_PCI_QUIRK(0x1043, 0x8532, "Asus X202E", VIA_FIXUP_INTMIC_BOOST), SND_PCI_QUIRK_VENDOR(0x1558, "Clevo", VIA_FIXUP_POWER_SAVE), From dbcf24d153884439dad30484a0e3f02350692e4c Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 17 Aug 2021 16:06:59 +0800 Subject: [PATCH 164/187] virtio-net: use NETIF_F_GRO_HW instead of NETIF_F_LRO Commit a02e8964eaf92 ("virtio-net: ethtool configurable LRO") maps LRO to virtio guest offloading features and allows the administrator to enable and disable those features via ethtool. This leads to several issues: - For a device that doesn't support control guest offloads, the "LRO" can't be disabled triggering WARN in dev_disable_lro() when turning off LRO or when enabling forwarding bridging etc. - For a device that supports control guest offloads, the guest offloads are disabled in cases of bridging, forwarding etc slowing down the traffic. Fix this by using NETIF_F_GRO_HW instead. Though the spec does not guarantee packets to be re-segmented as the original ones, we can add that to the spec, possibly with a flag for devices to differentiate between GRO and LRO. Further, we never advertised LRO historically before a02e8964eaf92 ("virtio-net: ethtool configurable LRO") and so bridged/forwarded configs effectively always relied on virtio receive offloads behaving like GRO - thus even if this breaks any configs it is at least not a regression. Fixes: a02e8964eaf92 ("virtio-net: ethtool configurable LRO") Acked-by: Michael S. Tsirkin Reported-by: Ivan Tested-by: Ivan Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 56c3f8519093..eee493685aad 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -63,7 +63,7 @@ static const unsigned long guest_offloads[] = { VIRTIO_NET_F_GUEST_CSUM }; -#define GUEST_OFFLOAD_LRO_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ +#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO)) @@ -2515,7 +2515,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) { - NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first"); + NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first"); return -EOPNOTSUPP; } @@ -2646,15 +2646,15 @@ static int virtnet_set_features(struct net_device *dev, u64 offloads; int err; - if ((dev->features ^ features) & NETIF_F_LRO) { + if ((dev->features ^ features) & NETIF_F_GRO_HW) { if (vi->xdp_enabled) return -EBUSY; - if (features & NETIF_F_LRO) + if (features & NETIF_F_GRO_HW) offloads = vi->guest_offloads_capable; else offloads = vi->guest_offloads_capable & - ~GUEST_OFFLOAD_LRO_MASK; + ~GUEST_OFFLOAD_GRO_HW_MASK; err = virtnet_set_guest_offloads(vi, offloads); if (err) @@ -3134,9 +3134,9 @@ static int virtnet_probe(struct virtio_device *vdev) dev->features |= NETIF_F_RXCSUM; if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) - dev->features |= NETIF_F_LRO; + dev->features |= NETIF_F_GRO_HW; if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) - dev->hw_features |= NETIF_F_LRO; + dev->hw_features |= NETIF_F_GRO_HW; dev->vlan_features = dev->features; From 276e189f8e4e3cce1634d6bac4ed0d9ca242441b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 17 Aug 2021 12:12:22 +0200 Subject: [PATCH 165/187] mac80211: fix locking in ieee80211_restart_work() Ilan's change to move locking around accidentally lost the wiphy_lock() during some porting, add it back. Fixes: 45daaa131841 ("mac80211: Properly WARN on HW scan before restart") Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20210817121210.47bdb177064f.Ib1ef79440cd27f318c028ddfc0c642406917f512@changeid Signed-off-by: Jakub Kicinski --- net/mac80211/main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 05f4c3c72619..fcae76ddd586 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -260,6 +260,8 @@ static void ieee80211_restart_work(struct work_struct *work) flush_work(&local->radar_detected_work); rtnl_lock(); + /* we might do interface manipulations, so need both */ + wiphy_lock(local->hw.wiphy); WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), "%s called with hardware scan in progress\n", __func__); From 0a298d133893c72c96e2156ed7cb0f0c4a306a3e Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Mon, 16 Aug 2021 21:14:04 +0800 Subject: [PATCH 166/187] net: qlcnic: add missed unlock in qlcnic_83xx_flash_read32 qlcnic_83xx_unlock_flash() is called on all paths after we call qlcnic_83xx_lock_flash(), except for one error path on failure of QLCRD32(), which may cause a deadlock. This bug is suggested by a static analysis tool, please advise. Fixes: 81d0aeb0a4fff ("qlcnic: flash template based firmware reset recovery") Signed-off-by: Dinghao Liu Link: https://lore.kernel.org/r/20210816131405.24024-1-dinghao.liu@zju.edu.cn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index d8882d0b6b49..d51bac7ba5af 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -3156,8 +3156,10 @@ int qlcnic_83xx_flash_read32(struct qlcnic_adapter *adapter, u32 flash_addr, indirect_addr = QLC_83XX_FLASH_DIRECT_DATA(addr); ret = QLCRD32(adapter, indirect_addr, &err); - if (err == -EIO) + if (err == -EIO) { + qlcnic_83xx_unlock_flash(adapter); return err; + } word = ret; *(u32 *)p_data = word; From 1b80fec7b043552e01609bae7d0aad07aa742adc Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 17 Aug 2021 13:37:36 -0700 Subject: [PATCH 167/187] ixgbe, xsk: clean up the resources in ixgbe_xsk_pool_enable error path In ixgbe_xsk_pool_enable(), if ixgbe_xsk_wakeup() fails, We should restore the previous state and clean up the resources. Add the missing clear af_xdp_zc_qps and unmap dma to fix this bug. Fixes: d49e286d354e ("ixgbe: add tracking of AF_XDP zero-copy state for each queue pair") Fixes: 4a9b32f30f80 ("ixgbe: fix potential RX buffer starvation for AF_XDP") Signed-off-by: Wang Hai Acked-by: Magnus Karlsson Tested-by: Sandeep Penigalapati Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20210817203736.3529939-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 96dd1a4f956a..b1d22e4d5ec9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -52,8 +52,11 @@ static int ixgbe_xsk_pool_enable(struct ixgbe_adapter *adapter, /* Kick start the NAPI context so that receiving will start */ err = ixgbe_xsk_wakeup(adapter->netdev, qid, XDP_WAKEUP_RX); - if (err) + if (err) { + clear_bit(qid, adapter->af_xdp_zc_qps); + xsk_pool_dma_unmap(pool, IXGBE_RX_DMA_ATTR); return err; + } } return 0; From b9570f5c9240cadf87fb5f9313e8f425aa9e788f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 17 Aug 2021 17:46:28 +0200 Subject: [PATCH 168/187] platform/x86: gigabyte-wmi: add support for X570 GAMING X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported as working here: https://github.com/t-8ch/linux-gigabyte-wmi-driver/issues/1#issuecomment-900263115 Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20210817154628.84992-1-linux@weissschuh.net Signed-off-by: Hans de Goede --- drivers/platform/x86/gigabyte-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index fbb224a82e34..9e8cfac403d3 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -147,6 +147,7 @@ static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550M DS3H"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("Z390 I AORUS PRO WIFI-CF"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 AORUS ELITE"), + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 GAMING X"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 I AORUS PRO WIFI"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("X570 UD"), { } From 86b9bbd332d0510679c7fedcee3e3bd278be5756 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Mon, 16 Aug 2021 13:59:17 +0200 Subject: [PATCH 169/187] sch_cake: fix srchost/dsthost hashing mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When adding support for using the skb->hash value as the flow hash in CAKE, I accidentally introduced a logic error that broke the host-only isolation modes of CAKE (srchost and dsthost keywords). Specifically, the flow_hash variable should stay initialised to 0 in cake_hash() in pure host-based hashing mode. Add a check for this before using the skb->hash value as flow_hash. Fixes: b0c19ed6088a ("sch_cake: Take advantage of skb->hash where appropriate") Reported-by: Pete Heist Tested-by: Pete Heist Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- net/sched/sch_cake.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 951542843cab..28af8b1e1bb1 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -720,7 +720,7 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, skip_hash: if (flow_override) flow_hash = flow_override - 1; - else if (use_skbhash) + else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; From ed5d2937a6a8f12e7f815748f991990e79ac4cd1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 17 Aug 2021 17:52:45 +0300 Subject: [PATCH 170/187] net: dsa: sja1105: fix use-after-free after calling of_find_compatible_node, or worse It seems that of_find_compatible_node has a weird calling convention in which it calls of_node_put() on the "from" node argument, instead of leaving that up to the caller. This comes from the fact that of_find_compatible_node with a non-NULL "from" argument it only supposed to be used as the iterator function of for_each_compatible_node(). OF iterator functions call of_node_get on the next OF node and of_node_put() on the previous one. When of_find_compatible_node calls of_node_put, it actually never expects the refcount to drop to zero, because the call is done under the atomic devtree_lock context, and when the refcount drops to zero it triggers a kobject and a sysfs file deletion, which assume blocking context. So any driver call to of_find_compatible_node is probably buggy because an unexpected of_node_put() takes place. What should be done is to use the of_get_compatible_child() function. Fixes: 5a8f09748ee7 ("net: dsa: sja1105: register the MDIO buses for 100base-T1 and 100base-TX") Link: https://lore.kernel.org/netdev/20210814010139.kzryimmp4rizlznt@skbuf/ Suggested-by: Frank Rowand Suggested-by: Rob Herring Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/sja1105/sja1105_mdio.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_mdio.c b/drivers/net/dsa/sja1105/sja1105_mdio.c index 19aea8fb76f6..705d3900e43a 100644 --- a/drivers/net/dsa/sja1105/sja1105_mdio.c +++ b/drivers/net/dsa/sja1105/sja1105_mdio.c @@ -284,8 +284,7 @@ static int sja1105_mdiobus_base_tx_register(struct sja1105_private *priv, struct mii_bus *bus; int rc = 0; - np = of_find_compatible_node(mdio_node, NULL, - "nxp,sja1110-base-tx-mdio"); + np = of_get_compatible_child(mdio_node, "nxp,sja1110-base-tx-mdio"); if (!np) return 0; @@ -339,8 +338,7 @@ static int sja1105_mdiobus_base_t1_register(struct sja1105_private *priv, struct mii_bus *bus; int rc = 0; - np = of_find_compatible_node(mdio_node, NULL, - "nxp,sja1110-base-t1-mdio"); + np = of_get_compatible_child(mdio_node, "nxp,sja1110-base-t1-mdio"); if (!np) return 0; From 663d946af5fb2fde0c0498f11fb295e9e8db979f Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Tue, 17 Aug 2021 20:38:01 -0700 Subject: [PATCH 171/187] net: mdio-mux: Delete unnecessary devm_kfree The whole point of devm_* APIs is that you don't have to undo them if you are returning an error that's going to get propagated out of a probe() function. So delete unnecessary devm_kfree() call in the error return path. Fixes: b60161668199 ("mdio: mux: Correct mdio_mux_init error path issues") Signed-off-by: Saravana Kannan Reviewed-by: Andrew Lunn Acked-by: Marc Zyngier Tested-by: Marc Zyngier Acked-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-mux.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/mdio/mdio-mux.c b/drivers/net/mdio/mdio-mux.c index 110e4ee85785..5b37284f54d6 100644 --- a/drivers/net/mdio/mdio-mux.c +++ b/drivers/net/mdio/mdio-mux.c @@ -181,7 +181,6 @@ int mdio_mux_init(struct device *dev, } dev_err(dev, "Error: No acceptable child buses found\n"); - devm_kfree(dev, pb); err_pb_kz: put_device(&parent_bus->dev); err_parent_bus: From 99d81e942474cc7677d12f673f42a7ea699e2589 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Tue, 17 Aug 2021 20:38:02 -0700 Subject: [PATCH 172/187] net: mdio-mux: Don't ignore memory allocation errors If we are seeing memory allocation errors, don't try to continue registering child mdiobus devices. It's unlikely they'll succeed. Fixes: 342fa1964439 ("mdio: mux: make child bus walking more permissive and errors more verbose") Signed-off-by: Saravana Kannan Reviewed-by: Andrew Lunn Acked-by: Marc Zyngier Tested-by: Marc Zyngier Acked-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-mux.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/net/mdio/mdio-mux.c b/drivers/net/mdio/mdio-mux.c index 5b37284f54d6..13035e2685c4 100644 --- a/drivers/net/mdio/mdio-mux.c +++ b/drivers/net/mdio/mdio-mux.c @@ -82,6 +82,17 @@ static int mdio_mux_write(struct mii_bus *bus, int phy_id, static int parent_count; +static void mdio_mux_uninit_children(struct mdio_mux_parent_bus *pb) +{ + struct mdio_mux_child_bus *cb = pb->children; + + while (cb) { + mdiobus_unregister(cb->mii_bus); + mdiobus_free(cb->mii_bus); + cb = cb->next; + } +} + int mdio_mux_init(struct device *dev, struct device_node *mux_node, int (*switch_fn)(int cur, int desired, void *data), @@ -144,7 +155,7 @@ int mdio_mux_init(struct device *dev, cb = devm_kzalloc(dev, sizeof(*cb), GFP_KERNEL); if (!cb) { ret_val = -ENOMEM; - continue; + goto err_loop; } cb->bus_number = v; cb->parent = pb; @@ -152,8 +163,7 @@ int mdio_mux_init(struct device *dev, cb->mii_bus = mdiobus_alloc(); if (!cb->mii_bus) { ret_val = -ENOMEM; - devm_kfree(dev, cb); - continue; + goto err_loop; } cb->mii_bus->priv = cb; @@ -181,6 +191,10 @@ int mdio_mux_init(struct device *dev, } dev_err(dev, "Error: No acceptable child buses found\n"); + +err_loop: + mdio_mux_uninit_children(pb); + of_node_put(child_bus_node); err_pb_kz: put_device(&parent_bus->dev); err_parent_bus: @@ -192,14 +206,8 @@ EXPORT_SYMBOL_GPL(mdio_mux_init); void mdio_mux_uninit(void *mux_handle) { struct mdio_mux_parent_bus *pb = mux_handle; - struct mdio_mux_child_bus *cb = pb->children; - - while (cb) { - mdiobus_unregister(cb->mii_bus); - mdiobus_free(cb->mii_bus); - cb = cb->next; - } + mdio_mux_uninit_children(pb); put_device(&pb->mii_bus->dev); } EXPORT_SYMBOL_GPL(mdio_mux_uninit); From 7bd0cef5dac685f09ef8b0b2a7748ff42d284dc7 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Tue, 17 Aug 2021 20:38:03 -0700 Subject: [PATCH 173/187] net: mdio-mux: Handle -EPROBE_DEFER correctly When registering mdiobus children, if we get an -EPROBE_DEFER, we shouldn't ignore it and continue registering the rest of the mdiobus children. This would permanently prevent the deferring child mdiobus from working instead of reattempting it in the future. So, if a child mdiobus needs to be reattempted in the future, defer the entire mdio-mux initialization. This fixes the issue where PHYs sitting under the mdio-mux aren't initialized correctly if the PHY's interrupt controller is not yet ready when the mdio-mux is being probed. Additional context in the link below. Fixes: 0ca2997d1452 ("netdev/of/phy: Add MDIO bus multiplexer support.") Link: https://lore.kernel.org/lkml/CAGETcx95kHrv8wA-O+-JtfH7H9biJEGJtijuPVN0V5dUKUAB3A@mail.gmail.com/#t Signed-off-by: Saravana Kannan Reviewed-by: Andrew Lunn Acked-by: Marc Zyngier Tested-by: Marc Zyngier Acked-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: David S. Miller --- drivers/net/mdio/mdio-mux.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/mdio/mdio-mux.c b/drivers/net/mdio/mdio-mux.c index 13035e2685c4..ebd001f0eece 100644 --- a/drivers/net/mdio/mdio-mux.c +++ b/drivers/net/mdio/mdio-mux.c @@ -175,11 +175,15 @@ int mdio_mux_init(struct device *dev, cb->mii_bus->write = mdio_mux_write; r = of_mdiobus_register(cb->mii_bus, child_bus_node); if (r) { + mdiobus_free(cb->mii_bus); + if (r == -EPROBE_DEFER) { + ret_val = r; + goto err_loop; + } + devm_kfree(dev, cb); dev_err(dev, "Error: Failed to register MDIO bus for child %pOF\n", child_bus_node); - mdiobus_free(cb->mii_bus); - devm_kfree(dev, cb); } else { cb->next = pb->children; pb->children = cb; From 01634047bf0d5c2d9b7d8095bb4de1663dbeedeb Mon Sep 17 00:00:00 2001 From: "kaixi.fan" Date: Wed, 18 Aug 2021 10:22:15 +0800 Subject: [PATCH 174/187] ovs: clear skb->tstamp in forwarding path fq qdisc requires tstamp to be cleared in the forwarding path. Now ovs doesn't clear skb->tstamp. We encountered a problem with linux version 5.4.56 and ovs version 2.14.1, and packets failed to dequeue from qdisc when fq qdisc was attached to ovs port. Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Signed-off-by: kaixi.fan Signed-off-by: xiexiaohui Reviewed-by: Cong Wang Signed-off-by: David S. Miller --- net/openvswitch/vport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 88deb5b41429..cf2ce5812489 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -507,6 +507,7 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) } skb->dev = vport->dev; + skb->tstamp = 0; vport->ops->send(skb); return; From a786e3195d6af183033e86f0518ffd2c51c0e8ac Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Tue, 17 Aug 2021 19:37:23 +0300 Subject: [PATCH 175/187] net: asix: fix uninit value bugs Syzbot reported uninit-value in asix_mdio_read(). The problem was in missing error handling. asix_read_cmd() should initialize passed stack variable smsr, but it can fail in some cases. Then while condidition checks possibly uninit smsr variable. Since smsr is uninitialized stack variable, driver can misbehave, because smsr will be random in case of asix_read_cmd() failure. Fix it by adding error handling and just continue the loop instead of checking uninit value. Added helper function for checking Host_En bit, since wrong loop was used in 4 functions and there is no need in copy-pasting code parts. Cc: Robert Foss Fixes: d9fe64e51114 ("net: asix: Add in_pm parameter") Reported-by: syzbot+a631ec9e717fb0423053@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 70 +++++++++++++++-------------------- 1 file changed, 30 insertions(+), 40 deletions(-) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index ac92bc52a85e..38cda590895c 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -63,6 +63,29 @@ void asix_write_cmd_async(struct usbnet *dev, u8 cmd, u16 value, u16 index, value, index, data, size); } +static int asix_check_host_enable(struct usbnet *dev, int in_pm) +{ + int i, ret; + u8 smsr; + + for (i = 0; i < 30; ++i) { + ret = asix_set_sw_mii(dev, in_pm); + if (ret == -ENODEV || ret == -ETIMEDOUT) + break; + usleep_range(1000, 1100); + ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, + 0, 0, 1, &smsr, in_pm); + if (ret == -ENODEV) + break; + else if (ret < 0) + continue; + else if (smsr & AX_HOST_EN) + break; + } + + return ret; +} + static void reset_asix_rx_fixup_info(struct asix_rx_fixup_info *rx) { /* Reset the variables that have a lifetime outside of @@ -467,19 +490,11 @@ int asix_mdio_read(struct net_device *netdev, int phy_id, int loc) { struct usbnet *dev = netdev_priv(netdev); __le16 res; - u8 smsr; - int i = 0; int ret; mutex_lock(&dev->phy_mutex); - do { - ret = asix_set_sw_mii(dev, 0); - if (ret == -ENODEV || ret == -ETIMEDOUT) - break; - usleep_range(1000, 1100); - ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, - 0, 0, 1, &smsr, 0); - } while (!(smsr & AX_HOST_EN) && (i++ < 30) && (ret != -ENODEV)); + + ret = asix_check_host_enable(dev, 0); if (ret == -ENODEV || ret == -ETIMEDOUT) { mutex_unlock(&dev->phy_mutex); return ret; @@ -505,23 +520,14 @@ static int __asix_mdio_write(struct net_device *netdev, int phy_id, int loc, { struct usbnet *dev = netdev_priv(netdev); __le16 res = cpu_to_le16(val); - u8 smsr; - int i = 0; int ret; netdev_dbg(dev->net, "asix_mdio_write() phy_id=0x%02x, loc=0x%02x, val=0x%04x\n", phy_id, loc, val); mutex_lock(&dev->phy_mutex); - do { - ret = asix_set_sw_mii(dev, 0); - if (ret == -ENODEV) - break; - usleep_range(1000, 1100); - ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, - 0, 0, 1, &smsr, 0); - } while (!(smsr & AX_HOST_EN) && (i++ < 30) && (ret != -ENODEV)); + ret = asix_check_host_enable(dev, 0); if (ret == -ENODEV) goto out; @@ -561,19 +567,11 @@ int asix_mdio_read_nopm(struct net_device *netdev, int phy_id, int loc) { struct usbnet *dev = netdev_priv(netdev); __le16 res; - u8 smsr; - int i = 0; int ret; mutex_lock(&dev->phy_mutex); - do { - ret = asix_set_sw_mii(dev, 1); - if (ret == -ENODEV || ret == -ETIMEDOUT) - break; - usleep_range(1000, 1100); - ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, - 0, 0, 1, &smsr, 1); - } while (!(smsr & AX_HOST_EN) && (i++ < 30) && (ret != -ENODEV)); + + ret = asix_check_host_enable(dev, 1); if (ret == -ENODEV || ret == -ETIMEDOUT) { mutex_unlock(&dev->phy_mutex); return ret; @@ -595,22 +593,14 @@ asix_mdio_write_nopm(struct net_device *netdev, int phy_id, int loc, int val) { struct usbnet *dev = netdev_priv(netdev); __le16 res = cpu_to_le16(val); - u8 smsr; - int i = 0; int ret; netdev_dbg(dev->net, "asix_mdio_write() phy_id=0x%02x, loc=0x%02x, val=0x%04x\n", phy_id, loc, val); mutex_lock(&dev->phy_mutex); - do { - ret = asix_set_sw_mii(dev, 1); - if (ret == -ENODEV) - break; - usleep_range(1000, 1100); - ret = asix_read_cmd(dev, AX_CMD_STATMNGSTS_REG, - 0, 0, 1, &smsr, 1); - } while (!(smsr & AX_HOST_EN) && (i++ < 30) && (ret != -ENODEV)); + + ret = asix_check_host_enable(dev, 1); if (ret == -ENODEV) { mutex_unlock(&dev->phy_mutex); return; From 1e35b8a7780a0c043cc5389420f069b69343f5d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 18 Aug 2021 18:44:35 +0200 Subject: [PATCH 176/187] platform/x86: gigabyte-wmi: add support for B450M S2H V2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported as working here: https://github.com/t-8ch/linux-gigabyte-wmi-driver/issues/1#issuecomment-901207693 Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20210818164435.99821-1-linux@weissschuh.net Signed-off-by: Hans de Goede --- drivers/platform/x86/gigabyte-wmi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/platform/x86/gigabyte-wmi.c b/drivers/platform/x86/gigabyte-wmi.c index 9e8cfac403d3..7f3a03f937f6 100644 --- a/drivers/platform/x86/gigabyte-wmi.c +++ b/drivers/platform/x86/gigabyte-wmi.c @@ -140,6 +140,7 @@ static u8 gigabyte_wmi_detect_sensor_usability(struct wmi_device *wdev) }} static const struct dmi_system_id gigabyte_wmi_known_working_platforms[] = { + DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B450M S2H V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 AORUS ELITE V2"), DMI_EXACT_MATCH_GIGABYTE_BOARD_NAME("B550 GAMING X V2"), From 3b844826b6c6affa80755254da322b017358a2f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 5 Aug 2021 10:04:43 -0700 Subject: [PATCH 177/187] pipe: avoid unnecessary EPOLLET wakeups under normal loads I had forgotten just how sensitive hackbench is to extra pipe wakeups, and commit 3a34b13a88ca ("pipe: make pipe writes always wake up readers") ended up causing a quite noticeable regression on larger machines. Now, hackbench isn't necessarily a hugely meaningful benchmark, and it's not clear that this matters in real life all that much, but as Mel points out, it's used often enough when comparing kernels and so the performance regression shows up like a sore thumb. It's easy enough to fix at least for the common cases where pipes are used purely for data transfer, and you never have any exciting poll usage at all. So set a special 'poll_usage' flag when there is polling activity, and make the ugly "EPOLLET has crazy legacy expectations" semantics explicit to only that case. I would love to limit it to just the broken EPOLLET case, but the pipe code can't see the difference between epoll and regular select/poll, so any non-read/write waiting will trigger the extra wakeup behavior. That is sufficient for at least the hackbench case. Apart from making the odd extra wakeup cases more explicitly about EPOLLET, this also makes the extra wakeup be at the _end_ of the pipe write, not at the first write chunk. That is actually much saner semantics (as much as you can call any of the legacy edge-triggered expectations for EPOLLET "sane") since it means that you know the wakeup will happen once the write is done, rather than possibly in the middle of one. [ For stable people: I'm putting a "Fixes" tag on this, but I leave it up to you to decide whether you actually want to backport it or not. It likely has no impact outside of synthetic benchmarks - Linus ] Link: https://lore.kernel.org/lkml/20210802024945.GA8372@xsang-OptiPlex-9020/ Fixes: 3a34b13a88ca ("pipe: make pipe writes always wake up readers") Reported-by: kernel test robot Tested-by: Sandeep Patil Tested-by: Mel Gorman Signed-off-by: Linus Torvalds --- fs/pipe.c | 15 +++++++++------ include/linux/pipe_fs_i.h | 2 ++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 8e6ef62aeb1c..678dee2a8228 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -444,9 +444,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) #endif /* - * Epoll nonsensically wants a wakeup whether the pipe - * was already empty or not. - * * If it wasn't empty we try to merge new data into * the last buffer. * @@ -455,9 +452,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * spanning multiple pages. */ head = pipe->head; - was_empty = true; + was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); - if (chars && !pipe_empty(head, pipe->tail)) { + if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; @@ -590,8 +587,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * This is particularly important for small writes, because of * how (for example) the GNU make jobserver uses small writes to * wake up pending jobs + * + * Epoll nonsensically wants a wakeup whether the pipe + * was already empty or not. */ - if (was_empty) { + if (was_empty || pipe->poll_usage) { wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } @@ -654,6 +654,9 @@ pipe_poll(struct file *filp, poll_table *wait) struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; + /* Epoll has some historical nasty semantics, this enables them */ + pipe->poll_usage = 1; + /* * Reading pipe state only -- no need for acquiring the semaphore. * diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5d2705f1d01c..fc5642431b92 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -48,6 +48,7 @@ struct pipe_buffer { * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter + * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers @@ -70,6 +71,7 @@ struct pipe_inode_info { unsigned int files; unsigned int r_counter; unsigned int w_counter; + unsigned int poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; From c1930148a3941f891ddbd76fceaa4e10a957ccf2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 17 Aug 2021 19:04:25 +0300 Subject: [PATCH 178/187] net: mscc: ocelot: allow forwarding from bridge ports to the tag_8021q CPU port Currently we are unable to ping a bridge on top of a felix switch which uses the ocelot-8021q tagger. The packets are dropped on the ingress of the user port and the 'drop_local' counter increments (the counter which denotes drops due to no valid destinations). Dumping the PGID tables, it becomes clear that the PGID_SRC of the user port is zero, so it has no valid destinations. But looking at the code, the cpu_fwd_mask (the bit mask of DSA tag_8021q ports) is clearly missing from the forwarding mask of ports that are under a bridge. So this has always been broken. Looking at the version history of the patch, in v7 https://patchwork.kernel.org/project/netdevbpf/patch/20210125220333.1004365-12-olteanv@gmail.com/ the code looked like this: /* Standalone ports forward only to DSA tag_8021q CPU ports */ unsigned long mask = cpu_fwd_mask; (...) } else if (ocelot->bridge_fwd_mask & BIT(port)) { mask |= ocelot->bridge_fwd_mask & ~BIT(port); while in v8 (the merged version) https://patchwork.kernel.org/project/netdevbpf/patch/20210129010009.3959398-12-olteanv@gmail.com/ it looked like this: unsigned long mask; (...) } else if (ocelot->bridge_fwd_mask & BIT(port)) { mask = ocelot->bridge_fwd_mask & ~BIT(port); So the breakage was introduced between v7 and v8 of the patch. Fixes: e21268efbe26 ("net: dsa: felix: perform switch setup for tag_8021q") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20210817160425.3702809-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index adfb9781799e..2948d731a1c1 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1334,6 +1334,7 @@ void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot) struct net_device *bond = ocelot_port->bond; mask = ocelot_get_bridge_fwd_mask(ocelot, bridge); + mask |= cpu_fwd_mask; mask &= ~BIT(port); if (bond) { mask &= ~ocelot_get_bond_mask(ocelot, bond, From fb4b1373dcab086d0619c29310f0466a0b2ceb8a Mon Sep 17 00:00:00 2001 From: Gerd Rausch Date: Tue, 17 Aug 2021 10:04:37 -0700 Subject: [PATCH 179/187] net/rds: dma_map_sg is entitled to merge entries Function "dma_map_sg" is entitled to merge adjacent entries and return a value smaller than what was passed as "nents". Subsequently "ib_map_mr_sg" needs to work with this value ("sg_dma_len") rather than the original "nents" parameter ("sg_len"). This old RDS bug was exposed and reliably causes kernel panics (using RDMA operations "rds-stress -D") on x86_64 starting with: commit c588072bba6b ("iommu/vt-d: Convert intel iommu driver to the iommu ops") Simply put: Linux 5.11 and later. Signed-off-by: Gerd Rausch Acked-by: Santosh Shilimkar Link: https://lore.kernel.org/r/60efc69f-1f35-529d-a7ef-da0549cad143@oracle.com Signed-off-by: Jakub Kicinski --- net/rds/ib_frmr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 9b6ffff72f2d..28c1b0022178 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -131,9 +131,9 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_len)) + if (unlikely(ret != ibmr->sg_dma_len)) return ret < 0 ? ret : -EINVAL; if (cmpxchg(&frmr->fr_state, From a0eea5f10eeb5180d115452b0d77afa6603dfe18 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 18 Aug 2021 16:42:36 -0700 Subject: [PATCH 180/187] mptcp: fix memory leak on address flush The endpoint cleanup path is prone to a memory leak, as reported by syzkaller: BUG: memory leak unreferenced object 0xffff88810680ea00 (size 64): comm "syz-executor.6", pid 6191, jiffies 4295756280 (age 24.138s) hex dump (first 32 bytes): 58 75 7d 3c 80 88 ff ff 22 01 00 00 00 00 ad de Xu}<...."....... 01 00 02 00 00 00 00 00 ac 1e 00 07 00 00 00 00 ................ backtrace: [<0000000072a9f72a>] kmalloc include/linux/slab.h:591 [inline] [<0000000072a9f72a>] mptcp_nl_cmd_add_addr+0x287/0x9f0 net/mptcp/pm_netlink.c:1170 [<00000000f6e931bf>] genl_family_rcv_msg_doit.isra.0+0x225/0x340 net/netlink/genetlink.c:731 [<00000000f1504a2c>] genl_family_rcv_msg net/netlink/genetlink.c:775 [inline] [<00000000f1504a2c>] genl_rcv_msg+0x341/0x5b0 net/netlink/genetlink.c:792 [<0000000097e76f6a>] netlink_rcv_skb+0x148/0x430 net/netlink/af_netlink.c:2504 [<00000000ceefa2b8>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:803 [<000000008ff91aec>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000008ff91aec>] netlink_unicast+0x537/0x750 net/netlink/af_netlink.c:1340 [<0000000041682c35>] netlink_sendmsg+0x846/0xd80 net/netlink/af_netlink.c:1929 [<00000000df3aa8e7>] sock_sendmsg_nosec net/socket.c:704 [inline] [<00000000df3aa8e7>] sock_sendmsg+0x14e/0x190 net/socket.c:724 [<000000002154c54c>] ____sys_sendmsg+0x709/0x870 net/socket.c:2403 [<000000001aab01d7>] ___sys_sendmsg+0xff/0x170 net/socket.c:2457 [<00000000fa3b1446>] __sys_sendmsg+0xe5/0x1b0 net/socket.c:2486 [<00000000db2ee9c7>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<00000000db2ee9c7>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000005873517d>] entry_SYSCALL_64_after_hwframe+0x44/0xae We should not require an allocation to cleanup stuff. Rework the code a bit so that the additional RCU work is no more needed. Fixes: 1729cf186d8a ("mptcp: create the listening socket for new port") Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 44 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 56263c2c4014..7b3794459783 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1135,36 +1135,12 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, return 0; } -struct addr_entry_release_work { - struct rcu_work rwork; - struct mptcp_pm_addr_entry *entry; -}; - -static void mptcp_pm_release_addr_entry(struct work_struct *work) +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) { - struct addr_entry_release_work *w; - struct mptcp_pm_addr_entry *entry; - - w = container_of(to_rcu_work(work), struct addr_entry_release_work, rwork); - entry = w->entry; - if (entry) { - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); - } - kfree(w); -} - -static void mptcp_pm_free_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - struct addr_entry_release_work *w; - - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - INIT_RCU_WORK(&w->rwork, mptcp_pm_release_addr_entry); - w->entry = entry; - queue_rcu_work(system_wq, &w->rwork); - } + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); } static int mptcp_nl_remove_id_zero_address(struct net *net, @@ -1244,7 +1220,8 @@ static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&pernet->lock); mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), &entry->addr); - mptcp_pm_free_addr_entry(entry); + synchronize_rcu(); + __mptcp_pm_release_addr_entry(entry); return ret; } @@ -1297,6 +1274,7 @@ static void mptcp_nl_remove_addrs_list(struct net *net, } } +/* caller must ensure the RCU grace period is already elapsed */ static void __flush_addrs(struct list_head *list) { while (!list_empty(list)) { @@ -1305,7 +1283,7 @@ static void __flush_addrs(struct list_head *list) cur = list_entry(list->next, struct mptcp_pm_addr_entry, list); list_del_rcu(&cur->list); - mptcp_pm_free_addr_entry(cur); + __mptcp_pm_release_addr_entry(cur); } } @@ -1329,6 +1307,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); spin_unlock_bh(&pernet->lock); mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); + synchronize_rcu(); __flush_addrs(&free_list); return 0; } @@ -1939,7 +1918,8 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); /* net is removed from namespace list, can't race with - * other modifiers + * other modifiers, also netns core already waited for a + * RCU grace period. */ __flush_addrs(&pernet->local_addr_list); } From 67b12f792d5eaeb8b4fca3b2053e6b819eb3bf0f Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Wed, 18 Aug 2021 16:42:37 -0700 Subject: [PATCH 181/187] mptcp: full fully established support after ADD_ADDR If directly after an MP_CAPABLE 3WHS, the client receives an ADD_ADDR with HMAC from the server, it is enough to switch to a "fully established" mode because it has received more MPTCP options. It was then OK to enable the "fully_established" flag on the MPTCP socket. Still, best to check if the ADD_ADDR looks valid by looking if it contains an HMAC (no 'echo' bit). If an ADD_ADDR echo is received while we are not in "fully established" mode, it is strange and then we should not switch to this mode now. But that is not enough. On one hand, the path-manager has be notified the state has changed. On the other hand, the "fully_established" flag on the subflow socket should be turned on as well not to re-send the MP_CAPABLE 3rd ACK content with the next ACK. Fixes: 84dfe3677a6f ("mptcp: send out dedicated ADD_ADDR packet") Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/options.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 4452455aef7f..7adcbc1f7d49 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -885,20 +885,16 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return subflow->mp_capable; } - if (mp_opt->dss && mp_opt->use_ack) { + if ((mp_opt->dss && mp_opt->use_ack) || + (mp_opt->add_addr && !mp_opt->echo)) { /* subflows are fully established as soon as we get any - * additional ack. + * additional ack, including ADD_ADDR. */ subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); goto fully_established; } - if (mp_opt->add_addr) { - WRITE_ONCE(msk->fully_established, true); - return true; - } - /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. From a876a33d2a1102f99fc782fefb784f4dd4841d8c Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Thu, 19 Aug 2021 11:05:36 +0800 Subject: [PATCH 182/187] r8152: fix writing USB_BP2_EN The register of USB_BP2_EN is 16 bits, so we should use ocp_write_word(), not ocp_write_byte(). Fixes: 9370f2d05a2a ("support request_firmware for RTL8153") Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index e09b107b5c99..3fd17b6dc61d 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3965,7 +3965,7 @@ static void rtl_clear_bp(struct r8152 *tp, u16 type) case RTL_VER_15: default: if (type == MCU_TYPE_USB) { - ocp_write_byte(tp, MCU_TYPE_USB, USB_BP2_EN, 0); + ocp_write_word(tp, MCU_TYPE_USB, USB_BP2_EN, 0); ocp_write_word(tp, MCU_TYPE_USB, USB_BP_8, 0); ocp_write_word(tp, MCU_TYPE_USB, USB_BP_9, 0); From 6633fb83f1faddbfcac09e35edcae96bd0468335 Mon Sep 17 00:00:00 2001 From: Hayes Wang Date: Thu, 19 Aug 2021 11:05:37 +0800 Subject: [PATCH 183/187] r8152: fix the maximum number of PLA bp for RTL8153C The maximum PLA bp number of RTL8153C is 16, not 8. That is, the bp 0 ~ 15 are at 0xfc28 ~ 0xfc46, and the bp_en is at 0xfc48. Fixes: 195aae321c82 ("r8152: support new chips") Signed-off-by: Hayes Wang Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 3fd17b6dc61d..79832374f78d 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -3955,13 +3955,24 @@ static void rtl_clear_bp(struct r8152 *tp, u16 type) case RTL_VER_06: ocp_write_byte(tp, type, PLA_BP_EN, 0); break; + case RTL_VER_14: + ocp_write_word(tp, type, USB_BP2_EN, 0); + + ocp_write_word(tp, type, USB_BP_8, 0); + ocp_write_word(tp, type, USB_BP_9, 0); + ocp_write_word(tp, type, USB_BP_10, 0); + ocp_write_word(tp, type, USB_BP_11, 0); + ocp_write_word(tp, type, USB_BP_12, 0); + ocp_write_word(tp, type, USB_BP_13, 0); + ocp_write_word(tp, type, USB_BP_14, 0); + ocp_write_word(tp, type, USB_BP_15, 0); + break; case RTL_VER_08: case RTL_VER_09: case RTL_VER_10: case RTL_VER_11: case RTL_VER_12: case RTL_VER_13: - case RTL_VER_14: case RTL_VER_15: default: if (type == MCU_TYPE_USB) { @@ -4331,7 +4342,6 @@ static bool rtl8152_is_fw_mac_ok(struct r8152 *tp, struct fw_mac *mac) case RTL_VER_11: case RTL_VER_12: case RTL_VER_13: - case RTL_VER_14: case RTL_VER_15: fw_reg = 0xf800; bp_ba_addr = PLA_BP_BA; @@ -4339,6 +4349,13 @@ static bool rtl8152_is_fw_mac_ok(struct r8152 *tp, struct fw_mac *mac) bp_start = PLA_BP_0; max_bp = 8; break; + case RTL_VER_14: + fw_reg = 0xf800; + bp_ba_addr = PLA_BP_BA; + bp_en_addr = USB_BP2_EN; + bp_start = PLA_BP_0; + max_bp = 16; + break; default: goto out; } From a222be597e316389f9f8c26033352c124ce93056 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Wed, 18 Aug 2021 10:42:16 -0700 Subject: [PATCH 184/187] i40e: Fix ATR queue selection Without this patch, ATR does not work. Receive/transmit uses queue selection based on SW DCB hashing method. If traffic classes are not configured for PF, then use netdev_pick_tx function for selecting queue for packet transmission. Instead of calling i40e_swdcb_skb_tx_hash, call netdev_pick_tx, which ensures that packet is transmitted/received from CPU that is running the application. Reproduction steps: 1. Load i40e driver 2. Map each MSI interrupt of i40e port for each CPU 3. Disable ntuple, enable ATR i.e.: ethtool -K $interface ntuple off ethtool --set-priv-flags $interface flow-director-atr 4. Run application that is generating traffic and is bound to a single CPU, i.e.: taskset -c 9 netperf -H 1.1.1.1 -t TCP_RR -l 10 5. Observe behavior: Application's traffic should be restricted to the CPU provided in taskset. Fixes: 89ec1f0886c1 ("i40e: Fix queue-to-TC mapping on Tx") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Arkadiusz Kubalewski Tested-by: Dave Switzer Signed-off-by: Tony Nguyen Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 3f25bd8c4924..10a83e5385c7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3663,8 +3663,7 @@ u16 i40e_lan_select_queue(struct net_device *netdev, /* is DCB enabled at all? */ if (vsi->tc_config.numtc == 1) - return i40e_swdcb_skb_tx_hash(netdev, skb, - netdev->real_num_tx_queues); + return netdev_pick_tx(netdev, skb, sb_dev); prio = skb->priority; hw = &vsi->back->hw; From 8da80c9d50220a8e4190a4eaa0dd6aeefcbbb5bf Mon Sep 17 00:00:00 2001 From: Sylwester Dziedziuch Date: Wed, 18 Aug 2021 10:42:17 -0700 Subject: [PATCH 185/187] iavf: Fix ping is lost after untrusted VF had tried to change MAC Make changes to MAC address dependent on the response of PF. Disallow changes to HW MAC address and MAC filter from untrusted VF, thanks to that ping is not lost if VF tries to change MAC. Add a new field in iavf_mac_filter, to indicate whether there was response from PF for given filter. Based on this field pass or discard the filter. If untrusted VF tried to change it's address, it's not changed. Still filter was changed, because of that ping couldn't go through. Fixes: c5c922b3e09b ("iavf: fix MAC address setting for VFs when filter is rejected") Signed-off-by: Przemyslaw Patynowski Signed-off-by: Sylwester Dziedziuch Signed-off-by: Mateusz Palczewski Tested-by: Gurucharan G Signed-off-by: Tony Nguyen Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/iavf/iavf.h | 1 + drivers/net/ethernet/intel/iavf/iavf_main.c | 1 + .../net/ethernet/intel/iavf/iavf_virtchnl.c | 47 ++++++++++++++++++- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index e8bd04100ecd..90793b36126e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -136,6 +136,7 @@ struct iavf_q_vector { struct iavf_mac_filter { struct list_head list; u8 macaddr[ETH_ALEN]; + bool is_new_mac; /* filter is new, wait for PF decision */ bool remove; /* filter needs to be removed */ bool add; /* filter needs to be added */ }; diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 244ec74ceca7..606a01ce4073 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -751,6 +751,7 @@ struct iavf_mac_filter *iavf_add_filter(struct iavf_adapter *adapter, list_add_tail(&f->list, &adapter->mac_filter_list); f->add = true; + f->is_new_mac = true; adapter->aq_required |= IAVF_FLAG_AQ_ADD_MAC_FILTER; } else { f->remove = false; diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 0eab3c43bdc5..3c735968e1b8 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -540,6 +540,47 @@ void iavf_del_ether_addrs(struct iavf_adapter *adapter) kfree(veal); } +/** + * iavf_mac_add_ok + * @adapter: adapter structure + * + * Submit list of filters based on PF response. + **/ +static void iavf_mac_add_ok(struct iavf_adapter *adapter) +{ + struct iavf_mac_filter *f, *ftmp; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry_safe(f, ftmp, &adapter->mac_filter_list, list) { + f->is_new_mac = false; + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); +} + +/** + * iavf_mac_add_reject + * @adapter: adapter structure + * + * Remove filters from list based on PF response. + **/ +static void iavf_mac_add_reject(struct iavf_adapter *adapter) +{ + struct net_device *netdev = adapter->netdev; + struct iavf_mac_filter *f, *ftmp; + + spin_lock_bh(&adapter->mac_vlan_list_lock); + list_for_each_entry_safe(f, ftmp, &adapter->mac_filter_list, list) { + if (f->remove && ether_addr_equal(f->macaddr, netdev->dev_addr)) + f->remove = false; + + if (f->is_new_mac) { + list_del(&f->list); + kfree(f); + } + } + spin_unlock_bh(&adapter->mac_vlan_list_lock); +} + /** * iavf_add_vlans * @adapter: adapter structure @@ -1492,6 +1533,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, case VIRTCHNL_OP_ADD_ETH_ADDR: dev_err(&adapter->pdev->dev, "Failed to add MAC filter, error %s\n", iavf_stat_str(&adapter->hw, v_retval)); + iavf_mac_add_reject(adapter); /* restore administratively set MAC address */ ether_addr_copy(adapter->hw.mac.addr, netdev->dev_addr); break; @@ -1639,10 +1681,11 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, } } switch (v_opcode) { - case VIRTCHNL_OP_ADD_ETH_ADDR: { + case VIRTCHNL_OP_ADD_ETH_ADDR: + if (!v_retval) + iavf_mac_add_ok(adapter); if (!ether_addr_equal(netdev->dev_addr, adapter->hw.mac.addr)) ether_addr_copy(netdev->dev_addr, adapter->hw.mac.addr); - } break; case VIRTCHNL_OP_GET_STATS: { struct iavf_eth_stats *stats = From fa05bdb89b01b098aad19ec0ebc4d1cc7b11177e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 19 Aug 2021 13:58:42 +0300 Subject: [PATCH 186/187] Revert "flow_offload: action should not be NULL when it is referenced" This reverts commit 9ea3e52c5bc8bb4a084938dc1e3160643438927a. Cited commit added a check to make sure 'action' is not NULL, but 'action' is already dereferenced before the check, when calling flow_offload_has_one_action(). Therefore, the check does not make any sense and results in a smatch warning: include/net/flow_offload.h:322 flow_action_mixed_hw_stats_check() warn: variable dereferenced before check 'action' (see line 319) Fix by reverting this commit. Cc: gushengxian Fixes: 9ea3e52c5bc8 ("flow_offload: action should not be NULL when it is referenced") Signed-off-by: Ido Schimmel Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20210819105842.1315705-1-idosch@idosch.org Signed-off-by: Jakub Kicinski --- include/net/flow_offload.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index f3c2841566a0..1b9d75aedb22 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -319,14 +319,12 @@ flow_action_mixed_hw_stats_check(const struct flow_action *action, if (flow_offload_has_one_action(action)) return true; - if (action) { - flow_action_for_each(i, action_entry, action) { - if (i && action_entry->hw_stats != last_hw_stats) { - NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); - return false; - } - last_hw_stats = action_entry->hw_stats; + flow_action_for_each(i, action_entry, action) { + if (i && action_entry->hw_stats != last_hw_stats) { + NL_SET_ERR_MSG_MOD(extack, "Mixing HW stats types for actions is not supported"); + return false; } + last_hw_stats = action_entry->hw_stats; } return true; } From cd0a719fbd702eb4b455a6ad986483750125588a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 19 Aug 2021 17:17:55 +0300 Subject: [PATCH 187/187] net: dpaa2-switch: disable the control interface on error path Currently dpaa2_switch_takedown has a funny name and does not do the opposite of dpaa2_switch_init, which makes probing fail when we need to handle an -EPROBE_DEFER. A sketch of what dpaa2_switch_init does: dpsw_open dpaa2_switch_detect_features dpsw_reset for (i = 0; i < ethsw->sw_attr.num_ifs; i++) { dpsw_if_disable dpsw_if_set_stp dpsw_vlan_remove_if_untagged dpsw_if_set_tci dpsw_vlan_remove_if } dpsw_vlan_remove alloc_ordered_workqueue dpsw_fdb_remove dpaa2_switch_ctrl_if_setup When dpaa2_switch_takedown is called from the error path of dpaa2_switch_probe(), the control interface, enabled by dpaa2_switch_ctrl_if_setup from dpaa2_switch_init, remains enabled, because dpaa2_switch_takedown does not call dpaa2_switch_ctrl_if_teardown. Since dpaa2_switch_probe might fail due to EPROBE_DEFER of a PHY, this means that a second probe of the driver will happen with the control interface directly enabled. This will trigger a second error: [ 93.273528] fsl_dpaa2_switch dpsw.0: dpsw_ctrl_if_set_pools() failed [ 93.281966] fsl_dpaa2_switch dpsw.0: fsl_mc_driver_probe failed: -13 [ 93.288323] fsl_dpaa2_switch: probe of dpsw.0 failed with error -13 Which if we investigate the /dev/dpaa2_mc_console log, we find out is caused by: [E, ctrl_if_set_pools:2211, DPMNG] ctrl_if must be disabled So make dpaa2_switch_takedown do the opposite of dpaa2_switch_init (in reasonable limits, no reason to change STP state, re-add VLANs etc), and rename it to something more conventional, like dpaa2_switch_teardown. Fixes: 613c0a5810b7 ("staging: dpaa2-switch: enable the control interface") Signed-off-by: Vladimir Oltean Reviewed-by: Ioana Ciornei Link: https://lore.kernel.org/r/20210819141755.1931423-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- .../ethernet/freescale/dpaa2/dpaa2-switch.c | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c index 68b78642c045..98cc0133c343 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c @@ -3038,17 +3038,6 @@ static int dpaa2_switch_port_init(struct ethsw_port_priv *port_priv, u16 port) return err; } -static void dpaa2_switch_takedown(struct fsl_mc_device *sw_dev) -{ - struct device *dev = &sw_dev->dev; - struct ethsw_core *ethsw = dev_get_drvdata(dev); - int err; - - err = dpsw_close(ethsw->mc_io, 0, ethsw->dpsw_handle); - if (err) - dev_warn(dev, "dpsw_close err %d\n", err); -} - static void dpaa2_switch_ctrl_if_teardown(struct ethsw_core *ethsw) { dpsw_ctrl_if_disable(ethsw->mc_io, 0, ethsw->dpsw_handle); @@ -3058,6 +3047,21 @@ static void dpaa2_switch_ctrl_if_teardown(struct ethsw_core *ethsw) dpaa2_switch_free_dpbp(ethsw); } +static void dpaa2_switch_teardown(struct fsl_mc_device *sw_dev) +{ + struct device *dev = &sw_dev->dev; + struct ethsw_core *ethsw = dev_get_drvdata(dev); + int err; + + dpaa2_switch_ctrl_if_teardown(ethsw); + + destroy_workqueue(ethsw->workqueue); + + err = dpsw_close(ethsw->mc_io, 0, ethsw->dpsw_handle); + if (err) + dev_warn(dev, "dpsw_close err %d\n", err); +} + static int dpaa2_switch_remove(struct fsl_mc_device *sw_dev) { struct ethsw_port_priv *port_priv; @@ -3068,8 +3072,6 @@ static int dpaa2_switch_remove(struct fsl_mc_device *sw_dev) dev = &sw_dev->dev; ethsw = dev_get_drvdata(dev); - dpaa2_switch_ctrl_if_teardown(ethsw); - dpaa2_switch_teardown_irqs(sw_dev); dpsw_disable(ethsw->mc_io, 0, ethsw->dpsw_handle); @@ -3084,9 +3086,7 @@ static int dpaa2_switch_remove(struct fsl_mc_device *sw_dev) kfree(ethsw->acls); kfree(ethsw->ports); - dpaa2_switch_takedown(sw_dev); - - destroy_workqueue(ethsw->workqueue); + dpaa2_switch_teardown(sw_dev); fsl_mc_portal_free(ethsw->mc_io); @@ -3199,7 +3199,7 @@ static int dpaa2_switch_probe(struct fsl_mc_device *sw_dev) GFP_KERNEL); if (!(ethsw->ports)) { err = -ENOMEM; - goto err_takedown; + goto err_teardown; } ethsw->fdbs = kcalloc(ethsw->sw_attr.num_ifs, sizeof(*ethsw->fdbs), @@ -3270,8 +3270,8 @@ static int dpaa2_switch_probe(struct fsl_mc_device *sw_dev) err_free_ports: kfree(ethsw->ports); -err_takedown: - dpaa2_switch_takedown(sw_dev); +err_teardown: + dpaa2_switch_teardown(sw_dev); err_free_cmdport: fsl_mc_portal_free(ethsw->mc_io);