From f8447cf22ec39b2ec3498f0205d4fde3d7efcb27 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Wed, 24 May 2023 15:23:42 -0400 Subject: [PATCH 01/38] Linux 6.4 compat: reclaimed_slab renamed to reclaimed Reviewed-by: Richard Yao Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #14891 --- config/kernel-reclaim_state.m4 | 26 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-kmem-cache.c | 5 ++++- module/os/linux/zfs/arc_os.c | 4 ++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 config/kernel-reclaim_state.m4 diff --git a/config/kernel-reclaim_state.m4 b/config/kernel-reclaim_state.m4 new file mode 100644 index 000000000000..9936b3c1001f --- /dev/null +++ b/config/kernel-reclaim_state.m4 @@ -0,0 +1,26 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RECLAIMED], [ + dnl # + dnl # 6.4 API change + dnl # The reclaimed_slab of struct reclaim_state + dnl # is renamed to reclaimed + dnl # + ZFS_LINUX_TEST_SRC([reclaim_state_reclaimed], [ + #include + static const struct reclaim_state + rs __attribute__ ((unused)) = { + .reclaimed = 100, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RECLAIMED], [ + AC_MSG_CHECKING([whether struct reclaim_state has reclaimed field]) + ZFS_LINUX_TEST_RESULT([reclaim_state_reclaimed], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RECLAIM_STATE_RECLAIMED, 1, + [struct reclaim_state has reclaimed]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + diff --git a/config/kernel.m4 b/config/kernel.m4 index 439ffdf5a898..cb7e736c9a43 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -153,6 +153,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T + ZFS_AC_KERNEL_SRC_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -285,6 +286,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T + ZFS_AC_KERNEL_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 963e7a1ec96a..745d03012f9d 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -182,8 +182,11 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += size >> PAGE_SHIFT; +#else current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - +#endif vfree(ptr); } diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index b7d6053529b4..29a8802b8367 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -219,7 +219,11 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += sc->nr_to_scan; +#else current->reclaim_state->reclaimed_slab += sc->nr_to_scan; +#endif /* * We are experiencing memory pressure which the arc_evict_zthr was From 9d618615d1ede4dd40a69386bc300580550fd4d0 Mon Sep 17 00:00:00 2001 From: Akash B Date: Thu, 25 May 2023 00:58:09 +0530 Subject: [PATCH 02/38] Fix concurrent resilvers initiated at same time For draid vdevs it was possible to initiate both the sequential and healing resilver at same time. This fixes the following two scenarios. 1) There's a window where a sequential rebuild can be started via ZED even if a healing resilver has been scheduled. - This is fixed by adding additional check in spa_vdev_attach() for any scheduled resilver and return appropriate error code when a resilver is already in progress. 2) It was possible for zpool clear to start a healing resilver when it wasn't needed at all. This occurs because during a vdev_open() the device is presumed to be healthy not until the device is validated by vdev_validate() and it's set unavailable. However, by this point an async resilver will have already been requested if the DTL isn't empty. - This is fixed by cancelling the SPA_ASYNC_RESILVER request immediately at the end of vdev_reopen() when a resilver is unneeded. Finally, added a testcase in ZTS for verification. Reviewed-by: Brian Behlendorf Reviewed-by: Dipak Ghosh Signed-off-by: Akash B Closes #14881 Closes #14892 --- module/zfs/spa.c | 5 +- module/zfs/vdev.c | 13 ++- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_resilver_concurrent.ksh | 101 ++++++++++++++++++ 5 files changed, 120 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1fc2c5e8c55d..27bbb8f09259 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -33,6 +33,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ /* @@ -6874,9 +6875,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - if (dsl_scan_resilvering(spa_get_dsl(spa))) + if (dsl_scan_resilvering(spa_get_dsl(spa)) || + dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); + } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c243dddb7e61..58dcd9f79799 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include @@ -2699,6 +2699,17 @@ vdev_reopen(vdev_t *vd) (void) vdev_validate(vd); } + /* + * Recheck if resilver is still needed and cancel any + * scheduled resilver if resilver is unneeded. + */ + if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa->spa_async_tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; + mutex_exit(&spa->spa_async_lock); + } + /* * Reassess parent vdev's health. */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9ed1a6d37a97..10525289a3bd 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -472,7 +472,8 @@ tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] -tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] +tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart', + 'zpool_resilver_concurrent'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ad4aec543299..129893cd61f3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1142,6 +1142,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ + functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh new file mode 100755 index 000000000000..4c3b09796869 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh @@ -0,0 +1,101 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 Hewlett Packard Enterprise Development LP. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify 'zpool clear' doesn't cause concurrent resilvers +# +# STRATEGY: +# 1. Create N(10) virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Force-fault 2 vdevs and verify distributed spare is kicked in. +# 5. Free the distributed spare by replacing the faulty drive. +# 6. Run zpool clear and verify that it does not initiate 2 resilvers +# concurrently while distributed spare gets kicked in. +# + +verify_runnable "global" + +typeset -ir devs=10 +typeset -ir nparity=1 +typeset -ir ndata=8 +typeset -ir dspare=1 + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$BASEDIR/vdev$i" + done + + for dir in $BASEDIR; do + if [[ -d $dir ]]; then + log_must rm -rf $dir + fi + done + + zed_stop + zed_cleanup +} + +log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers" +log_onexit cleanup + +setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs + +# ZED needed for sequential resilver +zed_setup +log_must zed_start + +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5 +log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 +log_must zpool wait -t resilver $TESTPOOL +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6 + +log_must zpool labelclear -f $BASEDIR/vdev5 +log_must zpool labelclear -f $BASEDIR/vdev6 + +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5 +sync_pool $TESTPOOL + +log_must zpool events -c +log_must zpool clear $TESTPOOL +log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 +log_must zpool wait -t resilver $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +nof_resilver=$(zpool events | grep -c resilver_start) +if [ $nof_resilver = 1 ] ; then + log_must verify_pool $TESTPOOL + log_pass "zpool clear on draid pool doesn't cause concurrent resilvers" +else + log_fail "FAIL: sequential and healing resilver initiated concurrently" +fi From 79b20949b25c8db4d379f6486b0835a6613b480c Mon Sep 17 00:00:00 2001 From: Dimitri John Ledkov <19779+xnox@users.noreply.github.com> Date: Wed, 24 May 2023 20:31:28 +0100 Subject: [PATCH 03/38] systemd: Use non-absolute paths in Exec* lines Since systemd v239, Exec* binaries are resolved from PATH when they are not-absolute. Switch to this by default for ease of downstream maintenance. Many downstream distributions move individual binaries to locations that existing compile-time configurations cannot accommodate. Reviewed-by: Brian Behlendorf Signed-off-by: Dimitri John Ledkov Closes #14880 --- etc/systemd/system/zfs-import-cache.service.in | 2 +- etc/systemd/system/zfs-import-scan.service.in | 2 +- etc/systemd/system/zfs-mount.service.in | 2 +- etc/systemd/system/zfs-scrub@.service.in | 10 +++++----- etc/systemd/system/zfs-share.service.in | 2 +- etc/systemd/system/zfs-trim@.service.in | 10 +++++----- etc/systemd/system/zfs-volume-wait.service.in | 2 +- etc/systemd/system/zfs-zed.service.in | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in index fd822989da93..6d9a065e7e3a 100644 --- a/etc/systemd/system/zfs-import-cache.service.in +++ b/etc/systemd/system/zfs-import-cache.service.in @@ -15,7 +15,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS +ExecStart=zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in index c5dd45d87e68..fb524f3b0889 100644 --- a/etc/systemd/system/zfs-import-scan.service.in +++ b/etc/systemd/system/zfs-import-scan.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS +ExecStart=zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in index 66d894923f4a..fc4e1c49f1c5 100644 --- a/etc/systemd/system/zfs-mount.service.in +++ b/etc/systemd/system/zfs-mount.service.in @@ -12,7 +12,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zfs mount -a +ExecStart=zfs mount -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-scrub@.service.in b/etc/systemd/system/zfs-scrub@.service.in index 8ffffeb0cf6c..2bb2757d5e97 100644 --- a/etc/systemd/system/zfs-scrub@.service.in +++ b/etc/systemd/system/zfs-scrub@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=/bin/sh -c '\ -if @sbindir@/zpool status %i | grep -q "scrub in progress"; then\ -exec @sbindir@/zpool wait -t scrub %i;\ -else exec @sbindir@/zpool scrub -w %i; fi' -ExecStop=-/bin/sh -c '@sbindir@/zpool scrub -p %i 2>/dev/null || true' +ExecStart=sh -c '\ +if zpool status %i | grep -q "scrub in progress"; then\ +exec zpool wait -t scrub %i;\ +else exec zpool scrub -w %i; fi' +ExecStop=-sh -c 'zpool scrub -p %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 1a6342a06fec..dd321f490fe6 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zfs share -a +ExecStart=zfs share -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-trim@.service.in b/etc/systemd/system/zfs-trim@.service.in index 423fb448c16f..f55e36cd8454 100644 --- a/etc/systemd/system/zfs-trim@.service.in +++ b/etc/systemd/system/zfs-trim@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=/bin/sh -c '\ -if @sbindir@/zpool status %i | grep -q "(trimming)"; then\ -exec @sbindir@/zpool wait -t trim %i;\ -else exec @sbindir@/zpool trim -w %i; fi' -ExecStop=-/bin/sh -c '@sbindir@/zpool trim -s %i 2>/dev/null || true' +ExecStart=sh -c '\ +if zpool status %i | grep -q "(trimming)"; then\ +exec zpool wait -t trim %i;\ +else exec zpool trim -w %i; fi' +ExecStop=-sh -c 'zpool trim -s %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in index 110c0f5f52ee..a86a3561e032 100644 --- a/etc/systemd/system/zfs-volume-wait.service.in +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -9,7 +9,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@bindir@/zvol_wait +ExecStart=zvol_wait [Install] WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in index be2fc67348f9..ac58ad3eff7b 100644 --- a/etc/systemd/system/zfs-zed.service.in +++ b/etc/systemd/system/zfs-zed.service.in @@ -5,7 +5,7 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zed -F +ExecStart=zed -F Restart=always [Install] From f63811f07213361d49878648bd597af88d06859c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 25 May 2023 12:48:43 -0400 Subject: [PATCH 04/38] ZIL: Reduce scope of per-dataset zl_issuer_lock. Before this change ZIL copied all log data while holding the lock. It caused huge lock contention on workloads with many big parallel writes. This change splits the process into two parts: first, zil_lwb_assign() estimates the log space needed for all transactions, and zil_lwb_write_close() allocates blocks and zios while holding the lock, then, after the lock in dropped, zil_lwb_commit() copies the data, and zil_lwb_write_issue() issues the I/Os. Also while there slightly reduce scope of zl_lock. Reviewed-by: Paul Dagnelie Reviewed-by: Prakash Surya Reviewed-by: Richard Yao Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14841 --- include/sys/zil_impl.h | 7 +- module/zfs/zil.c | 432 +++++++++++++++++++++++++++-------------- 2 files changed, 287 insertions(+), 152 deletions(-) diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index bb85bf6d1eb1..03a409c5257c 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -44,7 +44,7 @@ extern "C" { * must be held. * * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must + * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must * be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write @@ -93,20 +93,23 @@ typedef struct lwb { blkptr_t lwb_blk; /* on disk address of this log blk */ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ int lwb_nused; /* # used bytes in buffer */ + int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ - hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ } lwb_t; /* diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d887e4900d1d..f2798270a8a2 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -146,6 +146,9 @@ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; +static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static itx_t *zil_itx_clone(itx_t *oitx); + static int zil_bp_compare(const void *x1, const void *x2) { @@ -747,20 +750,21 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; lwb->lwb_slog = slog; + lwb->lwb_indirect = B_FALSE; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = lwb->lwb_nfilled = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); - lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } + lwb->lwb_max_txg = txg; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); @@ -1397,6 +1401,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } + mutex_exit(&zilog->zl_lock); + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); @@ -1429,8 +1435,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } - mutex_exit(&zilog->zl_lock); - mutex_enter(&zilog->zl_lwb_io_lock); txg = lwb->lwb_issued_txg; ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); @@ -1666,46 +1670,41 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); + if (lwb->lwb_root_zio != NULL) + return; + + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); - if (lwb->lwb_root_zio == NULL) { - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, - zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, - BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - - lwb->lwb_state = LWB_STATE_OPENED; - - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; } - mutex_exit(&zilog->zl_lock); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, + &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), + zil_lwb_write_done, lwb, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); + + lwb->lwb_state = LWB_STATE_OPENED; + + zil_lwb_set_zio_dependency(zilog, lwb); + zilog->zl_last_lwb_opened = lwb; + mutex_exit(&zilog->zl_lock); } /* @@ -1736,11 +1735,11 @@ static const struct { static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* - * Start a log block write and advance to the next log block. - * Calls are serialized. + * Close the log block for being issued and allocate the next one. + * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; @@ -1748,7 +1747,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; - uint64_t zil_blksz, wsz; + uint64_t zil_blksz; int i, error; boolean_t slog; @@ -1757,16 +1756,17 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - zilc = (zil_chain_t *)lwb->lwb_buf; - bp = &zilc->zc_next_blk; - } else { - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; + /* + * If this lwb includes indirect writes, we have to commit before + * creating the transaction, otherwise we may end up in dead lock. + */ + if (lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; } - ASSERT(lwb->lwb_nused <= lwb->lwb_sz); - /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. @@ -1816,17 +1816,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; BP_ZERO(bp); error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (slog) { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, - lwb->lwb_nused); - } else { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, - lwb->lwb_nused); - } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -1838,17 +1834,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); } + lwb->lwb_state = LWB_STATE_ISSUED; + + dmu_tx_commit(tx); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); +} + +/* + * Finalize previously closed block and issue the write zio. + * Does not require locking. + */ +static void +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +{ + zil_chain_t *zilc; + int wsz; + + /* Actually fill the lwb with the data if not yet. */ + if (!lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; + } + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); - ASSERT3U(wsz, <=, lwb->lwb_sz); + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); + ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; + zilc = (zil_chain_t *)lwb->lwb_buf; } else { wsz = lwb->lwb_sz; + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); } - zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; @@ -1858,22 +1884,20 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) */ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + if (lwb->lwb_slog) { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, + lwb->lwb_nused); + } else { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, + lwb->lwb_nused); + } spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); - lwb->lwb_state = LWB_STATE_ISSUED; - zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); - - dmu_tx_commit(tx); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); } /* @@ -1909,13 +1933,19 @@ zil_max_copied_data(zilog_t *zilog) sizeof (lr_write_t)); } +/* + * Estimate space needed in the lwb for the itx. Allocate more lwbs or + * split the itx as needed, but don't touch the actual transaction data. + * Has to be called under zl_issuer_lock to call zil_lwb_write_close() + * to chain more lwbs. + */ static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { - lr_t *lrcb, *lrc; - lr_write_t *lrwb, *lrw; - char *lr_buf; - uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; + itx_t *citx; + lr_t *lr, *clr; + lr_write_t *lrw; + uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1923,8 +1953,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zil_lwb_write_open(zilog, lwb); - lrc = &itx->itx_lr; - lrw = (lr_write_t *)lrc; + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead @@ -1938,24 +1968,23 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * * For more details, see the comment above zil_commit(). */ - if (lrc->lrc_txtype == TX_COMMIT) { + if (lr->lrc_txtype == TX_COMMIT) { mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; mutex_exit(&zilog->zl_lock); + list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); - dpad = dlen - lrw->lr_length; } else { - dlen = dpad = 0; + dlen = 0; } - reclen = lrc->lrc_reclen; + reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); - txg = lrc->lrc_txg; cont: /* @@ -1968,7 +1997,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); @@ -1987,19 +2017,99 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } dnow = MIN(dlen, lwb_sp - reclen); - lr_buf = lwb->lwb_buf + lwb->lwb_nused; - memcpy(lr_buf, lrc, reclen); - lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ - lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + if (dlen > dnow) { + ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); + ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); + citx = zil_itx_clone(itx); + clr = &citx->itx_lr; + lr_write_t *clrw = (lr_write_t *)clr; + clrw->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + } else { + citx = itx; + clr = lr; + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + clr->lrc_seq = ++zilog->zl_lr_seq; + + lwb->lwb_nused += reclen + dnow; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + zil_lwb_add_txg(lwb, lr->lrc_txg); + list_insert_tail(&lwb->lwb_itxs, citx); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + /* + * We have to really issue all queued LWBs before we may have to + * wait for a txg sync. Otherwise we may end up in a dead lock. + */ + if (lr->lrc_txtype == TX_WRITE) { + boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); + if (frozen || itx->itx_wr_state == WR_INDIRECT) { + lwb_t *tlwb; + while ((tlwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, tlwb); + } + if (itx->itx_wr_state == WR_INDIRECT) + lwb->lwb_indirect = B_TRUE; + if (frozen) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); + } + + return (lwb); +} + +/* + * Fill the actual transaction data into the lwb, following zil_lwb_assign(). + * Does not require locking. + */ +static void +zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) +{ + lr_t *lr, *lrb; + lr_write_t *lrw, *lrwb; + char *lr_buf; + uint64_t dlen, reclen; + + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_COMMIT) + return; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + dlen = P2ROUNDUP_TYPED( + lrw->lr_length, sizeof (uint64_t), uint64_t); + } else { + dlen = 0; + } + reclen = lr->lrc_reclen; + ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); + + lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; + memcpy(lr_buf, lr, reclen); + lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ + lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, @@ -2010,14 +2120,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrcb->lrc_reclen += dnow; - if (lrwb->lr_length > dnow) - lrwb->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; + lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, - dnow); + dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; @@ -2044,9 +2150,11 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_write_zio); - if (dbuf != NULL && error == 0 && dnow == dlen) + if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ - memset((char *)dbuf + lrwb->lr_length, 0, dpad); + memset((char *)dbuf + lrwb->lr_length, 0, + dlen - lrwb->lr_length); + } /* * Typically, the only return values we should see from @@ -2074,39 +2182,26 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) error); zfs_fallthrough; case EIO: - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lwb->lwb_indirect) { + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); + } else { + lwb->lwb_write_zio->io_error = error; + } zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return (lwb); + return; } } } - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - lrcb->lrc_seq = ++zilog->zl_lr_seq; - lwb->lwb_nused += reclen + dnow; - - zil_lwb_add_txg(lwb, txg); - - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - return (lwb); + lwb->lwb_nfilled += reclen + dlen; + ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); + ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * @@ -2131,6 +2226,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize) return (itx); } +static itx_t * +zil_itx_clone(itx_t *oitx) +{ + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); + memcpy(itx, oitx, oitx->itx_size); + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; + return (itx); +} + void zil_itx_destroy(itx_t *itx) { @@ -2162,7 +2267,7 @@ zil_itxg_clean(void *arg) /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via - * zil_lwb_commit(), and free'd in that function. Having + * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * @@ -2561,7 +2666,7 @@ zil_commit_writer_stall(zilog_t *zilog) * lwb will be issued to the zio layer to be written to disk. */ static void -zil_process_commit_list(zilog_t *zilog) +zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; @@ -2663,18 +2768,23 @@ zil_process_commit_list(zilog_t *zilog) */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { - lwb = zil_lwb_commit(zilog, itx, lwb); - - if (lwb == NULL) + lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); + if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); - else - list_insert_tail(&lwb->lwb_itxs, itx); + } else if ((zcw->zcw_lwb != NULL && + zcw->zcw_lwb != lwb) || zcw->zcw_done) { + /* + * Our lwb is done, leave the rest of + * itx list to somebody else who care. + */ + first = B_FALSE; + break; + } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } - list_insert_tail(&nolwb_itxs, itx); } } else { @@ -2690,6 +2800,8 @@ zil_process_commit_list(zilog_t *zilog) * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* @@ -2735,13 +2847,13 @@ zil_process_commit_list(zilog_t *zilog) * on the system, such that this function will be * immediately called again (not necessarily by the same * thread) and this lwb's zio will be issued via - * zil_lwb_commit(). This way, the lwb is guaranteed to + * zil_lwb_assign(). This way, the lwb is guaranteed to * be "full" when it is issued to disk, and we'll make * use of the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, such that this lwb's zio isn't issued via - * zil_lwb_commit(), zil_commit_waiter() will issue the + * zil_lwb_assign(), zil_commit_waiter() will issue the * lwb's zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount @@ -2773,10 +2885,15 @@ zil_process_commit_list(zilog_t *zilog) zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); zilog->zl_cur_used = 0; - if (lwb == NULL) + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) + != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); + } } } } @@ -2799,9 +2916,13 @@ zil_process_commit_list(zilog_t *zilog) static void zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { + list_t ilwbs; + lwb_t *lwb; + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); + list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { @@ -2828,10 +2949,13 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_get_commit_list(zilog); zil_prune_commit_list(zilog); - zil_process_commit_list(zilog); + zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); + while ((lwb = list_remove_head(&ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + list_destroy(&ilwbs); } static void @@ -2858,7 +2982,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) return; /* - * In order to call zil_lwb_write_issue() we must hold the + * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order @@ -2876,8 +3000,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ - if (zcw->zcw_done) + if (zcw->zcw_done) { + lwb = NULL; goto out; + } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -2896,15 +3022,17 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * if it's ISSUED or OPENED, and block any other threads that might * attempt to issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_issue() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + lwb->lwb_state == LWB_STATE_FLUSH_DONE) { + lwb = NULL; goto out; + } ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -2914,7 +3042,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); @@ -2934,7 +3062,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) if (nlwb == NULL) { /* - * When zil_lwb_write_issue() returns NULL, this + * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the @@ -2956,12 +3084,16 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * lock, which occurs prior to calling dmu_tx_commit() */ mutex_exit(&zcw->zcw_lock); + zil_lwb_write_issue(zilog, lwb); + lwb = NULL; zil_commit_writer_stall(zilog); mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); + if (lwb) + zil_lwb_write_issue(zilog, lwb); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); } @@ -2976,7 +3108,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. @@ -3656,7 +3788,7 @@ zil_close(zilog_t *zilog) /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_issue(). + * zil_lwb_write_close(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); From b6fbe61fa6a75747d9b65082ad4dbec05305d496 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 25 May 2023 16:51:53 -0400 Subject: [PATCH 05/38] zil: Add some more statistics. In addition to a number of actual log bytes written, account also a total written bytes including padding and total allocated bytes (bytes <= write <= alloc). It should allow to monitor zil traffic and space efficiency. Add dtrace probe for zil block size selection. Make zilstat report more information and fit it into less width. Reviewed-by: Ameer Hamza Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14863 --- cmd/zilstat.in | 180 +++++++++++++----- include/os/linux/zfs/sys/trace_zil.h | 34 ++++ include/sys/zil.h | 12 +- module/zfs/dataset_kstats.c | 6 +- module/zfs/zil.c | 31 +++ .../cli_user/misc/zilstat_001_pos.ksh | 2 +- 6 files changed, 213 insertions(+), 52 deletions(-) diff --git a/cmd/zilstat.in b/cmd/zilstat.in index cf4e2e0dd0c8..e8678e20cafa 100755 --- a/cmd/zilstat.in +++ b/cmd/zilstat.in @@ -36,31 +36,49 @@ import argparse from argparse import RawTextHelpFormatter cols = { - # hdr: [size, scale, kstat name] + # hdr: [size, scale, kstat name] "time": [8, -1, "time"], "pool": [12, -1, "pool"], "ds": [12, -1, "dataset_name"], "obj": [12, -1, "objset"], - "zcc": [10, 1000, "zil_commit_count"], - "zcwc": [10, 1000, "zil_commit_writer_count"], - "ziic": [10, 1000, "zil_itx_indirect_count"], - "zic": [10, 1000, "zil_itx_count"], - "ziib": [10, 1024, "zil_itx_indirect_bytes"], - "zicc": [10, 1000, "zil_itx_copied_count"], - "zicb": [10, 1024, "zil_itx_copied_bytes"], - "zinc": [10, 1000, "zil_itx_needcopy_count"], - "zinb": [10, 1024, "zil_itx_needcopy_bytes"], - "zimnc": [10, 1000, "zil_itx_metaslab_normal_count"], - "zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"], - "zimsc": [10, 1000, "zil_itx_metaslab_slog_count"], - "zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"], + "cc": [5, 1000, "zil_commit_count"], + "cwc": [5, 1000, "zil_commit_writer_count"], + "ic": [5, 1000, "zil_itx_count"], + "iic": [5, 1000, "zil_itx_indirect_count"], + "iib": [5, 1024, "zil_itx_indirect_bytes"], + "icc": [5, 1000, "zil_itx_copied_count"], + "icb": [5, 1024, "zil_itx_copied_bytes"], + "inc": [5, 1000, "zil_itx_needcopy_count"], + "inb": [5, 1024, "zil_itx_needcopy_bytes"], + "idc": [5, 1000, "icc+inc"], + "idb": [5, 1024, "icb+inb"], + "iwc": [5, 1000, "iic+idc"], + "iwb": [5, 1024, "iib+idb"], + "imnc": [6, 1000, "zil_itx_metaslab_normal_count"], + "imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"], + "imnw": [6, 1024, "zil_itx_metaslab_normal_write"], + "imna": [6, 1024, "zil_itx_metaslab_normal_alloc"], + "imsc": [6, 1000, "zil_itx_metaslab_slog_count"], + "imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"], + "imsw": [6, 1024, "zil_itx_metaslab_slog_write"], + "imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"], + "imc": [5, 1000, "imnc+imsc"], + "imb": [5, 1024, "imnb+imsb"], + "imw": [5, 1024, "imnw+imsw"], + "ima": [5, 1024, "imna+imsa"], + "se%": [3, 100, "imb/ima"], + "sen%": [4, 100, "imnb/imna"], + "ses%": [4, 100, "imsb/imsa"], + "te%": [3, 100, "imb/imw"], + "ten%": [4, 100, "imnb/imnw"], + "tes%": [4, 100, "imsb/imsw"], } -hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \ - "zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] +hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib", + "imnc", "imnw", "imsc", "imsw"] -ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb", - "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] +ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib", + "imnc", "imnw", "imsc", "imsw"] cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]") @@ -105,7 +123,7 @@ def print_header(): global sep for col in hdr: new_col = col - if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']: + if interval > 0 and cols[col][1] > 100: new_col += "/s" sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep)) sys.stdout.write("\n") @@ -115,7 +133,7 @@ def print_values(v): global sep for col in hdr: val = v[cols[col][2]] - if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0: + if interval > 0 and cols[col][1] > 100: val = v[cols[col][2]] // interval sys.stdout.write("%s%s" % ( prettynum(cols[col][0], cols[col][1], val), sep)) @@ -237,9 +255,7 @@ def init(): invalid = [] for ele in hdr: - if gFlag and ele not in ghdr: - invalid.append(ele) - elif ele not in cols: + if ele not in cols: invalid.append(ele) if len(invalid) > 0: @@ -403,17 +419,17 @@ def calculate_diff(): diff = copy.deepcopy(curr) for pool in curr: for objset in curr[pool]: - for col in hdr: - if col not in ['time', 'pool', 'ds', 'obj']: - key = cols[col][2] - # If prev is NULL, this is the - # first time we are here - if not prev: - diff[pool][objset][key] = 0 - else: - diff[pool][objset][key] \ - = curr[pool][objset][key] \ - - prev[pool][objset][key] + for key in curr[pool][objset]: + if not isinstance(diff[pool][objset][key], int): + continue + # If prev is NULL, this is the + # first time we are here + if not prev: + diff[pool][objset][key] = 0 + else: + diff[pool][objset][key] \ + = curr[pool][objset][key] \ + - prev[pool][objset][key] def zil_build_dict(pool = "GLOBAL"): global kstat @@ -425,10 +441,77 @@ def zil_build_dict(pool = "GLOBAL"): if objset not in curr[pool]: curr[pool][objset] = dict() curr[pool][objset][key] = val - curr[pool][objset]["pool"] = pool - curr[pool][objset]["objset"] = objset - curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \ - time.localtime()) + +def zil_extend_dict(): + global diff + for pool in diff: + for objset in diff[pool]: + diff[pool][objset]["pool"] = pool + diff[pool][objset]["objset"] = objset + diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \ + time.localtime()) + diff[pool][objset]["icc+inc"] = \ + diff[pool][objset]["zil_itx_copied_count"] + \ + diff[pool][objset]["zil_itx_needcopy_count"] + diff[pool][objset]["icb+inb"] = \ + diff[pool][objset]["zil_itx_copied_bytes"] + \ + diff[pool][objset]["zil_itx_needcopy_bytes"] + diff[pool][objset]["iic+idc"] = \ + diff[pool][objset]["zil_itx_indirect_count"] + \ + diff[pool][objset]["zil_itx_copied_count"] + \ + diff[pool][objset]["zil_itx_needcopy_count"] + diff[pool][objset]["iib+idb"] = \ + diff[pool][objset]["zil_itx_indirect_bytes"] + \ + diff[pool][objset]["zil_itx_copied_bytes"] + \ + diff[pool][objset]["zil_itx_needcopy_bytes"] + diff[pool][objset]["imnc+imsc"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_count"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_count"] + diff[pool][objset]["imnb+imsb"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] + diff[pool][objset]["imnw+imsw"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_write"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_write"] + diff[pool][objset]["imna+imsa"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_alloc"] + if diff[pool][objset]["imna+imsa"] > 0: + diff[pool][objset]["imb/ima"] = 100 * \ + diff[pool][objset]["imnb+imsb"] // \ + diff[pool][objset]["imna+imsa"] + else: + diff[pool][objset]["imb/ima"] = 100 + if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: + diff[pool][objset]["imnb/imna"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + else: + diff[pool][objset]["imnb/imna"] = 100 + if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: + diff[pool][objset]["imsb/imsa"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_slog_alloc"] + else: + diff[pool][objset]["imsb/imsa"] = 100 + if diff[pool][objset]["imnw+imsw"] > 0: + diff[pool][objset]["imb/imw"] = 100 * \ + diff[pool][objset]["imnb+imsb"] // \ + diff[pool][objset]["imnw+imsw"] + else: + diff[pool][objset]["imb/imw"] = 100 + if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: + diff[pool][objset]["imnb/imnw"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_normal_write"] + else: + diff[pool][objset]["imnb/imnw"] = 100 + if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: + diff[pool][objset]["imsb/imsw"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_slog_write"] + else: + diff[pool][objset]["imsb/imsw"] = 100 def sign_handler_epipe(sig, frame): print("Caught EPIPE signal: " + str(frame)) @@ -437,30 +520,31 @@ def sign_handler_epipe(sig, frame): def main(): global interval - global curr + global curr, diff hprint = False init() signal.signal(signal.SIGINT, signal.SIG_DFL) signal.signal(signal.SIGPIPE, sign_handler_epipe) + zil_process_kstat() + if not curr: + print ("Error: No stats to show") + sys.exit(0) + print_header() if interval > 0: + time.sleep(interval) while True: calculate_diff() if not diff: print ("Error: No stats to show") sys.exit(0) - if hprint == False: - print_header() - hprint = True + zil_extend_dict() print_dict(diff) time.sleep(interval) else: - zil_process_kstat() - if not curr: - print ("Error: No stats to show") - sys.exit(0) - print_header() - print_dict(curr) + diff = curr + zil_extend_dict() + print_dict(diff) if __name__ == '__main__': main() diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h index 7bddd9d1f469..afa1a274e43c 100644 --- a/include/os/linux/zfs/sys/trace_zil.h +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -215,6 +215,39 @@ DEFINE_EVENT(zfs_zil_commit_io_error_class, name, \ TP_ARGS(zilog, zcw)) DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * zilog_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_zil_block_size_class, + TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), + TP_ARGS(zilog, res, s1), + TP_STRUCT__entry( + ZILOG_TP_STRUCT_ENTRY + __field(uint64_t, res) + __field(uint64_t, s1) + ), + TP_fast_assign( + ZILOG_TP_FAST_ASSIGN + __entry->res = res; + __entry->s1 = s1; + ), + TP_printk( + ZILOG_TP_PRINTK_FMT " res %llu s1 %llu", + ZILOG_TP_PRINTK_ARGS, __entry->res, __entry->s1) +); + +#define DEFINE_ZIL_BLOCK_SIZE_EVENT(name) \ +DEFINE_EVENT(zfs_zil_block_size_class, name, \ + TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), \ + TP_ARGS(zilog, res, s1)) +DEFINE_ZIL_BLOCK_SIZE_EVENT(zfs_zil__block__size); + #endif /* _TRACE_ZIL_H */ #undef TRACE_INCLUDE_PATH @@ -228,6 +261,7 @@ DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); DEFINE_DTRACE_PROBE2(zil__process__commit__itx); DEFINE_DTRACE_PROBE2(zil__process__normal__itx); DEFINE_DTRACE_PROBE2(zil__commit__io__error); +DEFINE_DTRACE_PROBE3(zil__block__size); #endif /* HAVE_DECLARE_EVENT_CLASS */ #endif /* _KERNEL */ diff --git a/include/sys/zil.h b/include/sys/zil.h index cff8ebcad819..4747ecc067a9 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -489,18 +489,22 @@ typedef struct zil_stats { * Transactions which have been allocated to the "normal" * (i.e. not slog) storage pool. Note that "bytes" accumulate * the actual log record sizes - which do not include the actual - * data in case of indirect writes. + * data in case of indirect writes. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_normal_count; kstat_named_t zil_itx_metaslab_normal_bytes; + kstat_named_t zil_itx_metaslab_normal_write; + kstat_named_t zil_itx_metaslab_normal_alloc; /* * Transactions which have been allocated to the "slog" storage pool. * If there are no separate log devices, this is the same as the - * "normal" pool. + * "normal" pool. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_slog_count; kstat_named_t zil_itx_metaslab_slog_bytes; + kstat_named_t zil_itx_metaslab_slog_write; + kstat_named_t zil_itx_metaslab_slog_alloc; } zil_kstat_values_t; typedef struct zil_sums { @@ -515,8 +519,12 @@ typedef struct zil_sums { wmsum_t zil_itx_needcopy_bytes; wmsum_t zil_itx_metaslab_normal_count; wmsum_t zil_itx_metaslab_normal_bytes; + wmsum_t zil_itx_metaslab_normal_write; + wmsum_t zil_itx_metaslab_normal_alloc; wmsum_t zil_itx_metaslab_slog_count; wmsum_t zil_itx_metaslab_slog_bytes; + wmsum_t zil_itx_metaslab_slog_write; + wmsum_t zil_itx_metaslab_slog_alloc; } zil_sums_t; #define ZIL_STAT_INCR(zil, stat, val) \ diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 57b8faf213eb..767a461e0026 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -49,8 +49,12 @@ static dataset_kstat_values_t empty_dataset_kstats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 } + { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 } } }; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index f2798270a8a2..509fd39d3590 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -116,8 +116,12 @@ static zil_kstat_values_t zil_stats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; @@ -378,8 +382,12 @@ zil_sums_init(zil_sums_t *zs) wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void @@ -396,8 +404,12 @@ zil_sums_fini(zil_sums_t *zs) wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); + wmsum_fini(&zs->zil_itx_metaslab_normal_write); + wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); + wmsum_fini(&zs->zil_itx_metaslab_slog_write); + wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void @@ -425,10 +437,18 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); + zs->zil_itx_metaslab_normal_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); + zs->zil_itx_metaslab_normal_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); + zs->zil_itx_metaslab_slog_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); + zs->zil_itx_metaslab_slog_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* @@ -1814,6 +1834,9 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, + uint64_t, zil_blksz, + uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) @@ -1888,10 +1911,18 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); } else { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); } spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); zil_lwb_add_block(lwb, &lwb->lwb_blk); diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh index 9bf6a94cfc84..9deee67a56ca 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh @@ -25,7 +25,7 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" set -A args "" "-s \",\"" "-v" \ - "-f time,zcwc,zimnb,zimsb" + "-f time,cwc,imnb,imsb" log_assert "zilstat generates output and doesn't return an error code" From 91a2325c4a0fbe01d0bf212e44fa9d85017837ce Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 May 2023 13:53:08 -0700 Subject: [PATCH 06/38] Update compatibility.d files Add an openzfs-2.2 compatibility file for the next release. Edon-R support has been enabled for FreeBSD removing the need for different FreeBSD and Linux files. Symlinks for the -linux and -freebsd names are created for any scripts expecting that convention. Additionally, a symlink for ubunutu-22.04 was added. Signed-off-by: Brian Behlendorf Closes #14833 --- cmd/zpool/Makefile.am | 6 +++- cmd/zpool/compatibility.d/openzfs-2.2 | 40 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 cmd/zpool/compatibility.d/openzfs-2.2 diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index 3c7c8a9aebe2..de700eabf86b 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -145,6 +145,7 @@ dist_zpoolcompat_DATA = \ %D%/compatibility.d/openzfs-2.0-linux \ %D%/compatibility.d/openzfs-2.1-freebsd \ %D%/compatibility.d/openzfs-2.1-linux \ + %D%/compatibility.d/openzfs-2.2 \ %D%/compatibility.d/openzfsonosx-1.7.0 \ %D%/compatibility.d/openzfsonosx-1.8.1 \ %D%/compatibility.d/openzfsonosx-1.9.3 \ @@ -173,7 +174,10 @@ zpoolcompatlinks = \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ "openzfs-2.0-freebsd truenas-12.0" \ "zol-0.7 ubuntu-18.04" \ - "zol-0.8 ubuntu-20.04" + "zol-0.8 ubuntu-20.04" \ + "openzfs-2.1-linux ubuntu-22.04" \ + "openzfs-2.2 openzfs-2.2-linux" \ + "openzfs-2.2 openzfs-2.2-freebsd" zpoolconfdir = $(sysconfdir)/zfs/zpool.d INSTALL_DATA_HOOKS += zpool-install-data-hook diff --git a/cmd/zpool/compatibility.d/openzfs-2.2 b/cmd/zpool/compatibility.d/openzfs-2.2 new file mode 100644 index 000000000000..c9491cd8dc42 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfs-2.2 @@ -0,0 +1,40 @@ +# Features supported by OpenZFS 2.2 on Linux and FreeBSD +allocation_classes +async_destroy +blake3 +block_cloning +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +head_errlog +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +vdev_zaps_v2 +zilsaxattr +zpool_checkpoint +zstd_compress From ff03dfd4d8cc533fcec5f63dd7cc5aa20f99cb18 Mon Sep 17 00:00:00 2001 From: Damiano Albani Date: Fri, 26 May 2023 01:10:54 +0200 Subject: [PATCH 07/38] Add missing files to Debian DKMS package Reviewed-by: Tino Reichardt Reviewed-by: Umer Saleem Reviewed-by: Brian Behlendorf Signed-off-by: Damiano Albani Closes #14887 Closes #14889 --- contrib/debian/rules.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 63892c6ca243..f0791cfabd38 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -7,8 +7,8 @@ NAME := $(shell awk '$$1 == "Name:" { print $$2; }' META) LINUX_MIN := $(shell awk '/Linux-Minimum:/{print $$2}' META) LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META) -DKMSFILES := module include config zfs.release.in autogen.sh META AUTHORS \ - COPYRIGHT LICENSE README.md +DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \ + COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md ifndef KVERS KVERS=$(shell uname -r) From bb736d98d133b4449a4e3bb97a914651677e6713 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Fri, 26 May 2023 18:53:00 +0200 Subject: [PATCH 08/38] Fix inconsistent definition of zfs_scrub_error_blocks_per_txg Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14894 --- module/zfs/dsl_scan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 5e3559b251e3..6cad339104a4 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -234,7 +234,7 @@ static int zfs_resilver_disable_defer = B_FALSE; static int zfs_free_bpobj_enabled = 1; /* Error blocks to be scrubbed in one txg. */ -unsigned long zfs_scrub_error_blocks_per_txg = 1 << 12; +static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { @@ -5242,6 +5242,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); -ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, U64, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW, "Error blocks to be scrubbed in one txg"); /* END CSTYLED */ From 677c6f8457943fe5b56d7aa8807010a104563e4a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 26 May 2023 13:03:12 -0400 Subject: [PATCH 09/38] btree: Implement faster binary search algorithm This implements a binary search algorithm for B-Trees that reduces branching to the absolute minimum necessary for a binary search algorithm. It also enables the compiler to inline the comparator to ensure that the only slowdown when doing binary search is from waiting for memory accesses. Additionally, it instructs the compiler to unroll the loop, which gives an additional 40% improve with Clang and 8% improvement with GCC. Consumers must opt into using the faster algorithm. At present, only B-Trees used inside kernel code have been modified to use the faster algorithm. Micro-benchmarks suggest that this can improve binary search performance by up to 3.5 times when compiling with Clang 16 and up to 1.9 times when compiling with GCC 12.2. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #14866 --- cmd/zdb/zdb.c | 8 ++-- include/sys/btree.h | 66 ++++++++++++++++++++++++++++++-- module/Kbuild.in | 14 +++++++ module/Makefile.bsd | 14 +++++++ module/zfs/btree.c | 22 +++++++---- module/zfs/dsl_scan.c | 7 +++- module/zfs/metaslab.c | 23 +++++++---- module/zfs/range_tree.c | 18 ++++++++- module/zfs/zap_micro.c | 6 ++- tests/zfs-tests/cmd/btree_test.c | 2 +- 10 files changed, 154 insertions(+), 26 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5ab13b470dc0..61f1258f72b9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -326,7 +326,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) int err; struct sublivelist_verify *sv = args; - zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, @@ -390,7 +390,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { (void) args; sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); @@ -682,7 +682,7 @@ livelist_metaslab_validate(spa_t *spa) (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); @@ -716,7 +716,7 @@ livelist_metaslab_validate(spa_t *spa) mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, - livelist_block_compare, + livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); diff --git a/include/sys/btree.h b/include/sys/btree.h index 883abb5181c9..6e05eee8f01d 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -105,8 +105,13 @@ typedef struct zfs_btree_index { boolean_t bti_before; } zfs_btree_index_t; -typedef struct btree { +typedef struct btree zfs_btree_t; +typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, + const void *, zfs_btree_index_t *); + +struct btree { int (*bt_compar) (const void *, const void *); + bt_find_in_buf_f bt_find_in_buf; size_t bt_elem_size; size_t bt_leaf_size; uint32_t bt_leaf_cap; @@ -115,7 +120,54 @@ typedef struct btree { uint64_t bt_num_nodes; zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading -} zfs_btree_t; +}; + +/* + * Implementation of Shar's algorithm designed to accelerate binary search by + * eliminating impossible to predict branches. + * + * For optimality, this should be used to generate the search function in the + * same file as the comparator and the comparator should be marked + * `__attribute__((always_inline) inline` so that the compiler will inline it. + * + * Arguments are: + * + * NAME - The function name for this instance of the search function. Use it + * in a subsequent call to zfs_btree_create(). + * T - The element type stored inside the B-Tree. + * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, + * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, + * TREE_CMP() from avl.h can be used in a boilerplate function. + */ +/* BEGIN CSTYLED */ +#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ +_Pragma("GCC diagnostic push") \ +_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ +static void * \ +NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ + const void *value, zfs_btree_index_t *where) \ +{ \ + T *i = (T *)buf; \ + (void) tree; \ + _Pragma("GCC unroll 9") \ + while (nelems > 1) { \ + uint32_t half = nelems / 2; \ + nelems -= half; \ + i += (COMP(&i[half - 1], value) < 0) * half; \ + } \ + \ + int comp = COMP(i, value); \ + where->bti_offset = (i - (T *)buf) + (comp < 0); \ + where->bti_before = (comp != 0); \ + \ + if (comp == 0) { \ + return (i); \ + } \ + \ + return (NULL); \ +} \ +_Pragma("GCC diagnostic pop") +/* END CSTYLED */ /* * Allocate and deallocate caches for btree nodes. @@ -129,13 +181,19 @@ void zfs_btree_fini(void); * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > + * find - optional function to accelerate searches inside B-Tree nodes + * through Shar's algorithm and comparator inlining. Setting this to + * NULL will use a generic function. The function should be created + * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. + * compar should be marked `__attribute__((always_inline)) inline` or + * performance is unlikely to improve very much. * size - the value of sizeof(struct my_type) * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), - size_t); + bt_find_in_buf_f, size_t); void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), - size_t, size_t); + bt_find_in_buf_f, size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node diff --git a/module/Kbuild.in b/module/Kbuild.in index 8d29f56c2fb8..29a55c9778b1 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +ifeq ($(CONFIG_X86),y) +ifeq ($(CONFIG_CC_IS_CLANG),y) +CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false +endif +endif + ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 365609fb8585..9464223f6ca6 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -400,6 +400,20 @@ beforeinstall: .include +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +.if ${CC} == "clang" +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64" +CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false +CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false +CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false +CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false +.endif +.endif + CFLAGS.sysctl_os.c= -include ../zfs_config.h CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 4c25afaa8199..af2b94a850be 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), - size_t size) + bt_find_in_buf_f bt_find_in_buf, size_t size) { - zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); + zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, + BTREE_LEAF_SIZE); } +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, + const void *value, zfs_btree_index_t *where); + void zfs_btree_create_custom(zfs_btree_t *tree, int (*compar) (const void *, const void *), + bt_find_in_buf_f bt_find_in_buf, size_t size, size_t lsize) { size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); @@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree, ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; + tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? + zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; tree->bt_leaf_cap = P2ALIGN(esize / size, 2); @@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = zfs_btree_find_in_buf(tree, + void *d = tree->bt_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); @@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + void *d = tree->bt_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { @@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); - void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + + void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); @@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; @@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 6cad339104a4..9ee719a5eef6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -4877,6 +4877,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ +__attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { @@ -4885,13 +4886,17 @@ ext_size_compare(const void *x, const void *y) return (TREE_CMP(*a, *b)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, + ext_size_compare) + static void ext_size_create(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; - zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t)); + zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, + sizeof (uint64_t)); } static void diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 24d52a74933f..94b131fcdb79 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { @@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { @@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } + typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; @@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt) range_tree_walk(rt, metaslab_size_sorted_add, &arg); } + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, + range_seg32_t, metaslab_rangesize32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, + range_seg64_t, metaslab_rangesize64_compare) + /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. @@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg) size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; + bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; + bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } - zfs_btree_create(size_tree, compare, size); + zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 894c30fcae16..5174e2c46633 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } +__attribute__((always_inline)) inline static int range_tree_seg32_compare(const void *x1, const void *x2) { @@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg64_compare(const void *x1, const void *x2) { @@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg_gap_compare(const void *x1, const void *x2) { @@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, + range_tree_seg32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, + range_tree_seg64_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, + range_tree_seg_gap_compare) + range_tree_t * range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap) @@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = range_tree_seg32_compare; + bt_find = range_tree_seg32_find_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = range_tree_seg64_compare; + bt_find = range_tree_seg64_find_in_buf; break; case RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); compare = range_tree_seg_gap_compare; + bt_find = range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } - zfs_btree_create(&rt->rt_root, compare, size); + zfs_btree_create(&rt->rt_root, compare, bt_find, size); rt->rt_ops = ops; rt->rt_gap = gap; diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index d6ad8b2b8bc5..085d9cd8b4b6 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size) } } +__attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { @@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2) (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, + mze_compare) + static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { @@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, - sizeof (mzap_ent_t), 512); + mze_find_in_buf, sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { diff --git a/tests/zfs-tests/cmd/btree_test.c b/tests/zfs-tests/cmd/btree_test.c index 9a34bf559be0..fda9229915ce 100644 --- a/tests/zfs-tests/cmd/btree_test.c +++ b/tests/zfs-tests/cmd/btree_test.c @@ -501,7 +501,7 @@ main(int argc, char *argv[]) srandom(seed); zfs_btree_init(); - zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); + zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t)); /* * This runs the named negative test. None of them should From d3e0138a3d186d61a13b9b8450c3b0d1b0ba9398 Mon Sep 17 00:00:00 2001 From: Colm Date: Fri, 26 May 2023 10:04:19 -0700 Subject: [PATCH 10/38] Adding new read-only compatible zpool features to compatibility.d/grub2 GRUB2 is compatible with all "read-only compatible" features, so it is safe to add new features of this type to the grub2 compatibility list. We generally want to include all compatible features, to minimize the differences between grub2-compatible pools and no-compatibility pools. Adding new properties `livelist` and `zpool_checkpoint` accordingly. Also adding them to the man page which references this file as an example, for consistency. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Colm Buckley Closes #14893 --- cmd/zpool/compatibility.d/grub2 | 2 ++ man/man7/zpool-features.7 | 2 ++ 2 files changed, 4 insertions(+) diff --git a/cmd/zpool/compatibility.d/grub2 b/cmd/zpool/compatibility.d/grub2 index 4e8f21362554..fec73a269a78 100644 --- a/cmd/zpool/compatibility.d/grub2 +++ b/cmd/zpool/compatibility.d/grub2 @@ -8,5 +8,7 @@ extensible_dataset filesystem_limits hole_birth large_blocks +livelist lz4_compress spacemap_histogram +zpool_checkpoint diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 2b7dcb63829c..b901ce6c2935 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -228,8 +228,10 @@ extensible_dataset filesystem_limits hole_birth large_blocks +livelist lz4_compress spacemap_histogram +zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed From 365bae0eab3bf1c9ce29789094fb352a7f269974 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Fri, 26 May 2023 15:37:15 -0700 Subject: [PATCH 11/38] Add compatibility symlinks for FreeBSD 12.{3,4} and 13.{0,1,2} Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Mike Swanson Closes #14902 --- cmd/zpool/Makefile.am | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index de700eabf86b..d08b8e1791b6 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -169,6 +169,11 @@ zpoolcompatlinks = \ "freebsd-11.3 freebsd-12.0" \ "freebsd-11.3 freebsd-12.1" \ "freebsd-11.3 freebsd-12.2" \ + "freebsd-11.3 freebsd-12.3" \ + "freebsd-11.3 freebsd-12.4" \ + "openzfs-2.1-freebsd freebsd-13.0" \ + "openzfs-2.1-freebsd freebsd-13.1" \ + "openzfs-2.1-freebsd freebsd-13.2" \ "freebsd-11.3 freenas-11.3" \ "freenas-11.0 freenas-11.1" \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ From 20494d47d22a60964274c73db3b22bc385eb9667 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 26 May 2023 15:39:23 -0700 Subject: [PATCH 12/38] ZTS: Add zpool_resilver_concurrent exception The zpool_resilver_concurrent test case requires the ZED which is not used on FreeBSD. Add this test to the known list of skipped tested for FreeBSD. Signed-off-by: Brian Behlendorf Closes #14904 --- tests/test-runner/bin/zts-report.py.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 3f7498f5c6bf..3eeee35878f8 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -163,6 +163,8 @@ if sys.platform.startswith('freebsd'): known.update({ 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], + 'cli_root/zpool_resilver/zpool_resilver_concurrent': + ['SKIP', na_reason], 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], From 0f03a411615a797425de488eecfaaf63fc41acfe Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 26 May 2023 18:47:52 -0400 Subject: [PATCH 13/38] Use __attribute__((malloc)) on memory allocation functions This informs the C compiler that pointers returned from these functions do not alias other functions, which allows it to do better code optimization and should make the compiled code smaller. References: https://stackoverflow.com/a/53654773 https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute https://clang.llvm.org/docs/AttributeReference.html#malloc Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #14827 --- include/os/freebsd/spl/sys/kmem.h | 3 ++- include/os/linux/spl/sys/kmem.h | 16 ++++++++-------- include/os/linux/spl/sys/vmem.h | 6 ++++-- include/sys/abd.h | 5 +++++ lib/libspl/include/umem.h | 7 ++++--- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h index 27d290863c0b..c633799318d5 100644 --- a/include/os/freebsd/spl/sys/kmem.h +++ b/include/os/freebsd/spl/sys/kmem.h @@ -75,7 +75,7 @@ typedef struct kmem_cache { extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); @@ -83,6 +83,7 @@ kmem_cache_t *kmem_cache_create(const char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); void kmem_cache_destroy(kmem_cache_t *cache); +__attribute__((malloc)) void *kmem_cache_alloc(kmem_cache_t *cache, int flags); void kmem_cache_free(kmem_cache_t *cache, void *buf); boolean_t kmem_cache_reap_active(void); diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h index 594425f7b297..8a203f7bb8e2 100644 --- a/include/os/linux/spl/sys/kmem.h +++ b/include/os/linux/spl/sys/kmem.h @@ -31,10 +31,10 @@ #include extern int kmem_debugging(void); -extern char *kmem_vasprintf(const char *fmt, va_list ap) - __attribute__((format(printf, 1, 0))); -extern char *kmem_asprintf(const char *fmt, ...) - __attribute__((format(printf, 1, 2))); +__attribute__((format(printf, 1, 0))) +extern char *kmem_vasprintf(const char *fmt, va_list ap); +__attribute__((format(printf, 1, 2))) +extern char *kmem_asprintf(const char *fmt, ...); extern char *kmem_strdup(const char *str); extern void kmem_strfree(char *str); @@ -186,10 +186,10 @@ extern unsigned int spl_kmem_alloc_max; #define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) #define kmem_cache_reap_active spl_kmem_cache_reap_active -extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line) - __attribute__((alloc_size(1))); -extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line) - __attribute__((alloc_size(1))); +__attribute__((malloc, alloc_size(1))) +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +__attribute__((malloc, alloc_size(1))) +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); extern void spl_kmem_free(const void *ptr, size_t sz); /* diff --git a/include/os/linux/spl/sys/vmem.h b/include/os/linux/spl/sys/vmem.h index e77af2a7a48c..92585a17e263 100644 --- a/include/os/linux/spl/sys/vmem.h +++ b/include/os/linux/spl/sys/vmem.h @@ -91,8 +91,10 @@ typedef struct vmem { } vmem_t; #define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) #define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) -extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); -extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line) + __attribute__((malloc, alloc_size(1))); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line) + __attribute__((malloc, alloc_size(1))); extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); diff --git a/include/sys/abd.h b/include/sys/abd.h index 82c51cb05cbc..750f9986c1da 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -86,10 +86,15 @@ extern int zfs_abd_scatter_enabled; * Allocations and deallocations */ +__attribute__((malloc)) abd_t *abd_alloc(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_linear(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_gang(void); +__attribute__((malloc)) abd_t *abd_alloc_for_io(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); void abd_gang_add(abd_t *, abd_t *, boolean_t); diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h index 77c216721253..9039212baf14 100644 --- a/lib/libspl/include/umem.h +++ b/lib/libspl/include/umem.h @@ -83,7 +83,7 @@ const char *_umem_debug_init(void); const char *_umem_options_init(void); const char *_umem_logging_init(void); -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_alloc(size_t size, int flags) { @@ -96,7 +96,7 @@ umem_alloc(size_t size, int flags) return (ptr); } -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_alloc_aligned(size_t size, size_t align, int flags) { @@ -118,7 +118,7 @@ umem_alloc_aligned(size_t size, size_t align, int flags) return (ptr); } -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_zalloc(size_t size, int flags) { @@ -188,6 +188,7 @@ umem_cache_destroy(umem_cache_t *cp) umem_free(cp, sizeof (umem_cache_t)); } +__attribute__((malloc)) static inline void * umem_cache_alloc(umem_cache_t *cp, int flags) { From e085e98d541ad96bfe16be98a235bbe4e00b2e08 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 29 May 2023 12:55:35 -0700 Subject: [PATCH 14/38] ZTS: zvol_misc_trim disable blk mq Disable the zvol_misc_fua.ksh and zvol_misc_trim.ksh test cases on impacted kernels. This issue is being actively worked in #14872 and as part of that fix this commit will be reverted. VERIFY(zh->zh_claim_txg == 0) failed PANIC at zil.c:904:zil_create() Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Issue #14872 Closes #14870 --- tests/test-runner/bin/zts-report.py.in | 2 ++ .../tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh | 9 +++++++++ .../tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh | 10 +++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 3eeee35878f8..ef1a46dca72a 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -279,6 +279,8 @@ elif sys.platform.startswith('linux'): 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], + 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], + 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 9ebd5b149118..619d8d0e8f07 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -45,6 +45,15 @@ fi if ! is_linux ; then log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" +else + if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + + # Disabled for the CentOS 9 kernel + if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi fi typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index 46cac3ecb6c2..c0b191aafd45 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -44,6 +44,15 @@ verify_runnable "global" if is_linux ; then + if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + + # Disabled for the CentOS 9 kernel + if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + # We need '--force' here since the prior tests may leave a filesystem # on the zvol, and blkdiscard will see that filesystem and print a # warning unless you force it. @@ -123,7 +132,6 @@ log_must zfs set compression=off $TESTPOOL/$TESTVOL # Remove old data from previous tests log_must $trimcmd $zvolpath - set_blk_mq 1 log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL From 928c81f4dfa994aad9a9406dee695ed954d77371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <73643340+lumigch@users.noreply.github.com> Date: Tue, 30 May 2023 23:15:24 +0100 Subject: [PATCH 15/38] Fix NULL pointer dereference when doing concurrent 'send' operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A NULL pointer will occur when doing a 'zfs send -S' on a dataset that is still being received. The problem is that the new 'send' will rightfully fail to own the datasets (i.e. dsl_dataset_own_force() will fail), but then dmu_send() will still do the dsl_dataset_disown(). Reviewed-by: Brian Behlendorf Signed-off-by: Luís Henriques Closes #14903 Closes #14890 --- module/zfs/dmu_send.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 5b7f5543ad09..b3ebdec6b45c 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2793,6 +2793,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } if (err == 0) { + owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2806,21 +2807,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, sizeof (dspp.saved_toname), dspp.saved_toname); } - if (err != 0) + /* Only disown if there was an error in the lookups */ + if (owned && (err != 0)) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); + if (err == 0) + owned = B_TRUE; } - owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { + /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } From 2810dda80b8e1d629236b82c5bee6a4ef717e02e Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Wed, 31 May 2023 19:58:41 -0400 Subject: [PATCH 16/38] Revert "initramfs: use `mount.zfs` instead of `mount`" This broke mounting of snapshots on / for users. See https://github.com/openzfs/zfs/issues/9461#issuecomment-1376162949 for more context. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14908 --- contrib/initramfs/scripts/zfs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 7f977a30f75b..0a2bd2efda7a 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount.zfs -o zfsutil" + ZFS_CMD="mount -o zfsutil -t zfs" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -361,7 +361,7 @@ mount_fs() fi # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount.zfs" + ZFS_CMD="mount -t zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" @@ -944,7 +944,7 @@ mountroot() echo " not specified on the kernel command line." echo "" echo "Manually mount the root filesystem on $rootmnt and then exit." - echo "Hint: Try: mount.zfs -o zfsutil ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" + echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" shell fi From c47b708647d10e6391101492dbd0f63a386ccd10 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 20:00:48 -0300 Subject: [PATCH 17/38] PAM: do not fail to mount if the key's already loaded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we're expecting a working home directory on login, it would be rather frustrating to not have it mounted just because it e.g. failed to unmount once on logout. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 979546ab3090..9d9076e1aa0d 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -386,7 +386,7 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); - if (ret) { + if (ret && ret != EEXIST) { pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); zfs_close(ds); return (-1); From bd4962b5ac42940a0c674b03ae9f47e36b13c908 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 21:56:39 -0300 Subject: [PATCH 18/38] PAM: use boolean_t for config flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we already use boolean_t in the file, we can use it here. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 9d9076e1aa0d..b3086e038e5b 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -437,7 +437,7 @@ typedef struct { char *dsname; uid_t uid; const char *username; - int unmount_and_unload; + boolean_t unmount_and_unload; } zfs_key_config_t; static int @@ -471,7 +471,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, } config->uid = entry->pw_uid; config->username = name; - config->unmount_and_unload = 1; + config->unmount_and_unload = B_TRUE; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -482,7 +482,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { - config->unmount_and_unload = 0; + config->unmount_and_unload = B_FALSE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); From 850bccd3bc163a602700c4a4b15c8d52c0b6231c Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 19:35:57 -0300 Subject: [PATCH 19/38] PAM: add 'recursive_homes' flag to use with 'prop_mountpoint' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not always desirable to have a fixed flat homes directory. With the 'recursive_homes' flag, 'prop_mountpoint' search would traverse the whole tree starting at 'homes' (which can now be '*' to mean all pools) to find a dataset with a mountpoint matching the home directory. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 36 +++++++--- tests/runfiles/linux.run | 2 +- .../tests/functional/pam/cleanup.ksh | 1 + .../tests/functional/pam/pam_recursive.ksh | 72 +++++++++++++++++++ 4 files changed, 99 insertions(+), 12 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/pam/pam_recursive.ksh diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index b3086e038e5b..259ac7a8f191 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -438,6 +438,7 @@ typedef struct { uid_t uid; const char *username; boolean_t unmount_and_unload; + boolean_t recursive_homes; } zfs_key_config_t; static int @@ -472,6 +473,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = B_TRUE; + config->recursive_homes = B_FALSE; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -483,6 +485,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = B_FALSE; + } else if (strcmp(argv[c], "recursive_homes") == 0) { + config->recursive_homes = B_TRUE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); @@ -517,8 +521,12 @@ find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE); if (strcmp(target->homedir, mountpoint) != 0) { + if (target->recursive_homes) { + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, target); + } zfs_close(zhp); - return (0); + return (target->dsname != NULL); } target->dsname = strdup(zfs_get_name(zhp)); @@ -531,17 +539,23 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) { if (config->homedir != NULL && config->homes_prefix != NULL) { - zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, - ZFS_TYPE_FILESYSTEM); - if (zhp == NULL) { - pam_syslog(NULL, LOG_ERR, "dataset %s not found", - config->homes_prefix); - return (NULL); - } + if (strcmp(config->homes_prefix, "*") == 0) { + (void) zfs_iter_root(g_zfs, + find_dsname_by_prop_value, config); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + config->homes_prefix, ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) { + pam_syslog(NULL, LOG_ERR, + "dataset %s not found", + config->homes_prefix); + return (NULL); + } - (void) zfs_iter_filesystems_v2(zhp, 0, - find_dsname_by_prop_value, config); - zfs_close(zhp); + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, config); + zfs_close(zhp); + } char *dsname = config->dsname; config->dsname = NULL; return (dsname); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4df770d61f07..97fc250a7cbf 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -140,7 +140,7 @@ tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_nounmount', 'pam_short_password'] +tests = ['pam_basic', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh index 971c7fce64e5..dbcb175ed069 100755 --- a/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -25,5 +25,6 @@ rmconfig destroy_pool $TESTPOOL del_user ${username} +del_user ${username}rec del_group pamtestgroup log_must rm -rf "$runstatedir" $TESTDIRS diff --git a/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh b/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh new file mode 100755 index 000000000000..3714b179b852 --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +username="${username}rec" + +# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests, +# and a user which references that mountpoint +log_must zfs create "$TESTPOOL/pampam" +log_must zfs create -o mountpoint="$TESTDIR/rec" "$TESTPOOL/pampam/pam" +echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \ + -o keylocation=prompt "$TESTPOOL/pampam/pam/${username}" +log_must zfs unmount "$TESTPOOL/pampam/pam/${username}" +log_must zfs unload-key "$TESTPOOL/pampam/pam/${username}" +log_must add_user pamtestgroup ${username} "$TESTDIR/rec" + +function keystatus { + log_must [ "$(get_prop keystatus "$TESTPOOL/pampam/pam/${username}")" = "$1" ] +} + +log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" +keystatus unavailable + +function test_session { + echo "recurpass" | pamtester ${pamservice} ${username} open_session + references 1 + log_must ismounted "$TESTPOOL/pampam/pam/${username}" + keystatus available + + log_must pamtester ${pamservice} ${username} close_session + references 0 + log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" + keystatus unavailable +} + +genconfig "homes=$TESTPOOL/pampam/pam prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL/pampam recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=* recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +log_pass "done." From f2f3ec17edb5015c068c737f328654ae2c36a790 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 22:02:13 -0300 Subject: [PATCH 20/38] PAM: add 'forceunmount' flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probably not always a good idea, but it's nice to have the option. It is a workaround for FreeBSD calling the PAM session end earier than the last process is actually done touching the mount, for example. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 259ac7a8f191..c6abb34619d6 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -406,14 +406,14 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, } static int -unmount_unload(pam_handle_t *pamh, const char *ds_name) +unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } - int ret = zfs_unmount(ds, NULL, 0); + int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0); if (ret) { pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); zfs_close(ds); @@ -438,6 +438,7 @@ typedef struct { uid_t uid; const char *username; boolean_t unmount_and_unload; + boolean_t force_unmount; boolean_t recursive_homes; } zfs_key_config_t; @@ -473,6 +474,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = B_TRUE; + config->force_unmount = B_FALSE; config->recursive_homes = B_FALSE; config->dsname = NULL; config->homedir = NULL; @@ -485,6 +487,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = B_FALSE; + } else if (strcmp(argv[c], "forceunmount") == 0) { + config->force_unmount = B_TRUE; } else if (strcmp(argv[c], "recursive_homes") == 0) { config->recursive_homes = B_TRUE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { @@ -882,7 +886,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SESSION_ERR); } - if (unmount_unload(pamh, dataset) == -1) { + if (unmount_unload(pamh, dataset, config.force_unmount) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); From e3ba6b93de32bc76e9e616472af1c7aafea585a5 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 22:34:58 -0300 Subject: [PATCH 21/38] PAM: add 'uid_min' and 'uid_max' options for changing the uid range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of a fixed >=1000 check, allow the configuration to override the minimum UID and add a maximum one as well. While here, add the uid range check to the authenticate method as well, and fix the return in the chauthtok method (seems very wrong to report success when we've done absolutely nothing). Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index c6abb34619d6..6b7a41fa1739 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -435,6 +435,8 @@ typedef struct { char *runstatedir; char *homedir; char *dsname; + uid_t uid_min; + uid_t uid_max; uid_t uid; const char *username; boolean_t unmount_and_unload; @@ -471,6 +473,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, free(config->homes_prefix); return (PAM_USER_UNKNOWN); } + config->uid_min = 1000; + config->uid_max = MAXUID; config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = B_TRUE; @@ -485,6 +489,10 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); + } else if (strncmp(argv[c], "uid_min=", 8) == 0) { + sscanf(argv[c] + 8, "%u", &config->uid_min); + } else if (strncmp(argv[c], "uid_max=", 8) == 0) { + sscanf(argv[c] + 8, "%u", &config->uid_max); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = B_FALSE; } else if (strcmp(argv[c], "forceunmount") == 0) { @@ -673,6 +681,10 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags, if (config_err != PAM_SUCCESS) { return (config_err); } + if (config.uid < config.uid_min || config.uid > config.uid_max) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } const pw_password_t *token = pw_fetch_lazy(pamh); if (token == NULL) { @@ -724,9 +736,9 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SERVICE_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); - return (PAM_SUCCESS); + return (PAM_SERVICE_ERR); } { if (pam_zfs_init(pamh) != 0) { @@ -806,7 +818,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SESSION_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -864,7 +876,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SESSION_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); return (PAM_SUCCESS); } From db994458bbee99968827d88afd823d36ef82af28 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 22:17:12 -0300 Subject: [PATCH 22/38] PAM: support password changes even when not mounted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's usually no requirement that a user be logged in for changing their password, so let's not be surprising here. We need to use the fetch_lazy mechanism for the old password to avoid a double prompt for it, so that mechanism is now generalized a bit. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 70 ++++++++++++------- tests/runfiles/linux.run | 3 +- .../functional/pam/pam_change_unmounted.ksh | 55 +++++++++++++++ .../functional/pam/pam_short_password.ksh | 2 +- 4 files changed, 102 insertions(+), 28 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 6b7a41fa1739..08a8640669b3 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -67,6 +67,7 @@ pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) #include static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; +static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok"; static libzfs_handle_t *g_zfs; @@ -160,10 +161,10 @@ pw_free(pw_password_t *pw) } static pw_password_t * -pw_fetch(pam_handle_t *pamh) +pw_fetch(pam_handle_t *pamh, int tok) { const char *token; - if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { + if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "couldn't get password from PAM stack"); return (NULL); @@ -177,13 +178,13 @@ pw_fetch(pam_handle_t *pamh) } static const pw_password_t * -pw_fetch_lazy(pam_handle_t *pamh) +pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name) { - pw_password_t *pw = pw_fetch(pamh); + pw_password_t *pw = pw_fetch(pamh, tok); if (pw == NULL) { return (NULL); } - int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); + int ret = pam_set_data(pamh, var_name, pw, destroy_pw); if (ret != PAM_SUCCESS) { pw_free(pw); pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); @@ -193,23 +194,23 @@ pw_fetch_lazy(pam_handle_t *pamh) } static const pw_password_t * -pw_get(pam_handle_t *pamh) +pw_get(pam_handle_t *pamh, int tok, const char *var_name) { const pw_password_t *authtok = NULL; - int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, + int ret = pam_get_data(pamh, var_name, (const void**)(&authtok)); if (ret == PAM_SUCCESS) return (authtok); if (ret == PAM_NO_MODULE_DATA) - return (pw_fetch_lazy(pamh)); + return (pw_fetch_lazy(pamh, tok, var_name)); pam_syslog(pamh, LOG_ERR, "password not available"); return (NULL); } static int -pw_clear(pam_handle_t *pamh) +pw_clear(pam_handle_t *pamh, const char *var_name) { - int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); + int ret = pam_set_data(pamh, var_name, NULL, NULL); if (ret != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "clearing password failed"); return (-1); @@ -686,7 +687,8 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags, return (PAM_SERVICE_ERR); } - const pw_password_t *token = pw_fetch_lazy(pamh); + const pw_password_t *token = pw_fetch_lazy(pamh, + PAM_AUTHTOK, PASSWORD_VAR_NAME); if (token == NULL) { zfs_key_config_free(&config); return (PAM_AUTH_ERR); @@ -740,6 +742,8 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } + const pw_password_t *old_token = pw_get(pamh, + PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME); { if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); @@ -751,49 +755,62 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - int key_loaded = is_key_loaded(pamh, dataset); - if (key_loaded == -1) { + if (!old_token) { + pam_syslog(pamh, LOG_ERR, + "old password from PAM stack is null"); free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - free(dataset); - pam_zfs_free(); - if (! key_loaded) { + if (decrypt_mount(pamh, dataset, + old_token->value, B_TRUE) == -1) { pam_syslog(pamh, LOG_ERR, - "key not loaded, returning try_again"); + "old token mismatch"); + free(dataset); + pam_zfs_free(); zfs_key_config_free(&config); return (PAM_PERM_DENIED); } } if ((flags & PAM_UPDATE_AUTHTOK) != 0) { - const pw_password_t *token = pw_get(pamh); + const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK, + PASSWORD_VAR_NAME); if (token == NULL) { + pam_syslog(pamh, LOG_ERR, "new password unavailable"); + pam_zfs_free(); zfs_key_config_free(&config); - return (PAM_SERVICE_ERR); - } - if (pam_zfs_init(pamh) != 0) { - zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } - if (change_key(pamh, dataset, token->value) == -1) { + int was_loaded = is_key_loaded(pamh, dataset); + if (!was_loaded && decrypt_mount(pamh, dataset, + old_token->value, B_FALSE) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } + int changed = change_key(pamh, dataset, token->value); + if (!was_loaded) { + unmount_unload(pamh, dataset, config.force_unmount); + } free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh) == -1) { + if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 || + pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) { return (PAM_SERVICE_ERR); } } else { @@ -829,7 +846,8 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SUCCESS); } - const pw_password_t *token = pw_get(pamh); + const pw_password_t *token = pw_get(pamh, + PAM_AUTHTOK, PASSWORD_VAR_NAME); if (token == NULL) { zfs_key_config_free(&config); return (PAM_SESSION_ERR); @@ -853,7 +871,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh) == -1) { + if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) { return (PAM_SERVICE_ERR); } return (PAM_SUCCESS); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 97fc250a7cbf..618eeb934017 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -140,7 +140,8 @@ tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', + 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh b/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh new file mode 100755 index 000000000000..91b202f7609d --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}" + +printf "testpass\nsecondpass\nsecondpass\n" | pamtester -v ${pamservice} ${username} chauthtok + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +echo "secondpass" | pamtester ${pamservice} ${username} open_session +references 1 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +printf "secondpass\ntestpass\ntestpass\n" | pamtester -v ${pamservice} ${username} chauthtok + +log_must ismounted "$TESTPOOL/pam/${username}" +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +log_must pamtester ${pamservice} ${username} close_session +references 0 +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +log_pass "done." diff --git a/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh b/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh index 443e07d7f003..079608583a72 100755 --- a/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh +++ b/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh @@ -52,7 +52,7 @@ log_must ismounted "$TESTPOOL/pam/${username}" keystatus available # Change user and dataset password to short one. -printf "short\nshort\n" | pamtester ${pamservice} ${username} chauthtok +printf "testpass\nshort\nshort\n" | pamtester -v ${pamservice} ${username} chauthtok # Unmount and unload key. log_must pamtester ${pamservice} ${username} close_session From 4f583a827e3b40f3bdfba78db75e1fe1feff122c Mon Sep 17 00:00:00 2001 From: Val Packett Date: Thu, 11 May 2023 18:16:57 -0300 Subject: [PATCH 23/38] PAM: enable testing on FreeBSD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- tests/runfiles/freebsd.run | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run index c7ca1d769fc3..13696d645850 100644 --- a/tests/runfiles/freebsd.run +++ b/tests/runfiles/freebsd.run @@ -25,3 +25,8 @@ tags = ['functional'] [tests/functional/cli_root/zfs_jail:FreeBSD] tests = ['zfs_jail_001_pos'] tags = ['functional', 'cli_root', 'zfs_jail'] + +[tests/functional/pam:FreeBSD] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', + 'pam_short_password'] +tags = ['functional', 'pam'] From 482da24e20735ebd67fb582ad8025ba517801503 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 2 Jun 2023 14:01:58 -0400 Subject: [PATCH 24/38] ZIL: Allow to replay blocks of any size. There seems to be no reason for ZIL blocks to be limited by 128KB other than replay code is written in such a way. This change does not increase the limit yet, just removes the artificial limitation. Avoided extra memcpy() may save us a second during replay. Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14910 --- module/zfs/zil.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 509fd39d3590..8672a61387a5 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -248,11 +248,10 @@ zil_kstats_global_update(kstat_t *ksp, int rw) */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, - blkptr_t *nbp, void *dst, char **end) + blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -269,7 +268,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, - &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -284,23 +283,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, */ cksum.zc_word[ZIL_ZC_SEQ]++; + uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); - uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + zilc->zc_nused < sizeof (*zilc) || + zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); - memcpy(dst, lr, len); - *end = (char *)dst + len; + *begin = lr; + *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; - uint64_t size = BP_GET_LSIZE(bp); + char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, @@ -308,15 +307,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(zilc->zc_nused, <=, - SPA_OLD_MAXBLOCKSIZE); - memcpy(dst, lr, zilc->zc_nused); - *end = (char *)dst + zilc->zc_nused; + *begin = lr; + *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } - - arc_buf_destroy(abuf, &abuf); } return (error); @@ -468,7 +463,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; - char *lrbuf, *lrp; int error = 0; /* @@ -486,13 +480,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *end = NULL; + char *lrp, *end; + arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; @@ -508,8 +502,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, - lrbuf, &end); + &lrp, &end, &abuf); if (error != 0) { + if (abuf) + arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -522,7 +518,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; } - for (lrp = lrbuf; lrp < end; lrp += reclen) { + for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); @@ -536,6 +532,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, max_lr_seq = lr->lrc_seq; lr_count++; } + arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; @@ -545,7 +542,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } From 6c29422e90dca7a7ef6a69c721238a1c33f393f9 Mon Sep 17 00:00:00 2001 From: Graham Perrin Date: Fri, 2 Jun 2023 19:25:13 +0100 Subject: [PATCH 25/38] zfs-create(8): ZFS for swap: caution, clarity Make the section heading more generic (the section relates to ZFS files as well as ZFS volumes). Swapping to a ZFS volume is prone to deadlock. Remove the related instruction, direct readers to OpenZFS FAQ. Related, but not linked from within the manual page: (Using a zvol for a swap device on Linux). Reviewed-by: Brian Behlendorf Signed-off-by: Graham Perrin Issue #7734 Closes #14756 --- man/man8/zfs-create.8 | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/man/man8/zfs-create.8 b/man/man8/zfs-create.8 index a7b6097c37f1..b3997d32767c 100644 --- a/man/man8/zfs-create.8 +++ b/man/man8/zfs-create.8 @@ -234,14 +234,11 @@ if the volume is not sparse. Print verbose information about the created dataset. .El .El -.Ss ZFS Volumes as Swap -ZFS volumes may be used as swap devices. -After creating the volume with the -.Nm zfs Cm create Fl V -enable the swap area using the -.Xr swapon 8 -command. -Swapping to files on ZFS filesystems is not supported. +.Ss ZFS for Swap +Swapping to a ZFS volume is prone to deadlock and not recommended. +See OpenZFS FAQ. +.Pp +Swapping to a file on a ZFS filesystem is not supported. . .Sh EXAMPLES .\" These are, respectively, examples 1, 10 from zfs.8 From dae3c549f59a4650edd07b86707166765c240310 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 5 Jun 2023 11:08:24 -0700 Subject: [PATCH 26/38] Linux 6.3 compat: META (#14930) Update the META file to reflect compatibility with the 6.3 kernel. Signed-off-by: Brian Behlendorf Reviewed-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 8779e512f7be..e4b476aff112 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.2 +Linux-Maximum: 6.3 Linux-Minimum: 3.10 From 5ba4025a8d94699d2638938a0cdf790113ff0531 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 5 Jun 2023 14:51:44 -0400 Subject: [PATCH 27/38] Introduce zfs_refcount_(add|remove)_few(). There are two places where we need to add/remove several references with semantics of zfs_refcount_(add|remove). But when debug/tracing is disabled, it is a crime to run multiple atomic_inc() in a loop, especially under congested pool-wide allocator lock. Introduced new functions implement the same semantics as the loop, but without overhead in production builds. Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14934 --- include/sys/zfs_refcount.h | 12 +++++++++--- module/zfs/dmu_zfetch.c | 3 +-- module/zfs/metaslab.c | 6 ++---- module/zfs/refcount.c | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index 42f846b8920a..4efa266a53c5 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -73,13 +73,15 @@ int64_t zfs_refcount_count(zfs_refcount_t *); int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); /* - * Note that (add|remove)_many add/remove one reference with "number" N, - * _not_ make N references with "number" 1, which is what vanilla - * zfs_refcount_(add|remove) would do if called N times. + * Note that (add|remove)_many adds/removes one reference with "number" N, + * _not_ N references with "number" 1, which is what (add|remove)_few does, + * or what vanilla zfs_refcount_(add|remove) called N times would do. * * Attempting to remove a reference with number N when none exists is a * panic on debug kernels with reference_tracking enabled. */ +void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *); +void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); @@ -108,6 +110,10 @@ typedef struct refcount { #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) +#define zfs_refcount_add_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, number) +#define zfs_refcount_remove_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, -number) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index ffc012e6c217..b70459380c24 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -520,8 +520,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = pf_end - pf_start + ipf_end - ipf_start; if (issued > 1) { /* More references on top of taken in dmu_zfetch_prepare(). */ - for (int i = 0; i < issued - 1; i++) - zfs_refcount_add(&zs->zs_refs, NULL); + zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); } else if (issued == 0) { /* Some other thread has done our work, so drop the ref. */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 94b131fcdb79..176247d63b76 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5650,8 +5650,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) - zfs_refcount_add(&mca->mca_alloc_slots, zio); + zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } @@ -5665,8 +5664,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - for (int d = 0; d < slots; d++) - zfs_refcount_remove(&mca->mca_alloc_slots, zio); + zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 62ec03e1035a..c9a504f67451 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -151,6 +151,15 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_add_many(rc, 1, holder)); } +void +zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (!rc->rc_tracked) + (void) zfs_refcount_add_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_add(rc, holder); +} + int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) @@ -204,6 +213,15 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_remove_many(rc, 1, holder)); } +void +zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (!rc->rc_tracked) + (void) zfs_refcount_remove_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_remove(rc, holder); +} + void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { From 2b9f8ba6736419d38292bee218f2756997a02c8c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 4 Jun 2023 11:14:20 +1000 Subject: [PATCH 28/38] znode: expose zfs_get_zplprop to libzpool There's no particular reason this function should be kernel-only, and I want to use it (indirectly) from zdb. I've moved it to zfs_znode.c because libzpool does not compile in zfs_vfsops.c, and this at least matches the header its imported from. Sponsored-By: Klara, Inc. Reviewed-by: Tino Reichardt Reviewed-by: WHR Signed-off-by: Rob Norris Closes #14642 --- include/sys/zfs_znode.h | 2 +- module/os/freebsd/zfs/zfs_vfsops.c | 86 ----------------------------- module/os/freebsd/zfs/zfs_znode.c | 87 ++++++++++++++++++++++++++++++ module/os/linux/zfs/zfs_vfsops.c | 85 ----------------------------- module/os/linux/zfs/zfs_znode.c | 85 +++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+), 172 deletions(-) diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 012e7403e2a6..2f266f53247e 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -158,6 +158,7 @@ extern "C" { #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); #ifdef _KERNEL #include @@ -280,7 +281,6 @@ extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_remove_op_tables(void); extern int zfs_create_op_tables(void); extern dev_t zfs_cmpldev(uint64_t); -extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); extern void zfs_znode_dmu_fini(znode_t *); diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 30851f5273a2..33759fa26169 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -2216,92 +2216,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) { - pname = ZPL_VERSION_STR; - } else { - pname = zfs_prop_to_name(prop); - } - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_NFSV4; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index d26d89544e7c..c4f2b722ef4e 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -2069,6 +2069,93 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_NFSV4; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + + void zfs_znode_update_vfs(znode_t *zp) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 48945b8af8c1..6b6293b9e482 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -2052,91 +2052,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_OFF; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index c104cd661bf5..02b1af3edc4f 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -2254,6 +2254,91 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); From 8653f1de48ee5b41e2bf421785e1279a08c6b903 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 15 Mar 2023 18:18:10 +1100 Subject: [PATCH 29/38] zdb: add -B option to generate backup stream This is more-or-less like `zfs send`, but specifying the snapshot by its objset id for situations where it can't be referenced any other way. Sponsored-By: Klara, Inc. Reviewed-by: Tino Reichardt Reviewed-by: WHR Signed-off-by: Rob Norris Closes #14642 --- cmd/zdb/zdb.c | 97 ++++++++++++++++++- man/man8/zdb.8 | 25 ++++- module/zfs/dmu_send.c | 3 +- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/cli_root/zdb/zdb_backup.ksh | 55 +++++++++++ 6 files changed, 174 insertions(+), 9 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 61f1258f72b9..105d36882291 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -33,6 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome + * Copyright (c) 2023, Klara Inc. */ #include @@ -789,6 +790,9 @@ usage(void) "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] [-K ]\n" "\t\t[[/] [ ...]\n" + "\t%s -B [-e [-V] [-p ...]] [-I ]\n" + "\t\t[-o =]... [-t ] [-U ] [-x ]\n" + "\t\t[-K ] / []\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" @@ -802,7 +806,7 @@ usage(void) "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -825,6 +829,8 @@ usage(void) (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); + (void) fprintf(stderr, " -B --backup " + "backup stream\n"); (void) fprintf(stderr, " -c --checksum " "checksum all metadata (twice for all data) blocks\n"); (void) fprintf(stderr, " -C --config " @@ -4875,6 +4881,81 @@ dump_path(char *ds, char *path, uint64_t *retobj) return (err); } +static int +dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) +{ + const char *p = (const char *)buf; + ssize_t nwritten; + + (void) os; + (void) arg; + + /* Write the data out, handling short writes and signals. */ + while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { + if (nwritten < 0) { + if (errno == EINTR) + continue; + return (errno); + } + p += nwritten; + len -= nwritten; + } + + return (0); +} + +static void +dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) +{ + boolean_t embed = B_FALSE; + boolean_t large_block = B_FALSE; + boolean_t compress = B_FALSE; + boolean_t raw = B_FALSE; + + const char *c; + for (c = flagstr; c != NULL && *c != '\0'; c++) { + switch (*c) { + case 'e': + embed = B_TRUE; + break; + case 'L': + large_block = B_TRUE; + break; + case 'c': + compress = B_TRUE; + break; + case 'w': + raw = B_TRUE; + break; + default: + fprintf(stderr, "dump_backup: invalid flag " + "'%c'\n", *c); + return; + } + } + + if (isatty(STDOUT_FILENO)) { + fprintf(stderr, "dump_backup: stream cannot be written " + "to a terminal\n"); + return; + } + + offset_t off = 0; + dmu_send_outparams_t out = { + .dso_outfunc = dump_backup_bytes, + .dso_dryrun = B_FALSE, + }; + + int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, + large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, + &off, &out); + if (err != 0) { + fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", + strerror(err)); + return; + } +} + static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { @@ -8695,6 +8776,7 @@ main(int argc, char **argv) struct option long_options[] = { {"ignore-assertions", no_argument, NULL, 'A'}, {"block-stats", no_argument, NULL, 'b'}, + {"backup", no_argument, NULL, 'B'}, {"checksum", no_argument, NULL, 'c'}, {"config", no_argument, NULL, 'C'}, {"datasets", no_argument, NULL, 'd'}, @@ -8736,10 +8818,11 @@ main(int argc, char **argv) }; while ((c = getopt_long(argc, argv, - "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", + "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': + case 'B': case 'c': case 'C': case 'd': @@ -8887,7 +8970,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL) + if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -9073,7 +9156,8 @@ main(int argc, char **argv) checkpoint_pool, error); } - } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { + } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || + objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); @@ -9209,7 +9293,10 @@ main(int argc, char **argv) strerror(errno)); } } - if (os != NULL) { + if (dump_opt['B']) { + dump_backup(target, objset_id, + argc > 0 ? argv[0] : NULL); + } else if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 26c67dabd705..031953c543a1 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd October 7, 2020 +.Dd June 4, 2023 .Dt ZDB 8 .Os . @@ -41,6 +41,13 @@ .Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID .Op Ar object Ns | Ns Ar range Ns … .Nm +.Fl B +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … +.Op Fl U Ar cache +.Op Fl K Ar key +.Ar poolname Ns Ar / Ns Ar objset-ID +.Op Ar backup-flags +.Nm .Fl C .Op Fl A .Op Fl U Ar cache @@ -123,6 +130,22 @@ Display options: Display statistics regarding the number, size .Pq logical, physical and allocated and deduplication of blocks. +.It Fl B , -backup +Generate a backup stream, similar to +.Nm zfs Cm send , +but for the numeric objset ID, and without opening the dataset. +This can be useful in recovery scenarios if dataset metadata has become +corrupted but the dataset itself is readable. +The optional +.Ar flags +argument is a string of one or more of the letters +.Sy e , +.Sy L , +.Sy c , +and +.Sy w , +which correspond to the same flags in +.Xr zfs-send 8 . .It Fl c , -checksum Verify the checksum of all metadata blocks while printing block statistics .Po see diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b3ebdec6b45c..2d37ed2cdfb5 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1955,7 +1955,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, { dsl_dataset_t *to_ds = dspp->to_ds; dsl_pool_t *dp = dspp->dp; -#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) @@ -1964,7 +1964,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, if (version >= ZPL_VERSION_SA) *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } -#endif /* raw sends imply large_block_ok */ if ((dspp->rawok || dspp->large_block_ok) && diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 10525289a3bd..342f56d50d04 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -128,7 +128,7 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] + 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] pre = post = tags = ['functional', 'cli_root', 'zdb'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 129893cd61f3..ff65dc1ac2b0 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -572,6 +572,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ + functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh new file mode 100755 index 000000000000..d98ab86ab667 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +write_count=8 +blksize=131072 + +tmpfile=$TEST_BASE_DIR/tmpfile + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm $tmpfile.1 $tmpfile.2 +} + +log_onexit cleanup + +log_assert "Verify that zfs send and zdb -B produce the same stream" + +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $TESTDIR/file -b $blksize -c $write_count + +snap=$TESTPOOL/$TESTFS@snap +log_must zfs snapshot $snap +typeset -i objsetid=$(zfs get -Ho value objsetid $snap) + +sync_pool $TESTPOOL + +log_must eval "zfs send -ecL $snap > $tmpfile.1" +log_must eval "zdb -B $TESTPOOL/$objsetid ecL > $tmpfile.2" + +typeset sum1=$(cat $tmpfile.1 | md5sum) +typeset sum2=$(cat $tmpfile.2 | md5sum) + +log_must test "$sum1" = "$sum2" + +log_pass "zfs send and zdb -B produce the same stream" From bcd5321039c3de29c14eac1068d392c15ad7fe2c Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 6 Jun 2023 21:32:37 +0200 Subject: [PATCH 30/38] Fix the L2ARC write size calculating logic l2arc_write_size() should return the write size after adjusting for trim and overhead of the L2ARC log blocks. Also take into account the allocated size of log blocks when deciding when to stop writing buffers to L2ARC. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14939 --- module/zfs/arc.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a78f664c4fe8..6f68c29fc7f5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -965,7 +965,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ @@ -8175,7 +8175,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size, tsize; + uint64_t size; /* * Make sure our globals have meaningful values in case the user @@ -8192,35 +8192,40 @@ l2arc_write_size(l2arc_dev_t *dev) if (arc_warm == B_FALSE) size += l2arc_write_boost; - /* - * Make sure the write size does not exceed the size of the cache - * device. This is important in l2arc_evict(), otherwise infinite - * iteration can occur. - */ - dev_size = dev->l2ad_end - dev->l2ad_start; - /* We need to add in the worst case scenario of log block overhead. */ - tsize = size + l2arc_log_blk_overhead(size, dev); + size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ - tsize += MAX(64 * 1024 * 1024, - (tsize * l2arc_trim_ahead) / 100); + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); } - if (tsize >= dev_size) { + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + if (size >= dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (arc_warm == B_FALSE) size += l2arc_write_boost; + + size += l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); + } } return (size); @@ -9413,8 +9418,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * arcstat_l2_{size,asize} kstats are updated * internally. */ - if (l2arc_log_blk_insert(dev, hdr)) - l2arc_log_blk_commit(dev, pio, cb); + if (l2arc_log_blk_insert(dev, hdr)) { + /* + * l2ad_hand has been accounted for in + * l2arc_log_blk_commit(). + */ + write_asize += + l2arc_log_blk_commit(dev, pio, cb); + } zio_nowait(wzio); } @@ -10564,7 +10575,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ -static void +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; @@ -10675,6 +10686,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; + + return (asize); } /* From 93f8abeff08e9c4363ec4d53d501cf21830c95e1 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 7 Jun 2023 10:43:43 -0700 Subject: [PATCH 31/38] Linux: Never sleep in kmem_cache_alloc(..., KM_NOSLEEP) (#14926) When a kmem cache is exhausted and needs to be expanded a new slab is allocated. KM_SLEEP callers can block and wait for the allocation, but KM_NOSLEEP callers were incorrectly allowed to block as well. Resolve this by attempting an emergency allocation as a best effort. This may fail but that's fine since any KM_NOSLEEP consumer is required to handle an allocation failure. Signed-off-by: Brian Behlendorf Reviewed-by: Adam Moss Reviewed-by: Brian Atkinson Reviewed-by: Richard Yao Reviewed-by: Tony Hutter --- module/os/linux/spl/spl-kmem-cache.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 745d03012f9d..3c30dfc577b4 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1015,9 +1015,19 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); + *obj = NULL; + /* + * Since we can't sleep attempt an emergency allocation to satisfy + * the request. The only alterative is to fail the allocation but + * it's preferable try. The use of KM_NOSLEEP is expected to be rare. + */ + if (flags & KM_NOSLEEP) + return (spl_emergency_alloc(skc, flags, obj)); + + might_sleep(); + /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. From 6c962690245a6a2a4dfc2350c71a249641139c26 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Wed, 7 Jun 2023 14:14:05 -0400 Subject: [PATCH 32/38] Revert "systemd: Use non-absolute paths in Exec* lines" This reverts commit 79b20949b25c8db4d379f6486b0835a6613b480c since it doesn't work with the systemd version shipped with RHEL7-based systems. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14943 Closes #14945 --- etc/systemd/system/zfs-import-cache.service.in | 2 +- etc/systemd/system/zfs-import-scan.service.in | 2 +- etc/systemd/system/zfs-mount.service.in | 2 +- etc/systemd/system/zfs-scrub@.service.in | 10 +++++----- etc/systemd/system/zfs-share.service.in | 2 +- etc/systemd/system/zfs-trim@.service.in | 10 +++++----- etc/systemd/system/zfs-volume-wait.service.in | 2 +- etc/systemd/system/zfs-zed.service.in | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in index 6d9a065e7e3a..fd822989da93 100644 --- a/etc/systemd/system/zfs-import-cache.service.in +++ b/etc/systemd/system/zfs-import-cache.service.in @@ -15,7 +15,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS +ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in index fb524f3b0889..c5dd45d87e68 100644 --- a/etc/systemd/system/zfs-import-scan.service.in +++ b/etc/systemd/system/zfs-import-scan.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS +ExecStart=@sbindir@/zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in index fc4e1c49f1c5..66d894923f4a 100644 --- a/etc/systemd/system/zfs-mount.service.in +++ b/etc/systemd/system/zfs-mount.service.in @@ -12,7 +12,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zfs mount -a +ExecStart=@sbindir@/zfs mount -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-scrub@.service.in b/etc/systemd/system/zfs-scrub@.service.in index 2bb2757d5e97..8ffffeb0cf6c 100644 --- a/etc/systemd/system/zfs-scrub@.service.in +++ b/etc/systemd/system/zfs-scrub@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=sh -c '\ -if zpool status %i | grep -q "scrub in progress"; then\ -exec zpool wait -t scrub %i;\ -else exec zpool scrub -w %i; fi' -ExecStop=-sh -c 'zpool scrub -p %i 2>/dev/null || true' +ExecStart=/bin/sh -c '\ +if @sbindir@/zpool status %i | grep -q "scrub in progress"; then\ +exec @sbindir@/zpool wait -t scrub %i;\ +else exec @sbindir@/zpool scrub -w %i; fi' +ExecStop=-/bin/sh -c '@sbindir@/zpool scrub -p %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index dd321f490fe6..1a6342a06fec 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zfs share -a +ExecStart=@sbindir@/zfs share -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-trim@.service.in b/etc/systemd/system/zfs-trim@.service.in index f55e36cd8454..423fb448c16f 100644 --- a/etc/systemd/system/zfs-trim@.service.in +++ b/etc/systemd/system/zfs-trim@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=sh -c '\ -if zpool status %i | grep -q "(trimming)"; then\ -exec zpool wait -t trim %i;\ -else exec zpool trim -w %i; fi' -ExecStop=-sh -c 'zpool trim -s %i 2>/dev/null || true' +ExecStart=/bin/sh -c '\ +if @sbindir@/zpool status %i | grep -q "(trimming)"; then\ +exec @sbindir@/zpool wait -t trim %i;\ +else exec @sbindir@/zpool trim -w %i; fi' +ExecStop=-/bin/sh -c '@sbindir@/zpool trim -s %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in index a86a3561e032..110c0f5f52ee 100644 --- a/etc/systemd/system/zfs-volume-wait.service.in +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -9,7 +9,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zvol_wait +ExecStart=@bindir@/zvol_wait [Install] WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in index ac58ad3eff7b..be2fc67348f9 100644 --- a/etc/systemd/system/zfs-zed.service.in +++ b/etc/systemd/system/zfs-zed.service.in @@ -5,7 +5,7 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=zed -F +ExecStart=@sbindir@/zed -F Restart=always [Install] From 55b1842f92a24fe7192d129bca7b60882080d31a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 13:08:05 -0400 Subject: [PATCH 33/38] ZIL: Fix race introduced by f63811f0721. We are not allowed to access lwb after setting LWB_STATE_FLUSH_DONE state and dropping zl_lock, since it may be freed by zil_sync(). To free itxs and waiters after dropping the lock we need to move lwb_itxs and lwb_waiters lists elements to local storage. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14957 Closes #14959 --- module/zfs/zil.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 8672a61387a5..8c1fe5f66838 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1393,9 +1393,14 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zil_commit_waiter_t *zcw; itx_t *itx; uint64_t txg; + list_t itxs, waiters; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); + list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1404,9 +1409,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) lwb->lwb_root_zio = NULL; - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1417,15 +1419,21 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } + list_move_tail(&itxs, &lwb->lwb_itxs); + list_move_tail(&waiters, &lwb->lwb_waiters); + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + mutex_exit(&zilog->zl_lock); - while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) + while ((itx = list_remove_head(&itxs)) != NULL) zil_itx_destroy(itx); + list_destroy(&itxs); - while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { + while ((zcw = list_remove_head(&waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); - ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been @@ -1450,6 +1458,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } + list_destroy(&waiters); mutex_enter(&zilog->zl_lwb_io_lock); txg = lwb->lwb_issued_txg; From b3ad3f48d9d215ce9bea1090d86ced17862ea441 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 13:12:52 -0400 Subject: [PATCH 34/38] Use list_remove_head() where possible. ... instead of list_head() + list_remove(). On FreeBSD the list functions are not inlined, so in addition to more compact code this also saves another function call. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14955 --- cmd/zed/agents/zfs_agents.c | 7 ++----- cmd/zed/agents/zfs_mod.c | 7 ++----- module/os/freebsd/zfs/zfs_acl.c | 4 +--- module/os/linux/zfs/zfs_acl.c | 4 +--- module/zfs/arc.c | 13 +++---------- module/zfs/bplist.c | 8 ++------ module/zfs/dmu_objset.c | 3 +-- module/zfs/dmu_tx.c | 3 +-- module/zfs/dsl_dataset.c | 3 +-- module/zfs/dsl_dir.c | 3 +-- module/zfs/dsl_scan.c | 4 +--- module/zfs/fm.c | 3 +-- module/zfs/refcount.c | 7 ++----- module/zfs/spa.c | 6 +++--- module/zfs/spa_misc.c | 3 +-- module/zfs/vdev_indirect.c | 11 +++++------ module/zfs/zfs_fm.c | 3 +-- module/zfs/zfs_fuid.c | 8 ++------ module/zfs/zfs_onexit.c | 3 +-- module/zfs/zvol.c | 7 ++----- 20 files changed, 34 insertions(+), 76 deletions(-) diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c index a2daa77a61fe..8fabb8d081a5 100644 --- a/cmd/zed/agents/zfs_agents.c +++ b/cmd/zed/agents/zfs_agents.c @@ -369,9 +369,7 @@ zfs_agent_consumer_thread(void *arg) return (NULL); } - if ((event = (list_head(&agent_events))) != NULL) { - list_remove(&agent_events, event); - + if ((event = list_remove_head(&agent_events)) != NULL) { (void) pthread_mutex_unlock(&agent_lock); /* dispatch to all event subscribers */ @@ -434,8 +432,7 @@ zfs_agent_fini(void) (void) pthread_join(g_agents_tid, NULL); /* drain any pending events */ - while ((event = (list_head(&agent_events))) != NULL) { - list_remove(&agent_events, event); + while ((event = list_remove_head(&agent_events)) != NULL) { nvlist_free(event->ae_nvl); free(event); } diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 1c82bd4f0010..b07a02712295 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -1288,17 +1288,14 @@ zfs_slm_fini(void) tpool_destroy(g_tpool); } - while ((pool = (list_head(&g_pool_list))) != NULL) { - list_remove(&g_pool_list, pool); + while ((pool = list_remove_head(&g_pool_list)) != NULL) { zpool_close(pool->uap_zhp); free(pool); } list_destroy(&g_pool_list); - while ((device = (list_head(&g_device_list))) != NULL) { - list_remove(&g_device_list, device); + while ((device = list_remove_head(&g_device_list)) != NULL) free(device); - } list_destroy(&g_device_list); libzfs_fini(g_zfshdl); diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c index a077076927a1..20466aeaaa05 100644 --- a/module/os/freebsd/zfs/zfs_acl.c +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -495,10 +495,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index ff26f47f2e04..a1fd3c9856cc 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -493,10 +493,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6f68c29fc7f5..dcd4620fcd20 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7866,8 +7866,7 @@ arc_fini(void) taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); - while ((p = list_head(&arc_prune_list)) != NULL) { - list_remove(&arc_prune_list, p); + while ((p = list_remove_head(&arc_prune_list)) != NULL) { zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); @@ -8324,20 +8323,14 @@ l2arc_dev_get_next(void) static void l2arc_do_free_on_write(void) { - list_t *buflist; - l2arc_data_free_t *df, *df_prev; + l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); - buflist = l2arc_free_on_write; - - for (df = list_tail(buflist); df; df = df_prev) { - df_prev = list_prev(buflist, df); + while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); - list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } - mutex_exit(&l2arc_free_on_write_mtx); } diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c index 1c1f7892bb7d..da7360f8ce10 100644 --- a/module/zfs/bplist.c +++ b/module/zfs/bplist.c @@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { + while ((bpe = list_remove_head(&bpl->bpl_list))) { bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); kmem_free(bpe, sizeof (*bpe)); @@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { - bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); + while ((bpe = list_remove_head(&bpl->bpl_list))) kmem_free(bpe, sizeof (*bpe)); - } mutex_exit(&bpl->bpl_lock); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c19ebf424953..778b18817eef 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1755,9 +1755,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_head(list)) != NULL) { + while ((dr = list_remove_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); zio_nowait(dr->dr_zio); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index c4e274bd4c42..0eb8c17e331a 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1396,8 +1396,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; - while ((dcb = list_tail(cb_list)) != NULL) { - list_remove(cb_list, dcb); + while ((dcb = list_remove_tail(cb_list)) != NULL) { dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 14e7ced4007c..d6db61729223 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -3782,8 +3782,7 @@ snaplist_destroy(list_t *l, const void *tag) if (l == NULL || !list_link_active(&l->list_head)) return; - while ((snap = list_tail(l)) != NULL) { - list_remove(l, snap); + while ((snap = list_remove_tail(l)) != NULL) { dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index eac9828a204a..bbe6a03d620f 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1490,7 +1490,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) if (tr_cookie == NULL) return; - while ((tr = list_head(tr_list)) != NULL) { + while ((tr = list_remove_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, @@ -1500,7 +1500,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) } else { arc_tempreserve_clear(tr->tr_size); } - list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 9ee719a5eef6..1dd44171c10e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -3437,10 +3437,8 @@ scan_io_queues_run_one(void *arg) * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ - while ((sio = list_head(&sio_list)) != NULL) { - list_remove(&sio_list, sio); + while ((sio = list_remove_head(&sio_list)) != NULL) scan_io_queue_insert_impl(queue, sio); - } queue->q_zio = NULL; mutex_exit(q_lock); diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 76956572f8bd..77d87b694a43 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev) list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ - while ((ze = list_head(&ev->ev_ze_list)) != NULL) { - list_remove(&ev->ev_ze_list, ze); + while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) { ze->ze_zevent = NULL; ze->ze_dropped++; } diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index c9a504f67451..601d27f8c47a 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -88,14 +88,11 @@ zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) reference_t *ref; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_head(&rc->rc_list))) { - list_remove(&rc->rc_list, ref); + while ((ref = list_remove_head(&rc->rc_list))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_list); - while ((ref = list_head(&rc->rc_removed))) { - list_remove(&rc->rc_removed, ref); + while ((ref = list_remove_head(&rc->rc_removed))) { kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 27bbb8f09259..88ee4ea9f458 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1609,16 +1609,16 @@ spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; + log_summary_entry_t *e; + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); - list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 89e1ce7165db..014c539eb683 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -814,8 +814,7 @@ spa_remove(spa_t *spa) if (spa->spa_root) spa_strfree(spa->spa_root); - while ((dp = list_head(&spa->spa_config_list)) != NULL) { - list_remove(&spa->spa_config_list, dp); + while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index a16ad2f4e7cf..89667585345d 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio) indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; - while ((is = list_head(&iv->iv_splits)) != NULL) { + while ((is = list_remove_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } - list_remove(&iv->iv_splits, is); indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; list_destroy(&is->is_unique_child); @@ -1659,8 +1658,8 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; is->is_unique_children = 0; } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index bdd0e96c327a..c42ef048dd74 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1522,9 +1522,8 @@ zfs_ereport_fini(void) { recent_events_node_t *entry; - while ((entry = list_head(&recent_events_list)) != NULL) { + while ((entry = list_remove_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); - list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 44aaae9c1264..add4241dcc99 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -699,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) zfs_fuid_t *zfuid; zfs_fuid_domain_t *zdomain; - while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { - list_remove(&fuidp->z_fuids, zfuid); + while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL) kmem_free(zfuid, sizeof (zfs_fuid_t)); - } if (fuidp->z_domain_table != NULL) kmem_free(fuidp->z_domain_table, (sizeof (char *)) * fuidp->z_domain_cnt); - while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { - list_remove(&fuidp->z_domains, zdomain); + while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL) kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); - } kmem_free(fuidp, sizeof (zfs_fuid_info_t)); } diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c index 63acf7ab2e4d..7bf804b67790 100644 --- a/module/zfs/zfs_onexit.c +++ b/module/zfs/zfs_onexit.c @@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo) zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); - while ((ap = list_head(&zo->zo_actions)) != NULL) { - list_remove(&zo->zo_actions, ap); + while ((ap = list_remove_head(&zo->zo_actions)) != NULL) { mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 06bc75c634a6..cd4e6f0c7558 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1203,8 +1203,7 @@ zvol_create_minors_recursive(const char *name) * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ - while ((job = list_head(&minors_list)) != NULL) { - list_remove(&minors_list, job); + while ((job = list_remove_head(&minors_list)) != NULL) { if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); @@ -1311,10 +1310,8 @@ zvol_remove_minors_impl(const char *name) rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ - while ((zv = list_head(&free_list)) != NULL) { - list_remove(&free_list, zv); + while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); - } } /* Remove minor for this specific volume only */ From 90ccfd426d9c9152c3f054d1335ab245d794d974 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 13:14:05 -0400 Subject: [PATCH 35/38] Improve l2arc reporting in arc_summary. - Do not report L2ARC as FAULTED in presence of in-flight writes. - Report read and write I/Os, bytes and errors. - Remove few numbers not important to average user. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #12304 Closes #14946 --- cmd/arc_summary | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/cmd/arc_summary b/cmd/arc_summary index 5d10e903fcba..6b29a611dab3 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -842,7 +842,8 @@ def section_l2arc(kstats_dict): ('Free on write:', 'l2_free_on_write'), ('R/W clashes:', 'l2_rw_clash'), ('Bad checksums:', 'l2_cksum_bad'), - ('I/O errors:', 'l2_io_error')) + ('Read errors:', 'l2_io_error'), + ('Write errors:', 'l2_writes_error')) for title, value in l2_todo: prt_i1(title, f_hits(arc_stats[value])) @@ -878,28 +879,20 @@ def section_l2arc(kstats_dict): prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), f_hits(arc_stats['l2_misses'])) - prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) print() - print('L2ARC writes:') - - if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: - prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) - prt_i2('Done ratio:', - f_perc(arc_stats['l2_writes_done'], - arc_stats['l2_writes_sent']), - f_hits(arc_stats['l2_writes_done'])) - prt_i2('Error ratio:', - f_perc(arc_stats['l2_writes_error'], - arc_stats['l2_writes_sent']), - f_hits(arc_stats['l2_writes_error'])) - else: - prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) + print('L2ARC I/O:') + prt_i2('Reads:', + f_bytes(arc_stats['l2_read_bytes']), + f_hits(arc_stats['l2_hits'])) + prt_i2('Writes:', + f_bytes(arc_stats['l2_write_bytes']), + f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') - prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) - prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) + prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached'])) + prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading'])) print() From 6db4ed51d6c2cd8ec6a3ad318118f7a0d6b6dfe8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 9 Jun 2023 11:10:01 -0700 Subject: [PATCH 36/38] ZTS: Skip checkpoint_discard_busy Until the ASSERT which is occasionally hit while running checkpoint_discard_busy is resolved skip this test case. Signed-off-by: Brian Behlendorf Issue #12053 Closes #14952 --- tests/test-runner/bin/zts-report.py.in | 1 + .../functional/pool_checkpoint/checkpoint_discard_busy.ksh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ef1a46dca72a..9517ce8073a5 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -152,6 +152,7 @@ known = { ['FAIL', rewind_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], + 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index f970935f5bd0..087aef9027ea 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -38,6 +38,8 @@ verify_runnable "global" +log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" + function test_cleanup { # reset memory limit to 16M From 70ea484e3ec56c529c6c5027ffc43840100ce224 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 15:40:55 -0400 Subject: [PATCH 37/38] Finally drop long disabled vdev cache. It was a vdev level read cache, designed to aggregate many small reads by speculatively issuing bigger reads instead and caching the result. But since it has almost no idea about what is going on with exception of ZIO_FLAG_DONT_CACHE flag set by higher layers, it was found to make more harm than good, for which reason it was disabled for the past 12 years. These days we have much better instruments to enlarge the I/Os, such as speculative and prescient prefetches, I/O scheduler, I/O aggregation etc. Besides just the dead code removal this removes one extra mutex lock/unlock per write inside vdev_cache_write(), not otherwise disabled and trying to do some work. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14953 --- cmd/arc_summary | 35 -- cmd/zdb/zdb.c | 7 +- include/os/linux/kernel/linux/mod_compat.h | 1 - include/sys/spa.h | 4 - include/sys/vdev.h | 6 - include/sys/vdev_impl.h | 20 - include/sys/zio.h | 1 - lib/libzpool/Makefile.am | 1 - man/man4/zfs.4 | 15 - man/man8/zpool-events.8 | 1 - module/Kbuild.in | 1 - module/Makefile.bsd | 1 - module/os/freebsd/zfs/sysctl_os.c | 2 - module/zfs/arc.c | 11 +- module/zfs/dmu_recv.c | 4 +- module/zfs/spa_misc.c | 2 - module/zfs/vdev.c | 7 +- module/zfs/vdev_cache.c | 436 --------------------- module/zfs/vdev_queue.c | 5 +- module/zfs/zio.c | 15 +- 20 files changed, 13 insertions(+), 562 deletions(-) delete mode 100644 module/zfs/vdev_cache.c diff --git a/cmd/arc_summary b/cmd/arc_summary index 6b29a611dab3..426e0207052d 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -64,7 +64,6 @@ SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats - 'vdev': 'vdev_cache_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} @@ -90,8 +89,6 @@ if sys.platform.startswith('freebsd'): # Requires py36-sysctl on FreeBSD import sysctl - VDEV_CACHE_SIZE = 'vdev.cache_size' - def is_value(ctl): return ctl.type != sysctl.CTLTYPE_NODE @@ -135,8 +132,6 @@ elif sys.platform.startswith('linux'): SPL_PATH = '/sys/module/spl/parameters' TUNABLES_PATH = '/sys/module/zfs/parameters' - VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' - def load_kstats(section): path = os.path.join(KSTAT_PATH, section) with open(path) as f: @@ -952,35 +947,6 @@ def section_tunables(*_): print() -def section_vdev(kstats_dict): - """Collect information on VDEV caches""" - - # Currently [Nov 2017] the VDEV cache is disabled, because it is actually - # harmful. When this is the case, we just skip the whole entry. See - # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c - # for details - tunables = get_vdev_params() - - if tunables[VDEV_CACHE_SIZE] == '0': - print('VDEV cache disabled, skipping section\n') - return - - vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) - - vdev_cache_total = int(vdev_stats['hits']) +\ - int(vdev_stats['misses']) +\ - int(vdev_stats['delegations']) - - prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) - prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), - f_hits(vdev_stats['hits'])) - prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), - f_hits(vdev_stats['misses'])) - prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), - f_hits(vdev_stats['delegations'])) - print() - - def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h @@ -1008,7 +974,6 @@ section_calls = {'arc': section_arc, 'l2arc': section_l2arc, 'spl': section_spl, 'tunables': section_tunables, - 'vdev': section_vdev, 'zil': section_zil} diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 105d36882291..04a10c4eedd7 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8546,9 +8546,9 @@ zdb_read_block(char *thing, spa_t *spa) */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | - ZIO_FLAG_OPTIONAL, NULL, NULL)); + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, + NULL, NULL)); } error = zio_wait(zio); @@ -8642,7 +8642,6 @@ zdb_read_block(char *thing, spa_t *spa) zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index 09d109d191bf..8e20a9613539 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -68,7 +68,6 @@ enum scope_prefix_types { zfs_trim, zfs_txg, zfs_vdev, - zfs_vdev_cache, zfs_vdev_file, zfs_vdev_mirror, zfs_vnops, diff --git a/include/sys/spa.h b/include/sys/spa.h index ed752967cca6..1fa2044008dc 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1174,10 +1174,6 @@ extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); -/* vdev cache */ -extern void vdev_cache_stat_init(void); -extern void vdev_cache_stat_fini(void); - /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index d529bbcdd9a4..26c834ff57cf 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -158,12 +158,6 @@ extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); -extern void vdev_cache_init(vdev_t *vd); -extern void vdev_cache_fini(vdev_t *vd); -extern boolean_t vdev_cache_read(zio_t *zio); -extern void vdev_cache_write(zio_t *zio); -extern void vdev_cache_purge(vdev_t *vd); - extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index ea3043c82a39..74b3737d8ee5 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -57,8 +57,6 @@ extern "C" { * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; -typedef struct vdev_cache vdev_cache_t; -typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; @@ -132,23 +130,6 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -struct vdev_cache_entry { - struct abd *ve_abd; - uint64_t ve_offset; - clock_t ve_lastused; - avl_node_t ve_offset_node; - avl_node_t ve_lastused_node; - uint32_t ve_hits; - uint16_t ve_missed_update; - zio_t *ve_fill_io; -}; - -struct vdev_cache { - avl_tree_t vc_offset_tree; - avl_tree_t vc_lastused_tree; - kmutex_t vc_lock; -}; - typedef struct vdev_queue_class { uint32_t vqc_active; @@ -443,7 +424,6 @@ struct vdev { boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ - vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 695bc09e6cb7..6b1352a72b9a 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -190,7 +190,6 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SPECULATIVE (1ULL << 8) #define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) #define ZIO_FLAG_DONT_RETRY (1ULL << 10) -#define ZIO_FLAG_DONT_CACHE (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index ceac2963e647..58d7f07527aa 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -135,7 +135,6 @@ nodist_libzpool_la_SOURCES = \ module/zfs/uberblock.c \ module/zfs/unique.c \ module/zfs/vdev.c \ - module/zfs/vdev_cache.c \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ module/zfs/vdev_indirect.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 9ec940a94488..5fbd9d7db93f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2028,21 +2028,6 @@ Max vdev I/O aggregation size. .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . -.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq uint -Shift size to inflate reads to. -. -.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq uint -Inflate reads smaller than this value to meet the -.Sy zfs_vdev_cache_bshift -size -.Pq default Sy 64 KiB . -. -.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq uint -Total size of the per-disk cache in bytes. -.Pp -Currently this feature is disabled, as it has been found to not be helpful -for performance and in some cases harmful. -. .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 0ba93e4166e7..341f902fe66e 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -456,7 +456,6 @@ ZIO_FLAG_CANFAIL:0x00000080 ZIO_FLAG_SPECULATIVE:0x00000100 ZIO_FLAG_CONFIG_WRITER:0x00000200 ZIO_FLAG_DONT_RETRY:0x00000400 -ZIO_FLAG_DONT_CACHE:0x00000800 ZIO_FLAG_NODATA:0x00001000 ZIO_FLAG_INDUCE_DAMAGE:0x00002000 diff --git a/module/Kbuild.in b/module/Kbuild.in index 29a55c9778b1..485331ac655e 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -382,7 +382,6 @@ ZFS_OBJS := \ uberblock.o \ unique.o \ vdev.o \ - vdev_cache.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 9464223f6ca6..0c4d8bfe1159 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -308,7 +308,6 @@ SRCS+= abd.c \ uberblock.c \ unique.c \ vdev.c \ - vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_indirect.c \ diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index cc616f33db96..8ae2f23c3ecf 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -872,8 +872,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, "Enable to bypass vdev_validate()."); /* END CSTYLED */ -/* vdev_cache.c */ - /* vdev_mirror.c */ /* vdev_queue.c */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dcd4620fcd20..3dbaaa76b4a5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6106,8 +6106,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | + zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; @@ -10177,8 +10176,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); @@ -10498,11 +10496,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index c22a95f8647f..2fdd7c1ece73 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1371,8 +1371,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; - zio_flag_t flags = ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; + zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 014c539eb683..9ef948e9e434 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2438,7 +2438,6 @@ spa_init(spa_mode_t mode) zio_init(); dmu_init(); zil_init(); - vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); @@ -2462,7 +2461,6 @@ spa_fini(void) spa_evict_all(); vdev_file_fini(); - vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 58dcd9f79799..612e66c3a8a8 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -715,7 +715,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); - vdev_cache_init(vd); return (vd); } @@ -1096,7 +1095,6 @@ vdev_free(vdev_t *vd) * Clean up vdev structure. */ vdev_queue_fini(vd); - vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); @@ -1720,8 +1718,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_TRYHARD; + ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -2612,8 +2609,6 @@ vdev_close(vdev_t *vd) vd->vdev_ops->vdev_op_close(vd); - vdev_cache_purge(vd); - /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c deleted file mode 100644 index f0a17600d58e..000000000000 --- a/module/zfs/vdev_cache.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Virtual device read-ahead caching. - * - * This file implements a simple LRU read-ahead cache. When the DMU reads - * a given block, it will often want other, nearby blocks soon thereafter. - * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 128 back-to-back 512-byte - * reads into a single 64k read followed by 127 cache hits; this reduces - * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 64k read, which doesn't affect latency all that much but is - * terribly wasteful of bandwidth. A more intelligent version of the cache - * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. Currently, only - * metadata I/O is inflated. A further enhancement could take advantage of - * more semantic information about the I/O. And it could use something - * faster than an AVL tree; that was chosen solely for convenience. - * - * There are five cache operations: allocate, fill, read, write, evict. - * - * (1) Allocate. This reserves a cache entry for the specified region. - * We separate the allocate and fill operations so that multiple threads - * don't generate I/O for the same cache miss. - * - * (2) Fill. When the I/O for a cache miss completes, the fill routine - * places the data in the previously allocated cache entry. - * - * (3) Read. Read data from the cache. - * - * (4) Write. Update cache contents after write completion. - * - * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds zfs_vdev_cache_size. - */ - -/* - * These tunables are for performance analysis. - */ -/* - * All i/os smaller than zfs_vdev_cache_max will be turned into - * 1<ve_offset, ve2->ve_offset)); -} - -static int -vdev_cache_lastused_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; - const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - - int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); - if (likely(cmp)) - return (cmp); - - /* - * Among equally old entries, sort by offset to ensure uniqueness. - */ - return (vdev_cache_offset_compare(a1, a2)); -} - -/* - * Evict the specified entry from the cache. - */ -static void -vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) -{ - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - ASSERT3P(ve->ve_abd, !=, NULL); - - avl_remove(&vc->vc_lastused_tree, ve); - avl_remove(&vc->vc_offset_tree, ve); - abd_free(ve->ve_abd); - kmem_free(ve, sizeof (vdev_cache_entry_t)); -} - -/* - * Allocate an entry in the cache. At the point we don't have the data, - * we're just creating a placeholder so that multiple threads don't all - * go off and read the same blocks. - */ -static vdev_cache_entry_t * -vdev_cache_allocate(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, VCBS); - vdev_cache_entry_t *ve; - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - - if (zfs_vdev_cache_size == 0) - return (NULL); - - /* - * If adding a new entry would exceed the cache size, - * evict the oldest entry (LRU). - */ - if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > - zfs_vdev_cache_size) { - ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) - return (NULL); - ASSERT3U(ve->ve_hits, !=, 0); - vdev_cache_evict(vc, ve); - } - - ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve->ve_offset = offset; - ve->ve_lastused = ddi_get_lbolt(); - ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); - - avl_add(&vc->vc_offset_tree, ve); - avl_add(&vc->vc_lastused_tree, ve); - - return (ve); -} - -static void -vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) -{ - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - - if (ve->ve_lastused != ddi_get_lbolt()) { - avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = ddi_get_lbolt(); - avl_add(&vc->vc_lastused_tree, ve); - } - - ve->ve_hits++; - abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); -} - -/* - * Fill a previously allocated cache entry with data. - */ -static void -vdev_cache_fill(zio_t *fio) -{ - vdev_t *vd = fio->io_vd; - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = fio->io_private; - zio_t *pio; - - ASSERT3U(fio->io_size, ==, VCBS); - - /* - * Add data to the cache. - */ - mutex_enter(&vc->vc_lock); - - ASSERT3P(ve->ve_fill_io, ==, fio); - ASSERT3U(ve->ve_offset, ==, fio->io_offset); - ASSERT3P(ve->ve_abd, ==, fio->io_abd); - - ve->ve_fill_io = NULL; - - /* - * Even if this cache line was invalidated by a missed write update, - * any reads that were queued up before the missed update are still - * valid, so we can satisfy them from this line before we evict it. - */ - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(fio, &zl)) != NULL) - vdev_cache_hit(vc, ve, pio); - - if (fio->io_error || ve->ve_missed_update) - vdev_cache_evict(vc, ve); - - mutex_exit(&vc->vc_lock); -} - -/* - * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. - */ -boolean_t -vdev_cache_read(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); - zio_t *fio; - uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - if (zfs_vdev_cache_size == 0) - return (B_FALSE); - - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (B_FALSE); - - if (zio->io_size > zfs_vdev_cache_max) - return (B_FALSE); - - /* - * If the I/O straddles two or more cache blocks, don't cache it. - */ - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (B_FALSE); - - ASSERT3U(cache_phase + zio->io_size, <=, VCBS); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); - - if (ve != NULL) { - if (ve->ve_missed_update) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - if ((fio = ve->ve_fill_io) != NULL) { - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_delegations); - return (B_TRUE); - } - - vdev_cache_hit(vc, ve, zio); - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_hits); - return (B_TRUE); - } - - ve = vdev_cache_allocate(zio); - - if (ve == NULL) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); - - ve->ve_fill_io = fio; - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - - mutex_exit(&vc->vc_lock); - zio_nowait(fio); - VDCSTAT_BUMP(vdc_stat_misses); - - return (B_TRUE); -} - -/* - * Update cache contents upon write completion. - */ -void -vdev_cache_write(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t io_start = zio->io_offset; - uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, VCBS); - uint64_t max_offset = P2ROUNDUP(io_end, VCBS); - avl_index_t where; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = min_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); - - if (ve == NULL) - ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); - - while (ve != NULL && ve->ve_offset < max_offset) { - uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + VCBS, io_end); - - if (ve->ve_fill_io != NULL) { - ve->ve_missed_update = 1; - } else { - abd_copy_off(ve->ve_abd, zio->io_abd, - start - ve->ve_offset, start - io_start, - end - start); - } - ve = AVL_NEXT(&vc->vc_offset_tree, ve); - } - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_purge(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_init(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_offset_node)); - - avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_lastused_node)); -} - -void -vdev_cache_fini(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - vdev_cache_purge(vd); - - avl_destroy(&vc->vc_offset_tree); - avl_destroy(&vc->vc_lastused_tree); - - mutex_destroy(&vc->vc_lock); -} - -void -vdev_cache_stat_init(void) -{ - vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (vdc_ksp != NULL) { - vdc_ksp->ks_data = &vdc_stats; - kstat_install(vdc_ksp); - } -} - -void -vdev_cache_stat_fini(void) -{ - if (vdc_ksp != NULL) { - kstat_delete(vdc_ksp); - vdc_ksp = NULL; - } -} - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, UINT, ZMOD_RW, - "Inflate reads small than max"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, UINT, ZMOD_RD, - "Total size of the per-disk cache"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, UINT, ZMOD_RW, - "Shift size to inflate reads too"); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 1a75d68abd9e..abb7d0662b8c 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -748,8 +748,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); + flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; @@ -907,7 +906,7 @@ vdev_queue_io(zio_t *zio) ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c17ca5e1d651..d7b2217623e6 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1617,12 +1617,6 @@ zio_read_bp_init(zio_t *zio) ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } - if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - - if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; @@ -3955,9 +3949,6 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) - return (zio); - if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -3994,9 +3985,6 @@ zio_vdev_io_done(zio_t *zio) vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); @@ -4106,8 +4094,7 @@ zio_vdev_io_assess(zio_t *zio) ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; - zio->io_flags |= ZIO_FLAG_IO_RETRY | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); From feff9dfed3df1bbae5dd74959a6ad87d11f27ffb Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Sat, 10 Jun 2023 02:05:47 +0200 Subject: [PATCH 38/38] Fix the L2ARC write size calculating logic (2) While commit bcd5321 adjusts the write size based on the size of the log block, this happens after comparing the unadjusted write size to the evicted (target) size. In this case l2ad_hand will exceed l2ad_evict and violate an assertion at the end of l2arc_write_buffers(). Fix this by adding the max log block size to the allocated size of the buffer to be committed before comparing the result to the target size. Also reset the l2arc_trim_ahead ZFS module variable when the adjusted write size exceeds the size of the L2ARC device. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14936 Closes #14954 --- module/zfs/arc.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3dbaaa76b4a5..a23715309f2b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8206,7 +8206,7 @@ l2arc_write_size(l2arc_dev_t *dev) * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - if (size >= dev->l2ad_end - dev->l2ad_start) { + if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " @@ -8216,6 +8216,11 @@ l2arc_write_size(l2arc_dev_t *dev) size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + if (l2arc_trim_ahead > 1) { + cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); + l2arc_trim_ahead = 1; + } + if (arc_warm == B_FALSE) size += l2arc_write_boost; @@ -8842,7 +8847,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) top: rerun = B_FALSE; - if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { + if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands @@ -9036,7 +9041,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } @@ -9296,7 +9301,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); - if ((write_asize + asize) > target_sz) { + /* + * If the allocated size of this buffer plus the max + * size for the pending log block exceeds the evicted + * target size, terminate writing buffers for this run. + */ + if (write_asize + asize + + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -9412,7 +9423,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) */ if (l2arc_log_blk_insert(dev, hdr)) { /* - * l2ad_hand has been accounted for in + * l2ad_hand will be adjusted in * l2arc_log_blk_commit(). */ write_asize +=