zfs: merge openzfs/zfs@66b81b349

Notable upstream pull request merges: #15290 54b1b1d89 import: require force when cachefile hostid doesn't match on-disk #15319 342357cd9 Reduce number of metaslab preload taskq threads #15340 2a6c62109 ARC: Remove b_cv from struct l1arc_buf_hdr #15347 75a2eb7fa ARC: Drop different size headers for crypto #15350 96b9cf42e ARC: Remove b_bufcnt/b_ebufcnt from ARC headers #15353 66b81b349 ZIL: Reduce maximum size of WR_COPIED to 7.5K #15362 5b8688e62 zfsconcepts: add description of block cloning Obtained from: OpenZFS OpenZFS commit: 66b81b3497
2024-07-23 11:16:33 +00:00 · 2023-10-08 09:43:15 +02:00 · 2023-10-08 09:43:15 +02:00 · b2526e8bfe
parent 2821a7498f 66b81b3497
commit b2526e8bfe
25 changed files with 512 additions and 332 deletions
--- a/sys/contrib/openzfs/.cirrus.yml
+++ b/sys/contrib/openzfs/.cirrus.yml
@ -0,0 +1,21 @@
+env:
+  CIRRUS_CLONE_DEPTH: 1
+  ARCH: amd64
+
+build_task:
+  matrix:
+    freebsd_instance:
+      image_family: freebsd-12-4
+    freebsd_instance:
+      image_family: freebsd-13-2
+    freebsd_instance:
+      image_family: freebsd-14-0-snap
+  prepare_script:
+    - pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py39-packaging py39-cffi py39-sysctl
+  configure_script:
+    - env MAKE=gmake ./autogen.sh
+    - env MAKE=gmake ./configure --with-config="user" --with-python=3.9
+  build_script:
+    - gmake -j `sysctl -n kern.smp.cpus`
+  install_script:
+    - gmake install
--- a/sys/contrib/openzfs/.gitignore
+++ b/sys/contrib/openzfs/.gitignore
@ -42,6 +42,7 @@
 !udev/**

 !.editorconfig
+!.cirrus.yml
 !.gitignore
 !.gitmodules
 !AUTHORS
@ -60,7 +61,6 @@
 !TEST
 !zfs.release.in

-
 #
 # Normal rules
 #
--- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c
+++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c
@ -3122,12 +3122,21 @@ zfs_force_import_required(nvlist_t *config)
 	nvlist_t *nvinfo;

 	state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
+	nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+
+	/*
+	 * The hostid on LOAD_INFO comes from the MOS label via
+	 * spa_tryimport(). If its not there then we're likely talking to an
+	 * older kernel, so use the top one, which will be from the label
+	 * discovered in zpool_find_import(), or if a cachefile is in use, the
+	 * local hostid.
+	 */
+	if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0)
+		nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);

 	if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
 		return (B_TRUE);

-	nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 	if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
 		mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
 		    ZPOOL_CONFIG_MMP_STATE);
@ -3198,7 +3207,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 			time_t timestamp = 0;
 			uint64_t hostid = 0;

-			if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
+			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME))
+				hostname = fnvlist_lookup_string(nvinfo,
+				    ZPOOL_CONFIG_HOSTNAME);
+			else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
 				hostname = fnvlist_lookup_string(config,
 				    ZPOOL_CONFIG_HOSTNAME);

@ -3206,7 +3218,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 				timestamp = fnvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_TIMESTAMP);

-			if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
+			if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID))
+				hostid = fnvlist_lookup_uint64(nvinfo,
+				    ZPOOL_CONFIG_HOSTID);
+			else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
 				hostid = fnvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_HOSTID);

--- a/sys/contrib/openzfs/config/zfs-build.m4
+++ b/sys/contrib/openzfs/config/zfs-build.m4
@ -358,6 +358,9 @@ AC_DEFUN([ZFS_AC_RPM], [
 	AS_IF([test -n "$udevruledir" ], [
 		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"'
 	])
+	AS_IF([test -n "$bashcompletiondir" ], [
+		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_bashcompletiondir $(bashcompletiondir)"'
+	])
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)'
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)'
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)'
--- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_arc.h
+++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_arc.h
@ -51,7 +51,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
 	    __array(uint64_t,		hdr_dva_word, 2)
 	    __field(uint64_t,		hdr_birth)
 	    __field(uint32_t,		hdr_flags)
-	    __field(uint32_t,		hdr_bufcnt)
 	    __field(arc_buf_contents_t,	hdr_type)
 	    __field(uint16_t,		hdr_psize)
 	    __field(uint16_t,		hdr_lsize)
@ -70,7 +69,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
 	    __entry->hdr_dva_word[1]	= ab->b_dva.dva_word[1];
 	    __entry->hdr_birth		= ab->b_birth;
 	    __entry->hdr_flags		= ab->b_flags;
-	    __entry->hdr_bufcnt	= ab->b_l1hdr.b_bufcnt;
 	    __entry->hdr_psize		= ab->b_psize;
 	    __entry->hdr_lsize		= ab->b_lsize;
 	    __entry->hdr_spa		= ab->b_spa;
@ -84,12 +82,12 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
 	    __entry->hdr_refcount	= ab->b_l1hdr.b_refcnt.rc_count;
 	),
 	TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
-	    "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu "
+	    "flags 0x%x type %u psize %u lsize %u spa %llu "
 	    "state_type %u access %lu mru_hits %u mru_ghost_hits %u "
 	    "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
 	    __entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
 	    __entry->hdr_birth, __entry->hdr_flags,
-	    __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize,
+	    __entry->hdr_type, __entry->hdr_psize,
 	    __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type,
 	    __entry->hdr_access, __entry->hdr_mru_hits,
 	    __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits,
@ -192,7 +190,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
 	    __array(uint64_t,		hdr_dva_word, 2)
 	    __field(uint64_t,		hdr_birth)
 	    __field(uint32_t,		hdr_flags)
-	    __field(uint32_t,		hdr_bufcnt)
 	    __field(arc_buf_contents_t,	hdr_type)
 	    __field(uint16_t,		hdr_psize)
 	    __field(uint16_t,		hdr_lsize)
@ -223,7 +220,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
 	    __entry->hdr_dva_word[1]	= hdr->b_dva.dva_word[1];
 	    __entry->hdr_birth		= hdr->b_birth;
 	    __entry->hdr_flags		= hdr->b_flags;
-	    __entry->hdr_bufcnt		= hdr->b_l1hdr.b_bufcnt;
 	    __entry->hdr_psize		= hdr->b_psize;
 	    __entry->hdr_lsize		= hdr->b_lsize;
 	    __entry->hdr_spa		= hdr->b_spa;
@ -255,7 +251,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
 	    __entry->zb_blkid		= zb->zb_blkid;
 	),
 	TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
-	    "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u "
+	    "flags 0x%x psize %u lsize %u spa %llu state_type %u "
 	    "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
 	    "mfu_ghost_hits %u l2_hits %u refcount %lli } "
 	    "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@ -264,7 +260,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
 	    "blkid %llu }",
 	    __entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
 	    __entry->hdr_birth, __entry->hdr_flags,
-	    __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize,
+	    __entry->hdr_psize, __entry->hdr_lsize,
 	    __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
 	    __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
 	    __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,
--- a/sys/contrib/openzfs/include/sys/arc_impl.h
+++ b/sys/contrib/openzfs/include/sys/arc_impl.h
@ -159,10 +159,6 @@ struct arc_write_callback {
 * these two allocation states.
 */
 typedef struct l1arc_buf_hdr {
-	/* for waiting on reads to complete */
-	kcondvar_t		b_cv;
-	uint8_t			b_byteswap;
-
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
 	multilist_node_t	b_arc_node;
@ -173,7 +169,7 @@ typedef struct l1arc_buf_hdr {
 	uint32_t		b_mru_ghost_hits;
 	uint32_t		b_mfu_hits;
 	uint32_t		b_mfu_ghost_hits;
-	uint32_t		b_bufcnt;
+	uint8_t			b_byteswap;
 	arc_buf_t		*b_buf;

 	/* self protecting */
@ -436,12 +432,12 @@ typedef struct l2arc_dev {
 */
 typedef struct arc_buf_hdr_crypt {
 	abd_t			*b_rabd;	/* raw encrypted data */
-	dmu_object_type_t	b_ot;		/* object type */
-	uint32_t		b_ebufcnt;	/* count of encrypted buffers */

 	/* dsobj for looking up encryption key for l2arc encryption */
 	uint64_t		b_dsobj;

+	dmu_object_type_t	b_ot;		/* object type */
+
 	/* encryption parameters */
 	uint8_t			b_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			b_iv[ZIO_DATA_IV_LEN];
--- a/sys/contrib/openzfs/include/sys/metaslab_impl.h
+++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h
@ -250,7 +250,6 @@ struct metaslab_group {
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
-	taskq_t			*mg_taskq;
 	metaslab_group_t	*mg_prev;
 	metaslab_group_t	*mg_next;

--- a/sys/contrib/openzfs/include/sys/spa_impl.h
+++ b/sys/contrib/openzfs/include/sys/spa_impl.h
@ -424,7 +424,9 @@ struct spa {

 	hrtime_t	spa_ccw_fail_time;	/* Conf cache write fail time */
 	taskq_t		*spa_zvol_taskq;	/* Taskq for minor management */
+	taskq_t		*spa_metaslab_taskq;	/* Taskq for metaslab preload */
 	taskq_t		*spa_prefetch_taskq;	/* Taskq for prefetch threads */
+	taskq_t		*spa_upgrade_taskq;	/* Taskq for upgrade jobs */
 	uint64_t	spa_multihost;		/* multihost aware (mmp) */
 	mmp_thread_t	spa_mmp;		/* multihost mmp thread */
 	list_t		spa_leaf_list;		/* list of leaf vdevs */
@ -448,8 +450,6 @@ struct spa {
 	 */
 	spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
 	zfs_refcount_t	spa_refcount;		/* number of opens */
-
-	taskq_t		*spa_upgrade_taskq;	/* taskq for upgrade jobs */
 };

 extern char *spa_config_path;
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@ -402,6 +402,12 @@ Practical upper limit of total metaslabs per top-level vdev.
 .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable metaslab group preloading.
 .
+.It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint
+Maximum number of metaslabs per group to preload
+.
+.It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint
+Percentage of CPUs to run a metaslab preload taskq
+.
 .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Give more weight to metaslabs with lower LBAs,
 assuming they have greater bandwidth,
@ -2144,6 +2150,11 @@ On very fragmented pools, lowering this
 .Pq typically to Sy 36 KiB
 can improve performance.
 .
+.It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint
+This sets the maximum number of write bytes logged via WR_COPIED.
+It tunes a tradeoff between additional memory copy and possibly worse log
+space efficiency vs additional range lock/unlock.
+.
 .It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
 This sets the minimum delay in nanoseconds ZIL care to delay block commit,
 waiting for more records.
--- a/sys/contrib/openzfs/man/man7/zfsconcepts.7
+++ b/sys/contrib/openzfs/man/man7/zfsconcepts.7
@ -28,8 +28,9 @@
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
+.\" Copyright 2023 Klara, Inc.
 .\"
-.Dd June 30, 2019
+.Dd October 6, 2023
 .Dt ZFSCONCEPTS 7
 .Os
 .
@ -205,3 +206,40 @@ practices, such as regular backups.
 Consider using the
 .Sy compression
 property as a less resource-intensive alternative.
+.Ss Block cloning
+Block cloning is a facility that allows a file (or parts of a file) to be
+.Qq cloned ,
+that is, a shallow copy made where the existing data blocks are referenced
+rather than copied.
+Later modifications to the data will cause a copy of the data block to be taken
+and that copy modified.
+This facility is used to implement
+.Qq reflinks
+or
+.Qq file-level copy-on-write .
+.Pp
+Cloned blocks are tracked in a special on-disk structure called the Block
+Reference Table
+.Po BRT
+.Pc .
+Unlike deduplication, this table has minimal overhead, so can be enabled at all
+times.
+.Pp
+Also unlike deduplication, cloning must be requested by a user program.
+Many common file copying programs, including newer versions of
+.Nm /bin/cp ,
+will try to create clones automatically.
+Look for
+.Qq clone ,
+.Qq dedupe
+or
+.Qq reflink
+in the documentation for more information.
+.Pp
+There are some limitations to block cloning.
+Only whole blocks can be cloned, and blocks can not be cloned if they are not
+yet written to disk, or if they are encrypted, or the source and destination
+.Sy recordsize
+properties differ.
+The OS may add additional restrictions;
+for example, most versions of Linux will not allow clones across datasets.
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@ -614,28 +614,6 @@ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
 	" space map to continue allocations in a first-fit fashion");
 /* END CSTYLED */

-/*
- * Percentage of all cpus that can be used by the metaslab taskq.
- */
-extern int metaslab_load_pct;
-
-/* BEGIN CSTYLED */
-SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct,
-	CTLFLAG_RWTUN, &metaslab_load_pct, 0,
-	"Percentage of cpus that can be used by the metaslab taskq");
-/* END CSTYLED */
-
-/*
- * Max number of metaslabs per group to preload.
- */
-extern uint_t metaslab_preload_limit;
-
-/* BEGIN CSTYLED */
-SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, preload_limit,
-	CTLFLAG_RWTUN, &metaslab_preload_limit, 0,
-	"Max number of metaslabs per group to preload");
-/* END CSTYLED */
-
 /* mmp.c */

 int
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@ -748,8 +748,7 @@ taskq_t *arc_prune_taskq;
 * Other sizes
 */

-#define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
+#define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))

 /*
@ -1113,7 +1112,6 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
 */

 static kmem_cache_t *hdr_full_cache;
-static kmem_cache_t *hdr_full_crypt_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;

@ -1134,7 +1132,6 @@ buf_fini(void)
 	for (int i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
-	kmem_cache_destroy(hdr_full_crypt_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
@ -1151,7 +1148,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)

 	memset(hdr, 0, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
-	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1163,19 +1159,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)
 	return (0);
 }

-static int
-hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
-{
-	(void) unused;
-	arc_buf_hdr_t *hdr = vbuf;
-
-	hdr_full_cons(vbuf, unused, kmflag);
-	memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr));
-	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
-
-	return (0);
-}
-
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
@ -1211,7 +1194,6 @@ hdr_full_dest(void *vbuf, void *unused)
 	arc_buf_hdr_t *hdr = vbuf;

 	ASSERT(HDR_EMPTY(hdr));
-	cv_destroy(&hdr->b_l1hdr.b_cv);
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
@ -1220,16 +1202,6 @@ hdr_full_dest(void *vbuf, void *unused)
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }

-static void
-hdr_full_crypt_dest(void *vbuf, void *unused)
-{
-	(void) vbuf, (void) unused;
-
-	hdr_full_dest(vbuf, unused);
-	arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr),
-	    ARC_SPACE_HDRS);
-}
-
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
@ -1285,9 +1257,6 @@ buf_init(void)

 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
-	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
-	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
-	    NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
@ -1995,7 +1964,6 @@ arc_buf_untransform_in_place(arc_buf_t *buf)
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
-	hdr->b_crypt_hdr.b_ebufcnt -= 1;
 }

 /*
@ -2230,7 +2198,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 	ASSERT(HDR_HAS_L1HDR(hdr));

 	if (GHOST_STATE(state)) {
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
@ -2270,7 +2237,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 	ASSERT(HDR_HAS_L1HDR(hdr));

 	if (GHOST_STATE(state)) {
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
@ -2386,7 +2352,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 		l2hdr = &hdr->b_l2hdr;

 	if (l1hdr) {
-		abi->abi_bufcnt = l1hdr->b_bufcnt;
+		abi->abi_bufcnt = 0;
+		for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
+			abi->abi_bufcnt++;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
@ -2414,7 +2382,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
-	uint32_t bufcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t type = arc_buf_type(hdr);

@ -2428,19 +2395,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
-		bufcnt = hdr->b_l1hdr.b_bufcnt;
-		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
-		    HDR_HAS_RABD(hdr));
+		update_old = (hdr->b_l1hdr.b_buf != NULL ||
+		    hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));

-		IMPLY(GHOST_STATE(old_state), bufcnt == 0);
-		IMPLY(GHOST_STATE(new_state), bufcnt == 0);
 		IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
-		IMPLY(old_state == arc_anon, bufcnt <= 1);
+		IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
+		    ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
-		bufcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
@ -2488,14 +2452,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
-			ASSERT0(bufcnt);

 			/*
 			 * When moving a header to a ghost state, we first
-			 * remove all arc buffers. Thus, we'll have a
-			 * bufcnt of zero, and no arc buffer to use for
-			 * the reference. As a result, we use the arc
-			 * header pointer for the reference.
+			 * remove all arc buffers. Thus, we'll have no arc
+			 * buffer to use for the reference. As a result, we
+			 * use the arc header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(
 			    &new_state->arcs_size[type],
@ -2503,7 +2465,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
-			uint32_t buffers = 0;

 			/*
 			 * Each individual buffer holds a unique reference,
@ -2512,8 +2473,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
-				ASSERT3U(bufcnt, !=, 0);
-				buffers++;

 				/*
 				 * When the arc_buf_t is sharing the data
@ -2529,7 +2488,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 				    &new_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
-			ASSERT3U(bufcnt, ==, buffers);

 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
@ -2548,7 +2506,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
-			ASSERT0(bufcnt);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));

@ -2564,7 +2521,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 			    &old_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
-			uint32_t buffers = 0;

 			/*
 			 * Each individual buffer holds a unique reference,
@ -2573,8 +2529,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
-				ASSERT3U(bufcnt, !=, 0);
-				buffers++;

 				/*
 				 * When the arc_buf_t is sharing the data
@ -2590,7 +2544,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 				    &old_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
-			ASSERT3U(bufcnt, ==, buffers);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));

@ -2838,9 +2791,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
 	VERIFY3P(buf->b_data, !=, NULL);

 	hdr->b_l1hdr.b_buf = buf;
-	hdr->b_l1hdr.b_bufcnt += 1;
-	if (encrypted)
-		hdr->b_crypt_hdr.b_ebufcnt += 1;

 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
@ -3082,8 +3032,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
-	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
-	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));

 	return (lastbuf);
@ -3122,22 +3070,20 @@ arc_buf_destroy_impl(arc_buf_t *buf)
 		}
 		buf->b_data = NULL;

-		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
-		hdr->b_l1hdr.b_bufcnt -= 1;
-
-		if (ARC_BUF_ENCRYPTED(buf)) {
-			hdr->b_crypt_hdr.b_ebufcnt -= 1;
-
-			/*
-			 * If we have no more encrypted buffers and we've
-			 * already gotten a copy of the decrypted data we can
-			 * free b_rabd to save some space.
-			 */
-			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
-			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
-			    !HDR_IO_IN_PROGRESS(hdr)) {
-				arc_hdr_free_abd(hdr, B_TRUE);
+		/*
+		 * If we have no more encrypted buffers and we've already
+		 * gotten a copy of the decrypted data we can free b_rabd
+		 * to save some space.
+		 */
+		if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
+		    hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
+			arc_buf_t *b;
+			for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
+				if (b != buf && ARC_BUF_ENCRYPTED(b))
+					break;
 			}
+			if (b == NULL)
+				arc_hdr_free_abd(hdr, B_TRUE);
 		}
 	}

@ -3298,11 +3244,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
 	arc_buf_hdr_t *hdr;

 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
-	if (protected) {
-		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
-	} else {
-		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
-	}
+	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);

 	ASSERT(HDR_EMPTY(hdr));
 #ifdef ZFS_DEBUG
@ -3325,7 +3267,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
-	hdr->b_l1hdr.b_bufcnt = 0;
 	hdr->b_l1hdr.b_buf = NULL;

 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@ -3351,16 +3292,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));

-	/*
-	 * if the caller wanted a new full header and the header is to be
-	 * encrypted we will actually allocate the header from the full crypt
-	 * cache instead. The same applies to freeing from the old cache.
-	 */
-	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
-		new = hdr_full_crypt_cache;
-	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
-		old = hdr_full_crypt_cache;
-
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);

 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
@ -3368,7 +3299,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)

 	memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);

-	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
+	if (new == hdr_full_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
@ -3382,7 +3313,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT0(hdr->b_l1hdr.b_bufcnt);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
@ -3448,126 +3378,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 	return (nhdr);
 }

-/*
- * This function allows an L1 header to be reallocated as a crypt
- * header and vice versa. If we are going to a crypt header, the
- * new fields will be zeroed out.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
-{
-	arc_buf_hdr_t *nhdr;
-	arc_buf_t *buf;
-	kmem_cache_t *ncache, *ocache;
-
-	/*
-	 * This function requires that hdr is in the arc_anon state.
-	 * Therefore it won't have any L2ARC data for us to worry
-	 * about copying.
-	 */
-	ASSERT(HDR_HAS_L1HDR(hdr));
-	ASSERT(!HDR_HAS_L2HDR(hdr));
-	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
-	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
-	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
-	ASSERT3P(hdr->b_hash_next, ==, NULL);
-
-	if (need_crypt) {
-		ncache = hdr_full_crypt_cache;
-		ocache = hdr_full_cache;
-	} else {
-		ncache = hdr_full_cache;
-		ocache = hdr_full_crypt_cache;
-	}
-
-	nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
-
-	/*
-	 * Copy all members that aren't locks or condvars to the new header.
-	 * No lists are pointing to us (as we asserted above), so we don't
-	 * need to worry about the list nodes.
-	 */
-	nhdr->b_dva = hdr->b_dva;
-	nhdr->b_birth = hdr->b_birth;
-	nhdr->b_type = hdr->b_type;
-	nhdr->b_flags = hdr->b_flags;
-	nhdr->b_psize = hdr->b_psize;
-	nhdr->b_lsize = hdr->b_lsize;
-	nhdr->b_spa = hdr->b_spa;
-#ifdef ZFS_DEBUG
-	nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
-#endif
-	nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
-	nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
-	nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
-	nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
-	nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
-	nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
-	nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
-	nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
-	nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
-	nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
-
-	/*
-	 * This zfs_refcount_add() exists only to ensure that the individual
-	 * arc buffers always point to a header that is referenced, avoiding
-	 * a small race condition that could trigger ASSERTs.
-	 */
-	(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
-	nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
-	for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next)
-		buf->b_hdr = nhdr;
-
-	zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
-	(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
-	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
-
-	if (need_crypt) {
-		arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
-	} else {
-		arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
-	}
-
-	/* unset all members of the original hdr */
-	memset(&hdr->b_dva, 0, sizeof (dva_t));
-	hdr->b_birth = 0;
-	hdr->b_type = 0;
-	hdr->b_flags = 0;
-	hdr->b_psize = 0;
-	hdr->b_lsize = 0;
-	hdr->b_spa = 0;
-#ifdef ZFS_DEBUG
-	hdr->b_l1hdr.b_freeze_cksum = NULL;
-#endif
-	hdr->b_l1hdr.b_buf = NULL;
-	hdr->b_l1hdr.b_bufcnt = 0;
-	hdr->b_l1hdr.b_byteswap = 0;
-	hdr->b_l1hdr.b_state = NULL;
-	hdr->b_l1hdr.b_arc_access = 0;
-	hdr->b_l1hdr.b_mru_hits = 0;
-	hdr->b_l1hdr.b_mru_ghost_hits = 0;
-	hdr->b_l1hdr.b_mfu_hits = 0;
-	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
-	hdr->b_l1hdr.b_acb = NULL;
-	hdr->b_l1hdr.b_pabd = NULL;
-
-	if (ocache == hdr_full_crypt_cache) {
-		ASSERT(!HDR_HAS_RABD(hdr));
-		hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
-		hdr->b_crypt_hdr.b_ebufcnt = 0;
-		hdr->b_crypt_hdr.b_dsobj = 0;
-		memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN);
-		memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN);
-		memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN);
-	}
-
-	buf_discard_identity(hdr);
-	kmem_cache_free(ocache, hdr);
-
-	return (nhdr);
-}
-
 /*
 * This function is used by the send / receive code to convert a newly
 * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
@ -3587,8 +3397,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);

 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
-	if (!HDR_PROTECTED(hdr))
-		hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
+	arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
@ -3789,8 +3598,6 @@ static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
-		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
-		    hdr->b_l1hdr.b_bufcnt > 0);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
@ -3854,12 +3661,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
-
-		if (!HDR_PROTECTED(hdr)) {
-			kmem_cache_free(hdr_full_cache, hdr);
-		} else {
-			kmem_cache_free(hdr_full_crypt_cache, hdr);
-		}
+		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
@ -3871,7 +3673,8 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag)
 	arc_buf_hdr_t *hdr = buf->b_hdr;

 	if (hdr->b_l1hdr.b_state == arc_anon) {
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, tag));
 		return;
@ -3881,7 +3684,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag)
 	mutex_enter(hash_lock);

 	ASSERT3P(hdr, ==, buf->b_hdr);
-	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
@ -3924,7 +3727,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	ASSERT0(hdr->b_l1hdr.b_bufcnt);
 	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));

@ -5586,13 +5388,6 @@ arc_read_done(zio_t *zio)
 			buf_hash_remove(hdr);
 	}

-	/*
-	 * Broadcast before we drop the hash_lock to avoid the possibility
-	 * that the hdr (and hence the cv) might be freed before we get to
-	 * the cv_broadcast().
-	 */
-	cv_broadcast(&hdr->b_l1hdr.b_cv);
-
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	(void) remove_reference(hdr, hdr);

@ -5787,8 +5582,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 				}
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
-				if (hdr->b_l1hdr.b_acb)
-					hdr->b_l1hdr.b_acb->acb_prev = acb;
+				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
@ -5928,8 +5722,28 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
-				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+				arc_callback_t *acb = kmem_zalloc(
+				    sizeof (arc_callback_t), KM_SLEEP);
+				acb->acb_wait = B_TRUE;
+				mutex_init(&acb->acb_wait_lock, NULL,
+				    MUTEX_DEFAULT, NULL);
+				cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
+				    NULL);
+				acb->acb_zio_head =
+				    hdr->b_l1hdr.b_acb->acb_zio_head;
+				acb->acb_next = hdr->b_l1hdr.b_acb;
+				hdr->b_l1hdr.b_acb->acb_prev = acb;
+				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
+				mutex_enter(&acb->acb_wait_lock);
+				while (acb->acb_wait) {
+					cv_wait(&acb->acb_wait_cv,
+					    &acb->acb_wait_lock);
+				}
+				mutex_exit(&acb->acb_wait_lock);
+				mutex_destroy(&acb->acb_wait_lock);
+				cv_destroy(&acb->acb_wait_cv);
+				kmem_free(acb, sizeof (arc_callback_t));
 				goto top;
 			}
 		}
@ -6310,7 +6124,8 @@ arc_release(arc_buf_t *buf, const void *tag)
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));

-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));

@ -6361,7 +6176,7 @@ arc_release(arc_buf_t *buf, const void *tag)
 	/*
 	 * Do we have more than one buf?
 	 */
-	if (hdr->b_l1hdr.b_bufcnt > 1) {
+	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
@ -6442,10 +6257,6 @@ arc_release(arc_buf_t *buf, const void *tag)
 			    arc_buf_size(buf), buf);
 		}

-		hdr->b_l1hdr.b_bufcnt -= 1;
-		if (ARC_BUF_ENCRYPTED(buf))
-			hdr->b_crypt_hdr.b_ebufcnt -= 1;
-
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);

@ -6458,15 +6269,11 @@ arc_release(arc_buf_t *buf, const void *tag)
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type);
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
-		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));

 		nhdr->b_l1hdr.b_buf = buf;
-		nhdr->b_l1hdr.b_bufcnt = 1;
-		if (ARC_BUF_ENCRYPTED(buf))
-			nhdr->b_crypt_hdr.b_ebufcnt = 1;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;

@ -6517,7 +6324,7 @@ arc_write_ready(zio_t *zio)

 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
-	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);

 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
@ -6552,13 +6359,9 @@ arc_write_ready(zio_t *zio)
 		add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
 	}

-	if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
-		hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
-
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
-		ASSERT(HDR_PROTECTED(hdr));

 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
@ -6571,11 +6374,14 @@ arc_write_ready(zio_t *zio)
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}

+		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+	} else {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
 	}

 	/*
@ -6656,7 +6462,8 @@ arc_write_ready(zio_t *zio)
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
-		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
+		ASSERT(ARC_BUF_LAST(buf));

 		arc_share_buf(hdr, buf);
 	}
@ -6737,7 +6544,8 @@ arc_write_done(zio_t *zio)
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
-				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+				ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
+				ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
@ -6778,7 +6586,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
-	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	if (uncached)
 		arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 	else if (l2arc)
--- a/sys/contrib/openzfs/module/zfs/metaslab.c
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@ -205,11 +205,6 @@ static const uint32_t metaslab_min_search_count = 100;
 */
 static int metaslab_df_use_largest_segment = B_FALSE;

-/*
- * Percentage of all cpus that can be used by the metaslab taskq.
- */
-int metaslab_load_pct = 50;
-
 /*
 * These tunables control how long a metaslab will remain loaded after the
 * last allocation from it.  A metaslab can't be unloaded until at least
@ -854,9 +849,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
 		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
 	}

-	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
-	    maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
-
 	return (mg);
 }

@ -872,7 +864,6 @@ metaslab_group_destroy(metaslab_group_t *mg)
 	 */
 	ASSERT(mg->mg_activation_count <= 0);

-	taskq_destroy(mg->mg_taskq);
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	mutex_destroy(&mg->mg_ms_disabled_lock);
@ -963,7 +954,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
 	 * allocations from taking place and any changes to the vdev tree.
 	 */
 	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
-	taskq_wait_outstanding(mg->mg_taskq, 0);
+	taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
 	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
 	metaslab_group_alloc_update(mg);
 	for (int i = 0; i < mg->mg_allocators; i++) {
@ -3571,10 +3562,8 @@ metaslab_group_preload(metaslab_group_t *mg)
 	avl_tree_t *t = &mg->mg_metaslab_tree;
 	int m = 0;

-	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
-		taskq_wait_outstanding(mg->mg_taskq, 0);
+	if (spa_shutting_down(spa) || !metaslab_preload_enabled)
 		return;
-	}

 	mutex_enter(&mg->mg_lock);

@ -3594,8 +3583,9 @@ metaslab_group_preload(metaslab_group_t *mg)
 			continue;
 		}

-		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
-		    msp, TQ_SLEEP) != TASKQID_INVALID);
+		VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
+		    msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
+		    != TASKQID_INVALID);
 	}
 	mutex_exit(&mg->mg_lock);
 }
@ -6224,6 +6214,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
 	"Preload potential metaslabs during reassessment");

+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
+	"Max number of metaslabs per group to preload");
+
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
 	"Delay in txgs after metaslab was last used before unloading");

--- a/sys/contrib/openzfs/module/zfs/spa.c
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@ -169,6 +169,11 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type,
    const char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);

+/*
+ * Percentage of all CPUs that can be used by the metaslab preload taskq.
+ */
+static uint_t metaslab_preload_pct = 50;
+
 static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
 static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
 static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
@ -1399,6 +1404,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
 	    1, INT_MAX, 0);

+	/*
+	 * The taskq to preload metaslabs.
+	 */
+	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
+	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
+	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
 	/*
 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
 	 * pool traverse code from monopolizing the global (and limited)
@ -1434,6 +1446,11 @@ spa_deactivate(spa_t *spa)
 		spa->spa_zvol_taskq = NULL;
 	}

+	if (spa->spa_metaslab_taskq) {
+		taskq_destroy(spa->spa_metaslab_taskq);
+		spa->spa_metaslab_taskq = NULL;
+	}
+
 	if (spa->spa_prefetch_taskq) {
 		taskq_destroy(spa->spa_prefetch_taskq);
 		spa->spa_prefetch_taskq = NULL;
@ -1706,13 +1723,7 @@ spa_unload(spa_t *spa)
 	 * This ensures that there is no async metaslab prefetching
 	 * while we attempt to unload the spa.
 	 */
-	if (spa->spa_root_vdev != NULL) {
-		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
-			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
-			if (vc->vdev_mg != NULL)
-				taskq_wait(vc->vdev_mg->mg_taskq);
-		}
-	}
+	taskq_wait(spa->spa_metaslab_taskq);

 	if (spa->spa_mmp.mmp_thread)
 		mmp_thread_stop(spa);
@ -3922,6 +3933,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
 	rvd = mrvd;
 	spa_config_exit(spa, SCL_ALL, FTAG);

+	/*
+	 * If 'zpool import' used a cached config, then the on-disk hostid and
+	 * hostname may be different to the cached config in ways that should
+	 * prevent import.  Userspace can't discover this without a scan, but
+	 * we know, so we add these values to LOAD_INFO so the caller can know
+	 * the difference.
+	 *
+	 * Note that we have to do this before the config is regenerated,
+	 * because the new config will have the hostid and hostname for this
+	 * host, in readiness for import.
+	 */
+	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
+		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
+		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
+	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
+		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
+		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
+
 	/*
 	 * We will use spa_config if we decide to reload the spa or if spa_load
 	 * fails and we rewind. We must thus regenerate the config using the
@ -10134,6 +10163,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);

+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
+	"Percentage of CPUs to run a metaslab preload taskq");
+
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
 	"log2 fraction of arc that can be used by inflight I/Os when "
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@ -1958,26 +1958,28 @@ zil_max_log_data(zilog_t *zilog, size_t hdrsize)

 /*
 * Maximum amount of log space we agree to waste to reduce number of
- * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
 */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
-	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8);
+	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }

 /*
 * Maximum amount of write data for WR_COPIED.  For correctness, consumers
 * must fall back to WR_NEED_COPY if we can't fit the entire record into one
 * maximum sized log block, because each WR_COPIED record must fit in a
- * single log block.  For space efficiency, we want to fit two records into a
- * max-sized log block.
+ * single log block.  Below that it is a tradeoff of additional memory copy
+ * and possibly worse log space efficiency vs additional range lock/unlock.
 */
+static uint_t zil_maxcopied = 7680;
+
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
-	return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
-	    sizeof (lr_write_t));
+	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
+	return (MIN(max_data, zil_maxcopied));
 }

 /*
@ -4226,3 +4228,6 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,

 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
+	"Limit in bytes WR_COPIED size");
--- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in
+++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in
@ -522,7 +522,7 @@ systemctl --system daemon-reload >/dev/null || true
 %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example
 %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*

-%config(noreplace) %{_sysconfdir}/bash_completion.d/zfs
+%config(noreplace) %{_bashcompletiondir}/zfs

 %files -n libzpool5
 %{_libdir}/libzpool.so.*
--- a/sys/contrib/openzfs/tests/runfiles/common.run
+++ b/sys/contrib/openzfs/tests/runfiles/common.run
@ -415,6 +415,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
    'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
    'zpool_import_encrypted', 'zpool_import_encrypted_load',
    'zpool_import_errata3', 'zpool_import_errata4',
+    'zpool_import_hostid_changed',
+    'zpool_import_hostid_changed_unclean_export',
+    'zpool_import_hostid_changed_cachefile',
+    'zpool_import_hostid_changed_cachefile_unclean_export',
    'import_cachefile_device_added',
    'import_cachefile_device_removed',
    'import_cachefile_device_replaced',
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@ -1104,6 +1104,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \
+	functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \
+	functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \
+	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \
+	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh
@ -52,6 +52,8 @@ log_must set_tunable64 TXG_TIMEOUT 5000

 log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS

+log_must sync_pool $TESTPOOL true
+
 log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
 log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288

--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg
@ -26,6 +26,7 @@

 #
 # Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc.
 #

 . $STF_SUITE/include/libtest.shlib
@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4
 export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5

 export ALTER_ROOT=/alter_import-test
+
+export HOSTID_FILE="/etc/hostid"
+export HOSTID1=01234567
+export HOSTID2=89abcdef
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib
@ -11,6 +11,7 @@

 #
 # Copyright (c) 2016 by Delphix. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc.
 #

 . $STF_SUITE/include/libtest.shlib
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh
@ -0,0 +1,59 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2021 by Delphix. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool that was cleanly exported should be importable without force even if
+# the local hostid doesn't match the on-disk hostid.
+#
+# STRATEGY:
+#	1. Set a hostid.
+#	2. Create a pool.
+#	3. Export the pool.
+#	4. Change the hostid.
+#	5. Verify that importing the pool without force succeeds.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+	rm -f $HOSTID_FILE
+	cleanup
+}
+
+log_onexit custom_cleanup
+
+# 1. Set a hostid.
+log_must zgenhostid -f $HOSTID1
+
+# 2. Create a pool.
+log_must zpool create $TESTPOOL1 $VDEV0
+
+# 3. Export the pool.
+log_must zpool export $TESTPOOL1
+
+# 4. Change the hostid.
+log_must zgenhostid -f $HOSTID2
+
+# 5. Verify that importing the pool without force succeeds.
+log_must zpool import -d $DEVICE_DIR $TESTPOOL1
+
+log_pass "zpool import can import cleanly exported pool when hostid changes."
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh
@ -0,0 +1,65 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2021 by Delphix. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool that was cleanly exported should be importable from a cachefile
+# without force even if the local hostid doesn't match the on-disk hostid.
+#
+# STRATEGY:
+#	1. Set a hostid.
+#	2. Create a pool with a cachefile.
+#	3. Backup the cachfile.
+#	4. Export the pool.
+#	5. Change the hostid.
+#	6. Verify that importing the pool from the cachefile succeeds
+#	   without force.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+	rm -f $HOSTID_FILE $CPATH $CPATHBKP
+	cleanup
+}
+
+log_onexit custom_cleanup
+
+# 1. Set a hostid.
+log_must zgenhostid -f $HOSTID1
+
+# 2. Create a pool.
+log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0
+
+# 3. Backup the cachfile.
+log_must cp $CPATH $CPATHBKP
+
+# 4. Export the pool.
+log_must zpool export $TESTPOOL1
+
+# 5. Change the hostid.
+log_must zgenhostid -f $HOSTID2
+
+# 6. Verify that importing the pool from the cachefile succeeds without force.
+log_must zpool import -c $CPATHBKP $TESTPOOL1
+
+log_pass "zpool import can import cleanly exported pool from cachefile " \
+  "when hostid changes."
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh
@ -0,0 +1,75 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2021 by Delphix. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool that wasn't cleanly exported should not be importable from a cachefile
+# without force if the local hostid doesn't match the on-disk hostid.
+#
+# STRATEGY:
+#	1. Set a hostid.
+#	2. Create a pool.
+#	3. Backup the cachefile.
+#	4. Simulate the pool being torn down without export:
+#	4.1. Copy the underlying device state.
+#	4.2. Export the pool.
+#	4.3. Restore the device state from the copy.
+#	5. Change the hostid.
+#	6. Verify that importing the pool from the cachefile fails.
+#	7. Verify that importing the pool from the cachefile with force
+#	   succeeds.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+	rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak
+	cleanup
+}
+
+log_onexit custom_cleanup
+
+# 1. Set a hostid.
+log_must zgenhostid -f $HOSTID1
+
+# 2. Create a pool.
+log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0
+
+# 3. Backup the cachfile.
+log_must cp $CPATH $CPATHBKP
+
+# 4. Simulate the pool being torn down without export.
+log_must cp $VDEV0 $VDEV0.bak
+log_must zpool export $TESTPOOL1
+log_must cp -f $VDEV0.bak $VDEV0
+log_must rm -f $VDEV0.bak
+
+# 5. Change the hostid.
+log_must zgenhostid -f $HOSTID2
+
+# 6. Verify that importing the pool from the cachefile fails.
+log_mustnot zpool import -c $CPATHBKP $TESTPOOL1
+
+# 7. Verify that importing the pool from the cachefile with force succeeds.
+log_must zpool import -f -c $CPATHBKP $TESTPOOL1
+
+log_pass "zpool import from cachefile requires force if not cleanly " \
+    "exported and hostid changes."
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh
@ -0,0 +1,70 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2021 by Delphix. All rights reserved.
+# Copyright (c) 2023 by Klara, Inc.
+#
+
+. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
+
+#
+# DESCRIPTION:
+# A pool that wasn't cleanly exported should not be importable without force if
+# the local hostid doesn't match the on-disk hostid.
+#
+# STRATEGY:
+#	1. Set a hostid.
+#	2. Create a pool.
+#	3. Simulate the pool being torn down without export:
+#	3.1. Copy the underlying device state.
+#	3.2. Export the pool.
+#	3.3. Restore the device state from the copy.
+#	4. Change the hostid.
+#	5. Verify that importing the pool fails.
+#	6. Verify that importing the pool with force succeeds.
+#
+
+verify_runnable "global"
+
+function custom_cleanup
+{
+	rm -f $HOSTID_FILE $VDEV0.bak
+	cleanup
+}
+
+log_onexit custom_cleanup
+
+# 1. Set a hostid.
+log_must zgenhostid -f $HOSTID1
+
+# 2. Create a pool.
+log_must zpool create $TESTPOOL1 $VDEV0
+
+# 3. Simulate the pool being torn down without export.
+log_must cp $VDEV0 $VDEV0.bak
+log_must zpool export $TESTPOOL1
+log_must cp -f $VDEV0.bak $VDEV0
+log_must rm -f $VDEV0.bak
+
+# 4. Change the hostid.
+log_must zgenhostid -f $HOSTID2
+
+# 5. Verify that importing the pool fails.
+log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1
+
+# 6. Verify that importing the pool with force succeeds.
+log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1
+
+log_pass "zpool import requires force if not cleanly exported " \
+    "and hostid changed."