Notable upstream pull request merges:
 #15290 54b1b1d89 import: require force when cachefile hostid doesn't
                  match on-disk
 #15319 342357cd9 Reduce number of metaslab preload taskq threads
 #15340 2a6c62109 ARC: Remove b_cv from struct l1arc_buf_hdr
 #15347 75a2eb7fa ARC: Drop different size headers for crypto
 #15350 96b9cf42e ARC: Remove b_bufcnt/b_ebufcnt from ARC headers
 #15353 66b81b349 ZIL: Reduce maximum size of WR_COPIED to 7.5K
 #15362 5b8688e62 zfsconcepts: add description of block cloning

Obtained from:	OpenZFS
OpenZFS commit:	66b81b3497
This commit is contained in:
Martin Matuska 2023-10-08 09:43:15 +02:00
commit b2526e8bfe
25 changed files with 512 additions and 332 deletions

View file

@ -0,0 +1,21 @@
env:
CIRRUS_CLONE_DEPTH: 1
ARCH: amd64
build_task:
matrix:
freebsd_instance:
image_family: freebsd-12-4
freebsd_instance:
image_family: freebsd-13-2
freebsd_instance:
image_family: freebsd-14-0-snap
prepare_script:
- pkg install -y autoconf automake libtool gettext-runtime gmake ksh93 py39-packaging py39-cffi py39-sysctl
configure_script:
- env MAKE=gmake ./autogen.sh
- env MAKE=gmake ./configure --with-config="user" --with-python=3.9
build_script:
- gmake -j `sysctl -n kern.smp.cpus`
install_script:
- gmake install

View file

@ -42,6 +42,7 @@
!udev/**
!.editorconfig
!.cirrus.yml
!.gitignore
!.gitmodules
!AUTHORS
@ -60,7 +61,6 @@
!TEST
!zfs.release.in
#
# Normal rules
#

View file

@ -3122,12 +3122,21 @@ zfs_force_import_required(nvlist_t *config)
nvlist_t *nvinfo;
state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE);
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
/*
* The hostid on LOAD_INFO comes from the MOS label via
* spa_tryimport(). If its not there then we're likely talking to an
* older kernel, so use the top one, which will be from the label
* discovered in zpool_find_import(), or if a cachefile is in use, the
* local hostid.
*/
if (nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_HOSTID, &hostid) != 0)
nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid);
if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid())
return (B_TRUE);
nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) {
mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo,
ZPOOL_CONFIG_MMP_STATE);
@ -3198,7 +3207,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
time_t timestamp = 0;
uint64_t hostid = 0;
if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTNAME))
hostname = fnvlist_lookup_string(nvinfo,
ZPOOL_CONFIG_HOSTNAME);
else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME))
hostname = fnvlist_lookup_string(config,
ZPOOL_CONFIG_HOSTNAME);
@ -3206,7 +3218,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
timestamp = fnvlist_lookup_uint64(config,
ZPOOL_CONFIG_TIMESTAMP);
if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
if (nvlist_exists(nvinfo, ZPOOL_CONFIG_HOSTID))
hostid = fnvlist_lookup_uint64(nvinfo,
ZPOOL_CONFIG_HOSTID);
else if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID))
hostid = fnvlist_lookup_uint64(config,
ZPOOL_CONFIG_HOSTID);

View file

@ -358,6 +358,9 @@ AC_DEFUN([ZFS_AC_RPM], [
AS_IF([test -n "$udevruledir" ], [
RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"'
])
AS_IF([test -n "$bashcompletiondir" ], [
RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_bashcompletiondir $(bashcompletiondir)"'
])
RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)'
RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)'
RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)'

View file

@ -51,7 +51,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_bufcnt)
__field(arc_buf_contents_t, hdr_type)
__field(uint16_t, hdr_psize)
__field(uint16_t, hdr_lsize)
@ -70,7 +69,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
__entry->hdr_birth = ab->b_birth;
__entry->hdr_flags = ab->b_flags;
__entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt;
__entry->hdr_psize = ab->b_psize;
__entry->hdr_lsize = ab->b_lsize;
__entry->hdr_spa = ab->b_spa;
@ -84,12 +82,12 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
__entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu "
"flags 0x%x type %u psize %u lsize %u spa %llu "
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
__entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize,
__entry->hdr_type, __entry->hdr_psize,
__entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type,
__entry->hdr_access, __entry->hdr_mru_hits,
__entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits,
@ -192,7 +190,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__array(uint64_t, hdr_dva_word, 2)
__field(uint64_t, hdr_birth)
__field(uint32_t, hdr_flags)
__field(uint32_t, hdr_bufcnt)
__field(arc_buf_contents_t, hdr_type)
__field(uint16_t, hdr_psize)
__field(uint16_t, hdr_lsize)
@ -223,7 +220,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
__entry->hdr_birth = hdr->b_birth;
__entry->hdr_flags = hdr->b_flags;
__entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt;
__entry->hdr_psize = hdr->b_psize;
__entry->hdr_lsize = hdr->b_lsize;
__entry->hdr_spa = hdr->b_spa;
@ -255,7 +251,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
__entry->zb_blkid = zb->zb_blkid;
),
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
"flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u "
"flags 0x%x psize %u lsize %u spa %llu state_type %u "
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
@ -264,7 +260,7 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
"blkid %llu }",
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
__entry->hdr_birth, __entry->hdr_flags,
__entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize,
__entry->hdr_psize, __entry->hdr_lsize,
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,

View file

@ -159,10 +159,6 @@ struct arc_write_callback {
* these two allocation states.
*/
typedef struct l1arc_buf_hdr {
/* for waiting on reads to complete */
kcondvar_t b_cv;
uint8_t b_byteswap;
/* protected by arc state mutex */
arc_state_t *b_state;
multilist_node_t b_arc_node;
@ -173,7 +169,7 @@ typedef struct l1arc_buf_hdr {
uint32_t b_mru_ghost_hits;
uint32_t b_mfu_hits;
uint32_t b_mfu_ghost_hits;
uint32_t b_bufcnt;
uint8_t b_byteswap;
arc_buf_t *b_buf;
/* self protecting */
@ -436,12 +432,12 @@ typedef struct l2arc_dev {
*/
typedef struct arc_buf_hdr_crypt {
abd_t *b_rabd; /* raw encrypted data */
dmu_object_type_t b_ot; /* object type */
uint32_t b_ebufcnt; /* count of encrypted buffers */
/* dsobj for looking up encryption key for l2arc encryption */
uint64_t b_dsobj;
dmu_object_type_t b_ot; /* object type */
/* encryption parameters */
uint8_t b_salt[ZIO_DATA_SALT_LEN];
uint8_t b_iv[ZIO_DATA_IV_LEN];

View file

@ -250,7 +250,6 @@ struct metaslab_group {
int64_t mg_activation_count;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;

View file

@ -424,7 +424,9 @@ struct spa {
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
taskq_t *spa_zvol_taskq; /* Taskq for minor management */
taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */
taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */
taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */
uint64_t spa_multihost; /* multihost aware (mmp) */
mmp_thread_t spa_mmp; /* multihost mmp thread */
list_t spa_leaf_list; /* list of leaf vdevs */
@ -448,8 +450,6 @@ struct spa {
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
zfs_refcount_t spa_refcount; /* number of opens */
taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */
};
extern char *spa_config_path;

View file

@ -402,6 +402,12 @@ Practical upper limit of total metaslabs per top-level vdev.
.It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Enable metaslab group preloading.
.
.It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint
Maximum number of metaslabs per group to preload
.
.It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint
Percentage of CPUs to run a metaslab preload taskq
.
.It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
Give more weight to metaslabs with lower LBAs,
assuming they have greater bandwidth,
@ -2144,6 +2150,11 @@ On very fragmented pools, lowering this
.Pq typically to Sy 36 KiB
can improve performance.
.
.It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint
This sets the maximum number of write bytes logged via WR_COPIED.
It tunes a tradeoff between additional memory copy and possibly worse log
space efficiency vs additional range lock/unlock.
.
.It Sy zil_min_commit_timeout Ns = Ns Sy 5000 Pq u64
This sets the minimum delay in nanoseconds ZIL care to delay block commit,
waiting for more records.

View file

@ -28,8 +28,9 @@
.\" Copyright 2019 Richard Laager. All rights reserved.
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright 2019 Joyent, Inc.
.\" Copyright 2023 Klara, Inc.
.\"
.Dd June 30, 2019
.Dd October 6, 2023
.Dt ZFSCONCEPTS 7
.Os
.
@ -205,3 +206,40 @@ practices, such as regular backups.
Consider using the
.Sy compression
property as a less resource-intensive alternative.
.Ss Block cloning
Block cloning is a facility that allows a file (or parts of a file) to be
.Qq cloned ,
that is, a shallow copy made where the existing data blocks are referenced
rather than copied.
Later modifications to the data will cause a copy of the data block to be taken
and that copy modified.
This facility is used to implement
.Qq reflinks
or
.Qq file-level copy-on-write .
.Pp
Cloned blocks are tracked in a special on-disk structure called the Block
Reference Table
.Po BRT
.Pc .
Unlike deduplication, this table has minimal overhead, so can be enabled at all
times.
.Pp
Also unlike deduplication, cloning must be requested by a user program.
Many common file copying programs, including newer versions of
.Nm /bin/cp ,
will try to create clones automatically.
Look for
.Qq clone ,
.Qq dedupe
or
.Qq reflink
in the documentation for more information.
.Pp
There are some limitations to block cloning.
Only whole blocks can be cloned, and blocks can not be cloned if they are not
yet written to disk, or if they are encrypted, or the source and destination
.Sy recordsize
properties differ.
The OS may add additional restrictions;
for example, most versions of Linux will not allow clones across datasets.

View file

@ -614,28 +614,6 @@ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
" space map to continue allocations in a first-fit fashion");
/* END CSTYLED */
/*
* Percentage of all cpus that can be used by the metaslab taskq.
*/
extern int metaslab_load_pct;
/* BEGIN CSTYLED */
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct,
CTLFLAG_RWTUN, &metaslab_load_pct, 0,
"Percentage of cpus that can be used by the metaslab taskq");
/* END CSTYLED */
/*
* Max number of metaslabs per group to preload.
*/
extern uint_t metaslab_preload_limit;
/* BEGIN CSTYLED */
SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, preload_limit,
CTLFLAG_RWTUN, &metaslab_preload_limit, 0,
"Max number of metaslabs per group to preload");
/* END CSTYLED */
/* mmp.c */
int

View file

@ -748,8 +748,7 @@ taskq_t *arc_prune_taskq;
* Other sizes
*/
#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
@ -1113,7 +1112,6 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
*/
static kmem_cache_t *hdr_full_cache;
static kmem_cache_t *hdr_full_crypt_cache;
static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
@ -1134,7 +1132,6 @@ buf_fini(void)
for (int i = 0; i < BUF_LOCKS; i++)
mutex_destroy(BUF_HASH_LOCK(i));
kmem_cache_destroy(hdr_full_cache);
kmem_cache_destroy(hdr_full_crypt_cache);
kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@ -1151,7 +1148,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)
memset(hdr, 0, HDR_FULL_SIZE);
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
#ifdef ZFS_DEBUG
mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
@ -1163,19 +1159,6 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag)
return (0);
}
static int
hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
{
(void) unused;
arc_buf_hdr_t *hdr = vbuf;
hdr_full_cons(vbuf, unused, kmflag);
memset(&hdr->b_crypt_hdr, 0, sizeof (hdr->b_crypt_hdr));
arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
return (0);
}
static int
hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
{
@ -1211,7 +1194,6 @@ hdr_full_dest(void *vbuf, void *unused)
arc_buf_hdr_t *hdr = vbuf;
ASSERT(HDR_EMPTY(hdr));
cv_destroy(&hdr->b_l1hdr.b_cv);
zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
#ifdef ZFS_DEBUG
mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
@ -1220,16 +1202,6 @@ hdr_full_dest(void *vbuf, void *unused)
arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
}
static void
hdr_full_crypt_dest(void *vbuf, void *unused)
{
(void) vbuf, (void) unused;
hdr_full_dest(vbuf, unused);
arc_space_return(sizeof (((arc_buf_hdr_t *)NULL)->b_crypt_hdr),
ARC_SPACE_HDRS);
}
static void
hdr_l2only_dest(void *vbuf, void *unused)
{
@ -1285,9 +1257,6 @@ buf_init(void)
hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
NULL, NULL, NULL, 0);
hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
NULL, NULL, 0);
@ -1995,7 +1964,6 @@ arc_buf_untransform_in_place(arc_buf_t *buf)
arc_buf_size(buf));
buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
hdr->b_crypt_hdr.b_ebufcnt -= 1;
}
/*
@ -2230,7 +2198,6 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
@ -2270,7 +2237,6 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(state)) {
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
@ -2386,7 +2352,9 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
l2hdr = &hdr->b_l2hdr;
if (l1hdr) {
abi->abi_bufcnt = l1hdr->b_bufcnt;
abi->abi_bufcnt = 0;
for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
abi->abi_bufcnt++;
abi->abi_access = l1hdr->b_arc_access;
abi->abi_mru_hits = l1hdr->b_mru_hits;
abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
@ -2414,7 +2382,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
{
arc_state_t *old_state;
int64_t refcnt;
uint32_t bufcnt;
boolean_t update_old, update_new;
arc_buf_contents_t type = arc_buf_type(hdr);
@ -2428,19 +2395,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
if (HDR_HAS_L1HDR(hdr)) {
old_state = hdr->b_l1hdr.b_state;
refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
bufcnt = hdr->b_l1hdr.b_bufcnt;
update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
HDR_HAS_RABD(hdr));
update_old = (hdr->b_l1hdr.b_buf != NULL ||
hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
IMPLY(GHOST_STATE(old_state), bufcnt == 0);
IMPLY(GHOST_STATE(new_state), bufcnt == 0);
IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
IMPLY(old_state == arc_anon, bufcnt <= 1);
IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
} else {
old_state = arc_l2c_only;
refcnt = 0;
bufcnt = 0;
update_old = B_FALSE;
}
update_new = update_old;
@ -2488,14 +2452,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
if (update_new && new_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(new_state)) {
ASSERT0(bufcnt);
/*
* When moving a header to a ghost state, we first
* remove all arc buffers. Thus, we'll have a
* bufcnt of zero, and no arc buffer to use for
* the reference. As a result, we use the arc
* header pointer for the reference.
* remove all arc buffers. Thus, we'll have no arc
* buffer to use for the reference. As a result, we
* use the arc header pointer for the reference.
*/
(void) zfs_refcount_add_many(
&new_state->arcs_size[type],
@ -2503,7 +2465,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
} else {
uint32_t buffers = 0;
/*
* Each individual buffer holds a unique reference,
@ -2512,8 +2473,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
* When the arc_buf_t is sharing the data
@ -2529,7 +2488,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
&new_state->arcs_size[type],
arc_buf_size(buf), buf);
}
ASSERT3U(bufcnt, ==, buffers);
if (hdr->b_l1hdr.b_pabd != NULL) {
(void) zfs_refcount_add_many(
@ -2548,7 +2506,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
if (update_old && old_state != arc_l2c_only) {
ASSERT(HDR_HAS_L1HDR(hdr));
if (GHOST_STATE(old_state)) {
ASSERT0(bufcnt);
ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
ASSERT(!HDR_HAS_RABD(hdr));
@ -2564,7 +2521,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
&old_state->arcs_size[type],
HDR_GET_LSIZE(hdr), hdr);
} else {
uint32_t buffers = 0;
/*
* Each individual buffer holds a unique reference,
@ -2573,8 +2529,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
*/
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
ASSERT3U(bufcnt, !=, 0);
buffers++;
/*
* When the arc_buf_t is sharing the data
@ -2590,7 +2544,6 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
&old_state->arcs_size[type],
arc_buf_size(buf), buf);
}
ASSERT3U(bufcnt, ==, buffers);
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
HDR_HAS_RABD(hdr));
@ -2838,9 +2791,6 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
VERIFY3P(buf->b_data, !=, NULL);
hdr->b_l1hdr.b_buf = buf;
hdr->b_l1hdr.b_bufcnt += 1;
if (encrypted)
hdr->b_crypt_hdr.b_ebufcnt += 1;
/*
* If the user wants the data from the hdr, we need to either copy or
@ -3082,8 +3032,6 @@ arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
}
buf->b_next = NULL;
ASSERT3P(lastbuf, !=, buf);
IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
return (lastbuf);
@ -3122,22 +3070,20 @@ arc_buf_destroy_impl(arc_buf_t *buf)
}
buf->b_data = NULL;
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
hdr->b_l1hdr.b_bufcnt -= 1;
if (ARC_BUF_ENCRYPTED(buf)) {
hdr->b_crypt_hdr.b_ebufcnt -= 1;
/*
* If we have no more encrypted buffers and we've
* already gotten a copy of the decrypted data we can
* free b_rabd to save some space.
*/
if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
!HDR_IO_IN_PROGRESS(hdr)) {
arc_hdr_free_abd(hdr, B_TRUE);
/*
* If we have no more encrypted buffers and we've already
* gotten a copy of the decrypted data we can free b_rabd
* to save some space.
*/
if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
arc_buf_t *b;
for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
if (b != buf && ARC_BUF_ENCRYPTED(b))
break;
}
if (b == NULL)
arc_hdr_free_abd(hdr, B_TRUE);
}
}
@ -3298,11 +3244,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
arc_buf_hdr_t *hdr;
VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
if (protected) {
hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
} else {
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
}
hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
ASSERT(HDR_EMPTY(hdr));
#ifdef ZFS_DEBUG
@ -3325,7 +3267,6 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_buf = NULL;
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
@ -3351,16 +3292,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
(old == hdr_l2only_cache && new == hdr_full_cache));
/*
* if the caller wanted a new full header and the header is to be
* encrypted we will actually allocate the header from the full crypt
* cache instead. The same applies to freeing from the old cache.
*/
if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
new = hdr_full_crypt_cache;
if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
old = hdr_full_crypt_cache;
nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
@ -3368,7 +3299,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
if (new == hdr_full_cache) {
arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
/*
* arc_access and arc_change_state need to be aware that a
@ -3382,7 +3313,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
ASSERT(!HDR_HAS_RABD(hdr));
} else {
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(hdr->b_l1hdr.b_bufcnt);
#ifdef ZFS_DEBUG
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
#endif
@ -3448,126 +3378,6 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
return (nhdr);
}
/*
* This function allows an L1 header to be reallocated as a crypt
* header and vice versa. If we are going to a crypt header, the
* new fields will be zeroed out.
*/
static arc_buf_hdr_t *
arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
{
arc_buf_hdr_t *nhdr;
arc_buf_t *buf;
kmem_cache_t *ncache, *ocache;
/*
* This function requires that hdr is in the arc_anon state.
* Therefore it won't have any L2ARC data for us to worry
* about copying.
*/
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_HAS_L2HDR(hdr));
ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
if (need_crypt) {
ncache = hdr_full_crypt_cache;
ocache = hdr_full_cache;
} else {
ncache = hdr_full_cache;
ocache = hdr_full_crypt_cache;
}
nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
/*
* Copy all members that aren't locks or condvars to the new header.
* No lists are pointing to us (as we asserted above), so we don't
* need to worry about the list nodes.
*/
nhdr->b_dva = hdr->b_dva;
nhdr->b_birth = hdr->b_birth;
nhdr->b_type = hdr->b_type;
nhdr->b_flags = hdr->b_flags;
nhdr->b_psize = hdr->b_psize;
nhdr->b_lsize = hdr->b_lsize;
nhdr->b_spa = hdr->b_spa;
#ifdef ZFS_DEBUG
nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
#endif
nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
/*
* This zfs_refcount_add() exists only to ensure that the individual
* arc buffers always point to a header that is referenced, avoiding
* a small race condition that could trigger ASSERTs.
*/
(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next)
buf->b_hdr = nhdr;
zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
if (need_crypt) {
arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
} else {
arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
}
/* unset all members of the original hdr */
memset(&hdr->b_dva, 0, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_type = 0;
hdr->b_flags = 0;
hdr->b_psize = 0;
hdr->b_lsize = 0;
hdr->b_spa = 0;
#ifdef ZFS_DEBUG
hdr->b_l1hdr.b_freeze_cksum = NULL;
#endif
hdr->b_l1hdr.b_buf = NULL;
hdr->b_l1hdr.b_bufcnt = 0;
hdr->b_l1hdr.b_byteswap = 0;
hdr->b_l1hdr.b_state = NULL;
hdr->b_l1hdr.b_arc_access = 0;
hdr->b_l1hdr.b_mru_hits = 0;
hdr->b_l1hdr.b_mru_ghost_hits = 0;
hdr->b_l1hdr.b_mfu_hits = 0;
hdr->b_l1hdr.b_mfu_ghost_hits = 0;
hdr->b_l1hdr.b_acb = NULL;
hdr->b_l1hdr.b_pabd = NULL;
if (ocache == hdr_full_crypt_cache) {
ASSERT(!HDR_HAS_RABD(hdr));
hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
hdr->b_crypt_hdr.b_ebufcnt = 0;
hdr->b_crypt_hdr.b_dsobj = 0;
memset(hdr->b_crypt_hdr.b_salt, 0, ZIO_DATA_SALT_LEN);
memset(hdr->b_crypt_hdr.b_iv, 0, ZIO_DATA_IV_LEN);
memset(hdr->b_crypt_hdr.b_mac, 0, ZIO_DATA_MAC_LEN);
}
buf_discard_identity(hdr);
kmem_cache_free(ocache, hdr);
return (nhdr);
}
/*
* This function is used by the send / receive code to convert a newly
* allocated arc_buf_t to one that is suitable for a raw encrypted write. It
@ -3587,8 +3397,7 @@ arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
if (!HDR_PROTECTED(hdr))
hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
hdr->b_crypt_hdr.b_dsobj = dsobj;
hdr->b_crypt_hdr.b_ot = ot;
hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
@ -3789,8 +3598,6 @@ static void
arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
if (HDR_HAS_L1HDR(hdr)) {
ASSERT(hdr->b_l1hdr.b_buf == NULL ||
hdr->b_l1hdr.b_bufcnt > 0);
ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
}
@ -3854,12 +3661,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
#ifdef ZFS_DEBUG
ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
#endif
if (!HDR_PROTECTED(hdr)) {
kmem_cache_free(hdr_full_cache, hdr);
} else {
kmem_cache_free(hdr_full_crypt_cache, hdr);
}
kmem_cache_free(hdr_full_cache, hdr);
} else {
kmem_cache_free(hdr_l2only_cache, hdr);
}
@ -3871,7 +3673,8 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag)
arc_buf_hdr_t *hdr = buf->b_hdr;
if (hdr->b_l1hdr.b_state == arc_anon) {
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
ASSERT(ARC_BUF_LAST(buf));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
VERIFY0(remove_reference(hdr, tag));
return;
@ -3881,7 +3684,7 @@ arc_buf_destroy(arc_buf_t *buf, const void *tag)
mutex_enter(hash_lock);
ASSERT3P(hdr, ==, buf->b_hdr);
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
ASSERT3P(buf->b_data, !=, NULL);
@ -3924,7 +3727,6 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT0(hdr->b_l1hdr.b_bufcnt);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
@ -5586,13 +5388,6 @@ arc_read_done(zio_t *zio)
buf_hash_remove(hdr);
}
/*
* Broadcast before we drop the hash_lock to avoid the possibility
* that the hdr (and hence the cv) might be freed before we get to
* the cv_broadcast().
*/
cv_broadcast(&hdr->b_l1hdr.b_cv);
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
(void) remove_reference(hdr, hdr);
@ -5787,8 +5582,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
}
acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
if (hdr->b_l1hdr.b_acb)
hdr->b_l1hdr.b_acb->acb_prev = acb;
hdr->b_l1hdr.b_acb->acb_prev = acb;
hdr->b_l1hdr.b_acb = acb;
}
mutex_exit(hash_lock);
@ -5928,8 +5722,28 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
* and so the performance impact shouldn't
* matter.
*/
cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
arc_callback_t *acb = kmem_zalloc(
sizeof (arc_callback_t), KM_SLEEP);
acb->acb_wait = B_TRUE;
mutex_init(&acb->acb_wait_lock, NULL,
MUTEX_DEFAULT, NULL);
cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
NULL);
acb->acb_zio_head =
hdr->b_l1hdr.b_acb->acb_zio_head;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb->acb_prev = acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
mutex_enter(&acb->acb_wait_lock);
while (acb->acb_wait) {
cv_wait(&acb->acb_wait_cv,
&acb->acb_wait_lock);
}
mutex_exit(&acb->acb_wait_lock);
mutex_destroy(&acb->acb_wait_lock);
cv_destroy(&acb->acb_wait_cv);
kmem_free(acb, sizeof (arc_callback_t));
goto top;
}
}
@ -6310,7 +6124,8 @@ arc_release(arc_buf_t *buf, const void *tag)
ASSERT(!HDR_IN_HASH_TABLE(hdr));
ASSERT(!HDR_HAS_L2HDR(hdr));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
ASSERT(ARC_BUF_LAST(buf));
ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
@ -6361,7 +6176,7 @@ arc_release(arc_buf_t *buf, const void *tag)
/*
* Do we have more than one buf?
*/
if (hdr->b_l1hdr.b_bufcnt > 1) {
if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
arc_buf_hdr_t *nhdr;
uint64_t spa = hdr->b_spa;
uint64_t psize = HDR_GET_PSIZE(hdr);
@ -6442,10 +6257,6 @@ arc_release(arc_buf_t *buf, const void *tag)
arc_buf_size(buf), buf);
}
hdr->b_l1hdr.b_bufcnt -= 1;
if (ARC_BUF_ENCRYPTED(buf))
hdr->b_crypt_hdr.b_ebufcnt -= 1;
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
@ -6458,15 +6269,11 @@ arc_release(arc_buf_t *buf, const void *tag)
nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
compress, hdr->b_complevel, type);
ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
ASSERT0(nhdr->b_l1hdr.b_bufcnt);
ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
VERIFY3U(nhdr->b_type, ==, type);
ASSERT(!HDR_SHARED_DATA(nhdr));
nhdr->b_l1hdr.b_buf = buf;
nhdr->b_l1hdr.b_bufcnt = 1;
if (ARC_BUF_ENCRYPTED(buf))
nhdr->b_crypt_hdr.b_ebufcnt = 1;
(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
@ -6517,7 +6324,7 @@ arc_write_ready(zio_t *zio)
ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
/*
* If we're reexecuting this zio because the pool suspended, then
@ -6552,13 +6359,9 @@ arc_write_ready(zio_t *zio)
add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
}
if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
if (BP_IS_PROTECTED(bp)) {
/* ZIL blocks are written through zio_rewrite */
ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
ASSERT(HDR_PROTECTED(hdr));
if (BP_SHOULD_BYTESWAP(bp)) {
if (BP_GET_LEVEL(bp) > 0) {
@ -6571,11 +6374,14 @@ arc_write_ready(zio_t *zio)
hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
}
arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
hdr->b_crypt_hdr.b_iv);
zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
} else {
arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
}
/*
@ -6656,7 +6462,8 @@ arc_write_ready(zio_t *zio)
} else {
ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
ASSERT(ARC_BUF_LAST(buf));
arc_share_buf(hdr, buf);
}
@ -6737,7 +6544,8 @@ arc_write_done(zio_t *zio)
(void *)hdr, (void *)exists);
} else {
/* Dedup */
ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(BP_GET_DEDUP(zio->io_bp));
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
@ -6778,7 +6586,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
ASSERT(!HDR_IO_ERROR(hdr));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
if (uncached)
arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
else if (l2arc)

View file

@ -205,11 +205,6 @@ static const uint32_t metaslab_min_search_count = 100;
*/
static int metaslab_df_use_largest_segment = B_FALSE;
/*
* Percentage of all cpus that can be used by the metaslab taskq.
*/
int metaslab_load_pct = 50;
/*
* These tunables control how long a metaslab will remain loaded after the
* last allocation from it. A metaslab can't be unloaded until at least
@ -854,9 +849,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
}
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
return (mg);
}
@ -872,7 +864,6 @@ metaslab_group_destroy(metaslab_group_t *mg)
*/
ASSERT(mg->mg_activation_count <= 0);
taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
mutex_destroy(&mg->mg_ms_disabled_lock);
@ -963,7 +954,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
* allocations from taking place and any changes to the vdev tree.
*/
spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
taskq_wait_outstanding(mg->mg_taskq, 0);
taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
metaslab_group_alloc_update(mg);
for (int i = 0; i < mg->mg_allocators; i++) {
@ -3571,10 +3562,8 @@ metaslab_group_preload(metaslab_group_t *mg)
avl_tree_t *t = &mg->mg_metaslab_tree;
int m = 0;
if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
taskq_wait_outstanding(mg->mg_taskq, 0);
if (spa_shutting_down(spa) || !metaslab_preload_enabled)
return;
}
mutex_enter(&mg->mg_lock);
@ -3594,8 +3583,9 @@ metaslab_group_preload(metaslab_group_t *mg)
continue;
}
VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
msp, TQ_SLEEP) != TASKQID_INVALID);
VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
!= TASKQID_INVALID);
}
mutex_exit(&mg->mg_lock);
}
@ -6224,6 +6214,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
"Preload potential metaslabs during reassessment");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
"Max number of metaslabs per group to preload");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
"Delay in txgs after metaslab was last used before unloading");

View file

@ -169,6 +169,11 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type,
const char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);
/*
* Percentage of all CPUs that can be used by the metaslab preload taskq.
*/
static uint_t metaslab_preload_pct = 50;
static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
static uint_t zio_taskq_batch_tpq; /* threads per taskq */
static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
@ -1399,6 +1404,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1, INT_MAX, 0);
/*
* The taskq to preload metaslabs.
*/
spa->spa_metaslab_taskq = taskq_create("z_metaslab",
metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
/*
* Taskq dedicated to prefetcher threads: this is used to prevent the
* pool traverse code from monopolizing the global (and limited)
@ -1434,6 +1446,11 @@ spa_deactivate(spa_t *spa)
spa->spa_zvol_taskq = NULL;
}
if (spa->spa_metaslab_taskq) {
taskq_destroy(spa->spa_metaslab_taskq);
spa->spa_metaslab_taskq = NULL;
}
if (spa->spa_prefetch_taskq) {
taskq_destroy(spa->spa_prefetch_taskq);
spa->spa_prefetch_taskq = NULL;
@ -1706,13 +1723,7 @@ spa_unload(spa_t *spa)
* This ensures that there is no async metaslab prefetching
* while we attempt to unload the spa.
*/
if (spa->spa_root_vdev != NULL) {
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
if (vc->vdev_mg != NULL)
taskq_wait(vc->vdev_mg->mg_taskq);
}
}
taskq_wait(spa->spa_metaslab_taskq);
if (spa->spa_mmp.mmp_thread)
mmp_thread_stop(spa);
@ -3922,6 +3933,24 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
rvd = mrvd;
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* If 'zpool import' used a cached config, then the on-disk hostid and
* hostname may be different to the cached config in ways that should
* prevent import. Userspace can't discover this without a scan, but
* we know, so we add these values to LOAD_INFO so the caller can know
* the difference.
*
* Note that we have to do this before the config is regenerated,
* because the new config will have the hostid and hostname for this
* host, in readiness for import.
*/
if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
/*
* We will use spa_config if we decide to reload the spa or if spa_load
* fails and we rewind. We must thus regenerate the config using the
@ -10134,6 +10163,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
/* asynchronous event notification */
EXPORT_SYMBOL(spa_event_notify);
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
"Percentage of CPUs to run a metaslab preload taskq");
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
"log2 fraction of arc that can be used by inflight I/Os when "

View file

@ -1958,26 +1958,28 @@ zil_max_log_data(zilog_t *zilog, size_t hdrsize)
/*
* Maximum amount of log space we agree to waste to reduce number of
* WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
* WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
*/
static inline uint64_t
zil_max_waste_space(zilog_t *zilog)
{
return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8);
return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
}
/*
* Maximum amount of write data for WR_COPIED. For correctness, consumers
* must fall back to WR_NEED_COPY if we can't fit the entire record into one
* maximum sized log block, because each WR_COPIED record must fit in a
* single log block. For space efficiency, we want to fit two records into a
* max-sized log block.
* single log block. Below that it is a tradeoff of additional memory copy
* and possibly worse log space efficiency vs additional range lock/unlock.
*/
static uint_t zil_maxcopied = 7680;
uint64_t
zil_max_copied_data(zilog_t *zilog)
{
return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
sizeof (lr_write_t));
uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
return (MIN(max_data, zil_maxcopied));
}
/*
@ -4226,3 +4228,6 @@ ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
"Limit in bytes of ZIL log block size");
ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
"Limit in bytes WR_COPIED size");

View file

@ -522,7 +522,7 @@ systemctl --system daemon-reload >/dev/null || true
%config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example
%attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*
%config(noreplace) %{_sysconfdir}/bash_completion.d/zfs
%config(noreplace) %{_bashcompletiondir}/zfs
%files -n libzpool5
%{_libdir}/libzpool.so.*

View file

@ -415,6 +415,10 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
'zpool_import_encrypted', 'zpool_import_encrypted_load',
'zpool_import_errata3', 'zpool_import_errata4',
'zpool_import_hostid_changed',
'zpool_import_hostid_changed_unclean_export',
'zpool_import_hostid_changed_cachefile',
'zpool_import_hostid_changed_cachefile_unclean_export',
'import_cachefile_device_added',
'import_cachefile_device_removed',
'import_cachefile_device_replaced',

View file

@ -1104,6 +1104,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \
functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \
functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \
functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \
functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \
functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \
functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \
functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \
functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \
functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \

View file

@ -52,6 +52,8 @@ log_must set_tunable64 TXG_TIMEOUT 5000
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must sync_pool $TESTPOOL true
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288

View file

@ -26,6 +26,7 @@
#
# Copyright (c) 2012, 2016 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib
@ -63,3 +64,7 @@ export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4
export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5
export ALTER_ROOT=/alter_import-test
export HOSTID_FILE="/etc/hostid"
export HOSTID1=01234567
export HOSTID2=89abcdef

View file

@ -11,6 +11,7 @@
#
# Copyright (c) 2016 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/include/libtest.shlib

View file

@ -0,0 +1,59 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that was cleanly exported should be importable without force even if
# the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool.
# 3. Export the pool.
# 4. Change the hostid.
# 5. Verify that importing the pool without force succeeds.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create $TESTPOOL1 $VDEV0
# 3. Export the pool.
log_must zpool export $TESTPOOL1
# 4. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 5. Verify that importing the pool without force succeeds.
log_must zpool import -d $DEVICE_DIR $TESTPOOL1
log_pass "zpool import can import cleanly exported pool when hostid changes."

View file

@ -0,0 +1,65 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that was cleanly exported should be importable from a cachefile
# without force even if the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool with a cachefile.
# 3. Backup the cachfile.
# 4. Export the pool.
# 5. Change the hostid.
# 6. Verify that importing the pool from the cachefile succeeds
# without force.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE $CPATH $CPATHBKP
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0
# 3. Backup the cachfile.
log_must cp $CPATH $CPATHBKP
# 4. Export the pool.
log_must zpool export $TESTPOOL1
# 5. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 6. Verify that importing the pool from the cachefile succeeds without force.
log_must zpool import -c $CPATHBKP $TESTPOOL1
log_pass "zpool import can import cleanly exported pool from cachefile " \
"when hostid changes."

View file

@ -0,0 +1,75 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that wasn't cleanly exported should not be importable from a cachefile
# without force if the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool.
# 3. Backup the cachefile.
# 4. Simulate the pool being torn down without export:
# 4.1. Copy the underlying device state.
# 4.2. Export the pool.
# 4.3. Restore the device state from the copy.
# 5. Change the hostid.
# 6. Verify that importing the pool from the cachefile fails.
# 7. Verify that importing the pool from the cachefile with force
# succeeds.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE $CPATH $CPATHBKP $VDEV0.bak
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create -o cachefile=$CPATH $TESTPOOL1 $VDEV0
# 3. Backup the cachfile.
log_must cp $CPATH $CPATHBKP
# 4. Simulate the pool being torn down without export.
log_must cp $VDEV0 $VDEV0.bak
log_must zpool export $TESTPOOL1
log_must cp -f $VDEV0.bak $VDEV0
log_must rm -f $VDEV0.bak
# 5. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 6. Verify that importing the pool from the cachefile fails.
log_mustnot zpool import -c $CPATHBKP $TESTPOOL1
# 7. Verify that importing the pool from the cachefile with force succeeds.
log_must zpool import -f -c $CPATHBKP $TESTPOOL1
log_pass "zpool import from cachefile requires force if not cleanly " \
"exported and hostid changes."

View file

@ -0,0 +1,70 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2021 by Delphix. All rights reserved.
# Copyright (c) 2023 by Klara, Inc.
#
. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
#
# DESCRIPTION:
# A pool that wasn't cleanly exported should not be importable without force if
# the local hostid doesn't match the on-disk hostid.
#
# STRATEGY:
# 1. Set a hostid.
# 2. Create a pool.
# 3. Simulate the pool being torn down without export:
# 3.1. Copy the underlying device state.
# 3.2. Export the pool.
# 3.3. Restore the device state from the copy.
# 4. Change the hostid.
# 5. Verify that importing the pool fails.
# 6. Verify that importing the pool with force succeeds.
#
verify_runnable "global"
function custom_cleanup
{
rm -f $HOSTID_FILE $VDEV0.bak
cleanup
}
log_onexit custom_cleanup
# 1. Set a hostid.
log_must zgenhostid -f $HOSTID1
# 2. Create a pool.
log_must zpool create $TESTPOOL1 $VDEV0
# 3. Simulate the pool being torn down without export.
log_must cp $VDEV0 $VDEV0.bak
log_must zpool export $TESTPOOL1
log_must cp -f $VDEV0.bak $VDEV0
log_must rm -f $VDEV0.bak
# 4. Change the hostid.
log_must zgenhostid -f $HOSTID2
# 5. Verify that importing the pool fails.
log_mustnot zpool import -d $DEVICE_DIR $TESTPOOL1
# 6. Verify that importing the pool with force succeeds.
log_must zpool import -d $DEVICE_DIR -f $TESTPOOL1
log_pass "zpool import requires force if not cleanly exported " \
"and hostid changed."