Notable upstream pull request merges:
  #14833 Update compatibility.d files
  #14841 ZIL: Reduce scope of per-dataset zl_issuer_lock
  #14863 zil: Add some more statistics
  #14866 btree: Implement faster binary search algorithm
  #14894 Fix inconsistent definition of zfs_scrub_error_blocks_per_txg
  #14892 Fix concurrent resilvers initiated at same time
  #14903 Fix NULL pointer dereference when doing concurrent 'send' operations
  #14910 ZIL: Allow to replay blocks of any size
  #14939 Fix the L2ARC write size calculating logic
  #14934 Introduce zfs_refcount_(add|remove)_few()
  #14946 Improve l2arc reporting in arc_summary
  #14953 Finally drop long disabled vdev cache
  #14954 Fix the L2ARC write size calculating logic (2)
  #14955 Use list_remove_head() where possible
  #14959 ZIL: Fix race introduced by f63811f072

Obtained from:	OpenZFS
OpenZFS commit:	feff9dfed3
This commit is contained in:
Martin Matuska 2023-06-10 19:31:17 +02:00
commit 4e8d558c9d
97 changed files with 1693 additions and 1220 deletions

View File

@ -135,7 +135,6 @@ KERNEL_C = \
uberblock.c \
unique.c \
vdev.c \
vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_file.c \

View File

@ -326,7 +326,6 @@ contrib/openzfs/module/zfs/txg.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/uberblock.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/unique.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_draid.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_draid_rand.c optional zfs compile-with "${ZFS_C}"
contrib/openzfs/module/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}"

View File

@ -6,5 +6,5 @@ Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
Linux-Maximum: 6.2
Linux-Maximum: 6.3
Linux-Minimum: 3.10

View File

@ -64,7 +64,6 @@ SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')'
SECTION_PATHS = {'arc': 'arcstats',
'dmu': 'dmu_tx',
'l2arc': 'arcstats', # L2ARC stuff lives in arcstats
'vdev': 'vdev_cache_stats',
'zfetch': 'zfetchstats',
'zil': 'zil'}
@ -90,8 +89,6 @@ if sys.platform.startswith('freebsd'):
# Requires py36-sysctl on FreeBSD
import sysctl
VDEV_CACHE_SIZE = 'vdev.cache_size'
def is_value(ctl):
return ctl.type != sysctl.CTLTYPE_NODE
@ -135,8 +132,6 @@ elif sys.platform.startswith('linux'):
SPL_PATH = '/sys/module/spl/parameters'
TUNABLES_PATH = '/sys/module/zfs/parameters'
VDEV_CACHE_SIZE = 'zfs_vdev_cache_size'
def load_kstats(section):
path = os.path.join(KSTAT_PATH, section)
with open(path) as f:
@ -842,7 +837,8 @@ def section_l2arc(kstats_dict):
('Free on write:', 'l2_free_on_write'),
('R/W clashes:', 'l2_rw_clash'),
('Bad checksums:', 'l2_cksum_bad'),
('I/O errors:', 'l2_io_error'))
('Read errors:', 'l2_io_error'),
('Write errors:', 'l2_writes_error'))
for title, value in l2_todo:
prt_i1(title, f_hits(arc_stats[value]))
@ -878,28 +874,20 @@ def section_l2arc(kstats_dict):
prt_i2('Miss ratio:',
f_perc(arc_stats['l2_misses'], l2_access_total),
f_hits(arc_stats['l2_misses']))
prt_i1('Feeds:', f_hits(arc_stats['l2_feeds']))
print()
print('L2ARC writes:')
if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']:
prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent']))
prt_i2('Done ratio:',
f_perc(arc_stats['l2_writes_done'],
arc_stats['l2_writes_sent']),
f_hits(arc_stats['l2_writes_done']))
prt_i2('Error ratio:',
f_perc(arc_stats['l2_writes_error'],
arc_stats['l2_writes_sent']),
f_hits(arc_stats['l2_writes_error']))
else:
prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent']))
print('L2ARC I/O:')
prt_i2('Reads:',
f_bytes(arc_stats['l2_read_bytes']),
f_hits(arc_stats['l2_hits']))
prt_i2('Writes:',
f_bytes(arc_stats['l2_write_bytes']),
f_hits(arc_stats['l2_writes_sent']))
print()
print('L2ARC evicts:')
prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry']))
prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading']))
prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached']))
prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading']))
print()
@ -959,35 +947,6 @@ def section_tunables(*_):
print()
def section_vdev(kstats_dict):
"""Collect information on VDEV caches"""
# Currently [Nov 2017] the VDEV cache is disabled, because it is actually
# harmful. When this is the case, we just skip the whole entry. See
# https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c
# for details
tunables = get_vdev_params()
if tunables[VDEV_CACHE_SIZE] == '0':
print('VDEV cache disabled, skipping section\n')
return
vdev_stats = isolate_section('vdev_cache_stats', kstats_dict)
vdev_cache_total = int(vdev_stats['hits']) +\
int(vdev_stats['misses']) +\
int(vdev_stats['delegations'])
prt_1('VDEV cache summary:', f_hits(vdev_cache_total))
prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total),
f_hits(vdev_stats['hits']))
prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total),
f_hits(vdev_stats['misses']))
prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total),
f_hits(vdev_stats['delegations']))
print()
def section_zil(kstats_dict):
"""Collect information on the ZFS Intent Log. Some of the information
taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h
@ -1015,7 +974,6 @@ section_calls = {'arc': section_arc,
'l2arc': section_l2arc,
'spl': section_spl,
'tunables': section_tunables,
'vdev': section_vdev,
'zil': section_zil}

View File

@ -33,6 +33,7 @@
* under sponsorship from the FreeBSD Foundation.
* Copyright (c) 2021 Allan Jude
* Copyright (c) 2021 Toomas Soome <tsoome@me.com>
* Copyright (c) 2023, Klara Inc.
*/
#include <stdio.h>
@ -326,7 +327,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
int err;
struct sublivelist_verify *sv = args;
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare,
zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
sizeof (sublivelist_verify_block_refcnt_t));
err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
@ -390,7 +391,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
{
(void) args;
sublivelist_verify_t sv;
zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
sizeof (sublivelist_verify_block_t));
int err = sublivelist_verify_func(&sv, dle);
zfs_btree_clear(&sv.sv_leftover);
@ -682,7 +683,7 @@ livelist_metaslab_validate(spa_t *spa)
(void) printf("Verifying deleted livelist entries\n");
sublivelist_verify_t sv;
zfs_btree_create(&sv.sv_leftover, livelist_block_compare,
zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
sizeof (sublivelist_verify_block_t));
iterate_deleted_livelists(spa, livelist_verify, &sv);
@ -716,7 +717,7 @@ livelist_metaslab_validate(spa_t *spa)
mv.mv_start = m->ms_start;
mv.mv_end = m->ms_start + m->ms_size;
zfs_btree_create(&mv.mv_livelist_allocs,
livelist_block_compare,
livelist_block_compare, NULL,
sizeof (sublivelist_verify_block_t));
mv_populate_livelist_allocs(&mv, &sv);
@ -789,6 +790,9 @@ usage(void)
"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
"\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
"\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
"\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
"\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
"\t%s [-v] <bookmark>\n"
"\t%s -C [-A] [-U <cache>]\n"
"\t%s -l [-Aqu] <device>\n"
@ -802,7 +806,7 @@ usage(void)
"\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
"<poolname>\n\n",
cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
cmdname, cmdname, cmdname, cmdname);
cmdname, cmdname, cmdname, cmdname, cmdname);
(void) fprintf(stderr, " Dataset name must include at least one "
"separator character '/' or '@'\n");
@ -825,6 +829,8 @@ usage(void)
(void) fprintf(stderr, " Options to control amount of output:\n");
(void) fprintf(stderr, " -b --block-stats "
"block statistics\n");
(void) fprintf(stderr, " -B --backup "
"backup stream\n");
(void) fprintf(stderr, " -c --checksum "
"checksum all metadata (twice for all data) blocks\n");
(void) fprintf(stderr, " -C --config "
@ -4875,6 +4881,81 @@ dump_path(char *ds, char *path, uint64_t *retobj)
return (err);
}
static int
dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
{
const char *p = (const char *)buf;
ssize_t nwritten;
(void) os;
(void) arg;
/* Write the data out, handling short writes and signals. */
while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
if (nwritten < 0) {
if (errno == EINTR)
continue;
return (errno);
}
p += nwritten;
len -= nwritten;
}
return (0);
}
static void
dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
{
boolean_t embed = B_FALSE;
boolean_t large_block = B_FALSE;
boolean_t compress = B_FALSE;
boolean_t raw = B_FALSE;
const char *c;
for (c = flagstr; c != NULL && *c != '\0'; c++) {
switch (*c) {
case 'e':
embed = B_TRUE;
break;
case 'L':
large_block = B_TRUE;
break;
case 'c':
compress = B_TRUE;
break;
case 'w':
raw = B_TRUE;
break;
default:
fprintf(stderr, "dump_backup: invalid flag "
"'%c'\n", *c);
return;
}
}
if (isatty(STDOUT_FILENO)) {
fprintf(stderr, "dump_backup: stream cannot be written "
"to a terminal\n");
return;
}
offset_t off = 0;
dmu_send_outparams_t out = {
.dso_outfunc = dump_backup_bytes,
.dso_dryrun = B_FALSE,
};
int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
&off, &out);
if (err != 0) {
fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
strerror(err));
return;
}
}
static int
zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
{
@ -8465,9 +8546,9 @@ zdb_read_block(char *thing, spa_t *spa)
*/
zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
ZIO_FLAG_OPTIONAL, NULL, NULL));
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
NULL, NULL));
}
error = zio_wait(zio);
@ -8561,7 +8642,6 @@ zdb_read_block(char *thing, spa_t *spa)
zio_nowait(zio_vdev_child_io(czio, bp, vd,
offset, pabd, psize, ZIO_TYPE_READ,
ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
@ -8695,6 +8775,7 @@ main(int argc, char **argv)
struct option long_options[] = {
{"ignore-assertions", no_argument, NULL, 'A'},
{"block-stats", no_argument, NULL, 'b'},
{"backup", no_argument, NULL, 'B'},
{"checksum", no_argument, NULL, 'c'},
{"config", no_argument, NULL, 'C'},
{"datasets", no_argument, NULL, 'd'},
@ -8736,10 +8817,11 @@ main(int argc, char **argv)
};
while ((c = getopt_long(argc, argv,
"AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
"AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ",
long_options, NULL)) != -1) {
switch (c) {
case 'b':
case 'B':
case 'c':
case 'C':
case 'd':
@ -8887,7 +8969,7 @@ main(int argc, char **argv)
verbose = MAX(verbose, 1);
for (c = 0; c < 256; c++) {
if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL)
if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
dump_opt[c] = 1;
if (dump_opt[c])
dump_opt[c] += verbose;
@ -9073,7 +9155,8 @@ main(int argc, char **argv)
checkpoint_pool, error);
}
} else if (target_is_spa || dump_opt['R'] || objset_id == 0) {
} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
objset_id == 0) {
zdb_set_skip_mmp(target);
error = spa_open_rewind(target, &spa, FTAG, policy,
NULL);
@ -9209,7 +9292,10 @@ main(int argc, char **argv)
strerror(errno));
}
}
if (os != NULL) {
if (dump_opt['B']) {
dump_backup(target, objset_id,
argc > 0 ? argv[0] : NULL);
} else if (os != NULL) {
dump_objset(os);
} else if (zopt_object_args > 0 && !dump_opt['m']) {
dump_objset(spa->spa_meta_objset);

View File

@ -369,9 +369,7 @@ zfs_agent_consumer_thread(void *arg)
return (NULL);
}
if ((event = (list_head(&agent_events))) != NULL) {
list_remove(&agent_events, event);
if ((event = list_remove_head(&agent_events)) != NULL) {
(void) pthread_mutex_unlock(&agent_lock);
/* dispatch to all event subscribers */
@ -434,8 +432,7 @@ zfs_agent_fini(void)
(void) pthread_join(g_agents_tid, NULL);
/* drain any pending events */
while ((event = (list_head(&agent_events))) != NULL) {
list_remove(&agent_events, event);
while ((event = list_remove_head(&agent_events)) != NULL) {
nvlist_free(event->ae_nvl);
free(event);
}

View File

@ -1288,17 +1288,14 @@ zfs_slm_fini(void)
tpool_destroy(g_tpool);
}
while ((pool = (list_head(&g_pool_list))) != NULL) {
list_remove(&g_pool_list, pool);
while ((pool = list_remove_head(&g_pool_list)) != NULL) {
zpool_close(pool->uap_zhp);
free(pool);
}
list_destroy(&g_pool_list);
while ((device = (list_head(&g_device_list))) != NULL) {
list_remove(&g_device_list, device);
while ((device = list_remove_head(&g_device_list)) != NULL)
free(device);
}
list_destroy(&g_device_list);
libzfs_fini(g_zfshdl);

View File

@ -36,31 +36,49 @@ import argparse
from argparse import RawTextHelpFormatter
cols = {
# hdr: [size, scale, kstat name]
# hdr: [size, scale, kstat name]
"time": [8, -1, "time"],
"pool": [12, -1, "pool"],
"ds": [12, -1, "dataset_name"],
"obj": [12, -1, "objset"],
"zcc": [10, 1000, "zil_commit_count"],
"zcwc": [10, 1000, "zil_commit_writer_count"],
"ziic": [10, 1000, "zil_itx_indirect_count"],
"zic": [10, 1000, "zil_itx_count"],
"ziib": [10, 1024, "zil_itx_indirect_bytes"],
"zicc": [10, 1000, "zil_itx_copied_count"],
"zicb": [10, 1024, "zil_itx_copied_bytes"],
"zinc": [10, 1000, "zil_itx_needcopy_count"],
"zinb": [10, 1024, "zil_itx_needcopy_bytes"],
"zimnc": [10, 1000, "zil_itx_metaslab_normal_count"],
"zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"],
"zimsc": [10, 1000, "zil_itx_metaslab_slog_count"],
"zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"],
"cc": [5, 1000, "zil_commit_count"],
"cwc": [5, 1000, "zil_commit_writer_count"],
"ic": [5, 1000, "zil_itx_count"],
"iic": [5, 1000, "zil_itx_indirect_count"],
"iib": [5, 1024, "zil_itx_indirect_bytes"],
"icc": [5, 1000, "zil_itx_copied_count"],
"icb": [5, 1024, "zil_itx_copied_bytes"],
"inc": [5, 1000, "zil_itx_needcopy_count"],
"inb": [5, 1024, "zil_itx_needcopy_bytes"],
"idc": [5, 1000, "icc+inc"],
"idb": [5, 1024, "icb+inb"],
"iwc": [5, 1000, "iic+idc"],
"iwb": [5, 1024, "iib+idb"],
"imnc": [6, 1000, "zil_itx_metaslab_normal_count"],
"imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"],
"imnw": [6, 1024, "zil_itx_metaslab_normal_write"],
"imna": [6, 1024, "zil_itx_metaslab_normal_alloc"],
"imsc": [6, 1000, "zil_itx_metaslab_slog_count"],
"imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"],
"imsw": [6, 1024, "zil_itx_metaslab_slog_write"],
"imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"],
"imc": [5, 1000, "imnc+imsc"],
"imb": [5, 1024, "imnb+imsb"],
"imw": [5, 1024, "imnw+imsw"],
"ima": [5, 1024, "imna+imsa"],
"se%": [3, 100, "imb/ima"],
"sen%": [4, 100, "imnb/imna"],
"ses%": [4, 100, "imsb/imsa"],
"te%": [3, 100, "imb/imw"],
"ten%": [4, 100, "imnb/imnw"],
"tes%": [4, 100, "imsb/imsw"],
}
hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \
"zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"]
hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib",
"imnc", "imnw", "imsc", "imsw"]
ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb",
"zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"]
ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib",
"imnc", "imnw", "imsc", "imsw"]
cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]")
@ -105,7 +123,7 @@ def print_header():
global sep
for col in hdr:
new_col = col
if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']:
if interval > 0 and cols[col][1] > 100:
new_col += "/s"
sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep))
sys.stdout.write("\n")
@ -115,7 +133,7 @@ def print_values(v):
global sep
for col in hdr:
val = v[cols[col][2]]
if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0:
if interval > 0 and cols[col][1] > 100:
val = v[cols[col][2]] // interval
sys.stdout.write("%s%s" % (
prettynum(cols[col][0], cols[col][1], val), sep))
@ -237,9 +255,7 @@ def init():
invalid = []
for ele in hdr:
if gFlag and ele not in ghdr:
invalid.append(ele)
elif ele not in cols:
if ele not in cols:
invalid.append(ele)
if len(invalid) > 0:
@ -403,17 +419,17 @@ def calculate_diff():
diff = copy.deepcopy(curr)
for pool in curr:
for objset in curr[pool]:
for col in hdr:
if col not in ['time', 'pool', 'ds', 'obj']:
key = cols[col][2]
# If prev is NULL, this is the
# first time we are here
if not prev:
diff[pool][objset][key] = 0
else:
diff[pool][objset][key] \
= curr[pool][objset][key] \
- prev[pool][objset][key]
for key in curr[pool][objset]:
if not isinstance(diff[pool][objset][key], int):
continue
# If prev is NULL, this is the
# first time we are here
if not prev:
diff[pool][objset][key] = 0
else:
diff[pool][objset][key] \
= curr[pool][objset][key] \
- prev[pool][objset][key]
def zil_build_dict(pool = "GLOBAL"):
global kstat
@ -425,10 +441,77 @@ def zil_build_dict(pool = "GLOBAL"):
if objset not in curr[pool]:
curr[pool][objset] = dict()
curr[pool][objset][key] = val
curr[pool][objset]["pool"] = pool
curr[pool][objset]["objset"] = objset
curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \
time.localtime())
def zil_extend_dict():
global diff
for pool in diff:
for objset in diff[pool]:
diff[pool][objset]["pool"] = pool
diff[pool][objset]["objset"] = objset
diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \
time.localtime())
diff[pool][objset]["icc+inc"] = \
diff[pool][objset]["zil_itx_copied_count"] + \
diff[pool][objset]["zil_itx_needcopy_count"]
diff[pool][objset]["icb+inb"] = \
diff[pool][objset]["zil_itx_copied_bytes"] + \
diff[pool][objset]["zil_itx_needcopy_bytes"]
diff[pool][objset]["iic+idc"] = \
diff[pool][objset]["zil_itx_indirect_count"] + \
diff[pool][objset]["zil_itx_copied_count"] + \
diff[pool][objset]["zil_itx_needcopy_count"]
diff[pool][objset]["iib+idb"] = \
diff[pool][objset]["zil_itx_indirect_bytes"] + \
diff[pool][objset]["zil_itx_copied_bytes"] + \
diff[pool][objset]["zil_itx_needcopy_bytes"]
diff[pool][objset]["imnc+imsc"] = \
diff[pool][objset]["zil_itx_metaslab_normal_count"] + \
diff[pool][objset]["zil_itx_metaslab_slog_count"]
diff[pool][objset]["imnb+imsb"] = \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"]
diff[pool][objset]["imnw+imsw"] = \
diff[pool][objset]["zil_itx_metaslab_normal_write"] + \
diff[pool][objset]["zil_itx_metaslab_slog_write"]
diff[pool][objset]["imna+imsa"] = \
diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \
diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
if diff[pool][objset]["imna+imsa"] > 0:
diff[pool][objset]["imb/ima"] = 100 * \
diff[pool][objset]["imnb+imsb"] // \
diff[pool][objset]["imna+imsa"]
else:
diff[pool][objset]["imb/ima"] = 100
if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
diff[pool][objset]["imnb/imna"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_normal_alloc"]
else:
diff[pool][objset]["imnb/imna"] = 100
if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
diff[pool][objset]["imsb/imsa"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_slog_alloc"]
else:
diff[pool][objset]["imsb/imsa"] = 100
if diff[pool][objset]["imnw+imsw"] > 0:
diff[pool][objset]["imb/imw"] = 100 * \
diff[pool][objset]["imnb+imsb"] // \
diff[pool][objset]["imnw+imsw"]
else:
diff[pool][objset]["imb/imw"] = 100
if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0:
diff[pool][objset]["imnb/imnw"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_normal_write"]
else:
diff[pool][objset]["imnb/imnw"] = 100
if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0:
diff[pool][objset]["imsb/imsw"] = 100 * \
diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \
diff[pool][objset]["zil_itx_metaslab_slog_write"]
else:
diff[pool][objset]["imsb/imsw"] = 100
def sign_handler_epipe(sig, frame):
print("Caught EPIPE signal: " + str(frame))
@ -437,30 +520,31 @@ def sign_handler_epipe(sig, frame):
def main():
global interval
global curr
global curr, diff
hprint = False
init()
signal.signal(signal.SIGINT, signal.SIG_DFL)
signal.signal(signal.SIGPIPE, sign_handler_epipe)
zil_process_kstat()
if not curr:
print ("Error: No stats to show")
sys.exit(0)
print_header()
if interval > 0:
time.sleep(interval)
while True:
calculate_diff()
if not diff:
print ("Error: No stats to show")
sys.exit(0)
if hprint == False:
print_header()
hprint = True
zil_extend_dict()
print_dict(diff)
time.sleep(interval)
else:
zil_process_kstat()
if not curr:
print ("Error: No stats to show")
sys.exit(0)
print_header()
print_dict(curr)
diff = curr
zil_extend_dict()
print_dict(diff)
if __name__ == '__main__':
main()

View File

@ -145,6 +145,7 @@ dist_zpoolcompat_DATA = \
%D%/compatibility.d/openzfs-2.0-linux \
%D%/compatibility.d/openzfs-2.1-freebsd \
%D%/compatibility.d/openzfs-2.1-linux \
%D%/compatibility.d/openzfs-2.2 \
%D%/compatibility.d/openzfsonosx-1.7.0 \
%D%/compatibility.d/openzfsonosx-1.8.1 \
%D%/compatibility.d/openzfsonosx-1.9.3 \
@ -168,12 +169,20 @@ zpoolcompatlinks = \
"freebsd-11.3 freebsd-12.0" \
"freebsd-11.3 freebsd-12.1" \
"freebsd-11.3 freebsd-12.2" \
"freebsd-11.3 freebsd-12.3" \
"freebsd-11.3 freebsd-12.4" \
"openzfs-2.1-freebsd freebsd-13.0" \
"openzfs-2.1-freebsd freebsd-13.1" \
"openzfs-2.1-freebsd freebsd-13.2" \
"freebsd-11.3 freenas-11.3" \
"freenas-11.0 freenas-11.1" \
"openzfsonosx-1.9.3 openzfsonosx-1.9.4" \
"openzfs-2.0-freebsd truenas-12.0" \
"zol-0.7 ubuntu-18.04" \
"zol-0.8 ubuntu-20.04"
"zol-0.8 ubuntu-20.04" \
"openzfs-2.1-linux ubuntu-22.04" \
"openzfs-2.2 openzfs-2.2-linux" \
"openzfs-2.2 openzfs-2.2-freebsd"
zpoolconfdir = $(sysconfdir)/zfs/zpool.d
INSTALL_DATA_HOOKS += zpool-install-data-hook

View File

@ -8,5 +8,7 @@ extensible_dataset
filesystem_limits
hole_birth
large_blocks
livelist
lz4_compress
spacemap_histogram
zpool_checkpoint

View File

@ -0,0 +1,40 @@
# Features supported by OpenZFS 2.2 on Linux and FreeBSD
allocation_classes
async_destroy
blake3
block_cloning
bookmark_v2
bookmark_written
bookmarks
device_rebuild
device_removal
draid
edonr
embedded_data
empty_bpobj
enabled_txg
encryption
extensible_dataset
filesystem_limits
head_errlog
hole_birth
large_blocks
large_dnode
livelist
log_spacemap
lz4_compress
multi_vdev_crash_dump
obsolete_counts
project_quota
redacted_datasets
redaction_bookmarks
resilver_defer
sha512
skein
spacemap_histogram
spacemap_v2
userobj_accounting
vdev_zaps_v2
zilsaxattr
zpool_checkpoint
zstd_compress

View File

@ -0,0 +1,26 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_RECLAIMED], [
dnl #
dnl # 6.4 API change
dnl # The reclaimed_slab of struct reclaim_state
dnl # is renamed to reclaimed
dnl #
ZFS_LINUX_TEST_SRC([reclaim_state_reclaimed], [
#include <linux/swap.h>
static const struct reclaim_state
rs __attribute__ ((unused)) = {
.reclaimed = 100,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_RECLAIMED], [
AC_MSG_CHECKING([whether struct reclaim_state has reclaimed field])
ZFS_LINUX_TEST_RESULT([reclaim_state_reclaimed], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_RECLAIM_STATE_RECLAIMED, 1,
[struct reclaim_state has reclaimed])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -153,6 +153,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_IATTR_VFSID
ZFS_AC_KERNEL_SRC_FILEMAP
ZFS_AC_KERNEL_SRC_WRITEPAGE_T
ZFS_AC_KERNEL_SRC_RECLAIMED
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@ -285,6 +286,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_IATTR_VFSID
ZFS_AC_KERNEL_FILEMAP
ZFS_AC_KERNEL_WRITEPAGE_T
ZFS_AC_KERNEL_RECLAIMED
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE

View File

@ -7,8 +7,8 @@ NAME := $(shell awk '$$1 == "Name:" { print $$2; }' META)
LINUX_MIN := $(shell awk '/Linux-Minimum:/{print $$2}' META)
LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META)
DKMSFILES := module include config zfs.release.in autogen.sh META AUTHORS \
COPYRIGHT LICENSE README.md
DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \
COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md
ifndef KVERS
KVERS=$(shell uname -r)

View File

@ -344,7 +344,7 @@ mount_fs()
# Need the _original_ datasets mountpoint!
mountpoint=$(get_fs_value "$fs" mountpoint)
ZFS_CMD="mount.zfs -o zfsutil"
ZFS_CMD="mount -o zfsutil -t zfs"
if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then
# Can't use the mountpoint property. Might be one of our
# clones. Check the 'org.zol:mountpoint' property set in
@ -361,7 +361,7 @@ mount_fs()
fi
# Don't use mount.zfs -o zfsutils for legacy mountpoint
if [ "$mountpoint" = "legacy" ]; then
ZFS_CMD="mount.zfs"
ZFS_CMD="mount -t zfs"
fi
# Last hail-mary: Hope 'rootmnt' is set!
mountpoint=""
@ -944,7 +944,7 @@ mountroot()
echo " not specified on the kernel command line."
echo ""
echo "Manually mount the root filesystem on $rootmnt and then exit."
echo "Hint: Try: mount.zfs -o zfsutil ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt"
echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt"
shell
fi

View File

@ -67,6 +67,7 @@ pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...)
#include <sys/mman.h>
static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok";
static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok";
static libzfs_handle_t *g_zfs;
@ -160,10 +161,10 @@ pw_free(pw_password_t *pw)
}
static pw_password_t *
pw_fetch(pam_handle_t *pamh)
pw_fetch(pam_handle_t *pamh, int tok)
{
const char *token;
if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) {
if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) {
pam_syslog(pamh, LOG_ERR,
"couldn't get password from PAM stack");
return (NULL);
@ -177,13 +178,13 @@ pw_fetch(pam_handle_t *pamh)
}
static const pw_password_t *
pw_fetch_lazy(pam_handle_t *pamh)
pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name)
{
pw_password_t *pw = pw_fetch(pamh);
pw_password_t *pw = pw_fetch(pamh, tok);
if (pw == NULL) {
return (NULL);
}
int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw);
int ret = pam_set_data(pamh, var_name, pw, destroy_pw);
if (ret != PAM_SUCCESS) {
pw_free(pw);
pam_syslog(pamh, LOG_ERR, "pam_set_data failed");
@ -193,23 +194,23 @@ pw_fetch_lazy(pam_handle_t *pamh)
}
static const pw_password_t *
pw_get(pam_handle_t *pamh)
pw_get(pam_handle_t *pamh, int tok, const char *var_name)
{
const pw_password_t *authtok = NULL;
int ret = pam_get_data(pamh, PASSWORD_VAR_NAME,
int ret = pam_get_data(pamh, var_name,
(const void**)(&authtok));
if (ret == PAM_SUCCESS)
return (authtok);
if (ret == PAM_NO_MODULE_DATA)
return (pw_fetch_lazy(pamh));
return (pw_fetch_lazy(pamh, tok, var_name));
pam_syslog(pamh, LOG_ERR, "password not available");
return (NULL);
}
static int
pw_clear(pam_handle_t *pamh)
pw_clear(pam_handle_t *pamh, const char *var_name)
{
int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL);
int ret = pam_set_data(pamh, var_name, NULL, NULL);
if (ret != PAM_SUCCESS) {
pam_syslog(pamh, LOG_ERR, "clearing password failed");
return (-1);
@ -386,7 +387,7 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name,
int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value,
WRAPPING_KEY_LEN);
pw_free(key);
if (ret) {
if (ret && ret != EEXIST) {
pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret);
zfs_close(ds);
return (-1);
@ -406,14 +407,14 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name,
}
static int
unmount_unload(pam_handle_t *pamh, const char *ds_name)
unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force)
{
zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM);
if (ds == NULL) {
pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name);
return (-1);
}
int ret = zfs_unmount(ds, NULL, 0);
int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0);
if (ret) {
pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret);
zfs_close(ds);
@ -435,9 +436,13 @@ typedef struct {
char *runstatedir;
char *homedir;
char *dsname;
uid_t uid_min;
uid_t uid_max;
uid_t uid;
const char *username;
int unmount_and_unload;
boolean_t unmount_and_unload;
boolean_t force_unmount;
boolean_t recursive_homes;
} zfs_key_config_t;
static int
@ -469,9 +474,13 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config,
free(config->homes_prefix);
return (PAM_USER_UNKNOWN);
}
config->uid_min = 1000;
config->uid_max = MAXUID;
config->uid = entry->pw_uid;
config->username = name;
config->unmount_and_unload = 1;
config->unmount_and_unload = B_TRUE;
config->force_unmount = B_FALSE;
config->recursive_homes = B_FALSE;
config->dsname = NULL;
config->homedir = NULL;
for (int c = 0; c < argc; c++) {
@ -481,8 +490,16 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config,
} else if (strncmp(argv[c], "runstatedir=", 12) == 0) {
free(config->runstatedir);
config->runstatedir = strdup(argv[c] + 12);
} else if (strncmp(argv[c], "uid_min=", 8) == 0) {
sscanf(argv[c] + 8, "%u", &config->uid_min);
} else if (strncmp(argv[c], "uid_max=", 8) == 0) {
sscanf(argv[c] + 8, "%u", &config->uid_max);
} else if (strcmp(argv[c], "nounmount") == 0) {
config->unmount_and_unload = 0;
config->unmount_and_unload = B_FALSE;
} else if (strcmp(argv[c], "forceunmount") == 0) {
config->force_unmount = B_TRUE;
} else if (strcmp(argv[c], "recursive_homes") == 0) {
config->recursive_homes = B_TRUE;
} else if (strcmp(argv[c], "prop_mountpoint") == 0) {
if (config->homedir == NULL)
config->homedir = strdup(entry->pw_dir);
@ -517,8 +534,12 @@ find_dsname_by_prop_value(zfs_handle_t *zhp, void *data)
(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
if (strcmp(target->homedir, mountpoint) != 0) {
if (target->recursive_homes) {
(void) zfs_iter_filesystems_v2(zhp, 0,
find_dsname_by_prop_value, target);
}
zfs_close(zhp);
return (0);
return (target->dsname != NULL);
}
target->dsname = strdup(zfs_get_name(zhp));
@ -531,17 +552,23 @@ zfs_key_config_get_dataset(zfs_key_config_t *config)
{
if (config->homedir != NULL &&
config->homes_prefix != NULL) {
zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix,
ZFS_TYPE_FILESYSTEM);
if (zhp == NULL) {
pam_syslog(NULL, LOG_ERR, "dataset %s not found",
config->homes_prefix);
return (NULL);
}
if (strcmp(config->homes_prefix, "*") == 0) {
(void) zfs_iter_root(g_zfs,
find_dsname_by_prop_value, config);
} else {
zfs_handle_t *zhp = zfs_open(g_zfs,
config->homes_prefix, ZFS_TYPE_FILESYSTEM);
if (zhp == NULL) {
pam_syslog(NULL, LOG_ERR,
"dataset %s not found",
config->homes_prefix);
return (NULL);
}
(void) zfs_iter_filesystems_v2(zhp, 0,
find_dsname_by_prop_value, config);
zfs_close(zhp);
(void) zfs_iter_filesystems_v2(zhp, 0,
find_dsname_by_prop_value, config);
zfs_close(zhp);
}
char *dsname = config->dsname;
config->dsname = NULL;
return (dsname);
@ -655,8 +682,13 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags,
if (config_err != PAM_SUCCESS) {
return (config_err);
}
if (config.uid < config.uid_min || config.uid > config.uid_max) {
zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
const pw_password_t *token = pw_fetch_lazy(pamh);
const pw_password_t *token = pw_fetch_lazy(pamh,
PAM_AUTHTOK, PASSWORD_VAR_NAME);
if (token == NULL) {
zfs_key_config_free(&config);
return (PAM_AUTH_ERR);
@ -706,10 +738,12 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags,
if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
return (PAM_SERVICE_ERR);
}
if (config.uid < 1000) {
if (config.uid < config.uid_min || config.uid > config.uid_max) {
zfs_key_config_free(&config);
return (PAM_SUCCESS);
return (PAM_SERVICE_ERR);
}
const pw_password_t *old_token = pw_get(pamh,
PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME);
{
if (pam_zfs_init(pamh) != 0) {
zfs_key_config_free(&config);
@ -721,49 +755,62 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags,
zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
int key_loaded = is_key_loaded(pamh, dataset);
if (key_loaded == -1) {
if (!old_token) {
pam_syslog(pamh, LOG_ERR,
"old password from PAM stack is null");
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
free(dataset);
pam_zfs_free();
if (! key_loaded) {
if (decrypt_mount(pamh, dataset,
old_token->value, B_TRUE) == -1) {
pam_syslog(pamh, LOG_ERR,
"key not loaded, returning try_again");
"old token mismatch");
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
return (PAM_PERM_DENIED);
}
}
if ((flags & PAM_UPDATE_AUTHTOK) != 0) {
const pw_password_t *token = pw_get(pamh);
const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK,
PASSWORD_VAR_NAME);
if (token == NULL) {
pam_syslog(pamh, LOG_ERR, "new password unavailable");
pam_zfs_free();
zfs_key_config_free(&config);
return (PAM_SERVICE_ERR);
}
if (pam_zfs_init(pamh) != 0) {
zfs_key_config_free(&config);
pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
return (PAM_SERVICE_ERR);
}
char *dataset = zfs_key_config_get_dataset(&config);
if (!dataset) {
pam_zfs_free();
zfs_key_config_free(&config);
pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
pw_clear(pamh, PASSWORD_VAR_NAME);
return (PAM_SERVICE_ERR);
}
if (change_key(pamh, dataset, token->value) == -1) {
int was_loaded = is_key_loaded(pamh, dataset);
if (!was_loaded && decrypt_mount(pamh, dataset,
old_token->value, B_FALSE) == -1) {
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
pw_clear(pamh, PASSWORD_VAR_NAME);
return (PAM_SERVICE_ERR);
}
int changed = change_key(pamh, dataset, token->value);
if (!was_loaded) {
unmount_unload(pamh, dataset, config.force_unmount);
}
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
if (pw_clear(pamh) == -1) {
if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 ||
pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) {
return (PAM_SERVICE_ERR);
}
} else {
@ -788,7 +835,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags,
return (PAM_SESSION_ERR);
}
if (config.uid < 1000) {
if (config.uid < config.uid_min || config.uid > config.uid_max) {
zfs_key_config_free(&config);
return (PAM_SUCCESS);
}
@ -799,7 +846,8 @@ pam_sm_open_session(pam_handle_t *pamh, int flags,
return (PAM_SUCCESS);
}
const pw_password_t *token = pw_get(pamh);
const pw_password_t *token = pw_get(pamh,
PAM_AUTHTOK, PASSWORD_VAR_NAME);
if (token == NULL) {
zfs_key_config_free(&config);
return (PAM_SESSION_ERR);
@ -823,7 +871,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags,
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);
if (pw_clear(pamh) == -1) {
if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) {
return (PAM_SERVICE_ERR);
}
return (PAM_SUCCESS);
@ -846,7 +894,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags,
if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
return (PAM_SESSION_ERR);
}
if (config.uid < 1000) {
if (config.uid < config.uid_min || config.uid > config.uid_max) {
zfs_key_config_free(&config);
return (PAM_SUCCESS);
}
@ -868,7 +916,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags,
zfs_key_config_free(&config);
return (PAM_SESSION_ERR);
}
if (unmount_unload(pamh, dataset) == -1) {
if (unmount_unload(pamh, dataset, config.force_unmount) == -1) {
free(dataset);
pam_zfs_free();
zfs_key_config_free(&config);

View File

@ -75,7 +75,7 @@ typedef struct kmem_cache {
extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache);
extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
__attribute__((alloc_size(1)))
__attribute__((malloc, alloc_size(1)))
void *zfs_kmem_alloc(size_t size, int kmflags);
void zfs_kmem_free(void *buf, size_t size);
uint64_t kmem_size(void);
@ -83,6 +83,7 @@ kmem_cache_t *kmem_cache_create(const char *name, size_t bufsize, size_t align,
int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);
void kmem_cache_destroy(kmem_cache_t *cache);
__attribute__((malloc))
void *kmem_cache_alloc(kmem_cache_t *cache, int flags);
void kmem_cache_free(kmem_cache_t *cache, void *buf);
boolean_t kmem_cache_reap_active(void);

View File

@ -68,7 +68,6 @@ enum scope_prefix_types {
zfs_trim,
zfs_txg,
zfs_vdev,
zfs_vdev_cache,
zfs_vdev_file,
zfs_vdev_mirror,
zfs_vnops,

View File

@ -31,10 +31,10 @@
#include <linux/vmalloc.h>
extern int kmem_debugging(void);
extern char *kmem_vasprintf(const char *fmt, va_list ap)
__attribute__((format(printf, 1, 0)));
extern char *kmem_asprintf(const char *fmt, ...)
__attribute__((format(printf, 1, 2)));
__attribute__((format(printf, 1, 0)))
extern char *kmem_vasprintf(const char *fmt, va_list ap);
__attribute__((format(printf, 1, 2)))
extern char *kmem_asprintf(const char *fmt, ...);
extern char *kmem_strdup(const char *str);
extern void kmem_strfree(char *str);
@ -186,10 +186,10 @@ extern unsigned int spl_kmem_alloc_max;
#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))
#define kmem_cache_reap_active spl_kmem_cache_reap_active
extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line)
__attribute__((alloc_size(1)));
extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line)
__attribute__((alloc_size(1)));
__attribute__((malloc, alloc_size(1)))
extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
__attribute__((malloc, alloc_size(1)))
extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
extern void spl_kmem_free(const void *ptr, size_t sz);
/*

View File

@ -91,8 +91,10 @@ typedef struct vmem { } vmem_t;
#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz))
extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line)
__attribute__((malloc, alloc_size(1)));
extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line)
__attribute__((malloc, alloc_size(1)));
extern void spl_vmem_free(const void *ptr, size_t sz);
int spl_vmem_init(void);

View File

@ -215,6 +215,39 @@ DEFINE_EVENT(zfs_zil_commit_io_error_class, name, \
TP_ARGS(zilog, zcw))
DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error);
/*
* Generic support for three argument tracepoints of the form:
*
* DTRACE_PROBE3(...,
* zilog_t *, ...,
* uint64_t, ...,
* uint64_t, ...);
*/
/* BEGIN CSTYLED */
DECLARE_EVENT_CLASS(zfs_zil_block_size_class,
TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1),
TP_ARGS(zilog, res, s1),
TP_STRUCT__entry(
ZILOG_TP_STRUCT_ENTRY
__field(uint64_t, res)
__field(uint64_t, s1)
),
TP_fast_assign(
ZILOG_TP_FAST_ASSIGN
__entry->res = res;
__entry->s1 = s1;
),
TP_printk(
ZILOG_TP_PRINTK_FMT " res %llu s1 %llu",
ZILOG_TP_PRINTK_ARGS, __entry->res, __entry->s1)
);
#define DEFINE_ZIL_BLOCK_SIZE_EVENT(name) \
DEFINE_EVENT(zfs_zil_block_size_class, name, \
TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), \
TP_ARGS(zilog, res, s1))
DEFINE_ZIL_BLOCK_SIZE_EVENT(zfs_zil__block__size);
#endif /* _TRACE_ZIL_H */
#undef TRACE_INCLUDE_PATH
@ -228,6 +261,7 @@ DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error);
DEFINE_DTRACE_PROBE2(zil__process__commit__itx);
DEFINE_DTRACE_PROBE2(zil__process__normal__itx);
DEFINE_DTRACE_PROBE2(zil__commit__io__error);
DEFINE_DTRACE_PROBE3(zil__block__size);
#endif /* HAVE_DECLARE_EVENT_CLASS */
#endif /* _KERNEL */

View File

@ -86,10 +86,15 @@ extern int zfs_abd_scatter_enabled;
* Allocations and deallocations
*/
__attribute__((malloc))
abd_t *abd_alloc(size_t, boolean_t);
__attribute__((malloc))
abd_t *abd_alloc_linear(size_t, boolean_t);
__attribute__((malloc))
abd_t *abd_alloc_gang(void);
__attribute__((malloc))
abd_t *abd_alloc_for_io(size_t, boolean_t);
__attribute__((malloc))
abd_t *abd_alloc_sametype(abd_t *, size_t);
boolean_t abd_size_alloc_linear(size_t);
void abd_gang_add(abd_t *, abd_t *, boolean_t);

View File

@ -105,8 +105,13 @@ typedef struct zfs_btree_index {
boolean_t bti_before;
} zfs_btree_index_t;
typedef struct btree {
typedef struct btree zfs_btree_t;
typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t,
const void *, zfs_btree_index_t *);
struct btree {
int (*bt_compar) (const void *, const void *);
bt_find_in_buf_f bt_find_in_buf;
size_t bt_elem_size;
size_t bt_leaf_size;
uint32_t bt_leaf_cap;
@ -115,7 +120,54 @@ typedef struct btree {
uint64_t bt_num_nodes;
zfs_btree_hdr_t *bt_root;
zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading
} zfs_btree_t;
};
/*
* Implementation of Shar's algorithm designed to accelerate binary search by
* eliminating impossible to predict branches.
*
* For optimality, this should be used to generate the search function in the
* same file as the comparator and the comparator should be marked
* `__attribute__((always_inline) inline` so that the compiler will inline it.
*
* Arguments are:
*
* NAME - The function name for this instance of the search function. Use it
* in a subsequent call to zfs_btree_create().
* T - The element type stored inside the B-Tree.
* COMP - A comparator to compare two nodes, it must return exactly: -1, 0,
* or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons,
* TREE_CMP() from avl.h can be used in a boilerplate function.
*/
/* BEGIN CSTYLED */
#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \
static void * \
NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \
const void *value, zfs_btree_index_t *where) \
{ \
T *i = (T *)buf; \
(void) tree; \
_Pragma("GCC unroll 9") \
while (nelems > 1) { \
uint32_t half = nelems / 2; \
nelems -= half; \
i += (COMP(&i[half - 1], value) < 0) * half; \
} \
\
int comp = COMP(i, value); \
where->bti_offset = (i - (T *)buf) + (comp < 0); \
where->bti_before = (comp != 0); \
\
if (comp == 0) { \
return (i); \
} \
\
return (NULL); \
} \
_Pragma("GCC diagnostic pop")
/* END CSTYLED */
/*
* Allocate and deallocate caches for btree nodes.
@ -129,13 +181,19 @@ void zfs_btree_fini(void);
* tree - the tree to be initialized
* compar - function to compare two nodes, it must return exactly: -1, 0, or +1
* -1 for <, 0 for ==, and +1 for >
* find - optional function to accelerate searches inside B-Tree nodes
* through Shar's algorithm and comparator inlining. Setting this to
* NULL will use a generic function. The function should be created
* using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar.
* compar should be marked `__attribute__((always_inline)) inline` or
* performance is unlikely to improve very much.
* size - the value of sizeof(struct my_type)
* lsize - custom leaf size
*/
void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *),
size_t);
bt_find_in_buf_f, size_t);
void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *),
size_t, size_t);
bt_find_in_buf_f, size_t, size_t);
/*
* Find a node with a matching value in the tree. Returns the matching node

View File

@ -1174,10 +1174,6 @@ extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep,
zbookmark_phys_t *zb);
extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep);
/* vdev cache */
extern void vdev_cache_stat_init(void);
extern void vdev_cache_stat_fini(void);
/* vdev mirror */
extern void vdev_mirror_stat_init(void);
extern void vdev_mirror_stat_fini(void);

View File

@ -158,12 +158,6 @@ extern boolean_t vdev_allocatable(vdev_t *vd);
extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio);
extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd);
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
extern boolean_t vdev_cache_read(zio_t *zio);
extern void vdev_cache_write(zio_t *zio);
extern void vdev_cache_purge(vdev_t *vd);
extern void vdev_queue_init(vdev_t *vd);
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);

View File

@ -57,8 +57,6 @@ extern "C" {
* Forward declarations that lots of things need.
*/
typedef struct vdev_queue vdev_queue_t;
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
struct abd;
extern uint_t zfs_vdev_queue_depth_pct;
@ -132,23 +130,6 @@ typedef const struct vdev_ops {
/*
* Virtual device properties
*/
struct vdev_cache_entry {
struct abd *ve_abd;
uint64_t ve_offset;
clock_t ve_lastused;
avl_node_t ve_offset_node;
avl_node_t ve_lastused_node;
uint32_t ve_hits;
uint16_t ve_missed_update;
zio_t *ve_fill_io;
};
struct vdev_cache {
avl_tree_t vc_offset_tree;
avl_tree_t vc_lastused_tree;
kmutex_t vc_lock;
};
typedef struct vdev_queue_class {
uint32_t vqc_active;
@ -443,7 +424,6 @@ struct vdev {
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */

View File

@ -73,13 +73,15 @@ int64_t zfs_refcount_count(zfs_refcount_t *);
int64_t zfs_refcount_add(zfs_refcount_t *, const void *);
int64_t zfs_refcount_remove(zfs_refcount_t *, const void *);
/*
* Note that (add|remove)_many add/remove one reference with "number" N,
* _not_ make N references with "number" 1, which is what vanilla
* zfs_refcount_(add|remove) would do if called N times.
* Note that (add|remove)_many adds/removes one reference with "number" N,
* _not_ N references with "number" 1, which is what (add|remove)_few does,
* or what vanilla zfs_refcount_(add|remove) called N times would do.
*
* Attempting to remove a reference with number N when none exists is a
* panic on debug kernels with reference_tracking enabled.
*/
void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *);
void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *);
int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *);
int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *);
void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *);
@ -108,6 +110,10 @@ typedef struct refcount {
#define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count)
#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
#define zfs_refcount_add_few(rc, number, holder) \
atomic_add_64(&(rc)->rc_count, number)
#define zfs_refcount_remove_few(rc, number, holder) \
atomic_add_64(&(rc)->rc_count, -number)
#define zfs_refcount_add_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, number)
#define zfs_refcount_remove_many(rc, number, holder) \

View File

@ -158,6 +158,7 @@ extern "C" {
#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48)
extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
#ifdef _KERNEL
#include <sys/zfs_znode_impl.h>
@ -280,7 +281,6 @@ extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
extern void zfs_remove_op_tables(void);
extern int zfs_create_op_tables(void);
extern dev_t zfs_cmpldev(uint64_t);
extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
extern int zfs_get_stats(objset_t *os, nvlist_t *nv);
extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os);
extern void zfs_znode_dmu_fini(znode_t *);

View File

@ -489,18 +489,22 @@ typedef struct zil_stats {
* Transactions which have been allocated to the "normal"
* (i.e. not slog) storage pool. Note that "bytes" accumulate
* the actual log record sizes - which do not include the actual
* data in case of indirect writes.
* data in case of indirect writes. bytes <= write <= alloc.
*/
kstat_named_t zil_itx_metaslab_normal_count;
kstat_named_t zil_itx_metaslab_normal_bytes;
kstat_named_t zil_itx_metaslab_normal_write;
kstat_named_t zil_itx_metaslab_normal_alloc;
/*
* Transactions which have been allocated to the "slog" storage pool.
* If there are no separate log devices, this is the same as the
* "normal" pool.
* "normal" pool. bytes <= write <= alloc.
*/
kstat_named_t zil_itx_metaslab_slog_count;
kstat_named_t zil_itx_metaslab_slog_bytes;
kstat_named_t zil_itx_metaslab_slog_write;
kstat_named_t zil_itx_metaslab_slog_alloc;
} zil_kstat_values_t;
typedef struct zil_sums {
@ -515,8 +519,12 @@ typedef struct zil_sums {
wmsum_t zil_itx_needcopy_bytes;
wmsum_t zil_itx_metaslab_normal_count;
wmsum_t zil_itx_metaslab_normal_bytes;
wmsum_t zil_itx_metaslab_normal_write;
wmsum_t zil_itx_metaslab_normal_alloc;
wmsum_t zil_itx_metaslab_slog_count;
wmsum_t zil_itx_metaslab_slog_bytes;
wmsum_t zil_itx_metaslab_slog_write;
wmsum_t zil_itx_metaslab_slog_alloc;
} zil_sums_t;
#define ZIL_STAT_INCR(zil, stat, val) \

View File

@ -44,7 +44,7 @@ extern "C" {
* must be held.
*
* After the lwb is "opened", it can transition into the "issued" state
* via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
* via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
* be held when making this transition.
*
* After the lwb's write zio completes, it transitions into the "write
@ -93,20 +93,23 @@ typedef struct lwb {
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */
lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */
zio_t *lwb_write_zio; /* zio for the lwb buffer */
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
uint64_t lwb_issued_txg; /* the txg when the write is issued */
uint64_t lwb_max_txg; /* highest txg in this lwb */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */
list_t lwb_itxs; /* list of itx's */
list_t lwb_waiters; /* list of zil_commit_waiter's */
avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */
kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
} lwb_t;
/*

View File

@ -190,7 +190,6 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_SPECULATIVE (1ULL << 8)
#define ZIO_FLAG_CONFIG_WRITER (1ULL << 9)
#define ZIO_FLAG_DONT_RETRY (1ULL << 10)
#define ZIO_FLAG_DONT_CACHE (1ULL << 11)
#define ZIO_FLAG_NODATA (1ULL << 12)
#define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13)
#define ZIO_FLAG_IO_ALLOCATING (1ULL << 14)

View File

@ -83,7 +83,7 @@ const char *_umem_debug_init(void);
const char *_umem_options_init(void);
const char *_umem_logging_init(void);
__attribute__((alloc_size(1)))
__attribute__((malloc, alloc_size(1)))
static inline void *
umem_alloc(size_t size, int flags)
{
@ -96,7 +96,7 @@ umem_alloc(size_t size, int flags)
return (ptr);
}
__attribute__((alloc_size(1)))
__attribute__((malloc, alloc_size(1)))
static inline void *
umem_alloc_aligned(size_t size, size_t align, int flags)
{
@ -118,7 +118,7 @@ umem_alloc_aligned(size_t size, size_t align, int flags)
return (ptr);
}
__attribute__((alloc_size(1)))
__attribute__((malloc, alloc_size(1)))
static inline void *
umem_zalloc(size_t size, int flags)
{
@ -188,6 +188,7 @@ umem_cache_destroy(umem_cache_t *cp)
umem_free(cp, sizeof (umem_cache_t));
}
__attribute__((malloc))
static inline void *
umem_cache_alloc(umem_cache_t *cp, int flags)
{

View File

@ -135,7 +135,6 @@ nodist_libzpool_la_SOURCES = \
module/zfs/uberblock.c \
module/zfs/unique.c \
module/zfs/vdev.c \
module/zfs/vdev_cache.c \
module/zfs/vdev_draid.c \
module/zfs/vdev_draid_rand.c \
module/zfs/vdev_indirect.c \

View File

@ -2028,21 +2028,6 @@ Max vdev I/O aggregation size.
.It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
Max vdev I/O aggregation size for non-rotating media.
.
.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq uint
Shift size to inflate reads to.
.
.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq uint
Inflate reads smaller than this value to meet the
.Sy zfs_vdev_cache_bshift
size
.Pq default Sy 64 KiB .
.
.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq uint
Total size of the per-disk cache in bytes.
.Pp
Currently this feature is disabled, as it has been found to not be helpful
for performance and in some cases harmful.
.
.It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int
A number by which the balancing algorithm increments the load calculation for
the purpose of selecting the least busy mirror member when an I/O operation

View File

@ -228,8 +228,10 @@ extensible_dataset
filesystem_limits
hole_birth
large_blocks
livelist
lz4_compress
spacemap_histogram
zpool_checkpoint
.No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev
.Ed

View File

@ -14,7 +14,7 @@
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
.\" Copyright (c) 2017 Intel Corporation.
.\"
.Dd October 7, 2020
.Dd June 4, 2023
.Dt ZDB 8
.Os
.
@ -41,6 +41,13 @@
.Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID
.Op Ar object Ns | Ns Ar range Ns
.Nm
.Fl B
.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns
.Op Fl U Ar cache
.Op Fl K Ar key
.Ar poolname Ns Ar / Ns Ar objset-ID
.Op Ar backup-flags
.Nm
.Fl C
.Op Fl A
.Op Fl U Ar cache
@ -123,6 +130,22 @@ Display options:
Display statistics regarding the number, size
.Pq logical, physical and allocated
and deduplication of blocks.
.It Fl B , -backup
Generate a backup stream, similar to
.Nm zfs Cm send ,
but for the numeric objset ID, and without opening the dataset.
This can be useful in recovery scenarios if dataset metadata has become
corrupted but the dataset itself is readable.
The optional
.Ar flags
argument is a string of one or more of the letters
.Sy e ,
.Sy L ,
.Sy c ,
and
.Sy w ,
which correspond to the same flags in
.Xr zfs-send 8 .
.It Fl c , -checksum
Verify the checksum of all metadata blocks while printing block statistics
.Po see

View File

@ -234,14 +234,11 @@ if the volume is not sparse.
Print verbose information about the created dataset.
.El
.El
.Ss ZFS Volumes as Swap
ZFS volumes may be used as swap devices.
After creating the volume with the
.Nm zfs Cm create Fl V
enable the swap area using the
.Xr swapon 8
command.
Swapping to files on ZFS filesystems is not supported.
.Ss ZFS for Swap
Swapping to a ZFS volume is prone to deadlock and not recommended.
See OpenZFS FAQ.
.Pp
Swapping to a file on a ZFS filesystem is not supported.
.
.Sh EXAMPLES
.\" These are, respectively, examples 1, 10 from zfs.8

View File

@ -456,7 +456,6 @@ ZIO_FLAG_CANFAIL:0x00000080
ZIO_FLAG_SPECULATIVE:0x00000100
ZIO_FLAG_CONFIG_WRITER:0x00000200
ZIO_FLAG_DONT_RETRY:0x00000400
ZIO_FLAG_DONT_CACHE:0x00000800
ZIO_FLAG_NODATA:0x00001000
ZIO_FLAG_INDUCE_DAMAGE:0x00002000

View File

@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y)
ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
endif
# Generated binary search code is particularly bad with this optimization.
# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
# is not affected when unrolling is done.
# Disable it until the following upstream issue is resolved:
# https://github.com/llvm/llvm-project/issues/62790
ifeq ($(CONFIG_X86),y)
ifeq ($(CONFIG_CC_IS_CLANG),y)
CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false
CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false
endif
endif
ifneq ($(KBUILD_EXTMOD),)
@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
@ -368,7 +382,6 @@ ZFS_OBJS := \
uberblock.o \
unique.o \
vdev.o \
vdev_cache.o \
vdev_draid.o \
vdev_draid_rand.o \
vdev_indirect.o \

View File

@ -308,7 +308,6 @@ SRCS+= abd.c \
uberblock.c \
unique.c \
vdev.c \
vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_indirect.c \
@ -400,6 +399,20 @@ beforeinstall:
.include <bsd.kmod.mk>
# Generated binary search code is particularly bad with this optimization.
# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
# is not affected when unrolling is done.
# Disable it until the following upstream issue is resolved:
# https://github.com/llvm/llvm-project/issues/62790
.if ${CC} == "clang"
.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64"
CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false
CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false
CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false
CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false
.endif
.endif
CFLAGS.sysctl_os.c= -include ../zfs_config.h
CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h

View File

@ -872,8 +872,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip,
"Enable to bypass vdev_validate().");
/* END CSTYLED */
/* vdev_cache.c */
/* vdev_mirror.c */
/* vdev_queue.c */

View File

@ -495,10 +495,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp)
{
zfs_acl_node_t *aclnode;
while ((aclnode = list_head(&aclp->z_acl))) {
list_remove(&aclp->z_acl, aclnode);
while ((aclnode = list_remove_head(&aclp->z_acl)))
zfs_acl_node_free(aclnode);
}
aclp->z_acl_count = 0;
aclp->z_acl_bytes = 0;
}

View File

@ -2220,92 +2220,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
return (0);
}
/*
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
uint64_t *cached_copy = NULL;
/*
* Figure out where in the objset_t the cached copy would live, if it
* is available for the requested property.
*/
if (os != NULL) {
switch (prop) {
case ZFS_PROP_VERSION:
cached_copy = &os->os_version;
break;
case ZFS_PROP_NORMALIZE:
cached_copy = &os->os_normalization;
break;
case ZFS_PROP_UTF8ONLY:
cached_copy = &os->os_utf8only;
break;
case ZFS_PROP_CASE:
cached_copy = &os->os_casesensitivity;
break;
default:
break;
}
}
if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
*value = *cached_copy;
return (0);
}
/*
* If the property wasn't cached, look up the file system's value for
* the property. For the version property, we look up a slightly
* different string.
*/
const char *pname;
int error = ENOENT;
if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
} else {
pname = zfs_prop_to_name(prop);
}
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
if (error == ENOENT) {
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
*value = ZFS_CASE_SENSITIVE;
break;
case ZFS_PROP_ACLTYPE:
*value = ZFS_ACLTYPE_NFSV4;
break;
default:
return (error);
}
error = 0;
}
/*
* If one of the methods for getting the property value above worked,
* copy it into the objset_t's cache.
*/
if (error == 0 && cached_copy != NULL) {
*cached_copy = *value;
}
return (error);
}
/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.

View File

@ -2069,6 +2069,93 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
return (error);
}
/*
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
uint64_t *cached_copy = NULL;
/*
* Figure out where in the objset_t the cached copy would live, if it
* is available for the requested property.
*/
if (os != NULL) {
switch (prop) {
case ZFS_PROP_VERSION:
cached_copy = &os->os_version;
break;
case ZFS_PROP_NORMALIZE:
cached_copy = &os->os_normalization;
break;
case ZFS_PROP_UTF8ONLY:
cached_copy = &os->os_utf8only;
break;
case ZFS_PROP_CASE:
cached_copy = &os->os_casesensitivity;
break;
default:
break;
}
}
if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
*value = *cached_copy;
return (0);
}
/*
* If the property wasn't cached, look up the file system's value for
* the property. For the version property, we look up a slightly
* different string.
*/
const char *pname;
int error = ENOENT;
if (prop == ZFS_PROP_VERSION) {
pname = ZPL_VERSION_STR;
} else {
pname = zfs_prop_to_name(prop);
}
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
if (error == ENOENT) {
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
*value = ZFS_CASE_SENSITIVE;
break;
case ZFS_PROP_ACLTYPE:
*value = ZFS_ACLTYPE_NFSV4;
break;
default:
return (error);
}
error = 0;
}
/*
* If one of the methods for getting the property value above worked,
* copy it into the objset_t's cache.
*/
if (error == 0 && cached_copy != NULL) {
*cached_copy = *value;
}
return (error);
}
void
zfs_znode_update_vfs(znode_t *zp)

View File

@ -182,8 +182,11 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
* of that infrastructure we are responsible for incrementing it.
*/
if (current->reclaim_state)
#ifdef HAVE_RECLAIM_STATE_RECLAIMED
current->reclaim_state->reclaimed += size >> PAGE_SHIFT;
#else
current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
#endif
vfree(ptr);
}
@ -1012,9 +1015,19 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
ASSERT0(flags & ~KM_PUBLIC_MASK);
ASSERT(skc->skc_magic == SKC_MAGIC);
ASSERT((skc->skc_flags & KMC_SLAB) == 0);
might_sleep();
*obj = NULL;
/*
* Since we can't sleep attempt an emergency allocation to satisfy
* the request. The only alterative is to fail the allocation but
* it's preferable try. The use of KM_NOSLEEP is expected to be rare.
*/
if (flags & KM_NOSLEEP)
return (spl_emergency_alloc(skc, flags, obj));
might_sleep();
/*
* Before allocating a new slab wait for any reaping to complete and
* then return so the local magazine can be rechecked for new objects.

View File

@ -219,7 +219,11 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
arc_reduce_target_size(ptob(sc->nr_to_scan));
arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
if (current->reclaim_state != NULL)
#ifdef HAVE_RECLAIM_STATE_RECLAIMED
current->reclaim_state->reclaimed += sc->nr_to_scan;
#else
current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
#endif
/*
* We are experiencing memory pressure which the arc_evict_zthr was

View File

@ -493,10 +493,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp)
{
zfs_acl_node_t *aclnode;
while ((aclnode = list_head(&aclp->z_acl))) {
list_remove(&aclp->z_acl, aclnode);
while ((aclnode = list_remove_head(&aclp->z_acl)))
zfs_acl_node_free(aclnode);
}
aclp->z_acl_count = 0;
aclp->z_acl_bytes = 0;
}

View File

@ -2052,91 +2052,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
return (0);
}
/*
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
uint64_t *cached_copy = NULL;
/*
* Figure out where in the objset_t the cached copy would live, if it
* is available for the requested property.
*/
if (os != NULL) {
switch (prop) {
case ZFS_PROP_VERSION:
cached_copy = &os->os_version;
break;
case ZFS_PROP_NORMALIZE:
cached_copy = &os->os_normalization;
break;
case ZFS_PROP_UTF8ONLY:
cached_copy = &os->os_utf8only;
break;
case ZFS_PROP_CASE:
cached_copy = &os->os_casesensitivity;
break;
default:
break;
}
}
if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
*value = *cached_copy;
return (0);
}
/*
* If the property wasn't cached, look up the file system's value for
* the property. For the version property, we look up a slightly
* different string.
*/
const char *pname;
int error = ENOENT;
if (prop == ZFS_PROP_VERSION)
pname = ZPL_VERSION_STR;
else
pname = zfs_prop_to_name(prop);
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
if (error == ENOENT) {
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
*value = ZFS_CASE_SENSITIVE;
break;
case ZFS_PROP_ACLTYPE:
*value = ZFS_ACLTYPE_OFF;
break;
default:
return (error);
}
error = 0;
}
/*
* If one of the methods for getting the property value above worked,
* copy it into the objset_t's cache.
*/
if (error == 0 && cached_copy != NULL) {
*cached_copy = *value;
}
return (error);
}
/*
* Return true if the corresponding vfs's unmounted flag is set.
* Otherwise return false.

View File

@ -2254,6 +2254,91 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
return (error);
}
/*
* Read a property stored within the master node.
*/
int
zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
{
uint64_t *cached_copy = NULL;
/*
* Figure out where in the objset_t the cached copy would live, if it
* is available for the requested property.
*/
if (os != NULL) {
switch (prop) {
case ZFS_PROP_VERSION:
cached_copy = &os->os_version;
break;
case ZFS_PROP_NORMALIZE:
cached_copy = &os->os_normalization;
break;
case ZFS_PROP_UTF8ONLY:
cached_copy = &os->os_utf8only;
break;
case ZFS_PROP_CASE:
cached_copy = &os->os_casesensitivity;
break;
default:
break;
}
}
if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
*value = *cached_copy;
return (0);
}
/*
* If the property wasn't cached, look up the file system's value for
* the property. For the version property, we look up a slightly
* different string.
*/
const char *pname;
int error = ENOENT;
if (prop == ZFS_PROP_VERSION)
pname = ZPL_VERSION_STR;
else
pname = zfs_prop_to_name(prop);
if (os != NULL) {
ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
}
if (error == ENOENT) {
/* No value set, use the default value */
switch (prop) {
case ZFS_PROP_VERSION:
*value = ZPL_VERSION;
break;
case ZFS_PROP_NORMALIZE:
case ZFS_PROP_UTF8ONLY:
*value = 0;
break;
case ZFS_PROP_CASE:
*value = ZFS_CASE_SENSITIVE;
break;
case ZFS_PROP_ACLTYPE:
*value = ZFS_ACLTYPE_OFF;
break;
default:
return (error);
}
error = 0;
}
/*
* If one of the methods for getting the property value above worked,
* copy it into the objset_t's cache.
*/
if (error == 0 && cached_copy != NULL) {
*cached_copy = *value;
}
return (error);
}
#if defined(_KERNEL)
EXPORT_SYMBOL(zfs_create_fs);
EXPORT_SYMBOL(zfs_obj_to_path);

View File

@ -965,7 +965,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
l2arc_dev_t *dev);
/* L2ARC persistence write I/O routines. */
static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
l2arc_write_callback_t *cb);
/* L2ARC persistence auxiliary routines. */
@ -6106,8 +6106,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
asize, abd,
ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_CANFAIL |
zio_flags | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
acb->acb_zio_head = rzio;
@ -7866,8 +7865,7 @@ arc_fini(void)
taskq_destroy(arc_prune_taskq);
mutex_enter(&arc_prune_mtx);
while ((p = list_head(&arc_prune_list)) != NULL) {
list_remove(&arc_prune_list, p);
while ((p = list_remove_head(&arc_prune_list)) != NULL) {
zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
zfs_refcount_destroy(&p->p_refcnt);
kmem_free(p, sizeof (*p));
@ -8175,7 +8173,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
static uint64_t
l2arc_write_size(l2arc_dev_t *dev)
{
uint64_t size, dev_size, tsize;
uint64_t size;
/*
* Make sure our globals have meaningful values in case the user
@ -8192,35 +8190,45 @@ l2arc_write_size(l2arc_dev_t *dev)
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
/*
* Make sure the write size does not exceed the size of the cache
* device. This is important in l2arc_evict(), otherwise infinite
* iteration can occur.
*/
dev_size = dev->l2ad_end - dev->l2ad_start;
/* We need to add in the worst case scenario of log block overhead. */
tsize = size + l2arc_log_blk_overhead(size, dev);
size += l2arc_log_blk_overhead(size, dev);
if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
/*
* Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
* times the writesize, whichever is greater.
*/
tsize += MAX(64 * 1024 * 1024,
(tsize * l2arc_trim_ahead) / 100);
size += MAX(64 * 1024 * 1024,
(size * l2arc_trim_ahead) / 100);
}
if (tsize >= dev_size) {
/*
* Make sure the write size does not exceed the size of the cache
* device. This is important in l2arc_evict(), otherwise infinite
* iteration can occur.
*/
if (size > dev->l2ad_end - dev->l2ad_start) {
cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
"plus the overhead of log blocks (persistent L2ARC, "
"%llu bytes) exceeds the size of the cache device "
"(guid %llu), resetting them to the default (%d)",
(u_longlong_t)l2arc_log_blk_overhead(size, dev),
(u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
if (l2arc_trim_ahead > 1) {
cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1");
l2arc_trim_ahead = 1;
}
if (arc_warm == B_FALSE)
size += l2arc_write_boost;
size += l2arc_log_blk_overhead(size, dev);
if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
size += MAX(64 * 1024 * 1024,
(size * l2arc_trim_ahead) / 100);
}
}
return (size);
@ -8319,20 +8327,14 @@ l2arc_dev_get_next(void)
static void
l2arc_do_free_on_write(void)
{
list_t *buflist;
l2arc_data_free_t *df, *df_prev;
l2arc_data_free_t *df;
mutex_enter(&l2arc_free_on_write_mtx);
buflist = l2arc_free_on_write;
for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df);
while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
ASSERT3P(df->l2df_abd, !=, NULL);
abd_free(df->l2df_abd);
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
mutex_exit(&l2arc_free_on_write_mtx);
}
@ -8845,7 +8847,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
top:
rerun = B_FALSE;
if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
if (dev->l2ad_hand + distance > dev->l2ad_end) {
/*
* When there is no space to accommodate upcoming writes,
* evict to the end. Then bump the write and evict hands
@ -9039,7 +9041,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
*/
ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
if (!dev->l2ad_first)
ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
}
}
@ -9299,7 +9301,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
psize);
if ((write_asize + asize) > target_sz) {
/*
* If the allocated size of this buffer plus the max
* size for the pending log block exceeds the evicted
* target size, terminate writing buffers for this run.
*/
if (write_asize + asize +
sizeof (l2arc_log_blk_phys_t) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
break;
@ -9413,8 +9421,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
* arcstat_l2_{size,asize} kstats are updated
* internally.
*/
if (l2arc_log_blk_insert(dev, hdr))
l2arc_log_blk_commit(dev, pio, cb);
if (l2arc_log_blk_insert(dev, hdr)) {
/*
* l2ad_hand will be adjusted in
* l2arc_log_blk_commit().
*/
write_asize +=
l2arc_log_blk_commit(dev, pio, cb);
}
zio_nowait(wzio);
}
@ -10173,8 +10187,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev)
err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_SPECULATIVE, B_FALSE));
abd_free(abd);
@ -10494,11 +10507,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
cb->l2rcb_abd = abd_get_from_buf(lb, asize);
pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY);
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
return (pio);
@ -10564,7 +10576,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev)
* This function allocates some memory to temporarily hold the serialized
* buffer to be written. This is then released in l2arc_write_done.
*/
static void
static uint64_t
l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
{
l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
@ -10675,6 +10687,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
dev->l2ad_log_ent_idx = 0;
dev->l2ad_log_blk_payload_asize = 0;
dev->l2ad_log_blk_payload_start = 0;
return (asize);
}
/*

View File

@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
while ((bpe = list_head(&bpl->bpl_list))) {
while ((bpe = list_remove_head(&bpl->bpl_list))) {
bplist_iterate_last_removed = bpe;
list_remove(&bpl->bpl_list, bpe);
mutex_exit(&bpl->bpl_lock);
func(arg, &bpe->bpe_blk, tx);
kmem_free(bpe, sizeof (*bpe));
@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl)
bplist_entry_t *bpe;
mutex_enter(&bpl->bpl_lock);
while ((bpe = list_head(&bpl->bpl_list))) {
bplist_iterate_last_removed = bpe;
list_remove(&bpl->bpl_list, bpe);
while ((bpe = list_remove_head(&bpl->bpl_list)))
kmem_free(bpe, sizeof (*bpe));
}
mutex_exit(&bpl->bpl_lock);
}

View File

@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr)
void
zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
size_t size)
bt_find_in_buf_f bt_find_in_buf, size_t size)
{
zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE);
zfs_btree_create_custom(tree, compar, bt_find_in_buf, size,
BTREE_LEAF_SIZE);
}
static void *
zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems,
const void *value, zfs_btree_index_t *where);
void
zfs_btree_create_custom(zfs_btree_t *tree,
int (*compar) (const void *, const void *),
bt_find_in_buf_f bt_find_in_buf,
size_t size, size_t lsize)
{
size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems);
@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree,
ASSERT3U(size, <=, esize / 2);
memset(tree, 0, sizeof (*tree));
tree->bt_compar = compar;
tree->bt_find_in_buf = (bt_find_in_buf == NULL) ?
zfs_btree_find_in_buf : bt_find_in_buf;
tree->bt_elem_size = size;
tree->bt_leaf_size = lsize;
tree->bt_leaf_cap = P2ALIGN(esize / size, 2);
@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
* element in the last leaf, it's in the last leaf or
* it's not in the tree.
*/
void *d = zfs_btree_find_in_buf(tree,
void *d = tree->bt_find_in_buf(tree,
last_leaf->btl_elems +
last_leaf->btl_hdr.bth_first * size,
last_leaf->btl_hdr.bth_count, value, &idx);
@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
ASSERT3P(node, !=, NULL);
void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
void *d = tree->bt_find_in_buf(tree, node->btc_elems,
node->btc_hdr.bth_count, value, &idx);
EQUIV(d != NULL, !idx.bti_before);
if (d != NULL) {
@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
*/
zfs_btree_leaf_t *leaf = (depth == 0 ?
(zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems +
void *d = tree->bt_find_in_buf(tree, leaf->btl_elems +
leaf->btl_hdr.bth_first * size,
leaf->btl_hdr.bth_count, value, &idx);
@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
zfs_btree_index_t idx;
ASSERT(zfs_btree_is_core(par_hdr));
VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
par_hdr->bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
uint32_t offset = idx.bti_offset;
@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
}
zfs_btree_index_t idx;
zfs_btree_core_t *parent = hdr->bth_parent;
VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems,
parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
ASSERT(idx.bti_before);
ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);

View File

@ -49,8 +49,12 @@ static dataset_kstat_values_t empty_dataset_kstats = {
{ "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }
{ "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }
}
};

View File

@ -1755,9 +1755,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
while ((dr = list_head(list)) != NULL) {
while ((dr = list_remove_head(list)) != NULL) {
ASSERT0(dr->dr_dbuf->db_level);
list_remove(list, dr);
zio_nowait(dr->dr_zio);
}

View File

@ -1371,8 +1371,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
dnode_t *dn;
abd_t *abd = rrd->abd;
zio_cksum_t bp_cksum = bp->blk_cksum;
zio_flag_t flags = ZIO_FLAG_SPECULATIVE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL;
zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY |
ZIO_FLAG_CANFAIL;
if (rwa->raw)
flags |= ZIO_FLAG_RAW;

View File

@ -1955,7 +1955,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
{
dsl_dataset_t *to_ds = dspp->to_ds;
dsl_pool_t *dp = dspp->dp;
#ifdef _KERNEL
if (dmu_objset_type(os) == DMU_OST_ZFS) {
uint64_t version;
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
@ -1964,7 +1964,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
if (version >= ZPL_VERSION_SA)
*featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
#endif
/* raw sends imply large_block_ok */
if ((dspp->rawok || dspp->large_block_ok) &&
@ -2793,6 +2792,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
}
if (err == 0) {
owned = B_TRUE;
err = zap_lookup(dspp.dp->dp_meta_objset,
dspp.to_ds->ds_object,
DS_FIELD_RESUME_TOGUID, 8, 1,
@ -2806,21 +2806,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
sizeof (dspp.saved_toname),
dspp.saved_toname);
}
if (err != 0)
/* Only disown if there was an error in the lookups */
if (owned && (err != 0))
dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
kmem_strfree(name);
} else {
err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
FTAG, &dspp.to_ds);
if (err == 0)
owned = B_TRUE;
}
owned = B_TRUE;
} else {
err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
&dspp.to_ds);
}
if (err != 0) {
/* Note: dsl dataset is not owned at this point */
dsl_pool_rele(dspp.dp, FTAG);
return (err);
}

View File

@ -1396,8 +1396,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error)
{
dmu_tx_callback_t *dcb;
while ((dcb = list_tail(cb_list)) != NULL) {
list_remove(cb_list, dcb);
while ((dcb = list_remove_tail(cb_list)) != NULL) {
dcb->dcb_func(dcb->dcb_data, error);
kmem_free(dcb, sizeof (dmu_tx_callback_t));
}

View File

@ -520,8 +520,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock)
issued = pf_end - pf_start + ipf_end - ipf_start;
if (issued > 1) {
/* More references on top of taken in dmu_zfetch_prepare(). */
for (int i = 0; i < issued - 1; i++)
zfs_refcount_add(&zs->zs_refs, NULL);
zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
} else if (issued == 0) {
/* Some other thread has done our work, so drop the ref. */
if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)

View File

@ -3782,8 +3782,7 @@ snaplist_destroy(list_t *l, const void *tag)
if (l == NULL || !list_link_active(&l->list_head))
return;
while ((snap = list_tail(l)) != NULL) {
list_remove(l, snap);
while ((snap = list_remove_tail(l)) != NULL) {
dsl_dataset_rele(snap->ds, tag);
kmem_free(snap, sizeof (*snap));
}

View File

@ -1490,7 +1490,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
if (tr_cookie == NULL)
return;
while ((tr = list_head(tr_list)) != NULL) {
while ((tr = list_remove_head(tr_list)) != NULL) {
if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
@ -1500,7 +1500,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
} else {
arc_tempreserve_clear(tr->tr_size);
}
list_remove(tr_list, tr);
kmem_free(tr, sizeof (struct tempreserve));
}

View File

@ -234,7 +234,7 @@ static int zfs_resilver_disable_defer = B_FALSE;
static int zfs_free_bpobj_enabled = 1;
/* Error blocks to be scrubbed in one txg. */
uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12;
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
@ -3437,10 +3437,8 @@ scan_io_queues_run_one(void *arg)
* If we were suspended in the middle of processing,
* requeue any unfinished sios and exit.
*/
while ((sio = list_head(&sio_list)) != NULL) {
list_remove(&sio_list, sio);
while ((sio = list_remove_head(&sio_list)) != NULL)
scan_io_queue_insert_impl(queue, sio);
}
queue->q_zio = NULL;
mutex_exit(q_lock);
@ -4877,6 +4875,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
* with single operation. Plus it makes scrubs more sequential and reduces
* chances that minor extent change move it within the B-tree.
*/
__attribute__((always_inline)) inline
static int
ext_size_compare(const void *x, const void *y)
{
@ -4885,13 +4884,17 @@ ext_size_compare(const void *x, const void *y)
return (TREE_CMP(*a, *b));
}
ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t,
ext_size_compare)
static void
ext_size_create(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf,
sizeof (uint64_t));
}
static void

View File

@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev)
list_remove(&zevent_list, ev);
/* Remove references to this event in all private file data */
while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
list_remove(&ev->ev_ze_list, ze);
while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) {
ze->ze_zevent = NULL;
ze->ze_dropped++;
}

View File

@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
* Comparison function for the private size-ordered tree using 32-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
__attribute__((always_inline)) inline
static int
metaslab_rangesize32_compare(const void *x1, const void *x2)
{
@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
if (likely(cmp))
return (cmp);
return (TREE_CMP(r1->rs_start, r2->rs_start));
return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
/*
* Comparison function for the private size-ordered tree using 64-bit
* ranges. Tree is sorted by size, larger sizes at the end of the tree.
*/
__attribute__((always_inline)) inline
static int
metaslab_rangesize64_compare(const void *x1, const void *x2)
{
@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2)
uint64_t rs_size2 = r2->rs_end - r2->rs_start;
int cmp = TREE_CMP(rs_size1, rs_size2);
if (likely(cmp))
return (cmp);
return (TREE_CMP(r1->rs_start, r2->rs_start));
return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start));
}
typedef struct metaslab_rt_arg {
zfs_btree_t *mra_bt;
uint32_t mra_floor_shift;
@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt)
range_tree_walk(rt, metaslab_size_sorted_add, &arg);
}
ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf,
range_seg32_t, metaslab_rangesize32_compare)
ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf,
range_seg64_t, metaslab_rangesize64_compare)
/*
* Create any block allocator specific components. The current allocators
* rely on using both a size-ordered range_tree_t and an array of uint64_t's.
@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg)
size_t size;
int (*compare) (const void *, const void *);
bt_find_in_buf_f bt_find;
switch (rt->rt_type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = metaslab_rangesize32_compare;
bt_find = metaslab_rt_find_rangesize32_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = metaslab_rangesize64_compare;
bt_find = metaslab_rt_find_rangesize64_in_buf;
break;
default:
panic("Invalid range seg type %d", rt->rt_type);
}
zfs_btree_create(size_tree, compare, size);
zfs_btree_create(size_tree, compare, bt_find, size);
mrap->mra_floor_shift = metaslab_by_size_min_shift;
}
@ -5641,8 +5650,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
* We reserve the slots individually so that we can unreserve
* them individually when an I/O completes.
*/
for (int d = 0; d < slots; d++)
zfs_refcount_add(&mca->mca_alloc_slots, zio);
zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
return (B_TRUE);
}
@ -5656,8 +5664,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled);
for (int d = 0; d < slots; d++)
zfs_refcount_remove(&mca->mca_alloc_slots, zio);
zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio);
}
static int

View File

@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
rt->rt_histogram[idx]--;
}
__attribute__((always_inline)) inline
static int
range_tree_seg32_compare(const void *x1, const void *x2)
{
@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
__attribute__((always_inline)) inline
static int
range_tree_seg64_compare(const void *x1, const void *x2)
{
@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
__attribute__((always_inline)) inline
static int
range_tree_seg_gap_compare(const void *x1, const void *x2)
{
@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
}
ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t,
range_tree_seg32_compare)
ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t,
range_tree_seg64_compare)
ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t,
range_tree_seg_gap_compare)
range_tree_t *
range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
void *arg, uint64_t start, uint64_t shift, uint64_t gap)
@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
size_t size;
int (*compare) (const void *, const void *);
bt_find_in_buf_f bt_find;
switch (type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
compare = range_tree_seg32_compare;
bt_find = range_tree_seg32_find_in_buf;
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
compare = range_tree_seg64_compare;
bt_find = range_tree_seg64_find_in_buf;
break;
case RANGE_SEG_GAP:
size = sizeof (range_seg_gap_t);
compare = range_tree_seg_gap_compare;
bt_find = range_tree_seg_gap_find_in_buf;
break;
default:
panic("Invalid range seg type %d", type);
}
zfs_btree_create(&rt->rt_root, compare, size);
zfs_btree_create(&rt->rt_root, compare, bt_find, size);
rt->rt_ops = ops;
rt->rt_gap = gap;

View File

@ -88,14 +88,11 @@ zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
reference_t *ref;
ASSERT3U(rc->rc_count, ==, number);
while ((ref = list_head(&rc->rc_list))) {
list_remove(&rc->rc_list, ref);
while ((ref = list_remove_head(&rc->rc_list)))
kmem_cache_free(reference_cache, ref);
}
list_destroy(&rc->rc_list);
while ((ref = list_head(&rc->rc_removed))) {
list_remove(&rc->rc_removed, ref);
while ((ref = list_remove_head(&rc->rc_removed))) {
kmem_cache_free(reference_history_cache, ref->ref_removed);
kmem_cache_free(reference_cache, ref);
}
@ -151,6 +148,15 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
return (zfs_refcount_add_many(rc, 1, holder));
}
void
zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
if (!rc->rc_tracked)
(void) zfs_refcount_add_many(rc, number, holder);
else for (; number > 0; number--)
(void) zfs_refcount_add(rc, holder);
}
int64_t
zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
const void *holder)
@ -204,6 +210,15 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
return (zfs_refcount_remove_many(rc, 1, holder));
}
void
zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder)
{
if (!rc->rc_tracked)
(void) zfs_refcount_remove_many(rc, number, holder);
else for (; number > 0; number--)
(void) zfs_refcount_remove(rc, holder);
}
void
zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
{

View File

@ -33,6 +33,7 @@
* Copyright 2017 Joyent, Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
* Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
*/
/*
@ -1608,16 +1609,16 @@ spa_unload_log_sm_metadata(spa_t *spa)
{
void *cookie = NULL;
spa_log_sm_t *sls;
log_summary_entry_t *e;
while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
&cookie)) != NULL) {
VERIFY0(sls->sls_mscount);
kmem_free(sls, sizeof (spa_log_sm_t));
}
for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
e != NULL; e = list_head(&spa->spa_log_summary)) {
while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
VERIFY0(e->lse_mscount);
list_remove(&spa->spa_log_summary, e);
kmem_free(e, sizeof (log_summary_entry_t));
}
@ -6874,9 +6875,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
if (dsl_scan_resilvering(spa_get_dsl(spa)))
if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_RESILVER_IN_PROGRESS));
}
} else {
if (vdev_rebuild_active(rvd))
return (spa_vdev_exit(spa, NULL, txg,

View File

@ -814,8 +814,7 @@ spa_remove(spa_t *spa)
if (spa->spa_root)
spa_strfree(spa->spa_root);
while ((dp = list_head(&spa->spa_config_list)) != NULL) {
list_remove(&spa->spa_config_list, dp);
while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path != NULL)
spa_strfree(dp->scd_path);
kmem_free(dp, sizeof (spa_config_dirent_t));
@ -2439,7 +2438,6 @@ spa_init(spa_mode_t mode)
zio_init();
dmu_init();
zil_init();
vdev_cache_stat_init();
vdev_mirror_stat_init();
vdev_raidz_math_init();
vdev_file_init();
@ -2463,7 +2461,6 @@ spa_fini(void)
spa_evict_all();
vdev_file_fini();
vdev_cache_stat_fini();
vdev_mirror_stat_fini();
vdev_raidz_math_fini();
chksum_fini();

View File

@ -29,7 +29,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2019, Datto Inc. All rights reserved.
* Copyright (c) 2021, Klara Inc.
* Copyright [2021] Hewlett Packard Enterprise Development LP
* Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
*/
#include <sys/zfs_context.h>
@ -715,7 +715,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
offsetof(struct vdev, vdev_dtl_node));
vd->vdev_stat.vs_timestamp = gethrtime();
vdev_queue_init(vd);
vdev_cache_init(vd);
return (vd);
}
@ -1096,7 +1095,6 @@ vdev_free(vdev_t *vd)
* Clean up vdev structure.
*/
vdev_queue_fini(vd);
vdev_cache_fini(vd);
if (vd->vdev_path)
spa_strfree(vd->vdev_path);
@ -1720,8 +1718,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
ZIO_FLAG_TRYHARD;
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@ -2612,8 +2609,6 @@ vdev_close(vdev_t *vd)
vd->vdev_ops->vdev_op_close(vd);
vdev_cache_purge(vd);
/*
* We record the previous state before we close it, so that if we are
* doing a reopen(), we don't generate FMA ereports if we notice that
@ -2699,6 +2694,17 @@ vdev_reopen(vdev_t *vd)
(void) vdev_validate(vd);
}
/*
* Recheck if resilver is still needed and cancel any
* scheduled resilver if resilver is unneeded.
*/
if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
mutex_enter(&spa->spa_async_lock);
spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
mutex_exit(&spa->spa_async_lock);
}
/*
* Reassess parent vdev's health.
*/

View File

@ -1,436 +0,0 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/*
* Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/kstat.h>
#include <sys/abd.h>
/*
* Virtual device read-ahead caching.
*
* This file implements a simple LRU read-ahead cache. When the DMU reads
* a given block, it will often want other, nearby blocks soon thereafter.
* We take advantage of this by reading a larger disk region and caching
* the result. In the best case, this can turn 128 back-to-back 512-byte
* reads into a single 64k read followed by 127 cache hits; this reduces
* latency dramatically. In the worst case, it can turn an isolated 512-byte
* read into a 64k read, which doesn't affect latency all that much but is
* terribly wasteful of bandwidth. A more intelligent version of the cache
* could keep track of access patterns and not do read-ahead unless it sees
* at least two temporally close I/Os to the same region. Currently, only
* metadata I/O is inflated. A further enhancement could take advantage of
* more semantic information about the I/O. And it could use something
* faster than an AVL tree; that was chosen solely for convenience.
*
* There are five cache operations: allocate, fill, read, write, evict.
*
* (1) Allocate. This reserves a cache entry for the specified region.
* We separate the allocate and fill operations so that multiple threads
* don't generate I/O for the same cache miss.
*
* (2) Fill. When the I/O for a cache miss completes, the fill routine
* places the data in the previously allocated cache entry.
*
* (3) Read. Read data from the cache.
*
* (4) Write. Update cache contents after write completion.
*
* (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry
* if the total cache size exceeds zfs_vdev_cache_size.
*/
/*
* These tunables are for performance analysis.
*/
/*
* All i/os smaller than zfs_vdev_cache_max will be turned into
* 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
* track buffer). At most zfs_vdev_cache_size bytes will be kept in each
* vdev's vdev_cache.
*
* TODO: Note that with the current ZFS code, it turns out that the
* vdev cache is not helpful, and in some cases actually harmful. It
* is better if we disable this. Once some time has passed, we should
* actually remove this to simplify the code. For now we just disable
* it by setting the zfs_vdev_cache_size to zero. Note that Solaris 11
* has made these same changes.
*/
static uint_t zfs_vdev_cache_max = 1 << 14; /* 16KB */
static uint_t zfs_vdev_cache_size = 0;
static uint_t zfs_vdev_cache_bshift = 16;
#define VCBS (1 << zfs_vdev_cache_bshift) /* 64KB */
static kstat_t *vdc_ksp = NULL;
typedef struct vdc_stats {
kstat_named_t vdc_stat_delegations;
kstat_named_t vdc_stat_hits;
kstat_named_t vdc_stat_misses;
} vdc_stats_t;
static vdc_stats_t vdc_stats = {
{ "delegations", KSTAT_DATA_UINT64 },
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 }
};
#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
static inline int
vdev_cache_offset_compare(const void *a1, const void *a2)
{
const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
}
static int
vdev_cache_lastused_compare(const void *a1, const void *a2)
{
const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
if (likely(cmp))
return (cmp);
/*
* Among equally old entries, sort by offset to ensure uniqueness.
*/
return (vdev_cache_offset_compare(a1, a2));
}
/*
* Evict the specified entry from the cache.
*/
static void
vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
{
ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT3P(ve->ve_fill_io, ==, NULL);
ASSERT3P(ve->ve_abd, !=, NULL);
avl_remove(&vc->vc_lastused_tree, ve);
avl_remove(&vc->vc_offset_tree, ve);
abd_free(ve->ve_abd);
kmem_free(ve, sizeof (vdev_cache_entry_t));
}
/*
* Allocate an entry in the cache. At the point we don't have the data,
* we're just creating a placeholder so that multiple threads don't all
* go off and read the same blocks.
*/
static vdev_cache_entry_t *
vdev_cache_allocate(zio_t *zio)
{
vdev_cache_t *vc = &zio->io_vd->vdev_cache;
uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
vdev_cache_entry_t *ve;
ASSERT(MUTEX_HELD(&vc->vc_lock));
if (zfs_vdev_cache_size == 0)
return (NULL);
/*
* If adding a new entry would exceed the cache size,
* evict the oldest entry (LRU).
*/
if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
zfs_vdev_cache_size) {
ve = avl_first(&vc->vc_lastused_tree);
if (ve->ve_fill_io != NULL)
return (NULL);
ASSERT3U(ve->ve_hits, !=, 0);
vdev_cache_evict(vc, ve);
}
ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
ve->ve_offset = offset;
ve->ve_lastused = ddi_get_lbolt();
ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
avl_add(&vc->vc_offset_tree, ve);
avl_add(&vc->vc_lastused_tree, ve);
return (ve);
}
static void
vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
{
uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
ASSERT(MUTEX_HELD(&vc->vc_lock));
ASSERT3P(ve->ve_fill_io, ==, NULL);
if (ve->ve_lastused != ddi_get_lbolt()) {
avl_remove(&vc->vc_lastused_tree, ve);
ve->ve_lastused = ddi_get_lbolt();
avl_add(&vc->vc_lastused_tree, ve);
}
ve->ve_hits++;
abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
}
/*
* Fill a previously allocated cache entry with data.
*/
static void
vdev_cache_fill(zio_t *fio)
{
vdev_t *vd = fio->io_vd;
vdev_cache_t *vc = &vd->vdev_cache;
vdev_cache_entry_t *ve = fio->io_private;
zio_t *pio;
ASSERT3U(fio->io_size, ==, VCBS);
/*
* Add data to the cache.
*/
mutex_enter(&vc->vc_lock);
ASSERT3P(ve->ve_fill_io, ==, fio);
ASSERT3U(ve->ve_offset, ==, fio->io_offset);
ASSERT3P(ve->ve_abd, ==, fio->io_abd);
ve->ve_fill_io = NULL;
/*
* Even if this cache line was invalidated by a missed write update,
* any reads that were queued up before the missed update are still
* valid, so we can satisfy them from this line before we evict it.
*/
zio_link_t *zl = NULL;
while ((pio = zio_walk_parents(fio, &zl)) != NULL)
vdev_cache_hit(vc, ve, pio);
if (fio->io_error || ve->ve_missed_update)
vdev_cache_evict(vc, ve);
mutex_exit(&vc->vc_lock);
}
/*
* Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
*/
boolean_t
vdev_cache_read(zio_t *zio)
{
vdev_cache_t *vc = &zio->io_vd->vdev_cache;
vdev_cache_entry_t *ve, ve_search;
uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
zio_t *fio;
uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS);
ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
if (zfs_vdev_cache_size == 0)
return (B_FALSE);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
return (B_FALSE);
if (zio->io_size > zfs_vdev_cache_max)
return (B_FALSE);
/*
* If the I/O straddles two or more cache blocks, don't cache it.
*/
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
return (B_FALSE);
ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
mutex_enter(&vc->vc_lock);
ve_search.ve_offset = cache_offset;
ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
if (ve != NULL) {
if (ve->ve_missed_update) {
mutex_exit(&vc->vc_lock);
return (B_FALSE);
}
if ((fio = ve->ve_fill_io) != NULL) {
zio_vdev_io_bypass(zio);
zio_add_child(zio, fio);
mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_delegations);
return (B_TRUE);
}
vdev_cache_hit(vc, ve, zio);
zio_vdev_io_bypass(zio);
mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_hits);
return (B_TRUE);
}
ve = vdev_cache_allocate(zio);
if (ve == NULL) {
mutex_exit(&vc->vc_lock);
return (B_FALSE);
}
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio;
zio_vdev_io_bypass(zio);
zio_add_child(zio, fio);
mutex_exit(&vc->vc_lock);
zio_nowait(fio);
VDCSTAT_BUMP(vdc_stat_misses);
return (B_TRUE);
}
/*
* Update cache contents upon write completion.
*/
void
vdev_cache_write(zio_t *zio)
{
vdev_cache_t *vc = &zio->io_vd->vdev_cache;
vdev_cache_entry_t *ve, ve_search;
uint64_t io_start = zio->io_offset;
uint64_t io_end = io_start + zio->io_size;
uint64_t min_offset = P2ALIGN(io_start, VCBS);
uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
avl_index_t where;
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
mutex_enter(&vc->vc_lock);
ve_search.ve_offset = min_offset;
ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
if (ve == NULL)
ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
while (ve != NULL && ve->ve_offset < max_offset) {
uint64_t start = MAX(ve->ve_offset, io_start);
uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
if (ve->ve_fill_io != NULL) {
ve->ve_missed_update = 1;
} else {
abd_copy_off(ve->ve_abd, zio->io_abd,
start - ve->ve_offset, start - io_start,
end - start);
}
ve = AVL_NEXT(&vc->vc_offset_tree, ve);
}
mutex_exit(&vc->vc_lock);
}
void
vdev_cache_purge(vdev_t *vd)
{
vdev_cache_t *vc = &vd->vdev_cache;
vdev_cache_entry_t *ve;
mutex_enter(&vc->vc_lock);
while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
vdev_cache_evict(vc, ve);
mutex_exit(&vc->vc_lock);
}
void
vdev_cache_init(vdev_t *vd)
{
vdev_cache_t *vc = &vd->vdev_cache;
mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
sizeof (vdev_cache_entry_t),
offsetof(struct vdev_cache_entry, ve_offset_node));
avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
sizeof (vdev_cache_entry_t),
offsetof(struct vdev_cache_entry, ve_lastused_node));
}
void
vdev_cache_fini(vdev_t *vd)
{
vdev_cache_t *vc = &vd->vdev_cache;
vdev_cache_purge(vd);
avl_destroy(&vc->vc_offset_tree);
avl_destroy(&vc->vc_lastused_tree);
mutex_destroy(&vc->vc_lock);
}
void
vdev_cache_stat_init(void)
{
vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (vdc_ksp != NULL) {
vdc_ksp->ks_data = &vdc_stats;
kstat_install(vdc_ksp);
}
}
void
vdev_cache_stat_fini(void)
{
if (vdc_ksp != NULL) {
kstat_delete(vdc_ksp);
vdc_ksp = NULL;
}
}
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, UINT, ZMOD_RW,
"Inflate reads small than max");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, UINT, ZMOD_RD,
"Total size of the per-disk cache");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, UINT, ZMOD_RW,
"Shift size to inflate reads too");

View File

@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio)
indirect_vsd_t *iv = zio->io_vsd;
indirect_split_t *is;
while ((is = list_head(&iv->iv_splits)) != NULL) {
while ((is = list_remove_head(&iv->iv_splits)) != NULL) {
for (int c = 0; c < is->is_children; c++) {
indirect_child_t *ic = &is->is_child[c];
if (ic->ic_data != NULL)
abd_free(ic->ic_data);
}
list_remove(&iv->iv_splits, is);
indirect_child_t *ic;
while ((ic = list_head(&is->is_unique_child)) != NULL)
list_remove(&is->is_unique_child, ic);
while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
;
list_destroy(&is->is_unique_child);
@ -1659,8 +1658,8 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
for (indirect_split_t *is = list_head(&iv->iv_splits);
is != NULL; is = list_next(&iv->iv_splits, is)) {
indirect_child_t *ic;
while ((ic = list_head(&is->is_unique_child)) != NULL)
list_remove(&is->is_unique_child, ic);
while ((ic = list_remove_head(&is->is_unique_child)) != NULL)
;
is->is_unique_children = 0;
}

View File

@ -748,8 +748,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
abd, size, first->io_type, zio->io_priority,
flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
vdev_queue_agg_io_done, NULL);
flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
aio->io_timestamp = first->io_timestamp;
nio = first;
@ -907,7 +906,7 @@ vdev_queue_io(zio_t *zio)
ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
}
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
zio->io_timestamp = gethrtime();
mutex_enter(&vq->vq_lock);

View File

@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size)
}
}
__attribute__((always_inline)) inline
static int
mze_compare(const void *arg1, const void *arg2)
{
@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2)
(uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
}
ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
mze_compare)
static void
mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
{
@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
* 62 entries before we have to add 2KB B-tree core node.
*/
zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
sizeof (mzap_ent_t), 512);
mze_find_in_buf, sizeof (mzap_ent_t), 512);
zap_name_t *zn = zap_name_alloc(zap);
for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {

View File

@ -1522,9 +1522,8 @@ zfs_ereport_fini(void)
{
recent_events_node_t *entry;
while ((entry = list_head(&recent_events_list)) != NULL) {
while ((entry = list_remove_head(&recent_events_list)) != NULL) {
avl_remove(&recent_events_tree, entry);
list_remove(&recent_events_list, entry);
kmem_free(entry, sizeof (*entry));
}
avl_destroy(&recent_events_tree);

View File

@ -699,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
zfs_fuid_t *zfuid;
zfs_fuid_domain_t *zdomain;
while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
list_remove(&fuidp->z_fuids, zfuid);
while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL)
kmem_free(zfuid, sizeof (zfs_fuid_t));
}
if (fuidp->z_domain_table != NULL)
kmem_free(fuidp->z_domain_table,
(sizeof (char *)) * fuidp->z_domain_cnt);
while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
list_remove(&fuidp->z_domains, zdomain);
while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL)
kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
}
kmem_free(fuidp, sizeof (zfs_fuid_info_t));
}

View File

@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo)
zfs_onexit_action_node_t *ap;
mutex_enter(&zo->zo_lock);
while ((ap = list_head(&zo->zo_actions)) != NULL) {
list_remove(&zo->zo_actions, ap);
while ((ap = list_remove_head(&zo->zo_actions)) != NULL) {
mutex_exit(&zo->zo_lock);
ap->za_func(ap->za_data);
kmem_free(ap, sizeof (zfs_onexit_action_node_t));

View File

@ -116,8 +116,12 @@ static zil_kstat_values_t zil_stats = {
{ "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 },
{ "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 },
};
static zil_sums_t zil_sums_global;
@ -146,6 +150,9 @@ static uint64_t zil_slog_bulk = 768 * 1024;
static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache;
static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
static itx_t *zil_itx_clone(itx_t *oitx);
static int
zil_bp_compare(const void *x1, const void *x2)
{
@ -241,11 +248,10 @@ zil_kstats_global_update(kstat_t *ksp, int rw)
*/
static int
zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
blkptr_t *nbp, void *dst, char **end)
blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
{
zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
zbookmark_phys_t zb;
int error;
@ -262,7 +268,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
&abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
if (error == 0) {
zio_cksum_t cksum = bp->blk_cksum;
@ -277,23 +283,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
*/
cksum.zc_word[ZIL_ZC_SEQ]++;
uint64_t size = BP_GET_LSIZE(bp);
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = abuf->b_data;
zil_chain_t *zilc = (*abuf)->b_data;
char *lr = (char *)(zilc + 1);
uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
zilc->zc_nused < sizeof (*zilc) ||
zilc->zc_nused > size) {
error = SET_ERROR(ECKSUM);
} else {
ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
memcpy(dst, lr, len);
*end = (char *)dst + len;
*begin = lr;
*end = lr + zilc->zc_nused - sizeof (*zilc);
*nbp = zilc->zc_next_blk;
}
} else {
char *lr = abuf->b_data;
uint64_t size = BP_GET_LSIZE(bp);
char *lr = (*abuf)->b_data;
zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
@ -301,15 +307,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
ASSERT3U(zilc->zc_nused, <=,
SPA_OLD_MAXBLOCKSIZE);
memcpy(dst, lr, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*begin = lr;
*end = lr + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
}
}
arc_buf_destroy(abuf, &abuf);
}
return (error);
@ -375,8 +377,12 @@ zil_sums_init(zil_sums_t *zs)
wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
}
void
@ -393,8 +399,12 @@ zil_sums_fini(zil_sums_t *zs)
wmsum_fini(&zs->zil_itx_needcopy_bytes);
wmsum_fini(&zs->zil_itx_metaslab_normal_count);
wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
wmsum_fini(&zs->zil_itx_metaslab_normal_write);
wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
wmsum_fini(&zs->zil_itx_metaslab_slog_count);
wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
wmsum_fini(&zs->zil_itx_metaslab_slog_write);
wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
}
void
@ -422,10 +432,18 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
zs->zil_itx_metaslab_normal_bytes.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
zs->zil_itx_metaslab_normal_write.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
zs->zil_itx_metaslab_normal_alloc.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
zs->zil_itx_metaslab_slog_count.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
zs->zil_itx_metaslab_slog_bytes.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
zs->zil_itx_metaslab_slog_write.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
zs->zil_itx_metaslab_slog_alloc.value.ui64 =
wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
}
/*
@ -445,7 +463,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
uint64_t blk_count = 0;
uint64_t lr_count = 0;
blkptr_t blk, next_blk = {{{{0}}}};
char *lrbuf, *lrp;
int error = 0;
/*
@ -463,13 +480,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
int reclen;
char *end = NULL;
char *lrp, *end;
arc_buf_t *abuf = NULL;
if (blk_seq > claim_blk_seq)
break;
@ -485,8 +502,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
break;
error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
lrbuf, &end);
&lrp, &end, &abuf);
if (error != 0) {
if (abuf)
arc_buf_destroy(abuf, &abuf);
if (claimed) {
char name[ZFS_MAX_DATASET_NAME_LEN];
@ -499,7 +518,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
break;
}
for (lrp = lrbuf; lrp < end; lrp += reclen) {
for (; lrp < end; lrp += reclen) {
lr_t *lr = (lr_t *)lrp;
reclen = lr->lrc_reclen;
ASSERT3U(reclen, >=, sizeof (lr_t));
@ -513,6 +532,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
max_lr_seq = lr->lrc_seq;
lr_count++;
}
arc_buf_destroy(abuf, &abuf);
}
done:
zilog->zl_parse_error = error;
@ -522,7 +542,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
zilog->zl_parse_lr_count = lr_count;
zil_bp_tree_fini(zilog);
zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@ -747,20 +766,21 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
lwb->lwb_blk = *bp;
lwb->lwb_fastwrite = fastwrite;
lwb->lwb_slog = slog;
lwb->lwb_indirect = B_FALSE;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
lwb->lwb_sz = BP_GET_LSIZE(bp);
} else {
lwb->lwb_nused = lwb->lwb_nfilled = 0;
lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
lwb->lwb_state = LWB_STATE_CLOSED;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
lwb->lwb_max_txg = txg;
lwb->lwb_write_zio = NULL;
lwb->lwb_root_zio = NULL;
lwb->lwb_issued_timestamp = 0;
lwb->lwb_issued_txg = 0;
if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
lwb->lwb_nused = sizeof (zil_chain_t);
lwb->lwb_sz = BP_GET_LSIZE(bp);
} else {
lwb->lwb_nused = 0;
lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
}
lwb->lwb_max_txg = txg;
mutex_enter(&zilog->zl_lock);
list_insert_tail(&zilog->zl_lwb_list, lwb);
@ -1373,9 +1393,14 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
zil_commit_waiter_t *zcw;
itx_t *itx;
uint64_t txg;
list_t itxs, waiters;
spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
list_create(&waiters, sizeof (zil_commit_waiter_t),
offsetof(zil_commit_waiter_t, zcw_node));
hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
mutex_enter(&zilog->zl_lock);
@ -1384,9 +1409,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
lwb->lwb_root_zio = NULL;
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
lwb->lwb_state = LWB_STATE_FLUSH_DONE;
if (zilog->zl_last_lwb_opened == lwb) {
/*
* Remember the highest committed log sequence number
@ -1397,13 +1419,21 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
}
while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
zil_itx_destroy(itx);
list_move_tail(&itxs, &lwb->lwb_itxs);
list_move_tail(&waiters, &lwb->lwb_waiters);
while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
lwb->lwb_state = LWB_STATE_FLUSH_DONE;
mutex_exit(&zilog->zl_lock);
while ((itx = list_remove_head(&itxs)) != NULL)
zil_itx_destroy(itx);
list_destroy(&itxs);
while ((zcw = list_remove_head(&waiters)) != NULL) {
mutex_enter(&zcw->zcw_lock);
ASSERT3P(zcw->zcw_lwb, ==, lwb);
zcw->zcw_lwb = NULL;
/*
* We expect any ZIO errors from child ZIOs to have been
@ -1428,8 +1458,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
mutex_exit(&zcw->zcw_lock);
}
mutex_exit(&zilog->zl_lock);
list_destroy(&waiters);
mutex_enter(&zilog->zl_lwb_io_lock);
txg = lwb->lwb_issued_txg;
@ -1666,46 +1695,41 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
if (lwb->lwb_root_zio != NULL)
return;
lwb->lwb_root_zio = zio_root(zilog->zl_spa,
zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
mutex_enter(&zilog->zl_lock);
if (lwb->lwb_root_zio == NULL) {
abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
BP_GET_LSIZE(&lwb->lwb_blk));
if (!lwb->lwb_fastwrite) {
metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 1;
}
if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
prio = ZIO_PRIORITY_SYNC_WRITE;
else
prio = ZIO_PRIORITY_ASYNC_WRITE;
lwb->lwb_root_zio = zio_root(zilog->zl_spa,
zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
ASSERT3P(lwb->lwb_root_zio, !=, NULL);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
lwb->lwb_state = LWB_STATE_OPENED;
zil_lwb_set_zio_dependency(zilog, lwb);
zilog->zl_last_lwb_opened = lwb;
if (!lwb->lwb_fastwrite) {
metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
lwb->lwb_fastwrite = 1;
}
mutex_exit(&zilog->zl_lock);
ASSERT3P(lwb->lwb_root_zio, !=, NULL);
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0,
&lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk),
zil_lwb_write_done, lwb, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb);
lwb->lwb_state = LWB_STATE_OPENED;
zil_lwb_set_zio_dependency(zilog, lwb);
zilog->zl_last_lwb_opened = lwb;
mutex_exit(&zilog->zl_lock);
}
/*
@ -1736,11 +1760,11 @@ static const struct {
static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
* Close the log block for being issued and allocate the next one.
* Has to be called under zl_issuer_lock to chain more lwbs.
*/
static lwb_t *
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb)
{
lwb_t *nlwb = NULL;
zil_chain_t *zilc;
@ -1748,7 +1772,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
blkptr_t *bp;
dmu_tx_t *tx;
uint64_t txg;
uint64_t zil_blksz, wsz;
uint64_t zil_blksz;
int i, error;
boolean_t slog;
@ -1757,16 +1781,17 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
ASSERT3P(lwb->lwb_write_zio, !=, NULL);
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
zilc = (zil_chain_t *)lwb->lwb_buf;
bp = &zilc->zc_next_blk;
} else {
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
bp = &zilc->zc_next_blk;
/*
* If this lwb includes indirect writes, we have to commit before
* creating the transaction, otherwise we may end up in dead lock.
*/
if (lwb->lwb_indirect) {
for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
itx = list_next(&lwb->lwb_itxs, itx))
zil_lwb_commit(zilog, lwb, itx);
lwb->lwb_nused = lwb->lwb_nfilled;
}
ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
/*
* Allocate the next block and save its address in this block
* before writing it in order to establish the log chain.
@ -1814,19 +1839,18 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
uint64_t, zil_blksz,
uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2)
zilc = (zil_chain_t *)lwb->lwb_buf;
else
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
bp = &zilc->zc_next_blk;
BP_ZERO(bp);
error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
if (slog) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
lwb->lwb_nused);
} else {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
lwb->lwb_nused);
}
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@ -1838,17 +1862,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
}
lwb->lwb_state = LWB_STATE_ISSUED;
dmu_tx_commit(tx);
/*
* If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced().
*/
return (nlwb);
}
/*
* Finalize previously closed block and issue the write zio.
* Does not require locking.
*/
static void
zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
{
zil_chain_t *zilc;
int wsz;
/* Actually fill the lwb with the data if not yet. */
if (!lwb->lwb_indirect) {
for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
itx = list_next(&lwb->lwb_itxs, itx))
zil_lwb_commit(zilog, lwb, itx);
lwb->lwb_nused = lwb->lwb_nfilled;
}
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
/* For Slim ZIL only write what is used. */
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
ASSERT3U(wsz, <=, lwb->lwb_sz);
wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int);
ASSERT3S(wsz, <=, lwb->lwb_sz);
zio_shrink(lwb->lwb_write_zio, wsz);
wsz = lwb->lwb_write_zio->io_size;
zilc = (zil_chain_t *)lwb->lwb_buf;
} else {
wsz = lwb->lwb_sz;
zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
}
zilc->zc_pad = 0;
zilc->zc_nused = lwb->lwb_nused;
zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
@ -1858,22 +1912,28 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
*/
memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
if (lwb->lwb_slog) {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
lwb->lwb_nused);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
wsz);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
BP_GET_LSIZE(&lwb->lwb_blk));
} else {
ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
lwb->lwb_nused);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
wsz);
ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
BP_GET_LSIZE(&lwb->lwb_blk));
}
spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
zil_lwb_add_block(lwb, &lwb->lwb_blk);
lwb->lwb_issued_timestamp = gethrtime();
lwb->lwb_state = LWB_STATE_ISSUED;
zio_nowait(lwb->lwb_root_zio);
zio_nowait(lwb->lwb_write_zio);
dmu_tx_commit(tx);
/*
* If there was an allocation failure then nlwb will be null which
* forces a txg_wait_synced().
*/
return (nlwb);
}
/*
@ -1909,13 +1969,19 @@ zil_max_copied_data(zilog_t *zilog)
sizeof (lr_write_t));
}
/*
* Estimate space needed in the lwb for the itx. Allocate more lwbs or
* split the itx as needed, but don't touch the actual transaction data.
* Has to be called under zl_issuer_lock to call zil_lwb_write_close()
* to chain more lwbs.
*/
static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
{
lr_t *lrcb, *lrc;
lr_write_t *lrwb, *lrw;
char *lr_buf;
uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data;
itx_t *citx;
lr_t *lr, *clr;
lr_write_t *lrw;
uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
ASSERT3P(lwb, !=, NULL);
@ -1923,8 +1989,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
zil_lwb_write_open(zilog, lwb);
lrc = &itx->itx_lr;
lrw = (lr_write_t *)lrc;
lr = &itx->itx_lr;
lrw = (lr_write_t *)lr;
/*
* A commit itx doesn't represent any on-disk state; instead
@ -1938,24 +2004,23 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
*
* For more details, see the comment above zil_commit().
*/
if (lrc->lrc_txtype == TX_COMMIT) {
if (lr->lrc_txtype == TX_COMMIT) {
mutex_enter(&zilog->zl_lock);
zil_commit_waiter_link_lwb(itx->itx_private, lwb);
itx->itx_private = NULL;
mutex_exit(&zilog->zl_lock);
list_insert_tail(&lwb->lwb_itxs, itx);
return (lwb);
}
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
dpad = dlen - lrw->lr_length;
} else {
dlen = dpad = 0;
dlen = 0;
}
reclen = lrc->lrc_reclen;
reclen = lr->lrc_reclen;
zilog->zl_cur_used += (reclen + dlen);
txg = lrc->lrc_txg;
cont:
/*
@ -1968,7 +2033,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
lwb_sp < zil_max_waste_space(zilog) &&
(dlen % max_log_data == 0 ||
lwb_sp < reclen + dlen % max_log_data))) {
lwb = zil_lwb_write_issue(zilog, lwb);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb);
if (lwb == NULL)
return (NULL);
zil_lwb_write_open(zilog, lwb);
@ -1987,19 +2053,99 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
}
dnow = MIN(dlen, lwb_sp - reclen);
lr_buf = lwb->lwb_buf + lwb->lwb_nused;
memcpy(lr_buf, lrc, reclen);
lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */
lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */
if (dlen > dnow) {
ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
citx = zil_itx_clone(itx);
clr = &citx->itx_lr;
lr_write_t *clrw = (lr_write_t *)clr;
clrw->lr_length = dnow;
lrw->lr_offset += dnow;
lrw->lr_length -= dnow;
} else {
citx = itx;
clr = lr;
}
/*
* We're actually making an entry, so update lrc_seq to be the
* log record sequence number. Note that this is generally not
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
clr->lrc_seq = ++zilog->zl_lr_seq;
lwb->lwb_nused += reclen + dnow;
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
zil_lwb_add_txg(lwb, lr->lrc_txg);
list_insert_tail(&lwb->lwb_itxs, citx);
dlen -= dnow;
if (dlen > 0) {
zilog->zl_cur_used += reclen;
goto cont;
}
/*
* We have to really issue all queued LWBs before we may have to
* wait for a txg sync. Otherwise we may end up in a dead lock.
*/
if (lr->lrc_txtype == TX_WRITE) {
boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa);
if (frozen || itx->itx_wr_state == WR_INDIRECT) {
lwb_t *tlwb;
while ((tlwb = list_remove_head(ilwbs)) != NULL)
zil_lwb_write_issue(zilog, tlwb);
}
if (itx->itx_wr_state == WR_INDIRECT)
lwb->lwb_indirect = B_TRUE;
if (frozen)
txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
}
return (lwb);
}
/*
* Fill the actual transaction data into the lwb, following zil_lwb_assign().
* Does not require locking.
*/
static void
zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
{
lr_t *lr, *lrb;
lr_write_t *lrw, *lrwb;
char *lr_buf;
uint64_t dlen, reclen;
lr = &itx->itx_lr;
lrw = (lr_write_t *)lr;
if (lr->lrc_txtype == TX_COMMIT)
return;
if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
dlen = P2ROUNDUP_TYPED(
lrw->lr_length, sizeof (uint64_t), uint64_t);
} else {
dlen = 0;
}
reclen = lr->lrc_reclen;
ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
memcpy(lr_buf, lr, reclen);
lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */
lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */
ZIL_STAT_BUMP(zilog, zil_itx_count);
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
*/
if (lrc->lrc_txtype == TX_WRITE) {
if (txg > spa_freeze_txg(zilog->zl_spa))
txg_wait_synced(zilog->zl_dmu_pool, txg);
if (lr->lrc_txtype == TX_WRITE) {
if (itx->itx_wr_state == WR_COPIED) {
ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
@ -2010,14 +2156,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen;
lrcb->lrc_reclen += dnow;
if (lrwb->lr_length > dnow)
lrwb->lr_length = dnow;
lrw->lr_offset += dnow;
lrw->lr_length -= dnow;
lrb->lrc_reclen += dlen;
ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
dnow);
dlen);
} else {
ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
dbuf = NULL;
@ -2044,9 +2186,11 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
error = zilog->zl_get_data(itx->itx_private,
itx->itx_gen, lrwb, dbuf, lwb,
lwb->lwb_write_zio);
if (dbuf != NULL && error == 0 && dnow == dlen)
if (dbuf != NULL && error == 0) {
/* Zero any padding bytes in the last block. */
memset((char *)dbuf + lrwb->lr_length, 0, dpad);
memset((char *)dbuf + lrwb->lr_length, 0,
dlen - lrwb->lr_length);
}
/*
* Typically, the only return values we should see from
@ -2074,39 +2218,26 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
error);
zfs_fallthrough;
case EIO:
txg_wait_synced(zilog->zl_dmu_pool, txg);
if (lwb->lwb_indirect) {
txg_wait_synced(zilog->zl_dmu_pool,
lr->lrc_txg);
} else {
lwb->lwb_write_zio->io_error = error;
}
zfs_fallthrough;
case ENOENT:
zfs_fallthrough;
case EEXIST:
zfs_fallthrough;
case EALREADY:
return (lwb);
return;
}
}
}
/*
* We're actually making an entry, so update lrc_seq to be the
* log record sequence number. Note that this is generally not
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
lrcb->lrc_seq = ++zilog->zl_lr_seq;
lwb->lwb_nused += reclen + dnow;
zil_lwb_add_txg(lwb, txg);
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
dlen -= dnow;
if (dlen > 0) {
zilog->zl_cur_used += reclen;
goto cont;
}
return (lwb);
lwb->lwb_nfilled += reclen + dlen;
ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
}
itx_t *
@ -2131,6 +2262,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize)
return (itx);
}
static itx_t *
zil_itx_clone(itx_t *oitx)
{
itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
memcpy(itx, oitx, oitx->itx_size);
itx->itx_callback = NULL;
itx->itx_callback_data = NULL;
return (itx);
}
void
zil_itx_destroy(itx_t *itx)
{
@ -2162,7 +2303,7 @@ zil_itxg_clean(void *arg)
/*
* In the general case, commit itxs will not be found
* here, as they'll be committed to an lwb via
* zil_lwb_commit(), and free'd in that function. Having
* zil_lwb_assign(), and free'd in that function. Having
* said that, it is still possible for commit itxs to be
* found here, due to the following race:
*
@ -2561,7 +2702,7 @@ zil_commit_writer_stall(zilog_t *zilog)
* lwb will be issued to the zio layer to be written to disk.
*/
static void
zil_process_commit_list(zilog_t *zilog)
zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
{
spa_t *spa = zilog->zl_spa;
list_t nolwb_itxs;
@ -2663,18 +2804,23 @@ zil_process_commit_list(zilog_t *zilog)
*/
if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
if (lwb != NULL) {
lwb = zil_lwb_commit(zilog, itx, lwb);
if (lwb == NULL)
lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
if (lwb == NULL) {
list_insert_tail(&nolwb_itxs, itx);
else
list_insert_tail(&lwb->lwb_itxs, itx);
} else if ((zcw->zcw_lwb != NULL &&
zcw->zcw_lwb != lwb) || zcw->zcw_done) {
/*
* Our lwb is done, leave the rest of
* itx list to somebody else who care.
*/
first = B_FALSE;
break;
}
} else {
if (lrc->lrc_txtype == TX_COMMIT) {
zil_commit_waiter_link_nolwb(
itx->itx_private, &nolwb_waiters);
}
list_insert_tail(&nolwb_itxs, itx);
}
} else {
@ -2690,6 +2836,8 @@ zil_process_commit_list(zilog_t *zilog)
* the ZIL write pipeline; see the comment within
* zil_commit_writer_stall() for more details.
*/
while ((lwb = list_remove_head(ilwbs)) != NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
/*
@ -2735,13 +2883,13 @@ zil_process_commit_list(zilog_t *zilog)
* on the system, such that this function will be
* immediately called again (not necessarily by the same
* thread) and this lwb's zio will be issued via
* zil_lwb_commit(). This way, the lwb is guaranteed to
* zil_lwb_assign(). This way, the lwb is guaranteed to
* be "full" when it is issued to disk, and we'll make
* use of the lwb's size the best we can.
*
* 2. If there isn't sufficient ZIL activity occurring on
* the system, such that this lwb's zio isn't issued via
* zil_lwb_commit(), zil_commit_waiter() will issue the
* zil_lwb_assign(), zil_commit_waiter() will issue the
* lwb's zio. If this occurs, the lwb is not guaranteed
* to be "full" by the time its zio is issued, and means
* the size of the lwb was "too large" given the amount
@ -2773,10 +2921,15 @@ zil_process_commit_list(zilog_t *zilog)
zfs_commit_timeout_pct / 100;
if (sleep < zil_min_commit_timeout ||
lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) {
lwb = zil_lwb_write_issue(zilog, lwb);
list_insert_tail(ilwbs, lwb);
lwb = zil_lwb_write_close(zilog, lwb);
zilog->zl_cur_used = 0;
if (lwb == NULL)
if (lwb == NULL) {
while ((lwb = list_remove_head(ilwbs))
!= NULL)
zil_lwb_write_issue(zilog, lwb);
zil_commit_writer_stall(zilog);
}
}
}
}
@ -2799,9 +2952,13 @@ zil_process_commit_list(zilog_t *zilog)
static void
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
list_t ilwbs;
lwb_t *lwb;
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(spa_writeable(zilog->zl_spa));
list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
mutex_enter(&zilog->zl_issuer_lock);
if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
@ -2828,10 +2985,13 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
zil_get_commit_list(zilog);
zil_prune_commit_list(zilog);
zil_process_commit_list(zilog);
zil_process_commit_list(zilog, zcw, &ilwbs);
out:
mutex_exit(&zilog->zl_issuer_lock);
while ((lwb = list_remove_head(&ilwbs)) != NULL)
zil_lwb_write_issue(zilog, lwb);
list_destroy(&ilwbs);
}
static void
@ -2858,7 +3018,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
return;
/*
* In order to call zil_lwb_write_issue() we must hold the
* In order to call zil_lwb_write_close() we must hold the
* zilog's "zl_issuer_lock". We can't simply acquire that lock,
* since we're already holding the commit waiter's "zcw_lock",
* and those two locks are acquired in the opposite order
@ -2876,8 +3036,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* the waiter is marked "done"), so without this check we could
* wind up with a use-after-free error below.
*/
if (zcw->zcw_done)
if (zcw->zcw_done) {
lwb = NULL;
goto out;
}
ASSERT3P(lwb, ==, zcw->zcw_lwb);
@ -2896,15 +3058,17 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* if it's ISSUED or OPENED, and block any other threads that might
* attempt to issue this lwb. For that reason we hold the
* zl_issuer_lock when checking the lwb_state; we must not call
* zil_lwb_write_issue() if the lwb had already been issued.
* zil_lwb_write_close() if the lwb had already been issued.
*
* See the comment above the lwb_state_t structure definition for
* more details on the lwb states, and locking requirements.
*/
if (lwb->lwb_state == LWB_STATE_ISSUED ||
lwb->lwb_state == LWB_STATE_WRITE_DONE ||
lwb->lwb_state == LWB_STATE_FLUSH_DONE)
lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
lwb = NULL;
goto out;
}
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
@ -2914,7 +3078,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* since we've reached the commit waiter's timeout and it still
* hasn't been issued.
*/
lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
lwb_t *nlwb = zil_lwb_write_close(zilog, lwb);
ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
@ -2934,7 +3098,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
if (nlwb == NULL) {
/*
* When zil_lwb_write_issue() returns NULL, this
* When zil_lwb_write_close() returns NULL, this
* indicates zio_alloc_zil() failed to allocate the
* "next" lwb on-disk. When this occurs, the ZIL write
* pipeline must be stalled; see the comment within the
@ -2956,12 +3120,16 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* lock, which occurs prior to calling dmu_tx_commit()
*/
mutex_exit(&zcw->zcw_lock);
zil_lwb_write_issue(zilog, lwb);
lwb = NULL;
zil_commit_writer_stall(zilog);
mutex_enter(&zcw->zcw_lock);
}
out:
mutex_exit(&zilog->zl_issuer_lock);
if (lwb)
zil_lwb_write_issue(zilog, lwb);
ASSERT(MUTEX_HELD(&zcw->zcw_lock));
}
@ -2976,7 +3144,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
* waited "long enough" and the lwb is still in the "open" state.
*
* Given a sufficient amount of itxs being generated and written using
* the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
* the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
* function. If this does not occur, this secondary responsibility will
* ensure the lwb is issued even if there is not other synchronous
* activity on the system.
@ -3656,7 +3824,7 @@ zil_close(zilog_t *zilog)
/*
* zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
* on the time when the dmu_tx transaction is assigned in
* zil_lwb_write_issue().
* zil_lwb_write_close().
*/
mutex_enter(&zilog->zl_lwb_io_lock);
txg = MAX(zilog->zl_lwb_max_issued_txg, txg);

View File

@ -1617,12 +1617,6 @@ zio_read_bp_init(zio_t *zio)
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
}
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
@ -3955,9 +3949,6 @@ zio_vdev_io_start(zio_t *zio)
zio->io_type == ZIO_TYPE_WRITE ||
zio->io_type == ZIO_TYPE_TRIM)) {
if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
return (zio);
if ((zio = vdev_queue_io(zio)) == NULL)
return (NULL);
@ -3994,9 +3985,6 @@ zio_vdev_io_done(zio_t *zio)
vd->vdev_ops != &vdev_draid_spare_ops) {
vdev_queue_io_done(zio);
if (zio->io_type == ZIO_TYPE_WRITE)
vdev_cache_write(zio);
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_device_injections(vd, zio,
EIO, EILSEQ);
@ -4106,8 +4094,7 @@ zio_vdev_io_assess(zio_t *zio)
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */
zio->io_error = 0;
zio->io_flags |= ZIO_FLAG_IO_RETRY |
ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE;
zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
zio_requeue_io_start_cut_in_line);

View File

@ -1203,8 +1203,7 @@ zvol_create_minors_recursive(const char *name)
* Prefetch is completed, we can do zvol_os_create_minor
* sequentially.
*/
while ((job = list_head(&minors_list)) != NULL) {
list_remove(&minors_list, job);
while ((job = list_remove_head(&minors_list)) != NULL) {
if (!job->error)
(void) zvol_os_create_minor(job->name);
kmem_strfree(job->name);
@ -1311,10 +1310,8 @@ zvol_remove_minors_impl(const char *name)
rw_exit(&zvol_state_lock);
/* Drop zvol_state_lock before calling zvol_free() */
while ((zv = list_head(&free_list)) != NULL) {
list_remove(&free_list, zv);
while ((zv = list_remove_head(&free_list)) != NULL)
zvol_os_free(zv);
}
}
/* Remove minor for this specific volume only */

View File

@ -128,7 +128,7 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2']
'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup']
pre =
post =
tags = ['functional', 'cli_root', 'zdb']
@ -472,7 +472,8 @@ tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
tags = ['functional', 'cli_root', 'zpool_replace']
[tests/functional/cli_root/zpool_resilver]
tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart']
tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart',
'zpool_resilver_concurrent']
tags = ['functional', 'cli_root', 'zpool_resilver']
[tests/functional/cli_root/zpool_scrub]

View File

@ -25,3 +25,8 @@ tags = ['functional']
[tests/functional/cli_root/zfs_jail:FreeBSD]
tests = ['zfs_jail_001_pos']
tags = ['functional', 'cli_root', 'zfs_jail']
[tests/functional/pam:FreeBSD]
tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive',
'pam_short_password']
tags = ['functional', 'pam']

View File

@ -140,7 +140,8 @@ tests = ['umount_unlinked_drain']
tags = ['functional', 'mount']
[tests/functional/pam:Linux]
tests = ['pam_basic', 'pam_nounmount', 'pam_short_password']
tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive',
'pam_short_password']
tags = ['functional', 'pam']
[tests/functional/procfs:Linux]

View File

@ -152,6 +152,7 @@ known = {
['FAIL', rewind_reason],
'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason],
'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason],
'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053],
'privilege/setup': ['SKIP', na_reason],
'refreserv/refreserv_004_pos': ['FAIL', known_reason],
'rootpool/setup': ['SKIP', na_reason],
@ -163,6 +164,8 @@ if sys.platform.startswith('freebsd'):
known.update({
'cli_root/zfs_receive/receive-o-x_props_override':
['FAIL', known_reason],
'cli_root/zpool_resilver/zpool_resilver_concurrent':
['SKIP', na_reason],
'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
@ -277,6 +280,8 @@ elif sys.platform.startswith('linux'):
'mmp/mmp_inactive_import': ['FAIL', known_reason],
'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621],
'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason],
'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872],
'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872],
'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason],
'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason],
'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason],

View File

@ -501,7 +501,7 @@ main(int argc, char *argv[])
srandom(seed);
zfs_btree_init();
zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t));
zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t));
/*
* This runs the named negative test. None of them should

View File

@ -572,6 +572,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zdb/zdb_006_pos.ksh \
functional/cli_root/zdb/zdb_args_neg.ksh \
functional/cli_root/zdb/zdb_args_pos.ksh \
functional/cli_root/zdb/zdb_backup.ksh \
functional/cli_root/zdb/zdb_block_size_histogram.ksh \
functional/cli_root/zdb/zdb_checksum.ksh \
functional/cli_root/zdb/zdb_decompress.ksh \
@ -1142,6 +1143,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_resilver/setup.ksh \
functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
functional/cli_root/zpool_scrub/cleanup.ksh \
functional/cli_root/zpool_scrub/setup.ksh \
functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \

View File

@ -0,0 +1,55 @@
#!/bin/ksh
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2023, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
write_count=8
blksize=131072
tmpfile=$TEST_BASE_DIR/tmpfile
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
rm $tmpfile.1 $tmpfile.2
}
log_onexit cleanup
log_assert "Verify that zfs send and zdb -B produce the same stream"
verify_runnable "global"
verify_disk_count "$DISKS" 2
default_mirror_setup_noexit $DISKS
file_write -o create -w -f $TESTDIR/file -b $blksize -c $write_count
snap=$TESTPOOL/$TESTFS@snap
log_must zfs snapshot $snap
typeset -i objsetid=$(zfs get -Ho value objsetid $snap)
sync_pool $TESTPOOL
log_must eval "zfs send -ecL $snap > $tmpfile.1"
log_must eval "zdb -B $TESTPOOL/$objsetid ecL > $tmpfile.2"
typeset sum1=$(cat $tmpfile.1 | md5sum)
typeset sum2=$(cat $tmpfile.2 | md5sum)
log_must test "$sum1" = "$sum2"
log_pass "zfs send and zdb -B produce the same stream"

View File

@ -0,0 +1,101 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib
#
# DESCRIPTION:
# Verify 'zpool clear' doesn't cause concurrent resilvers
#
# STRATEGY:
# 1. Create N(10) virtual disk files.
# 2. Create draid pool based on the virtual disk files.
# 3. Fill the filesystem with directories and files.
# 4. Force-fault 2 vdevs and verify distributed spare is kicked in.
# 5. Free the distributed spare by replacing the faulty drive.
# 6. Run zpool clear and verify that it does not initiate 2 resilvers
# concurrently while distributed spare gets kicked in.
#
verify_runnable "global"
typeset -ir devs=10
typeset -ir nparity=1
typeset -ir ndata=8
typeset -ir dspare=1
function cleanup
{
poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL"
for i in {0..$devs}; do
log_must rm -f "$BASEDIR/vdev$i"
done
for dir in $BASEDIR; do
if [[ -d $dir ]]; then
log_must rm -rf $dir
fi
done
zed_stop
zed_cleanup
}
log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers"
log_onexit cleanup
setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs
# ZED needed for sequential resilver
zed_setup
log_must zed_start
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5
log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60
log_must zpool wait -t resilver $TESTPOOL
log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6
log_must zpool labelclear -f $BASEDIR/vdev5
log_must zpool labelclear -f $BASEDIR/vdev6
log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5
sync_pool $TESTPOOL
log_must zpool events -c
log_must zpool clear $TESTPOOL
log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60
log_must zpool wait -t resilver $TESTPOOL
log_must zpool wait -t scrub $TESTPOOL
nof_resilver=$(zpool events | grep -c resilver_start)
if [ $nof_resilver = 1 ] ; then
log_must verify_pool $TESTPOOL
log_pass "zpool clear on draid pool doesn't cause concurrent resilvers"
else
log_fail "FAIL: sequential and healing resilver initiated concurrently"
fi

View File

@ -25,7 +25,7 @@
is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing"
set -A args "" "-s \",\"" "-v" \
"-f time,zcwc,zimnb,zimsb"
"-f time,cwc,imnb,imsb"
log_assert "zilstat generates output and doesn't return an error code"

View File

@ -25,5 +25,6 @@
rmconfig
destroy_pool $TESTPOOL
del_user ${username}
del_user ${username}rec
del_group pamtestgroup
log_must rm -rf "$runstatedir" $TESTDIRS

View File

@ -0,0 +1,55 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
. $STF_SUITE/tests/functional/pam/utilities.kshlib
if [ -n "$ASAN_OPTIONS" ]; then
export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}')
fi
log_mustnot ismounted "$TESTPOOL/pam/${username}"
keystatus unavailable
genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}"
printf "testpass\nsecondpass\nsecondpass\n" | pamtester -v ${pamservice} ${username} chauthtok
log_mustnot ismounted "$TESTPOOL/pam/${username}"
keystatus unavailable
echo "secondpass" | pamtester ${pamservice} ${username} open_session
references 1
log_must ismounted "$TESTPOOL/pam/${username}"
keystatus available
printf "secondpass\ntestpass\ntestpass\n" | pamtester -v ${pamservice} ${username} chauthtok
log_must ismounted "$TESTPOOL/pam/${username}"
log_must ismounted "$TESTPOOL/pam/${username}"
keystatus available
log_must pamtester ${pamservice} ${username} close_session
references 0
log_mustnot ismounted "$TESTPOOL/pam/${username}"
keystatus unavailable
log_pass "done."

View File

@ -0,0 +1,72 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
. $STF_SUITE/tests/functional/pam/utilities.kshlib
if [ -n "$ASAN_OPTIONS" ]; then
export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}')
fi
username="${username}rec"
# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests,
# and a user which references that mountpoint
log_must zfs create "$TESTPOOL/pampam"
log_must zfs create -o mountpoint="$TESTDIR/rec" "$TESTPOOL/pampam/pam"
echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \
-o keylocation=prompt "$TESTPOOL/pampam/pam/${username}"
log_must zfs unmount "$TESTPOOL/pampam/pam/${username}"
log_must zfs unload-key "$TESTPOOL/pampam/pam/${username}"
log_must add_user pamtestgroup ${username} "$TESTDIR/rec"
function keystatus {
log_must [ "$(get_prop keystatus "$TESTPOOL/pampam/pam/${username}")" = "$1" ]
}
log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}"
keystatus unavailable
function test_session {
echo "recurpass" | pamtester ${pamservice} ${username} open_session
references 1
log_must ismounted "$TESTPOOL/pampam/pam/${username}"
keystatus available
log_must pamtester ${pamservice} ${username} close_session
references 0
log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}"
keystatus unavailable
}
genconfig "homes=$TESTPOOL/pampam/pam prop_mountpoint runstatedir=${runstatedir}"
test_session
genconfig "homes=$TESTPOOL/pampam recursive_homes prop_mountpoint runstatedir=${runstatedir}"
test_session
genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint runstatedir=${runstatedir}"
test_session
genconfig "homes=* recursive_homes prop_mountpoint runstatedir=${runstatedir}"
test_session
log_pass "done."

View File

@ -52,7 +52,7 @@ log_must ismounted "$TESTPOOL/pam/${username}"
keystatus available
# Change user and dataset password to short one.
printf "short\nshort\n" | pamtester ${pamservice} ${username} chauthtok
printf "testpass\nshort\nshort\n" | pamtester -v ${pamservice} ${username} chauthtok
# Unmount and unload key.
log_must pamtester ${pamservice} ${username} close_session

View File

@ -38,6 +38,8 @@
verify_runnable "global"
log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053"
function test_cleanup
{
# reset memory limit to 16M

View File

@ -45,6 +45,15 @@ fi
if ! is_linux ; then
log_unsupported "Only linux supports dd with oflag=dsync for FUA writes"
else
if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi
# Disabled for the CentOS 9 kernel
if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi
fi
typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)"

View File

@ -44,6 +44,15 @@
verify_runnable "global"
if is_linux ; then
if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi
# Disabled for the CentOS 9 kernel
if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then
log_unsupported "Disabled while issue #14872 is being worked"
fi
# We need '--force' here since the prior tests may leave a filesystem
# on the zvol, and blkdiscard will see that filesystem and print a
# warning unless you force it.
@ -123,7 +132,6 @@ log_must zfs set compression=off $TESTPOOL/$TESTVOL
# Remove old data from previous tests
log_must $trimcmd $zvolpath
set_blk_mq 1
log_must_busy zpool export $TESTPOOL
log_must zpool import $TESTPOOL

View File

@ -295,7 +295,6 @@ SRCS+= abd.c \
uberblock.c \
unique.c \
vdev.c \
vdev_cache.c \
vdev_draid.c \
vdev_draid_rand.c \
vdev_indirect.c \

View File

@ -653,6 +653,9 @@
/* qat is enabled and existed */
/* #undef HAVE_QAT */
/* struct reclaim_state has reclaimed */
/* #undef HAVE_RECLAIM_STATE_RECLAIMED */
/* register_shrinker is vararg */
/* #undef HAVE_REGISTER_SHRINKER_VARARG */
@ -1048,7 +1051,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gad0a55461"
#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gfeff9dfed"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -1057,7 +1060,7 @@
/* #undef ZFS_META_DATA */
/* Define the maximum compatible kernel version. */
#define ZFS_META_KVER_MAX "6.2"
#define ZFS_META_KVER_MAX "6.3"
/* Define the minimum compatible kernel version. */
#define ZFS_META_KVER_MIN "3.10"
@ -1078,7 +1081,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "FreeBSD_gad0a55461"
#define ZFS_META_RELEASE "FreeBSD_gfeff9dfed"
/* Define the project version. */
#define ZFS_META_VERSION "2.1.99"

View File

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.1.99-1955-gad0a55461"
#define ZFS_META_GITREV "zfs-2.1.99-1993-gfeff9dfed"