diff --git a/cddl/lib/libzpool/Makefile b/cddl/lib/libzpool/Makefile index 5dbeb9cb1721..217159611e97 100644 --- a/cddl/lib/libzpool/Makefile +++ b/cddl/lib/libzpool/Makefile @@ -135,7 +135,6 @@ KERNEL_C = \ uberblock.c \ unique.c \ vdev.c \ - vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_file.c \ diff --git a/sys/conf/files b/sys/conf/files index a5f55b49451c..726c4c7c8b2c 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -326,7 +326,6 @@ contrib/openzfs/module/zfs/txg.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/uberblock.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/unique.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev.c optional zfs compile-with "${ZFS_C}" -contrib/openzfs/module/zfs/vdev_cache.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_draid.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_draid_rand.c optional zfs compile-with "${ZFS_C}" contrib/openzfs/module/zfs/vdev_indirect.c optional zfs compile-with "${ZFS_C}" diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 8779e512f7be..e4b476aff112 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.2 +Linux-Maximum: 6.3 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary index 5d10e903fcba..426e0207052d 100755 --- a/sys/contrib/openzfs/cmd/arc_summary +++ b/sys/contrib/openzfs/cmd/arc_summary @@ -64,7 +64,6 @@ SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats - 'vdev': 'vdev_cache_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} @@ -90,8 +89,6 @@ if sys.platform.startswith('freebsd'): # Requires py36-sysctl on FreeBSD import sysctl - VDEV_CACHE_SIZE = 'vdev.cache_size' - def is_value(ctl): return ctl.type != sysctl.CTLTYPE_NODE @@ -135,8 +132,6 @@ elif sys.platform.startswith('linux'): SPL_PATH = '/sys/module/spl/parameters' TUNABLES_PATH = '/sys/module/zfs/parameters' - VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' - def load_kstats(section): path = os.path.join(KSTAT_PATH, section) with open(path) as f: @@ -842,7 +837,8 @@ def section_l2arc(kstats_dict): ('Free on write:', 'l2_free_on_write'), ('R/W clashes:', 'l2_rw_clash'), ('Bad checksums:', 'l2_cksum_bad'), - ('I/O errors:', 'l2_io_error')) + ('Read errors:', 'l2_io_error'), + ('Write errors:', 'l2_writes_error')) for title, value in l2_todo: prt_i1(title, f_hits(arc_stats[value])) @@ -878,28 +874,20 @@ def section_l2arc(kstats_dict): prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), f_hits(arc_stats['l2_misses'])) - prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) print() - print('L2ARC writes:') - - if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: - prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) - prt_i2('Done ratio:', - f_perc(arc_stats['l2_writes_done'], - arc_stats['l2_writes_sent']), - f_hits(arc_stats['l2_writes_done'])) - prt_i2('Error ratio:', - f_perc(arc_stats['l2_writes_error'], - arc_stats['l2_writes_sent']), - f_hits(arc_stats['l2_writes_error'])) - else: - prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) + print('L2ARC I/O:') + prt_i2('Reads:', + f_bytes(arc_stats['l2_read_bytes']), + f_hits(arc_stats['l2_hits'])) + prt_i2('Writes:', + f_bytes(arc_stats['l2_write_bytes']), + f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') - prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) - prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) + prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached'])) + prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading'])) print() @@ -959,35 +947,6 @@ def section_tunables(*_): print() -def section_vdev(kstats_dict): - """Collect information on VDEV caches""" - - # Currently [Nov 2017] the VDEV cache is disabled, because it is actually - # harmful. When this is the case, we just skip the whole entry. See - # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c - # for details - tunables = get_vdev_params() - - if tunables[VDEV_CACHE_SIZE] == '0': - print('VDEV cache disabled, skipping section\n') - return - - vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) - - vdev_cache_total = int(vdev_stats['hits']) +\ - int(vdev_stats['misses']) +\ - int(vdev_stats['delegations']) - - prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) - prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), - f_hits(vdev_stats['hits'])) - prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), - f_hits(vdev_stats['misses'])) - prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), - f_hits(vdev_stats['delegations'])) - print() - - def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h @@ -1015,7 +974,6 @@ section_calls = {'arc': section_arc, 'l2arc': section_l2arc, 'spl': section_spl, 'tunables': section_tunables, - 'vdev': section_vdev, 'zil': section_zil} diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 5ab13b470dc0..04a10c4eedd7 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -33,6 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome + * Copyright (c) 2023, Klara Inc. */ #include @@ -326,7 +327,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) int err; struct sublivelist_verify *sv = args; - zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, @@ -390,7 +391,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { (void) args; sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); @@ -682,7 +683,7 @@ livelist_metaslab_validate(spa_t *spa) (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); @@ -716,7 +717,7 @@ livelist_metaslab_validate(spa_t *spa) mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, - livelist_block_compare, + livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); @@ -789,6 +790,9 @@ usage(void) "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] [-K ]\n" "\t\t[[/] [ ...]\n" + "\t%s -B [-e [-V] [-p ...]] [-I ]\n" + "\t\t[-o =]... [-t ] [-U ] [-x ]\n" + "\t\t[-K ] / []\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" @@ -802,7 +806,7 @@ usage(void) "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -825,6 +829,8 @@ usage(void) (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); + (void) fprintf(stderr, " -B --backup " + "backup stream\n"); (void) fprintf(stderr, " -c --checksum " "checksum all metadata (twice for all data) blocks\n"); (void) fprintf(stderr, " -C --config " @@ -4875,6 +4881,81 @@ dump_path(char *ds, char *path, uint64_t *retobj) return (err); } +static int +dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) +{ + const char *p = (const char *)buf; + ssize_t nwritten; + + (void) os; + (void) arg; + + /* Write the data out, handling short writes and signals. */ + while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { + if (nwritten < 0) { + if (errno == EINTR) + continue; + return (errno); + } + p += nwritten; + len -= nwritten; + } + + return (0); +} + +static void +dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) +{ + boolean_t embed = B_FALSE; + boolean_t large_block = B_FALSE; + boolean_t compress = B_FALSE; + boolean_t raw = B_FALSE; + + const char *c; + for (c = flagstr; c != NULL && *c != '\0'; c++) { + switch (*c) { + case 'e': + embed = B_TRUE; + break; + case 'L': + large_block = B_TRUE; + break; + case 'c': + compress = B_TRUE; + break; + case 'w': + raw = B_TRUE; + break; + default: + fprintf(stderr, "dump_backup: invalid flag " + "'%c'\n", *c); + return; + } + } + + if (isatty(STDOUT_FILENO)) { + fprintf(stderr, "dump_backup: stream cannot be written " + "to a terminal\n"); + return; + } + + offset_t off = 0; + dmu_send_outparams_t out = { + .dso_outfunc = dump_backup_bytes, + .dso_dryrun = B_FALSE, + }; + + int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, + large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, + &off, &out); + if (err != 0) { + fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", + strerror(err)); + return; + } +} + static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { @@ -8465,9 +8546,9 @@ zdb_read_block(char *thing, spa_t *spa) */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | - ZIO_FLAG_OPTIONAL, NULL, NULL)); + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, + NULL, NULL)); } error = zio_wait(zio); @@ -8561,7 +8642,6 @@ zdb_read_block(char *thing, spa_t *spa) zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | @@ -8695,6 +8775,7 @@ main(int argc, char **argv) struct option long_options[] = { {"ignore-assertions", no_argument, NULL, 'A'}, {"block-stats", no_argument, NULL, 'b'}, + {"backup", no_argument, NULL, 'B'}, {"checksum", no_argument, NULL, 'c'}, {"config", no_argument, NULL, 'C'}, {"datasets", no_argument, NULL, 'd'}, @@ -8736,10 +8817,11 @@ main(int argc, char **argv) }; while ((c = getopt_long(argc, argv, - "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", + "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': + case 'B': case 'c': case 'C': case 'd': @@ -8887,7 +8969,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL) + if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -9073,7 +9155,8 @@ main(int argc, char **argv) checkpoint_pool, error); } - } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { + } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || + objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); @@ -9209,7 +9292,10 @@ main(int argc, char **argv) strerror(errno)); } } - if (os != NULL) { + if (dump_opt['B']) { + dump_backup(target, objset_id, + argc > 0 ? argv[0] : NULL); + } else if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c index a2daa77a61fe..8fabb8d081a5 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c @@ -369,9 +369,7 @@ zfs_agent_consumer_thread(void *arg) return (NULL); } - if ((event = (list_head(&agent_events))) != NULL) { - list_remove(&agent_events, event); - + if ((event = list_remove_head(&agent_events)) != NULL) { (void) pthread_mutex_unlock(&agent_lock); /* dispatch to all event subscribers */ @@ -434,8 +432,7 @@ zfs_agent_fini(void) (void) pthread_join(g_agents_tid, NULL); /* drain any pending events */ - while ((event = (list_head(&agent_events))) != NULL) { - list_remove(&agent_events, event); + while ((event = list_remove_head(&agent_events)) != NULL) { nvlist_free(event->ae_nvl); free(event); } diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c index 1c82bd4f0010..b07a02712295 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -1288,17 +1288,14 @@ zfs_slm_fini(void) tpool_destroy(g_tpool); } - while ((pool = (list_head(&g_pool_list))) != NULL) { - list_remove(&g_pool_list, pool); + while ((pool = list_remove_head(&g_pool_list)) != NULL) { zpool_close(pool->uap_zhp); free(pool); } list_destroy(&g_pool_list); - while ((device = (list_head(&g_device_list))) != NULL) { - list_remove(&g_device_list, device); + while ((device = list_remove_head(&g_device_list)) != NULL) free(device); - } list_destroy(&g_device_list); libzfs_fini(g_zfshdl); diff --git a/sys/contrib/openzfs/cmd/zilstat.in b/sys/contrib/openzfs/cmd/zilstat.in index cf4e2e0dd0c8..e8678e20cafa 100755 --- a/sys/contrib/openzfs/cmd/zilstat.in +++ b/sys/contrib/openzfs/cmd/zilstat.in @@ -36,31 +36,49 @@ import argparse from argparse import RawTextHelpFormatter cols = { - # hdr: [size, scale, kstat name] + # hdr: [size, scale, kstat name] "time": [8, -1, "time"], "pool": [12, -1, "pool"], "ds": [12, -1, "dataset_name"], "obj": [12, -1, "objset"], - "zcc": [10, 1000, "zil_commit_count"], - "zcwc": [10, 1000, "zil_commit_writer_count"], - "ziic": [10, 1000, "zil_itx_indirect_count"], - "zic": [10, 1000, "zil_itx_count"], - "ziib": [10, 1024, "zil_itx_indirect_bytes"], - "zicc": [10, 1000, "zil_itx_copied_count"], - "zicb": [10, 1024, "zil_itx_copied_bytes"], - "zinc": [10, 1000, "zil_itx_needcopy_count"], - "zinb": [10, 1024, "zil_itx_needcopy_bytes"], - "zimnc": [10, 1000, "zil_itx_metaslab_normal_count"], - "zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"], - "zimsc": [10, 1000, "zil_itx_metaslab_slog_count"], - "zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"], + "cc": [5, 1000, "zil_commit_count"], + "cwc": [5, 1000, "zil_commit_writer_count"], + "ic": [5, 1000, "zil_itx_count"], + "iic": [5, 1000, "zil_itx_indirect_count"], + "iib": [5, 1024, "zil_itx_indirect_bytes"], + "icc": [5, 1000, "zil_itx_copied_count"], + "icb": [5, 1024, "zil_itx_copied_bytes"], + "inc": [5, 1000, "zil_itx_needcopy_count"], + "inb": [5, 1024, "zil_itx_needcopy_bytes"], + "idc": [5, 1000, "icc+inc"], + "idb": [5, 1024, "icb+inb"], + "iwc": [5, 1000, "iic+idc"], + "iwb": [5, 1024, "iib+idb"], + "imnc": [6, 1000, "zil_itx_metaslab_normal_count"], + "imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"], + "imnw": [6, 1024, "zil_itx_metaslab_normal_write"], + "imna": [6, 1024, "zil_itx_metaslab_normal_alloc"], + "imsc": [6, 1000, "zil_itx_metaslab_slog_count"], + "imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"], + "imsw": [6, 1024, "zil_itx_metaslab_slog_write"], + "imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"], + "imc": [5, 1000, "imnc+imsc"], + "imb": [5, 1024, "imnb+imsb"], + "imw": [5, 1024, "imnw+imsw"], + "ima": [5, 1024, "imna+imsa"], + "se%": [3, 100, "imb/ima"], + "sen%": [4, 100, "imnb/imna"], + "ses%": [4, 100, "imsb/imsa"], + "te%": [3, 100, "imb/imw"], + "ten%": [4, 100, "imnb/imnw"], + "tes%": [4, 100, "imsb/imsw"], } -hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \ - "zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] +hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib", + "imnc", "imnw", "imsc", "imsw"] -ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb", - "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] +ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib", + "imnc", "imnw", "imsc", "imsw"] cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]") @@ -105,7 +123,7 @@ def print_header(): global sep for col in hdr: new_col = col - if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']: + if interval > 0 and cols[col][1] > 100: new_col += "/s" sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep)) sys.stdout.write("\n") @@ -115,7 +133,7 @@ def print_values(v): global sep for col in hdr: val = v[cols[col][2]] - if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0: + if interval > 0 and cols[col][1] > 100: val = v[cols[col][2]] // interval sys.stdout.write("%s%s" % ( prettynum(cols[col][0], cols[col][1], val), sep)) @@ -237,9 +255,7 @@ def init(): invalid = [] for ele in hdr: - if gFlag and ele not in ghdr: - invalid.append(ele) - elif ele not in cols: + if ele not in cols: invalid.append(ele) if len(invalid) > 0: @@ -403,17 +419,17 @@ def calculate_diff(): diff = copy.deepcopy(curr) for pool in curr: for objset in curr[pool]: - for col in hdr: - if col not in ['time', 'pool', 'ds', 'obj']: - key = cols[col][2] - # If prev is NULL, this is the - # first time we are here - if not prev: - diff[pool][objset][key] = 0 - else: - diff[pool][objset][key] \ - = curr[pool][objset][key] \ - - prev[pool][objset][key] + for key in curr[pool][objset]: + if not isinstance(diff[pool][objset][key], int): + continue + # If prev is NULL, this is the + # first time we are here + if not prev: + diff[pool][objset][key] = 0 + else: + diff[pool][objset][key] \ + = curr[pool][objset][key] \ + - prev[pool][objset][key] def zil_build_dict(pool = "GLOBAL"): global kstat @@ -425,10 +441,77 @@ def zil_build_dict(pool = "GLOBAL"): if objset not in curr[pool]: curr[pool][objset] = dict() curr[pool][objset][key] = val - curr[pool][objset]["pool"] = pool - curr[pool][objset]["objset"] = objset - curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \ - time.localtime()) + +def zil_extend_dict(): + global diff + for pool in diff: + for objset in diff[pool]: + diff[pool][objset]["pool"] = pool + diff[pool][objset]["objset"] = objset + diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \ + time.localtime()) + diff[pool][objset]["icc+inc"] = \ + diff[pool][objset]["zil_itx_copied_count"] + \ + diff[pool][objset]["zil_itx_needcopy_count"] + diff[pool][objset]["icb+inb"] = \ + diff[pool][objset]["zil_itx_copied_bytes"] + \ + diff[pool][objset]["zil_itx_needcopy_bytes"] + diff[pool][objset]["iic+idc"] = \ + diff[pool][objset]["zil_itx_indirect_count"] + \ + diff[pool][objset]["zil_itx_copied_count"] + \ + diff[pool][objset]["zil_itx_needcopy_count"] + diff[pool][objset]["iib+idb"] = \ + diff[pool][objset]["zil_itx_indirect_bytes"] + \ + diff[pool][objset]["zil_itx_copied_bytes"] + \ + diff[pool][objset]["zil_itx_needcopy_bytes"] + diff[pool][objset]["imnc+imsc"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_count"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_count"] + diff[pool][objset]["imnb+imsb"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] + diff[pool][objset]["imnw+imsw"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_write"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_write"] + diff[pool][objset]["imna+imsa"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_alloc"] + if diff[pool][objset]["imna+imsa"] > 0: + diff[pool][objset]["imb/ima"] = 100 * \ + diff[pool][objset]["imnb+imsb"] // \ + diff[pool][objset]["imna+imsa"] + else: + diff[pool][objset]["imb/ima"] = 100 + if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: + diff[pool][objset]["imnb/imna"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + else: + diff[pool][objset]["imnb/imna"] = 100 + if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: + diff[pool][objset]["imsb/imsa"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_slog_alloc"] + else: + diff[pool][objset]["imsb/imsa"] = 100 + if diff[pool][objset]["imnw+imsw"] > 0: + diff[pool][objset]["imb/imw"] = 100 * \ + diff[pool][objset]["imnb+imsb"] // \ + diff[pool][objset]["imnw+imsw"] + else: + diff[pool][objset]["imb/imw"] = 100 + if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: + diff[pool][objset]["imnb/imnw"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_normal_write"] + else: + diff[pool][objset]["imnb/imnw"] = 100 + if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: + diff[pool][objset]["imsb/imsw"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_slog_write"] + else: + diff[pool][objset]["imsb/imsw"] = 100 def sign_handler_epipe(sig, frame): print("Caught EPIPE signal: " + str(frame)) @@ -437,30 +520,31 @@ def sign_handler_epipe(sig, frame): def main(): global interval - global curr + global curr, diff hprint = False init() signal.signal(signal.SIGINT, signal.SIG_DFL) signal.signal(signal.SIGPIPE, sign_handler_epipe) + zil_process_kstat() + if not curr: + print ("Error: No stats to show") + sys.exit(0) + print_header() if interval > 0: + time.sleep(interval) while True: calculate_diff() if not diff: print ("Error: No stats to show") sys.exit(0) - if hprint == False: - print_header() - hprint = True + zil_extend_dict() print_dict(diff) time.sleep(interval) else: - zil_process_kstat() - if not curr: - print ("Error: No stats to show") - sys.exit(0) - print_header() - print_dict(curr) + diff = curr + zil_extend_dict() + print_dict(diff) if __name__ == '__main__': main() diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am index 3c7c8a9aebe2..d08b8e1791b6 100644 --- a/sys/contrib/openzfs/cmd/zpool/Makefile.am +++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am @@ -145,6 +145,7 @@ dist_zpoolcompat_DATA = \ %D%/compatibility.d/openzfs-2.0-linux \ %D%/compatibility.d/openzfs-2.1-freebsd \ %D%/compatibility.d/openzfs-2.1-linux \ + %D%/compatibility.d/openzfs-2.2 \ %D%/compatibility.d/openzfsonosx-1.7.0 \ %D%/compatibility.d/openzfsonosx-1.8.1 \ %D%/compatibility.d/openzfsonosx-1.9.3 \ @@ -168,12 +169,20 @@ zpoolcompatlinks = \ "freebsd-11.3 freebsd-12.0" \ "freebsd-11.3 freebsd-12.1" \ "freebsd-11.3 freebsd-12.2" \ + "freebsd-11.3 freebsd-12.3" \ + "freebsd-11.3 freebsd-12.4" \ + "openzfs-2.1-freebsd freebsd-13.0" \ + "openzfs-2.1-freebsd freebsd-13.1" \ + "openzfs-2.1-freebsd freebsd-13.2" \ "freebsd-11.3 freenas-11.3" \ "freenas-11.0 freenas-11.1" \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ "openzfs-2.0-freebsd truenas-12.0" \ "zol-0.7 ubuntu-18.04" \ - "zol-0.8 ubuntu-20.04" + "zol-0.8 ubuntu-20.04" \ + "openzfs-2.1-linux ubuntu-22.04" \ + "openzfs-2.2 openzfs-2.2-linux" \ + "openzfs-2.2 openzfs-2.2-freebsd" zpoolconfdir = $(sysconfdir)/zfs/zpool.d INSTALL_DATA_HOOKS += zpool-install-data-hook diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 index 4e8f21362554..fec73a269a78 100644 --- a/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/grub2 @@ -8,5 +8,7 @@ extensible_dataset filesystem_limits hole_birth large_blocks +livelist lz4_compress spacemap_histogram +zpool_checkpoint diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 new file mode 100644 index 000000000000..c9491cd8dc42 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.2 @@ -0,0 +1,40 @@ +# Features supported by OpenZFS 2.2 on Linux and FreeBSD +allocation_classes +async_destroy +blake3 +block_cloning +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +head_errlog +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +vdev_zaps_v2 +zilsaxattr +zpool_checkpoint +zstd_compress diff --git a/sys/contrib/openzfs/config/kernel-reclaim_state.m4 b/sys/contrib/openzfs/config/kernel-reclaim_state.m4 new file mode 100644 index 000000000000..9936b3c1001f --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-reclaim_state.m4 @@ -0,0 +1,26 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RECLAIMED], [ + dnl # + dnl # 6.4 API change + dnl # The reclaimed_slab of struct reclaim_state + dnl # is renamed to reclaimed + dnl # + ZFS_LINUX_TEST_SRC([reclaim_state_reclaimed], [ + #include + static const struct reclaim_state + rs __attribute__ ((unused)) = { + .reclaimed = 100, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RECLAIMED], [ + AC_MSG_CHECKING([whether struct reclaim_state has reclaimed field]) + ZFS_LINUX_TEST_RESULT([reclaim_state_reclaimed], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RECLAIM_STATE_RECLAIMED, 1, + [struct reclaim_state has reclaimed]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index 439ffdf5a898..cb7e736c9a43 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -153,6 +153,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T + ZFS_AC_KERNEL_SRC_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -285,6 +286,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T + ZFS_AC_KERNEL_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/sys/contrib/openzfs/contrib/debian/rules.in b/sys/contrib/openzfs/contrib/debian/rules.in index 63892c6ca243..f0791cfabd38 100755 --- a/sys/contrib/openzfs/contrib/debian/rules.in +++ b/sys/contrib/openzfs/contrib/debian/rules.in @@ -7,8 +7,8 @@ NAME := $(shell awk '$$1 == "Name:" { print $$2; }' META) LINUX_MIN := $(shell awk '/Linux-Minimum:/{print $$2}' META) LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META) -DKMSFILES := module include config zfs.release.in autogen.sh META AUTHORS \ - COPYRIGHT LICENSE README.md +DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \ + COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md ifndef KVERS KVERS=$(shell uname -r) diff --git a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs index 7f977a30f75b..0a2bd2efda7a 100644 --- a/sys/contrib/openzfs/contrib/initramfs/scripts/zfs +++ b/sys/contrib/openzfs/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount.zfs -o zfsutil" + ZFS_CMD="mount -o zfsutil -t zfs" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -361,7 +361,7 @@ mount_fs() fi # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount.zfs" + ZFS_CMD="mount -t zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" @@ -944,7 +944,7 @@ mountroot() echo " not specified on the kernel command line." echo "" echo "Manually mount the root filesystem on $rootmnt and then exit." - echo "Hint: Try: mount.zfs -o zfsutil ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" + echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" shell fi diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c index 979546ab3090..08a8640669b3 100644 --- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c +++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c @@ -67,6 +67,7 @@ pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) #include static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; +static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok"; static libzfs_handle_t *g_zfs; @@ -160,10 +161,10 @@ pw_free(pw_password_t *pw) } static pw_password_t * -pw_fetch(pam_handle_t *pamh) +pw_fetch(pam_handle_t *pamh, int tok) { const char *token; - if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { + if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "couldn't get password from PAM stack"); return (NULL); @@ -177,13 +178,13 @@ pw_fetch(pam_handle_t *pamh) } static const pw_password_t * -pw_fetch_lazy(pam_handle_t *pamh) +pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name) { - pw_password_t *pw = pw_fetch(pamh); + pw_password_t *pw = pw_fetch(pamh, tok); if (pw == NULL) { return (NULL); } - int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); + int ret = pam_set_data(pamh, var_name, pw, destroy_pw); if (ret != PAM_SUCCESS) { pw_free(pw); pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); @@ -193,23 +194,23 @@ pw_fetch_lazy(pam_handle_t *pamh) } static const pw_password_t * -pw_get(pam_handle_t *pamh) +pw_get(pam_handle_t *pamh, int tok, const char *var_name) { const pw_password_t *authtok = NULL; - int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, + int ret = pam_get_data(pamh, var_name, (const void**)(&authtok)); if (ret == PAM_SUCCESS) return (authtok); if (ret == PAM_NO_MODULE_DATA) - return (pw_fetch_lazy(pamh)); + return (pw_fetch_lazy(pamh, tok, var_name)); pam_syslog(pamh, LOG_ERR, "password not available"); return (NULL); } static int -pw_clear(pam_handle_t *pamh) +pw_clear(pam_handle_t *pamh, const char *var_name) { - int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); + int ret = pam_set_data(pamh, var_name, NULL, NULL); if (ret != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "clearing password failed"); return (-1); @@ -386,7 +387,7 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); - if (ret) { + if (ret && ret != EEXIST) { pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); zfs_close(ds); return (-1); @@ -406,14 +407,14 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, } static int -unmount_unload(pam_handle_t *pamh, const char *ds_name) +unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } - int ret = zfs_unmount(ds, NULL, 0); + int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0); if (ret) { pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); zfs_close(ds); @@ -435,9 +436,13 @@ typedef struct { char *runstatedir; char *homedir; char *dsname; + uid_t uid_min; + uid_t uid_max; uid_t uid; const char *username; - int unmount_and_unload; + boolean_t unmount_and_unload; + boolean_t force_unmount; + boolean_t recursive_homes; } zfs_key_config_t; static int @@ -469,9 +474,13 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, free(config->homes_prefix); return (PAM_USER_UNKNOWN); } + config->uid_min = 1000; + config->uid_max = MAXUID; config->uid = entry->pw_uid; config->username = name; - config->unmount_and_unload = 1; + config->unmount_and_unload = B_TRUE; + config->force_unmount = B_FALSE; + config->recursive_homes = B_FALSE; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -481,8 +490,16 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); + } else if (strncmp(argv[c], "uid_min=", 8) == 0) { + sscanf(argv[c] + 8, "%u", &config->uid_min); + } else if (strncmp(argv[c], "uid_max=", 8) == 0) { + sscanf(argv[c] + 8, "%u", &config->uid_max); } else if (strcmp(argv[c], "nounmount") == 0) { - config->unmount_and_unload = 0; + config->unmount_and_unload = B_FALSE; + } else if (strcmp(argv[c], "forceunmount") == 0) { + config->force_unmount = B_TRUE; + } else if (strcmp(argv[c], "recursive_homes") == 0) { + config->recursive_homes = B_TRUE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); @@ -517,8 +534,12 @@ find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE); if (strcmp(target->homedir, mountpoint) != 0) { + if (target->recursive_homes) { + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, target); + } zfs_close(zhp); - return (0); + return (target->dsname != NULL); } target->dsname = strdup(zfs_get_name(zhp)); @@ -531,17 +552,23 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) { if (config->homedir != NULL && config->homes_prefix != NULL) { - zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, - ZFS_TYPE_FILESYSTEM); - if (zhp == NULL) { - pam_syslog(NULL, LOG_ERR, "dataset %s not found", - config->homes_prefix); - return (NULL); - } + if (strcmp(config->homes_prefix, "*") == 0) { + (void) zfs_iter_root(g_zfs, + find_dsname_by_prop_value, config); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + config->homes_prefix, ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) { + pam_syslog(NULL, LOG_ERR, + "dataset %s not found", + config->homes_prefix); + return (NULL); + } - (void) zfs_iter_filesystems_v2(zhp, 0, - find_dsname_by_prop_value, config); - zfs_close(zhp); + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, config); + zfs_close(zhp); + } char *dsname = config->dsname; config->dsname = NULL; return (dsname); @@ -655,8 +682,13 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags, if (config_err != PAM_SUCCESS) { return (config_err); } + if (config.uid < config.uid_min || config.uid > config.uid_max) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } - const pw_password_t *token = pw_fetch_lazy(pamh); + const pw_password_t *token = pw_fetch_lazy(pamh, + PAM_AUTHTOK, PASSWORD_VAR_NAME); if (token == NULL) { zfs_key_config_free(&config); return (PAM_AUTH_ERR); @@ -706,10 +738,12 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SERVICE_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); - return (PAM_SUCCESS); + return (PAM_SERVICE_ERR); } + const pw_password_t *old_token = pw_get(pamh, + PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME); { if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); @@ -721,49 +755,62 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - int key_loaded = is_key_loaded(pamh, dataset); - if (key_loaded == -1) { + if (!old_token) { + pam_syslog(pamh, LOG_ERR, + "old password from PAM stack is null"); free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - free(dataset); - pam_zfs_free(); - if (! key_loaded) { + if (decrypt_mount(pamh, dataset, + old_token->value, B_TRUE) == -1) { pam_syslog(pamh, LOG_ERR, - "key not loaded, returning try_again"); + "old token mismatch"); + free(dataset); + pam_zfs_free(); zfs_key_config_free(&config); return (PAM_PERM_DENIED); } } if ((flags & PAM_UPDATE_AUTHTOK) != 0) { - const pw_password_t *token = pw_get(pamh); + const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK, + PASSWORD_VAR_NAME); if (token == NULL) { + pam_syslog(pamh, LOG_ERR, "new password unavailable"); + pam_zfs_free(); zfs_key_config_free(&config); - return (PAM_SERVICE_ERR); - } - if (pam_zfs_init(pamh) != 0) { - zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } - if (change_key(pamh, dataset, token->value) == -1) { + int was_loaded = is_key_loaded(pamh, dataset); + if (!was_loaded && decrypt_mount(pamh, dataset, + old_token->value, B_FALSE) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } + int changed = change_key(pamh, dataset, token->value); + if (!was_loaded) { + unmount_unload(pamh, dataset, config.force_unmount); + } free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh) == -1) { + if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 || + pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) { return (PAM_SERVICE_ERR); } } else { @@ -788,7 +835,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SESSION_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -799,7 +846,8 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SUCCESS); } - const pw_password_t *token = pw_get(pamh); + const pw_password_t *token = pw_get(pamh, + PAM_AUTHTOK, PASSWORD_VAR_NAME); if (token == NULL) { zfs_key_config_free(&config); return (PAM_SESSION_ERR); @@ -823,7 +871,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh) == -1) { + if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) { return (PAM_SERVICE_ERR); } return (PAM_SUCCESS); @@ -846,7 +894,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SESSION_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -868,7 +916,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SESSION_ERR); } - if (unmount_unload(pamh, dataset) == -1) { + if (unmount_unload(pamh, dataset, config.force_unmount) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h index 27d290863c0b..c633799318d5 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/kmem.h @@ -75,7 +75,7 @@ typedef struct kmem_cache { extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); @@ -83,6 +83,7 @@ kmem_cache_t *kmem_cache_create(const char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); void kmem_cache_destroy(kmem_cache_t *cache); +__attribute__((malloc)) void *kmem_cache_alloc(kmem_cache_t *cache, int flags); void kmem_cache_free(kmem_cache_t *cache, void *buf); boolean_t kmem_cache_reap_active(void); diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h index 09d109d191bf..8e20a9613539 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h @@ -68,7 +68,6 @@ enum scope_prefix_types { zfs_trim, zfs_txg, zfs_vdev, - zfs_vdev_cache, zfs_vdev_file, zfs_vdev_mirror, zfs_vnops, diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h index 594425f7b297..8a203f7bb8e2 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/kmem.h @@ -31,10 +31,10 @@ #include extern int kmem_debugging(void); -extern char *kmem_vasprintf(const char *fmt, va_list ap) - __attribute__((format(printf, 1, 0))); -extern char *kmem_asprintf(const char *fmt, ...) - __attribute__((format(printf, 1, 2))); +__attribute__((format(printf, 1, 0))) +extern char *kmem_vasprintf(const char *fmt, va_list ap); +__attribute__((format(printf, 1, 2))) +extern char *kmem_asprintf(const char *fmt, ...); extern char *kmem_strdup(const char *str); extern void kmem_strfree(char *str); @@ -186,10 +186,10 @@ extern unsigned int spl_kmem_alloc_max; #define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) #define kmem_cache_reap_active spl_kmem_cache_reap_active -extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line) - __attribute__((alloc_size(1))); -extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line) - __attribute__((alloc_size(1))); +__attribute__((malloc, alloc_size(1))) +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +__attribute__((malloc, alloc_size(1))) +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); extern void spl_kmem_free(const void *ptr, size_t sz); /* diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h b/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h index e77af2a7a48c..92585a17e263 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/vmem.h @@ -91,8 +91,10 @@ typedef struct vmem { } vmem_t; #define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) #define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) -extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); -extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line) + __attribute__((malloc, alloc_size(1))); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line) + __attribute__((malloc, alloc_size(1))); extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h index 7bddd9d1f469..afa1a274e43c 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_zil.h @@ -215,6 +215,39 @@ DEFINE_EVENT(zfs_zil_commit_io_error_class, name, \ TP_ARGS(zilog, zcw)) DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * zilog_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_zil_block_size_class, + TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), + TP_ARGS(zilog, res, s1), + TP_STRUCT__entry( + ZILOG_TP_STRUCT_ENTRY + __field(uint64_t, res) + __field(uint64_t, s1) + ), + TP_fast_assign( + ZILOG_TP_FAST_ASSIGN + __entry->res = res; + __entry->s1 = s1; + ), + TP_printk( + ZILOG_TP_PRINTK_FMT " res %llu s1 %llu", + ZILOG_TP_PRINTK_ARGS, __entry->res, __entry->s1) +); + +#define DEFINE_ZIL_BLOCK_SIZE_EVENT(name) \ +DEFINE_EVENT(zfs_zil_block_size_class, name, \ + TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), \ + TP_ARGS(zilog, res, s1)) +DEFINE_ZIL_BLOCK_SIZE_EVENT(zfs_zil__block__size); + #endif /* _TRACE_ZIL_H */ #undef TRACE_INCLUDE_PATH @@ -228,6 +261,7 @@ DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); DEFINE_DTRACE_PROBE2(zil__process__commit__itx); DEFINE_DTRACE_PROBE2(zil__process__normal__itx); DEFINE_DTRACE_PROBE2(zil__commit__io__error); +DEFINE_DTRACE_PROBE3(zil__block__size); #endif /* HAVE_DECLARE_EVENT_CLASS */ #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/include/sys/abd.h b/sys/contrib/openzfs/include/sys/abd.h index 82c51cb05cbc..750f9986c1da 100644 --- a/sys/contrib/openzfs/include/sys/abd.h +++ b/sys/contrib/openzfs/include/sys/abd.h @@ -86,10 +86,15 @@ extern int zfs_abd_scatter_enabled; * Allocations and deallocations */ +__attribute__((malloc)) abd_t *abd_alloc(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_linear(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_gang(void); +__attribute__((malloc)) abd_t *abd_alloc_for_io(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); void abd_gang_add(abd_t *, abd_t *, boolean_t); diff --git a/sys/contrib/openzfs/include/sys/btree.h b/sys/contrib/openzfs/include/sys/btree.h index 883abb5181c9..6e05eee8f01d 100644 --- a/sys/contrib/openzfs/include/sys/btree.h +++ b/sys/contrib/openzfs/include/sys/btree.h @@ -105,8 +105,13 @@ typedef struct zfs_btree_index { boolean_t bti_before; } zfs_btree_index_t; -typedef struct btree { +typedef struct btree zfs_btree_t; +typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, + const void *, zfs_btree_index_t *); + +struct btree { int (*bt_compar) (const void *, const void *); + bt_find_in_buf_f bt_find_in_buf; size_t bt_elem_size; size_t bt_leaf_size; uint32_t bt_leaf_cap; @@ -115,7 +120,54 @@ typedef struct btree { uint64_t bt_num_nodes; zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading -} zfs_btree_t; +}; + +/* + * Implementation of Shar's algorithm designed to accelerate binary search by + * eliminating impossible to predict branches. + * + * For optimality, this should be used to generate the search function in the + * same file as the comparator and the comparator should be marked + * `__attribute__((always_inline) inline` so that the compiler will inline it. + * + * Arguments are: + * + * NAME - The function name for this instance of the search function. Use it + * in a subsequent call to zfs_btree_create(). + * T - The element type stored inside the B-Tree. + * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, + * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, + * TREE_CMP() from avl.h can be used in a boilerplate function. + */ +/* BEGIN CSTYLED */ +#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ +_Pragma("GCC diagnostic push") \ +_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ +static void * \ +NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ + const void *value, zfs_btree_index_t *where) \ +{ \ + T *i = (T *)buf; \ + (void) tree; \ + _Pragma("GCC unroll 9") \ + while (nelems > 1) { \ + uint32_t half = nelems / 2; \ + nelems -= half; \ + i += (COMP(&i[half - 1], value) < 0) * half; \ + } \ + \ + int comp = COMP(i, value); \ + where->bti_offset = (i - (T *)buf) + (comp < 0); \ + where->bti_before = (comp != 0); \ + \ + if (comp == 0) { \ + return (i); \ + } \ + \ + return (NULL); \ +} \ +_Pragma("GCC diagnostic pop") +/* END CSTYLED */ /* * Allocate and deallocate caches for btree nodes. @@ -129,13 +181,19 @@ void zfs_btree_fini(void); * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > + * find - optional function to accelerate searches inside B-Tree nodes + * through Shar's algorithm and comparator inlining. Setting this to + * NULL will use a generic function. The function should be created + * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. + * compar should be marked `__attribute__((always_inline)) inline` or + * performance is unlikely to improve very much. * size - the value of sizeof(struct my_type) * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), - size_t); + bt_find_in_buf_f, size_t); void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), - size_t, size_t); + bt_find_in_buf_f, size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node diff --git a/sys/contrib/openzfs/include/sys/spa.h b/sys/contrib/openzfs/include/sys/spa.h index ed752967cca6..1fa2044008dc 100644 --- a/sys/contrib/openzfs/include/sys/spa.h +++ b/sys/contrib/openzfs/include/sys/spa.h @@ -1174,10 +1174,6 @@ extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); -/* vdev cache */ -extern void vdev_cache_stat_init(void); -extern void vdev_cache_stat_fini(void); - /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); diff --git a/sys/contrib/openzfs/include/sys/vdev.h b/sys/contrib/openzfs/include/sys/vdev.h index d529bbcdd9a4..26c834ff57cf 100644 --- a/sys/contrib/openzfs/include/sys/vdev.h +++ b/sys/contrib/openzfs/include/sys/vdev.h @@ -158,12 +158,6 @@ extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); -extern void vdev_cache_init(vdev_t *vd); -extern void vdev_cache_fini(vdev_t *vd); -extern boolean_t vdev_cache_read(zio_t *zio); -extern void vdev_cache_write(zio_t *zio); -extern void vdev_cache_purge(vdev_t *vd); - extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index ea3043c82a39..74b3737d8ee5 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -57,8 +57,6 @@ extern "C" { * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; -typedef struct vdev_cache vdev_cache_t; -typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; @@ -132,23 +130,6 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -struct vdev_cache_entry { - struct abd *ve_abd; - uint64_t ve_offset; - clock_t ve_lastused; - avl_node_t ve_offset_node; - avl_node_t ve_lastused_node; - uint32_t ve_hits; - uint16_t ve_missed_update; - zio_t *ve_fill_io; -}; - -struct vdev_cache { - avl_tree_t vc_offset_tree; - avl_tree_t vc_lastused_tree; - kmutex_t vc_lock; -}; - typedef struct vdev_queue_class { uint32_t vqc_active; @@ -443,7 +424,6 @@ struct vdev { boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ - vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ diff --git a/sys/contrib/openzfs/include/sys/zfs_refcount.h b/sys/contrib/openzfs/include/sys/zfs_refcount.h index 42f846b8920a..4efa266a53c5 100644 --- a/sys/contrib/openzfs/include/sys/zfs_refcount.h +++ b/sys/contrib/openzfs/include/sys/zfs_refcount.h @@ -73,13 +73,15 @@ int64_t zfs_refcount_count(zfs_refcount_t *); int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); /* - * Note that (add|remove)_many add/remove one reference with "number" N, - * _not_ make N references with "number" 1, which is what vanilla - * zfs_refcount_(add|remove) would do if called N times. + * Note that (add|remove)_many adds/removes one reference with "number" N, + * _not_ N references with "number" 1, which is what (add|remove)_few does, + * or what vanilla zfs_refcount_(add|remove) called N times would do. * * Attempting to remove a reference with number N when none exists is a * panic on debug kernels with reference_tracking enabled. */ +void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *); +void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); @@ -108,6 +110,10 @@ typedef struct refcount { #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) +#define zfs_refcount_add_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, number) +#define zfs_refcount_remove_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, -number) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ diff --git a/sys/contrib/openzfs/include/sys/zfs_znode.h b/sys/contrib/openzfs/include/sys/zfs_znode.h index 012e7403e2a6..2f266f53247e 100644 --- a/sys/contrib/openzfs/include/sys/zfs_znode.h +++ b/sys/contrib/openzfs/include/sys/zfs_znode.h @@ -158,6 +158,7 @@ extern "C" { #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); #ifdef _KERNEL #include @@ -280,7 +281,6 @@ extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_remove_op_tables(void); extern int zfs_create_op_tables(void); extern dev_t zfs_cmpldev(uint64_t); -extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); extern void zfs_znode_dmu_fini(znode_t *); diff --git a/sys/contrib/openzfs/include/sys/zil.h b/sys/contrib/openzfs/include/sys/zil.h index cff8ebcad819..4747ecc067a9 100644 --- a/sys/contrib/openzfs/include/sys/zil.h +++ b/sys/contrib/openzfs/include/sys/zil.h @@ -489,18 +489,22 @@ typedef struct zil_stats { * Transactions which have been allocated to the "normal" * (i.e. not slog) storage pool. Note that "bytes" accumulate * the actual log record sizes - which do not include the actual - * data in case of indirect writes. + * data in case of indirect writes. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_normal_count; kstat_named_t zil_itx_metaslab_normal_bytes; + kstat_named_t zil_itx_metaslab_normal_write; + kstat_named_t zil_itx_metaslab_normal_alloc; /* * Transactions which have been allocated to the "slog" storage pool. * If there are no separate log devices, this is the same as the - * "normal" pool. + * "normal" pool. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_slog_count; kstat_named_t zil_itx_metaslab_slog_bytes; + kstat_named_t zil_itx_metaslab_slog_write; + kstat_named_t zil_itx_metaslab_slog_alloc; } zil_kstat_values_t; typedef struct zil_sums { @@ -515,8 +519,12 @@ typedef struct zil_sums { wmsum_t zil_itx_needcopy_bytes; wmsum_t zil_itx_metaslab_normal_count; wmsum_t zil_itx_metaslab_normal_bytes; + wmsum_t zil_itx_metaslab_normal_write; + wmsum_t zil_itx_metaslab_normal_alloc; wmsum_t zil_itx_metaslab_slog_count; wmsum_t zil_itx_metaslab_slog_bytes; + wmsum_t zil_itx_metaslab_slog_write; + wmsum_t zil_itx_metaslab_slog_alloc; } zil_sums_t; #define ZIL_STAT_INCR(zil, stat, val) \ diff --git a/sys/contrib/openzfs/include/sys/zil_impl.h b/sys/contrib/openzfs/include/sys/zil_impl.h index bb85bf6d1eb1..03a409c5257c 100644 --- a/sys/contrib/openzfs/include/sys/zil_impl.h +++ b/sys/contrib/openzfs/include/sys/zil_impl.h @@ -44,7 +44,7 @@ extern "C" { * must be held. * * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must + * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must * be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write @@ -93,20 +93,23 @@ typedef struct lwb { blkptr_t lwb_blk; /* on disk address of this log blk */ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ int lwb_nused; /* # used bytes in buffer */ + int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ - hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ } lwb_t; /* diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 695bc09e6cb7..6b1352a72b9a 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -190,7 +190,6 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SPECULATIVE (1ULL << 8) #define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) #define ZIO_FLAG_DONT_RETRY (1ULL << 10) -#define ZIO_FLAG_DONT_CACHE (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) diff --git a/sys/contrib/openzfs/lib/libspl/include/umem.h b/sys/contrib/openzfs/lib/libspl/include/umem.h index 77c216721253..9039212baf14 100644 --- a/sys/contrib/openzfs/lib/libspl/include/umem.h +++ b/sys/contrib/openzfs/lib/libspl/include/umem.h @@ -83,7 +83,7 @@ const char *_umem_debug_init(void); const char *_umem_options_init(void); const char *_umem_logging_init(void); -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_alloc(size_t size, int flags) { @@ -96,7 +96,7 @@ umem_alloc(size_t size, int flags) return (ptr); } -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_alloc_aligned(size_t size, size_t align, int flags) { @@ -118,7 +118,7 @@ umem_alloc_aligned(size_t size, size_t align, int flags) return (ptr); } -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_zalloc(size_t size, int flags) { @@ -188,6 +188,7 @@ umem_cache_destroy(umem_cache_t *cp) umem_free(cp, sizeof (umem_cache_t)); } +__attribute__((malloc)) static inline void * umem_cache_alloc(umem_cache_t *cp, int flags) { diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am index ceac2963e647..58d7f07527aa 100644 --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -135,7 +135,6 @@ nodist_libzpool_la_SOURCES = \ module/zfs/uberblock.c \ module/zfs/unique.c \ module/zfs/vdev.c \ - module/zfs/vdev_cache.c \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ module/zfs/vdev_indirect.c \ diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4 index 9ec940a94488..5fbd9d7db93f 100644 --- a/sys/contrib/openzfs/man/man4/zfs.4 +++ b/sys/contrib/openzfs/man/man4/zfs.4 @@ -2028,21 +2028,6 @@ Max vdev I/O aggregation size. .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . -.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq uint -Shift size to inflate reads to. -. -.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq uint -Inflate reads smaller than this value to meet the -.Sy zfs_vdev_cache_bshift -size -.Pq default Sy 64 KiB . -. -.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq uint -Total size of the per-disk cache in bytes. -.Pp -Currently this feature is disabled, as it has been found to not be helpful -for performance and in some cases harmful. -. .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7 index 2b7dcb63829c..b901ce6c2935 100644 --- a/sys/contrib/openzfs/man/man7/zpool-features.7 +++ b/sys/contrib/openzfs/man/man7/zpool-features.7 @@ -228,8 +228,10 @@ extensible_dataset filesystem_limits hole_birth large_blocks +livelist lz4_compress spacemap_histogram +zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8 index 26c67dabd705..031953c543a1 100644 --- a/sys/contrib/openzfs/man/man8/zdb.8 +++ b/sys/contrib/openzfs/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd October 7, 2020 +.Dd June 4, 2023 .Dt ZDB 8 .Os . @@ -41,6 +41,13 @@ .Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID .Op Ar object Ns | Ns Ar range Ns … .Nm +.Fl B +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … +.Op Fl U Ar cache +.Op Fl K Ar key +.Ar poolname Ns Ar / Ns Ar objset-ID +.Op Ar backup-flags +.Nm .Fl C .Op Fl A .Op Fl U Ar cache @@ -123,6 +130,22 @@ Display options: Display statistics regarding the number, size .Pq logical, physical and allocated and deduplication of blocks. +.It Fl B , -backup +Generate a backup stream, similar to +.Nm zfs Cm send , +but for the numeric objset ID, and without opening the dataset. +This can be useful in recovery scenarios if dataset metadata has become +corrupted but the dataset itself is readable. +The optional +.Ar flags +argument is a string of one or more of the letters +.Sy e , +.Sy L , +.Sy c , +and +.Sy w , +which correspond to the same flags in +.Xr zfs-send 8 . .It Fl c , -checksum Verify the checksum of all metadata blocks while printing block statistics .Po see diff --git a/sys/contrib/openzfs/man/man8/zfs-create.8 b/sys/contrib/openzfs/man/man8/zfs-create.8 index a7b6097c37f1..b3997d32767c 100644 --- a/sys/contrib/openzfs/man/man8/zfs-create.8 +++ b/sys/contrib/openzfs/man/man8/zfs-create.8 @@ -234,14 +234,11 @@ if the volume is not sparse. Print verbose information about the created dataset. .El .El -.Ss ZFS Volumes as Swap -ZFS volumes may be used as swap devices. -After creating the volume with the -.Nm zfs Cm create Fl V -enable the swap area using the -.Xr swapon 8 -command. -Swapping to files on ZFS filesystems is not supported. +.Ss ZFS for Swap +Swapping to a ZFS volume is prone to deadlock and not recommended. +See OpenZFS FAQ. +.Pp +Swapping to a file on a ZFS filesystem is not supported. . .Sh EXAMPLES .\" These are, respectively, examples 1, 10 from zfs.8 diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8 index 0ba93e4166e7..341f902fe66e 100644 --- a/sys/contrib/openzfs/man/man8/zpool-events.8 +++ b/sys/contrib/openzfs/man/man8/zpool-events.8 @@ -456,7 +456,6 @@ ZIO_FLAG_CANFAIL:0x00000080 ZIO_FLAG_SPECULATIVE:0x00000100 ZIO_FLAG_CONFIG_WRITER:0x00000200 ZIO_FLAG_DONT_RETRY:0x00000400 -ZIO_FLAG_DONT_CACHE:0x00000800 ZIO_FLAG_NODATA:0x00001000 ZIO_FLAG_INDUCE_DAMAGE:0x00002000 diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in index 8d29f56c2fb8..485331ac655e 100644 --- a/sys/contrib/openzfs/module/Kbuild.in +++ b/sys/contrib/openzfs/module/Kbuild.in @@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +ifeq ($(CONFIG_X86),y) +ifeq ($(CONFIG_CC_IS_CLANG),y) +CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false +endif +endif + ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ @@ -368,7 +382,6 @@ ZFS_OBJS := \ uberblock.o \ unique.o \ vdev.o \ - vdev_cache.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd index 365609fb8585..0c4d8bfe1159 100644 --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -308,7 +308,6 @@ SRCS+= abd.c \ uberblock.c \ unique.c \ vdev.c \ - vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_indirect.c \ @@ -400,6 +399,20 @@ beforeinstall: .include +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +.if ${CC} == "clang" +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64" +CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false +CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false +CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false +CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false +.endif +.endif + CFLAGS.sysctl_os.c= -include ../zfs_config.h CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index cc616f33db96..8ae2f23c3ecf 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -872,8 +872,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, "Enable to bypass vdev_validate()."); /* END CSTYLED */ -/* vdev_cache.c */ - /* vdev_mirror.c */ /* vdev_queue.c */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c index a077076927a1..20466aeaaa05 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c @@ -495,10 +495,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index b63899ddede0..49b97ae8f590 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -2220,92 +2220,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) { - pname = ZPL_VERSION_STR; - } else { - pname = zfs_prop_to_name(prop); - } - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_NFSV4; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index d26d89544e7c..c4f2b722ef4e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -2069,6 +2069,93 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_NFSV4; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + + void zfs_znode_update_vfs(znode_t *zp) diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c index 963e7a1ec96a..3c30dfc577b4 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c @@ -182,8 +182,11 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += size >> PAGE_SHIFT; +#else current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - +#endif vfree(ptr); } @@ -1012,9 +1015,19 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); + *obj = NULL; + /* + * Since we can't sleep attempt an emergency allocation to satisfy + * the request. The only alterative is to fail the allocation but + * it's preferable try. The use of KM_NOSLEEP is expected to be rare. + */ + if (flags & KM_NOSLEEP) + return (spl_emergency_alloc(skc, flags, obj)); + + might_sleep(); + /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index b7d6053529b4..29a8802b8367 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -219,7 +219,11 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += sc->nr_to_scan; +#else current->reclaim_state->reclaimed_slab += sc->nr_to_scan; +#endif /* * We are experiencing memory pressure which the arc_evict_zthr was diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c index ff26f47f2e04..a1fd3c9856cc 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c @@ -493,10 +493,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 48945b8af8c1..6b6293b9e482 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -2052,91 +2052,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_OFF; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index c104cd661bf5..02b1af3edc4f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -2254,6 +2254,91 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index a78f664c4fe8..a23715309f2b 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -965,7 +965,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ @@ -6106,8 +6106,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | + zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; @@ -7866,8 +7865,7 @@ arc_fini(void) taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); - while ((p = list_head(&arc_prune_list)) != NULL) { - list_remove(&arc_prune_list, p); + while ((p = list_remove_head(&arc_prune_list)) != NULL) { zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); @@ -8175,7 +8173,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size, tsize; + uint64_t size; /* * Make sure our globals have meaningful values in case the user @@ -8192,35 +8190,45 @@ l2arc_write_size(l2arc_dev_t *dev) if (arc_warm == B_FALSE) size += l2arc_write_boost; - /* - * Make sure the write size does not exceed the size of the cache - * device. This is important in l2arc_evict(), otherwise infinite - * iteration can occur. - */ - dev_size = dev->l2ad_end - dev->l2ad_start; - /* We need to add in the worst case scenario of log block overhead. */ - tsize = size + l2arc_log_blk_overhead(size, dev); + size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ - tsize += MAX(64 * 1024 * 1024, - (tsize * l2arc_trim_ahead) / 100); + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); } - if (tsize >= dev_size) { + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + if (l2arc_trim_ahead > 1) { + cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); + l2arc_trim_ahead = 1; + } + if (arc_warm == B_FALSE) size += l2arc_write_boost; + + size += l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); + } } return (size); @@ -8319,20 +8327,14 @@ l2arc_dev_get_next(void) static void l2arc_do_free_on_write(void) { - list_t *buflist; - l2arc_data_free_t *df, *df_prev; + l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); - buflist = l2arc_free_on_write; - - for (df = list_tail(buflist); df; df = df_prev) { - df_prev = list_prev(buflist, df); + while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); - list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } - mutex_exit(&l2arc_free_on_write_mtx); } @@ -8845,7 +8847,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) top: rerun = B_FALSE; - if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { + if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands @@ -9039,7 +9041,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } @@ -9299,7 +9301,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); - if ((write_asize + asize) > target_sz) { + /* + * If the allocated size of this buffer plus the max + * size for the pending log block exceeds the evicted + * target size, terminate writing buffers for this run. + */ + if (write_asize + asize + + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -9413,8 +9421,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * arcstat_l2_{size,asize} kstats are updated * internally. */ - if (l2arc_log_blk_insert(dev, hdr)) - l2arc_log_blk_commit(dev, pio, cb); + if (l2arc_log_blk_insert(dev, hdr)) { + /* + * l2ad_hand will be adjusted in + * l2arc_log_blk_commit(). + */ + write_asize += + l2arc_log_blk_commit(dev, pio, cb); + } zio_nowait(wzio); } @@ -10173,8 +10187,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); @@ -10494,11 +10507,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); @@ -10564,7 +10576,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ -static void +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; @@ -10675,6 +10687,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; + + return (asize); } /* diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c index 1c1f7892bb7d..da7360f8ce10 100644 --- a/sys/contrib/openzfs/module/zfs/bplist.c +++ b/sys/contrib/openzfs/module/zfs/bplist.c @@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { + while ((bpe = list_remove_head(&bpl->bpl_list))) { bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); kmem_free(bpe, sizeof (*bpe)); @@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { - bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); + while ((bpe = list_remove_head(&bpl->bpl_list))) kmem_free(bpe, sizeof (*bpe)); - } mutex_exit(&bpl->bpl_lock); } diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c index 4c25afaa8199..af2b94a850be 100644 --- a/sys/contrib/openzfs/module/zfs/btree.c +++ b/sys/contrib/openzfs/module/zfs/btree.c @@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), - size_t size) + bt_find_in_buf_f bt_find_in_buf, size_t size) { - zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); + zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, + BTREE_LEAF_SIZE); } +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, + const void *value, zfs_btree_index_t *where); + void zfs_btree_create_custom(zfs_btree_t *tree, int (*compar) (const void *, const void *), + bt_find_in_buf_f bt_find_in_buf, size_t size, size_t lsize) { size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); @@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree, ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; + tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? + zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; tree->bt_leaf_cap = P2ALIGN(esize / size, 2); @@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = zfs_btree_find_in_buf(tree, + void *d = tree->bt_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); @@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + void *d = tree->bt_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { @@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); - void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + + void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); @@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; @@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c index 57b8faf213eb..767a461e0026 100644 --- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c +++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c @@ -49,8 +49,12 @@ static dataset_kstat_values_t empty_dataset_kstats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 } + { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 } } }; diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index c19ebf424953..778b18817eef 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -1755,9 +1755,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_head(list)) != NULL) { + while ((dr = list_remove_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); zio_nowait(dr->dr_zio); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index c22a95f8647f..2fdd7c1ece73 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -1371,8 +1371,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; - zio_flag_t flags = ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; + zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 5b7f5543ad09..2d37ed2cdfb5 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -1955,7 +1955,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, { dsl_dataset_t *to_ds = dspp->to_ds; dsl_pool_t *dp = dspp->dp; -#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) @@ -1964,7 +1964,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, if (version >= ZPL_VERSION_SA) *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } -#endif /* raw sends imply large_block_ok */ if ((dspp->rawok || dspp->large_block_ok) && @@ -2793,6 +2792,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } if (err == 0) { + owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2806,21 +2806,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, sizeof (dspp.saved_toname), dspp.saved_toname); } - if (err != 0) + /* Only disown if there was an error in the lookups */ + if (owned && (err != 0)) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); + if (err == 0) + owned = B_TRUE; } - owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { + /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index c4e274bd4c42..0eb8c17e331a 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -1396,8 +1396,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; - while ((dcb = list_tail(cb_list)) != NULL) { - list_remove(cb_list, dcb); + while ((dcb = list_remove_tail(cb_list)) != NULL) { dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index ffc012e6c217..b70459380c24 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -520,8 +520,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = pf_end - pf_start + ipf_end - ipf_start; if (issued > 1) { /* More references on top of taken in dmu_zfetch_prepare(). */ - for (int i = 0; i < issued - 1; i++) - zfs_refcount_add(&zs->zs_refs, NULL); + zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); } else if (issued == 0) { /* Some other thread has done our work, so drop the ref. */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 14e7ced4007c..d6db61729223 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -3782,8 +3782,7 @@ snaplist_destroy(list_t *l, const void *tag) if (l == NULL || !list_link_active(&l->list_head)) return; - while ((snap = list_tail(l)) != NULL) { - list_remove(l, snap); + while ((snap = list_remove_tail(l)) != NULL) { dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c index eac9828a204a..bbe6a03d620f 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dir.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c @@ -1490,7 +1490,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) if (tr_cookie == NULL) return; - while ((tr = list_head(tr_list)) != NULL) { + while ((tr = list_remove_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, @@ -1500,7 +1500,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) } else { arc_tempreserve_clear(tr->tr_size); } - list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 07a527e332bc..1dd44171c10e 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -234,7 +234,7 @@ static int zfs_resilver_disable_defer = B_FALSE; static int zfs_free_bpobj_enabled = 1; /* Error blocks to be scrubbed in one txg. */ -uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; +static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { @@ -3437,10 +3437,8 @@ scan_io_queues_run_one(void *arg) * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ - while ((sio = list_head(&sio_list)) != NULL) { - list_remove(&sio_list, sio); + while ((sio = list_remove_head(&sio_list)) != NULL) scan_io_queue_insert_impl(queue, sio); - } queue->q_zio = NULL; mutex_exit(q_lock); @@ -4877,6 +4875,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ +__attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { @@ -4885,13 +4884,17 @@ ext_size_compare(const void *x, const void *y) return (TREE_CMP(*a, *b)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, + ext_size_compare) + static void ext_size_create(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; - zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t)); + zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, + sizeof (uint64_t)); } static void diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c index 76956572f8bd..77d87b694a43 100644 --- a/sys/contrib/openzfs/module/zfs/fm.c +++ b/sys/contrib/openzfs/module/zfs/fm.c @@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev) list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ - while ((ze = list_head(&ev->ev_ze_list)) != NULL) { - list_remove(&ev->ev_ze_list, ze); + while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) { ze->ze_zevent = NULL; ze->ze_dropped++; } diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 24d52a74933f..176247d63b76 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { @@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { @@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } + typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; @@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt) range_tree_walk(rt, metaslab_size_sorted_add, &arg); } + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, + range_seg32_t, metaslab_rangesize32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, + range_seg64_t, metaslab_rangesize64_compare) + /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. @@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg) size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; + bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; + bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } - zfs_btree_create(size_tree, compare, size); + zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } @@ -5641,8 +5650,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) - zfs_refcount_add(&mca->mca_alloc_slots, zio); + zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } @@ -5656,8 +5664,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - for (int d = 0; d < slots; d++) - zfs_refcount_remove(&mca->mca_alloc_slots, zio); + zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c index 894c30fcae16..5174e2c46633 100644 --- a/sys/contrib/openzfs/module/zfs/range_tree.c +++ b/sys/contrib/openzfs/module/zfs/range_tree.c @@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } +__attribute__((always_inline)) inline static int range_tree_seg32_compare(const void *x1, const void *x2) { @@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg64_compare(const void *x1, const void *x2) { @@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg_gap_compare(const void *x1, const void *x2) { @@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, + range_tree_seg32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, + range_tree_seg64_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, + range_tree_seg_gap_compare) + range_tree_t * range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap) @@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = range_tree_seg32_compare; + bt_find = range_tree_seg32_find_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = range_tree_seg64_compare; + bt_find = range_tree_seg64_find_in_buf; break; case RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); compare = range_tree_seg_gap_compare; + bt_find = range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } - zfs_btree_create(&rt->rt_root, compare, size); + zfs_btree_create(&rt->rt_root, compare, bt_find, size); rt->rt_ops = ops; rt->rt_gap = gap; diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c index 62ec03e1035a..601d27f8c47a 100644 --- a/sys/contrib/openzfs/module/zfs/refcount.c +++ b/sys/contrib/openzfs/module/zfs/refcount.c @@ -88,14 +88,11 @@ zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) reference_t *ref; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_head(&rc->rc_list))) { - list_remove(&rc->rc_list, ref); + while ((ref = list_remove_head(&rc->rc_list))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_list); - while ((ref = list_head(&rc->rc_removed))) { - list_remove(&rc->rc_removed, ref); + while ((ref = list_remove_head(&rc->rc_removed))) { kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); } @@ -151,6 +148,15 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_add_many(rc, 1, holder)); } +void +zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (!rc->rc_tracked) + (void) zfs_refcount_add_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_add(rc, holder); +} + int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) @@ -204,6 +210,15 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_remove_many(rc, 1, holder)); } +void +zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (!rc->rc_tracked) + (void) zfs_refcount_remove_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_remove(rc, holder); +} + void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 1fc2c5e8c55d..88ee4ea9f458 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -33,6 +33,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ /* @@ -1608,16 +1609,16 @@ spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; + log_summary_entry_t *e; + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); - list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } @@ -6874,9 +6875,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - if (dsl_scan_resilvering(spa_get_dsl(spa))) + if (dsl_scan_resilvering(spa_get_dsl(spa)) || + dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); + } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 89e1ce7165db..9ef948e9e434 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -814,8 +814,7 @@ spa_remove(spa_t *spa) if (spa->spa_root) spa_strfree(spa->spa_root); - while ((dp = list_head(&spa->spa_config_list)) != NULL) { - list_remove(&spa->spa_config_list, dp); + while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); @@ -2439,7 +2438,6 @@ spa_init(spa_mode_t mode) zio_init(); dmu_init(); zil_init(); - vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); @@ -2463,7 +2461,6 @@ spa_fini(void) spa_evict_all(); vdev_file_fini(); - vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index c243dddb7e61..612e66c3a8a8 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include @@ -715,7 +715,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); - vdev_cache_init(vd); return (vd); } @@ -1096,7 +1095,6 @@ vdev_free(vdev_t *vd) * Clean up vdev structure. */ vdev_queue_fini(vd); - vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); @@ -1720,8 +1718,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_TRYHARD; + ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -2612,8 +2609,6 @@ vdev_close(vdev_t *vd) vd->vdev_ops->vdev_op_close(vd); - vdev_cache_purge(vd); - /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that @@ -2699,6 +2694,17 @@ vdev_reopen(vdev_t *vd) (void) vdev_validate(vd); } + /* + * Recheck if resilver is still needed and cancel any + * scheduled resilver if resilver is unneeded. + */ + if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa->spa_async_tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; + mutex_exit(&spa->spa_async_lock); + } + /* * Reassess parent vdev's health. */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c deleted file mode 100644 index f0a17600d58e..000000000000 --- a/sys/contrib/openzfs/module/zfs/vdev_cache.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Virtual device read-ahead caching. - * - * This file implements a simple LRU read-ahead cache. When the DMU reads - * a given block, it will often want other, nearby blocks soon thereafter. - * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 128 back-to-back 512-byte - * reads into a single 64k read followed by 127 cache hits; this reduces - * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 64k read, which doesn't affect latency all that much but is - * terribly wasteful of bandwidth. A more intelligent version of the cache - * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. Currently, only - * metadata I/O is inflated. A further enhancement could take advantage of - * more semantic information about the I/O. And it could use something - * faster than an AVL tree; that was chosen solely for convenience. - * - * There are five cache operations: allocate, fill, read, write, evict. - * - * (1) Allocate. This reserves a cache entry for the specified region. - * We separate the allocate and fill operations so that multiple threads - * don't generate I/O for the same cache miss. - * - * (2) Fill. When the I/O for a cache miss completes, the fill routine - * places the data in the previously allocated cache entry. - * - * (3) Read. Read data from the cache. - * - * (4) Write. Update cache contents after write completion. - * - * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds zfs_vdev_cache_size. - */ - -/* - * These tunables are for performance analysis. - */ -/* - * All i/os smaller than zfs_vdev_cache_max will be turned into - * 1<ve_offset, ve2->ve_offset)); -} - -static int -vdev_cache_lastused_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; - const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - - int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); - if (likely(cmp)) - return (cmp); - - /* - * Among equally old entries, sort by offset to ensure uniqueness. - */ - return (vdev_cache_offset_compare(a1, a2)); -} - -/* - * Evict the specified entry from the cache. - */ -static void -vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) -{ - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - ASSERT3P(ve->ve_abd, !=, NULL); - - avl_remove(&vc->vc_lastused_tree, ve); - avl_remove(&vc->vc_offset_tree, ve); - abd_free(ve->ve_abd); - kmem_free(ve, sizeof (vdev_cache_entry_t)); -} - -/* - * Allocate an entry in the cache. At the point we don't have the data, - * we're just creating a placeholder so that multiple threads don't all - * go off and read the same blocks. - */ -static vdev_cache_entry_t * -vdev_cache_allocate(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, VCBS); - vdev_cache_entry_t *ve; - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - - if (zfs_vdev_cache_size == 0) - return (NULL); - - /* - * If adding a new entry would exceed the cache size, - * evict the oldest entry (LRU). - */ - if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > - zfs_vdev_cache_size) { - ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) - return (NULL); - ASSERT3U(ve->ve_hits, !=, 0); - vdev_cache_evict(vc, ve); - } - - ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve->ve_offset = offset; - ve->ve_lastused = ddi_get_lbolt(); - ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); - - avl_add(&vc->vc_offset_tree, ve); - avl_add(&vc->vc_lastused_tree, ve); - - return (ve); -} - -static void -vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) -{ - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - - if (ve->ve_lastused != ddi_get_lbolt()) { - avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = ddi_get_lbolt(); - avl_add(&vc->vc_lastused_tree, ve); - } - - ve->ve_hits++; - abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); -} - -/* - * Fill a previously allocated cache entry with data. - */ -static void -vdev_cache_fill(zio_t *fio) -{ - vdev_t *vd = fio->io_vd; - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = fio->io_private; - zio_t *pio; - - ASSERT3U(fio->io_size, ==, VCBS); - - /* - * Add data to the cache. - */ - mutex_enter(&vc->vc_lock); - - ASSERT3P(ve->ve_fill_io, ==, fio); - ASSERT3U(ve->ve_offset, ==, fio->io_offset); - ASSERT3P(ve->ve_abd, ==, fio->io_abd); - - ve->ve_fill_io = NULL; - - /* - * Even if this cache line was invalidated by a missed write update, - * any reads that were queued up before the missed update are still - * valid, so we can satisfy them from this line before we evict it. - */ - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(fio, &zl)) != NULL) - vdev_cache_hit(vc, ve, pio); - - if (fio->io_error || ve->ve_missed_update) - vdev_cache_evict(vc, ve); - - mutex_exit(&vc->vc_lock); -} - -/* - * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. - */ -boolean_t -vdev_cache_read(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); - zio_t *fio; - uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - if (zfs_vdev_cache_size == 0) - return (B_FALSE); - - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (B_FALSE); - - if (zio->io_size > zfs_vdev_cache_max) - return (B_FALSE); - - /* - * If the I/O straddles two or more cache blocks, don't cache it. - */ - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (B_FALSE); - - ASSERT3U(cache_phase + zio->io_size, <=, VCBS); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); - - if (ve != NULL) { - if (ve->ve_missed_update) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - if ((fio = ve->ve_fill_io) != NULL) { - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_delegations); - return (B_TRUE); - } - - vdev_cache_hit(vc, ve, zio); - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_hits); - return (B_TRUE); - } - - ve = vdev_cache_allocate(zio); - - if (ve == NULL) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); - - ve->ve_fill_io = fio; - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - - mutex_exit(&vc->vc_lock); - zio_nowait(fio); - VDCSTAT_BUMP(vdc_stat_misses); - - return (B_TRUE); -} - -/* - * Update cache contents upon write completion. - */ -void -vdev_cache_write(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t io_start = zio->io_offset; - uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, VCBS); - uint64_t max_offset = P2ROUNDUP(io_end, VCBS); - avl_index_t where; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = min_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); - - if (ve == NULL) - ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); - - while (ve != NULL && ve->ve_offset < max_offset) { - uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + VCBS, io_end); - - if (ve->ve_fill_io != NULL) { - ve->ve_missed_update = 1; - } else { - abd_copy_off(ve->ve_abd, zio->io_abd, - start - ve->ve_offset, start - io_start, - end - start); - } - ve = AVL_NEXT(&vc->vc_offset_tree, ve); - } - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_purge(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_init(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_offset_node)); - - avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_lastused_node)); -} - -void -vdev_cache_fini(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - vdev_cache_purge(vd); - - avl_destroy(&vc->vc_offset_tree); - avl_destroy(&vc->vc_lastused_tree); - - mutex_destroy(&vc->vc_lock); -} - -void -vdev_cache_stat_init(void) -{ - vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (vdc_ksp != NULL) { - vdc_ksp->ks_data = &vdc_stats; - kstat_install(vdc_ksp); - } -} - -void -vdev_cache_stat_fini(void) -{ - if (vdc_ksp != NULL) { - kstat_delete(vdc_ksp); - vdc_ksp = NULL; - } -} - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, UINT, ZMOD_RW, - "Inflate reads small than max"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, UINT, ZMOD_RD, - "Total size of the per-disk cache"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, UINT, ZMOD_RW, - "Shift size to inflate reads too"); diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index a16ad2f4e7cf..89667585345d 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio) indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; - while ((is = list_head(&iv->iv_splits)) != NULL) { + while ((is = list_remove_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } - list_remove(&iv->iv_splits, is); indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; list_destroy(&is->is_unique_child); @@ -1659,8 +1658,8 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; is->is_unique_children = 0; } diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index 1a75d68abd9e..abb7d0662b8c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -748,8 +748,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); + flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; @@ -907,7 +906,7 @@ vdev_queue_io(zio_t *zio) ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c index d6ad8b2b8bc5..085d9cd8b4b6 100644 --- a/sys/contrib/openzfs/module/zfs/zap_micro.c +++ b/sys/contrib/openzfs/module/zfs/zap_micro.c @@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size) } } +__attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { @@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2) (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, + mze_compare) + static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { @@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, - sizeof (mzap_ent_t), 512); + mze_find_in_buf, sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index bdd0e96c327a..c42ef048dd74 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -1522,9 +1522,8 @@ zfs_ereport_fini(void) { recent_events_node_t *entry; - while ((entry = list_head(&recent_events_list)) != NULL) { + while ((entry = list_remove_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); - list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c index 44aaae9c1264..add4241dcc99 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fuid.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c @@ -699,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) zfs_fuid_t *zfuid; zfs_fuid_domain_t *zdomain; - while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { - list_remove(&fuidp->z_fuids, zfuid); + while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL) kmem_free(zfuid, sizeof (zfs_fuid_t)); - } if (fuidp->z_domain_table != NULL) kmem_free(fuidp->z_domain_table, (sizeof (char *)) * fuidp->z_domain_cnt); - while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { - list_remove(&fuidp->z_domains, zdomain); + while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL) kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); - } kmem_free(fuidp, sizeof (zfs_fuid_info_t)); } diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c index 63acf7ab2e4d..7bf804b67790 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_onexit.c +++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c @@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo) zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); - while ((ap = list_head(&zo->zo_actions)) != NULL) { - list_remove(&zo->zo_actions, ap); + while ((ap = list_remove_head(&zo->zo_actions)) != NULL) { mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c index d887e4900d1d..8c1fe5f66838 100644 --- a/sys/contrib/openzfs/module/zfs/zil.c +++ b/sys/contrib/openzfs/module/zfs/zil.c @@ -116,8 +116,12 @@ static zil_kstat_values_t zil_stats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; @@ -146,6 +150,9 @@ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; +static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static itx_t *zil_itx_clone(itx_t *oitx); + static int zil_bp_compare(const void *x1, const void *x2) { @@ -241,11 +248,10 @@ zil_kstats_global_update(kstat_t *ksp, int rw) */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, - blkptr_t *nbp, void *dst, char **end) + blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -262,7 +268,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, - &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -277,23 +283,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, */ cksum.zc_word[ZIL_ZC_SEQ]++; + uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); - uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + zilc->zc_nused < sizeof (*zilc) || + zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); - memcpy(dst, lr, len); - *end = (char *)dst + len; + *begin = lr; + *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; - uint64_t size = BP_GET_LSIZE(bp); + char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, @@ -301,15 +307,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(zilc->zc_nused, <=, - SPA_OLD_MAXBLOCKSIZE); - memcpy(dst, lr, zilc->zc_nused); - *end = (char *)dst + zilc->zc_nused; + *begin = lr; + *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } - - arc_buf_destroy(abuf, &abuf); } return (error); @@ -375,8 +377,12 @@ zil_sums_init(zil_sums_t *zs) wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void @@ -393,8 +399,12 @@ zil_sums_fini(zil_sums_t *zs) wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); + wmsum_fini(&zs->zil_itx_metaslab_normal_write); + wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); + wmsum_fini(&zs->zil_itx_metaslab_slog_write); + wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void @@ -422,10 +432,18 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); + zs->zil_itx_metaslab_normal_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); + zs->zil_itx_metaslab_normal_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); + zs->zil_itx_metaslab_slog_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); + zs->zil_itx_metaslab_slog_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* @@ -445,7 +463,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; - char *lrbuf, *lrp; int error = 0; /* @@ -463,13 +480,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *end = NULL; + char *lrp, *end; + arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; @@ -485,8 +502,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, - lrbuf, &end); + &lrp, &end, &abuf); if (error != 0) { + if (abuf) + arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -499,7 +518,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; } - for (lrp = lrbuf; lrp < end; lrp += reclen) { + for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); @@ -513,6 +532,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, max_lr_seq = lr->lrc_seq; lr_count++; } + arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; @@ -522,7 +542,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } @@ -747,20 +766,21 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; lwb->lwb_slog = slog; + lwb->lwb_indirect = B_FALSE; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = lwb->lwb_nfilled = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); - lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } + lwb->lwb_max_txg = txg; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); @@ -1373,9 +1393,14 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zil_commit_waiter_t *zcw; itx_t *itx; uint64_t txg; + list_t itxs, waiters; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); + list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1384,9 +1409,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) lwb->lwb_root_zio = NULL; - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1397,13 +1419,21 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } - while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) - zil_itx_destroy(itx); + list_move_tail(&itxs, &lwb->lwb_itxs); + list_move_tail(&waiters, &lwb->lwb_waiters); - while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + + mutex_exit(&zilog->zl_lock); + + while ((itx = list_remove_head(&itxs)) != NULL) + zil_itx_destroy(itx); + list_destroy(&itxs); + + while ((zcw = list_remove_head(&waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); - ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been @@ -1428,8 +1458,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } - - mutex_exit(&zilog->zl_lock); + list_destroy(&waiters); mutex_enter(&zilog->zl_lwb_io_lock); txg = lwb->lwb_issued_txg; @@ -1666,46 +1695,41 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); + if (lwb->lwb_root_zio != NULL) + return; + + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); - if (lwb->lwb_root_zio == NULL) { - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, - zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, - BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - - lwb->lwb_state = LWB_STATE_OPENED; - - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; } - mutex_exit(&zilog->zl_lock); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, + &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), + zil_lwb_write_done, lwb, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); + + lwb->lwb_state = LWB_STATE_OPENED; + + zil_lwb_set_zio_dependency(zilog, lwb); + zilog->zl_last_lwb_opened = lwb; + mutex_exit(&zilog->zl_lock); } /* @@ -1736,11 +1760,11 @@ static const struct { static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* - * Start a log block write and advance to the next log block. - * Calls are serialized. + * Close the log block for being issued and allocate the next one. + * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; @@ -1748,7 +1772,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; - uint64_t zil_blksz, wsz; + uint64_t zil_blksz; int i, error; boolean_t slog; @@ -1757,16 +1781,17 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - zilc = (zil_chain_t *)lwb->lwb_buf; - bp = &zilc->zc_next_blk; - } else { - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; + /* + * If this lwb includes indirect writes, we have to commit before + * creating the transaction, otherwise we may end up in dead lock. + */ + if (lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; } - ASSERT(lwb->lwb_nused <= lwb->lwb_sz); - /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. @@ -1814,19 +1839,18 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, + uint64_t, zil_blksz, + uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; BP_ZERO(bp); error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (slog) { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, - lwb->lwb_nused); - } else { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, - lwb->lwb_nused); - } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -1838,17 +1862,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); } + lwb->lwb_state = LWB_STATE_ISSUED; + + dmu_tx_commit(tx); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); +} + +/* + * Finalize previously closed block and issue the write zio. + * Does not require locking. + */ +static void +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +{ + zil_chain_t *zilc; + int wsz; + + /* Actually fill the lwb with the data if not yet. */ + if (!lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; + } + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); - ASSERT3U(wsz, <=, lwb->lwb_sz); + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); + ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; + zilc = (zil_chain_t *)lwb->lwb_buf; } else { wsz = lwb->lwb_sz; + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); } - zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; @@ -1858,22 +1912,28 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) */ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + if (lwb->lwb_slog) { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, + lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); + } else { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, + lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); + } spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); - lwb->lwb_state = LWB_STATE_ISSUED; - zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); - - dmu_tx_commit(tx); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); } /* @@ -1909,13 +1969,19 @@ zil_max_copied_data(zilog_t *zilog) sizeof (lr_write_t)); } +/* + * Estimate space needed in the lwb for the itx. Allocate more lwbs or + * split the itx as needed, but don't touch the actual transaction data. + * Has to be called under zl_issuer_lock to call zil_lwb_write_close() + * to chain more lwbs. + */ static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { - lr_t *lrcb, *lrc; - lr_write_t *lrwb, *lrw; - char *lr_buf; - uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; + itx_t *citx; + lr_t *lr, *clr; + lr_write_t *lrw; + uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1923,8 +1989,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zil_lwb_write_open(zilog, lwb); - lrc = &itx->itx_lr; - lrw = (lr_write_t *)lrc; + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead @@ -1938,24 +2004,23 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * * For more details, see the comment above zil_commit(). */ - if (lrc->lrc_txtype == TX_COMMIT) { + if (lr->lrc_txtype == TX_COMMIT) { mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; mutex_exit(&zilog->zl_lock); + list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); - dpad = dlen - lrw->lr_length; } else { - dlen = dpad = 0; + dlen = 0; } - reclen = lrc->lrc_reclen; + reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); - txg = lrc->lrc_txg; cont: /* @@ -1968,7 +2033,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); @@ -1987,19 +2053,99 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } dnow = MIN(dlen, lwb_sp - reclen); - lr_buf = lwb->lwb_buf + lwb->lwb_nused; - memcpy(lr_buf, lrc, reclen); - lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ - lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + if (dlen > dnow) { + ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); + ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); + citx = zil_itx_clone(itx); + clr = &citx->itx_lr; + lr_write_t *clrw = (lr_write_t *)clr; + clrw->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + } else { + citx = itx; + clr = lr; + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + clr->lrc_seq = ++zilog->zl_lr_seq; + + lwb->lwb_nused += reclen + dnow; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + zil_lwb_add_txg(lwb, lr->lrc_txg); + list_insert_tail(&lwb->lwb_itxs, citx); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + /* + * We have to really issue all queued LWBs before we may have to + * wait for a txg sync. Otherwise we may end up in a dead lock. + */ + if (lr->lrc_txtype == TX_WRITE) { + boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); + if (frozen || itx->itx_wr_state == WR_INDIRECT) { + lwb_t *tlwb; + while ((tlwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, tlwb); + } + if (itx->itx_wr_state == WR_INDIRECT) + lwb->lwb_indirect = B_TRUE; + if (frozen) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); + } + + return (lwb); +} + +/* + * Fill the actual transaction data into the lwb, following zil_lwb_assign(). + * Does not require locking. + */ +static void +zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) +{ + lr_t *lr, *lrb; + lr_write_t *lrw, *lrwb; + char *lr_buf; + uint64_t dlen, reclen; + + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_COMMIT) + return; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + dlen = P2ROUNDUP_TYPED( + lrw->lr_length, sizeof (uint64_t), uint64_t); + } else { + dlen = 0; + } + reclen = lr->lrc_reclen; + ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); + + lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; + memcpy(lr_buf, lr, reclen); + lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ + lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, @@ -2010,14 +2156,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrcb->lrc_reclen += dnow; - if (lrwb->lr_length > dnow) - lrwb->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; + lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, - dnow); + dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; @@ -2044,9 +2186,11 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_write_zio); - if (dbuf != NULL && error == 0 && dnow == dlen) + if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ - memset((char *)dbuf + lrwb->lr_length, 0, dpad); + memset((char *)dbuf + lrwb->lr_length, 0, + dlen - lrwb->lr_length); + } /* * Typically, the only return values we should see from @@ -2074,39 +2218,26 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) error); zfs_fallthrough; case EIO: - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lwb->lwb_indirect) { + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); + } else { + lwb->lwb_write_zio->io_error = error; + } zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return (lwb); + return; } } } - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - lrcb->lrc_seq = ++zilog->zl_lr_seq; - lwb->lwb_nused += reclen + dnow; - - zil_lwb_add_txg(lwb, txg); - - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - return (lwb); + lwb->lwb_nfilled += reclen + dlen; + ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); + ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * @@ -2131,6 +2262,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize) return (itx); } +static itx_t * +zil_itx_clone(itx_t *oitx) +{ + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); + memcpy(itx, oitx, oitx->itx_size); + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; + return (itx); +} + void zil_itx_destroy(itx_t *itx) { @@ -2162,7 +2303,7 @@ zil_itxg_clean(void *arg) /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via - * zil_lwb_commit(), and free'd in that function. Having + * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * @@ -2561,7 +2702,7 @@ zil_commit_writer_stall(zilog_t *zilog) * lwb will be issued to the zio layer to be written to disk. */ static void -zil_process_commit_list(zilog_t *zilog) +zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; @@ -2663,18 +2804,23 @@ zil_process_commit_list(zilog_t *zilog) */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { - lwb = zil_lwb_commit(zilog, itx, lwb); - - if (lwb == NULL) + lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); + if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); - else - list_insert_tail(&lwb->lwb_itxs, itx); + } else if ((zcw->zcw_lwb != NULL && + zcw->zcw_lwb != lwb) || zcw->zcw_done) { + /* + * Our lwb is done, leave the rest of + * itx list to somebody else who care. + */ + first = B_FALSE; + break; + } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } - list_insert_tail(&nolwb_itxs, itx); } } else { @@ -2690,6 +2836,8 @@ zil_process_commit_list(zilog_t *zilog) * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* @@ -2735,13 +2883,13 @@ zil_process_commit_list(zilog_t *zilog) * on the system, such that this function will be * immediately called again (not necessarily by the same * thread) and this lwb's zio will be issued via - * zil_lwb_commit(). This way, the lwb is guaranteed to + * zil_lwb_assign(). This way, the lwb is guaranteed to * be "full" when it is issued to disk, and we'll make * use of the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, such that this lwb's zio isn't issued via - * zil_lwb_commit(), zil_commit_waiter() will issue the + * zil_lwb_assign(), zil_commit_waiter() will issue the * lwb's zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount @@ -2773,10 +2921,15 @@ zil_process_commit_list(zilog_t *zilog) zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); zilog->zl_cur_used = 0; - if (lwb == NULL) + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) + != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); + } } } } @@ -2799,9 +2952,13 @@ zil_process_commit_list(zilog_t *zilog) static void zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { + list_t ilwbs; + lwb_t *lwb; + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); + list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { @@ -2828,10 +2985,13 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_get_commit_list(zilog); zil_prune_commit_list(zilog); - zil_process_commit_list(zilog); + zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); + while ((lwb = list_remove_head(&ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + list_destroy(&ilwbs); } static void @@ -2858,7 +3018,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) return; /* - * In order to call zil_lwb_write_issue() we must hold the + * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order @@ -2876,8 +3036,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ - if (zcw->zcw_done) + if (zcw->zcw_done) { + lwb = NULL; goto out; + } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -2896,15 +3058,17 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * if it's ISSUED or OPENED, and block any other threads that might * attempt to issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_issue() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + lwb->lwb_state == LWB_STATE_FLUSH_DONE) { + lwb = NULL; goto out; + } ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -2914,7 +3078,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); @@ -2934,7 +3098,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) if (nlwb == NULL) { /* - * When zil_lwb_write_issue() returns NULL, this + * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the @@ -2956,12 +3120,16 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * lock, which occurs prior to calling dmu_tx_commit() */ mutex_exit(&zcw->zcw_lock); + zil_lwb_write_issue(zilog, lwb); + lwb = NULL; zil_commit_writer_stall(zilog); mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); + if (lwb) + zil_lwb_write_issue(zilog, lwb); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); } @@ -2976,7 +3144,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. @@ -3656,7 +3824,7 @@ zil_close(zilog_t *zilog) /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_issue(). + * zil_lwb_write_close(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index c17ca5e1d651..d7b2217623e6 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -1617,12 +1617,6 @@ zio_read_bp_init(zio_t *zio) ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } - if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - - if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; @@ -3955,9 +3949,6 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) - return (zio); - if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -3994,9 +3985,6 @@ zio_vdev_io_done(zio_t *zio) vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); @@ -4106,8 +4094,7 @@ zio_vdev_io_assess(zio_t *zio) ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; - zio->io_flags |= ZIO_FLAG_IO_RETRY | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 06bc75c634a6..cd4e6f0c7558 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -1203,8 +1203,7 @@ zvol_create_minors_recursive(const char *name) * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ - while ((job = list_head(&minors_list)) != NULL) { - list_remove(&minors_list, job); + while ((job = list_remove_head(&minors_list)) != NULL) { if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); @@ -1311,10 +1310,8 @@ zvol_remove_minors_impl(const char *name) rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ - while ((zv = list_head(&free_list)) != NULL) { - list_remove(&free_list, zv); + while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); - } } /* Remove minor for this specific volume only */ diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run index 9ed1a6d37a97..342f56d50d04 100644 --- a/sys/contrib/openzfs/tests/runfiles/common.run +++ b/sys/contrib/openzfs/tests/runfiles/common.run @@ -128,7 +128,7 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] + 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] pre = post = tags = ['functional', 'cli_root', 'zdb'] @@ -472,7 +472,8 @@ tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] -tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] +tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart', + 'zpool_resilver_concurrent'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] diff --git a/sys/contrib/openzfs/tests/runfiles/freebsd.run b/sys/contrib/openzfs/tests/runfiles/freebsd.run index c7ca1d769fc3..13696d645850 100644 --- a/sys/contrib/openzfs/tests/runfiles/freebsd.run +++ b/sys/contrib/openzfs/tests/runfiles/freebsd.run @@ -25,3 +25,8 @@ tags = ['functional'] [tests/functional/cli_root/zfs_jail:FreeBSD] tests = ['zfs_jail_001_pos'] tags = ['functional', 'cli_root', 'zfs_jail'] + +[tests/functional/pam:FreeBSD] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', + 'pam_short_password'] +tags = ['functional', 'pam'] diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run index 4df770d61f07..618eeb934017 100644 --- a/sys/contrib/openzfs/tests/runfiles/linux.run +++ b/sys/contrib/openzfs/tests/runfiles/linux.run @@ -140,7 +140,8 @@ tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_nounmount', 'pam_short_password'] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', + 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in index 3f7498f5c6bf..9517ce8073a5 100755 --- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in +++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in @@ -152,6 +152,7 @@ known = { ['FAIL', rewind_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], + 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], @@ -163,6 +164,8 @@ if sys.platform.startswith('freebsd'): known.update({ 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], + 'cli_root/zpool_resilver/zpool_resilver_concurrent': + ['SKIP', na_reason], 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], @@ -277,6 +280,8 @@ elif sys.platform.startswith('linux'): 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], + 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], + 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c index 9a34bf559be0..fda9229915ce 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/btree_test.c @@ -501,7 +501,7 @@ main(int argc, char *argv[]) srandom(seed); zfs_btree_init(); - zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); + zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t)); /* * This runs the named negative test. None of them should diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am index ad4aec543299..ff65dc1ac2b0 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am @@ -572,6 +572,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ + functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ @@ -1142,6 +1143,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ + functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh new file mode 100755 index 000000000000..d98ab86ab667 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +write_count=8 +blksize=131072 + +tmpfile=$TEST_BASE_DIR/tmpfile + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm $tmpfile.1 $tmpfile.2 +} + +log_onexit cleanup + +log_assert "Verify that zfs send and zdb -B produce the same stream" + +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $TESTDIR/file -b $blksize -c $write_count + +snap=$TESTPOOL/$TESTFS@snap +log_must zfs snapshot $snap +typeset -i objsetid=$(zfs get -Ho value objsetid $snap) + +sync_pool $TESTPOOL + +log_must eval "zfs send -ecL $snap > $tmpfile.1" +log_must eval "zdb -B $TESTPOOL/$objsetid ecL > $tmpfile.2" + +typeset sum1=$(cat $tmpfile.1 | md5sum) +typeset sum2=$(cat $tmpfile.2 | md5sum) + +log_must test "$sum1" = "$sum2" + +log_pass "zfs send and zdb -B produce the same stream" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh new file mode 100755 index 000000000000..4c3b09796869 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh @@ -0,0 +1,101 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 Hewlett Packard Enterprise Development LP. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify 'zpool clear' doesn't cause concurrent resilvers +# +# STRATEGY: +# 1. Create N(10) virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Force-fault 2 vdevs and verify distributed spare is kicked in. +# 5. Free the distributed spare by replacing the faulty drive. +# 6. Run zpool clear and verify that it does not initiate 2 resilvers +# concurrently while distributed spare gets kicked in. +# + +verify_runnable "global" + +typeset -ir devs=10 +typeset -ir nparity=1 +typeset -ir ndata=8 +typeset -ir dspare=1 + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$BASEDIR/vdev$i" + done + + for dir in $BASEDIR; do + if [[ -d $dir ]]; then + log_must rm -rf $dir + fi + done + + zed_stop + zed_cleanup +} + +log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers" +log_onexit cleanup + +setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs + +# ZED needed for sequential resilver +zed_setup +log_must zed_start + +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5 +log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 +log_must zpool wait -t resilver $TESTPOOL +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6 + +log_must zpool labelclear -f $BASEDIR/vdev5 +log_must zpool labelclear -f $BASEDIR/vdev6 + +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5 +sync_pool $TESTPOOL + +log_must zpool events -c +log_must zpool clear $TESTPOOL +log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 +log_must zpool wait -t resilver $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +nof_resilver=$(zpool events | grep -c resilver_start) +if [ $nof_resilver = 1 ] ; then + log_must verify_pool $TESTPOOL + log_pass "zpool clear on draid pool doesn't cause concurrent resilvers" +else + log_fail "FAIL: sequential and healing resilver initiated concurrently" +fi diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh index 9bf6a94cfc84..9deee67a56ca 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh @@ -25,7 +25,7 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" set -A args "" "-s \",\"" "-v" \ - "-f time,zcwc,zimnb,zimsb" + "-f time,cwc,imnb,imsb" log_assert "zilstat generates output and doesn't return an error code" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh index 971c7fce64e5..dbcb175ed069 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -25,5 +25,6 @@ rmconfig destroy_pool $TESTPOOL del_user ${username} +del_user ${username}rec del_group pamtestgroup log_must rm -rf "$runstatedir" $TESTDIRS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh new file mode 100755 index 000000000000..91b202f7609d --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}" + +printf "testpass\nsecondpass\nsecondpass\n" | pamtester -v ${pamservice} ${username} chauthtok + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +echo "secondpass" | pamtester ${pamservice} ${username} open_session +references 1 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +printf "secondpass\ntestpass\ntestpass\n" | pamtester -v ${pamservice} ${username} chauthtok + +log_must ismounted "$TESTPOOL/pam/${username}" +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +log_must pamtester ${pamservice} ${username} close_session +references 0 +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +log_pass "done." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh new file mode 100755 index 000000000000..3714b179b852 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +username="${username}rec" + +# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests, +# and a user which references that mountpoint +log_must zfs create "$TESTPOOL/pampam" +log_must zfs create -o mountpoint="$TESTDIR/rec" "$TESTPOOL/pampam/pam" +echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \ + -o keylocation=prompt "$TESTPOOL/pampam/pam/${username}" +log_must zfs unmount "$TESTPOOL/pampam/pam/${username}" +log_must zfs unload-key "$TESTPOOL/pampam/pam/${username}" +log_must add_user pamtestgroup ${username} "$TESTDIR/rec" + +function keystatus { + log_must [ "$(get_prop keystatus "$TESTPOOL/pampam/pam/${username}")" = "$1" ] +} + +log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" +keystatus unavailable + +function test_session { + echo "recurpass" | pamtester ${pamservice} ${username} open_session + references 1 + log_must ismounted "$TESTPOOL/pampam/pam/${username}" + keystatus available + + log_must pamtester ${pamservice} ${username} close_session + references 0 + log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" + keystatus unavailable +} + +genconfig "homes=$TESTPOOL/pampam/pam prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL/pampam recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=* recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +log_pass "done." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh index 443e07d7f003..079608583a72 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh @@ -52,7 +52,7 @@ log_must ismounted "$TESTPOOL/pam/${username}" keystatus available # Change user and dataset password to short one. -printf "short\nshort\n" | pamtester ${pamservice} ${username} chauthtok +printf "testpass\nshort\nshort\n" | pamtester -v ${pamservice} ${username} chauthtok # Unmount and unload key. log_must pamtester ${pamservice} ${username} close_session diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index f970935f5bd0..087aef9027ea 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -38,6 +38,8 @@ verify_runnable "global" +log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" + function test_cleanup { # reset memory limit to 16M diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 9ebd5b149118..619d8d0e8f07 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -45,6 +45,15 @@ fi if ! is_linux ; then log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" +else + if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + + # Disabled for the CentOS 9 kernel + if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi fi typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index 46cac3ecb6c2..c0b191aafd45 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -44,6 +44,15 @@ verify_runnable "global" if is_linux ; then + if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + + # Disabled for the CentOS 9 kernel + if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + # We need '--force' here since the prior tests may leave a filesystem # on the zvol, and blkdiscard will see that filesystem and print a # warning unless you force it. @@ -123,7 +132,6 @@ log_must zfs set compression=off $TESTPOOL/$TESTVOL # Remove old data from previous tests log_must $trimcmd $zvolpath - set_blk_mq 1 log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL diff --git a/sys/modules/zfs/Makefile b/sys/modules/zfs/Makefile index c72becff924e..2cfdf06f8f01 100644 --- a/sys/modules/zfs/Makefile +++ b/sys/modules/zfs/Makefile @@ -295,7 +295,6 @@ SRCS+= abd.c \ uberblock.c \ unique.c \ vdev.c \ - vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_indirect.c \ diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h index d0ad587ed327..dc40483fe785 100644 --- a/sys/modules/zfs/zfs_config.h +++ b/sys/modules/zfs/zfs_config.h @@ -653,6 +653,9 @@ /* qat is enabled and existed */ /* #undef HAVE_QAT */ +/* struct reclaim_state has reclaimed */ +/* #undef HAVE_RECLAIM_STATE_RECLAIMED */ + /* register_shrinker is vararg */ /* #undef HAVE_REGISTER_SHRINKER_VARARG */ @@ -1048,7 +1051,7 @@ /* #undef ZFS_IS_GPL_COMPATIBLE */ /* Define the project alias string. */ -#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gad0a55461" +#define ZFS_META_ALIAS "zfs-2.1.99-FreeBSD_gfeff9dfed" /* Define the project author. */ #define ZFS_META_AUTHOR "OpenZFS" @@ -1057,7 +1060,7 @@ /* #undef ZFS_META_DATA */ /* Define the maximum compatible kernel version. */ -#define ZFS_META_KVER_MAX "6.2" +#define ZFS_META_KVER_MAX "6.3" /* Define the minimum compatible kernel version. */ #define ZFS_META_KVER_MIN "3.10" @@ -1078,7 +1081,7 @@ #define ZFS_META_NAME "zfs" /* Define the project release. */ -#define ZFS_META_RELEASE "FreeBSD_gad0a55461" +#define ZFS_META_RELEASE "FreeBSD_gfeff9dfed" /* Define the project version. */ #define ZFS_META_VERSION "2.1.99" diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h index 5c7ebbec1dc5..932bf9730c7a 100644 --- a/sys/modules/zfs/zfs_gitrev.h +++ b/sys/modules/zfs/zfs_gitrev.h @@ -1 +1 @@ -#define ZFS_META_GITREV "zfs-2.1.99-1955-gad0a55461" +#define ZFS_META_GITREV "zfs-2.1.99-1993-gfeff9dfed"