Notable upstream pull request merges:
 #15509 b1e46f869 Add ashift validation when adding devices to a pool
 #15927 45e23abed Update resume token at object receive
 #15941 bf8f72359 BRT: Skip duplicate BRT prefetches
 #15950 8cd8ccca5 BRT: Skip getting length in brt_entry_lookup()
 #15951 80cc51629 ZAP: Massively switch to _by_dnode() interfaces
 #15954 2c01cae8b BRT: Change brt_pending_tree sorting order
 #15955 4616b96a6 BRT: Relax brt_pending_apply() locking
 #15959 5c4a4f82c zio: update ZIO type x stage documentation
 #15962 493fcce9b Provide macros for setting and getting blkptr birth times
 #15963 90ff73235 freebsd: fix missing headers in distribution tarball
 #15967 f68bde723 BRT: Make BRT block sizes configurable
 #15976 c28f94f32 ZAP: Some cleanups/micro-optimizations
 #15995 cfb96c772 vdev_disk: clean up spa/bdev mode conversion
 #16006 c0aab8b8f zvols: prevent overflow of minor device numbers
 #16007 a89d209bb BRT: Fix holes cloning
 #16008 c9d8f6c59 Fix option string, adding -e and fixing order

Obtained from:	OpenZFS
OpenZFS commit:	39be46f43f
This commit is contained in:
Martin Matuska 2024-03-30 22:14:52 +01:00
commit 783d3ff6d7
88 changed files with 1801 additions and 841 deletions

View file

@ -199,7 +199,8 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
break;
sublivelist_verify_block_t svb = {
.svb_dva = bp->blk_dva[i],
.svb_allocated_txg = bp->blk_birth
.svb_allocated_txg =
BP_GET_LOGICAL_BIRTH(bp)
};
if (zfs_btree_find(&sv->sv_leftover, &svb,
@ -2340,7 +2341,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
(int)BPE_GET_ETYPE(bp),
(u_longlong_t)BPE_GET_LSIZE(bp),
(u_longlong_t)BPE_GET_PSIZE(bp),
(u_longlong_t)bp->blk_birth);
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
return;
}
@ -2358,7 +2359,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
buflen - strlen(blkbuf),
"%llxL B=%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)bp->blk_birth);
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
} else {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf),
@ -2366,8 +2367,8 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
(u_longlong_t)BP_GET_BIRTH(bp));
if (bp_freed)
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), " %s", "FREE");
@ -2417,7 +2418,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
{
int err = 0;
if (bp->blk_birth == 0)
if (BP_GET_LOGICAL_BIRTH(bp) == 0)
return (0);
print_indirect(spa, bp, zb, dnp);
@ -2605,7 +2606,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
(void) arg, (void) tx;
char blkbuf[BP_SPRINTF_LEN];
if (bp->blk_birth != 0) {
if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("\t%s\n", blkbuf);
}
@ -2646,7 +2647,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
(void) arg, (void) tx;
char blkbuf[BP_SPRINTF_LEN];
ASSERT(bp->blk_birth != 0);
ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
(void) printf("\t%s\n", blkbuf);
return (0);
@ -5788,7 +5789,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (zb->zb_level == ZB_DNODE_LEVEL)
return (0);
if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("objset %llu object %llu "

View file

@ -173,8 +173,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", tab_prefix,
!BP_IS_HOLE(bp) &&
bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
spa_min_claim_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);
@ -186,7 +186,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
(void) printf("%s<hole>\n", tab_prefix);
return;
}
if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
(void) printf("%s<block already committed>\n",
tab_prefix);
return;
@ -237,8 +237,8 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", tab_prefix,
!BP_IS_HOLE(bp) &&
bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
spa_min_claim_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);
}
@ -473,7 +473,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
if (claim_txg != 0)
claim = "already claimed";
else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
claim = "will claim";
else
claim = "won't claim";

View file

@ -612,8 +612,8 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
* Uberblock root block pointer has valid birth TXG.
* Copying it to the label NVlist
*/
if (ub->ub_rootbp.blk_birth != 0) {
const uint64_t txg = ub->ub_rootbp.blk_birth;
if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
ub->ub_txg = txg;
if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {

View file

@ -458,7 +458,7 @@ static char *zpool_sysfs_gets(char *path)
}
/* Remove trailing newline */
if (buf[count - 1] == '\n')
if (count > 0 && buf[count - 1] == '\n')
buf[count - 1] = 0;
close(fd);

View file

@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright (c) 2012 by Frederik Wessels. All rights reserved.
* Copyright (c) 2012 by Cyril Plisko. All rights reserved.
* Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@ -131,6 +131,13 @@ static int zpool_do_help(int argc, char **argv);
static zpool_compat_status_t zpool_do_load_compat(
const char *, boolean_t *);
enum zpool_options {
ZPOOL_OPTION_POWER = 1024,
ZPOOL_OPTION_ALLOW_INUSE,
ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH,
ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH
};
/*
* These libumem hooks provide a reasonable set of defaults for the allocator's
* debugging facilities.
@ -347,7 +354,7 @@ get_usage(zpool_help_t idx)
{
switch (idx) {
case HELP_ADD:
return (gettext("\tadd [-fgLnP] [-o property=value] "
return (gettext("\tadd [-afgLnP] [-o property=value] "
"<pool> <vdev> ...\n"));
case HELP_ATTACH:
return (gettext("\tattach [-fsw] [-o property=value] "
@ -413,7 +420,7 @@ get_usage(zpool_help_t idx)
"[<device> ...]\n"));
case HELP_STATUS:
return (gettext("\tstatus [--power] [-c [script1,script2,...]] "
"[-igLpPstvxD] [-T d|u] [pool] ... \n"
"[-DegiLpPstvx] [-T d|u] [pool] ...\n"
"\t [interval [count]]\n"));
case HELP_UPGRADE:
return (gettext("\tupgrade\n"
@ -1009,8 +1016,9 @@ add_prop_list_default(const char *propname, const char *propval,
}
/*
* zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
* zpool add [-afgLnP] [-o property=value] <pool> <vdev> ...
*
* -a Disable the ashift validation checks
* -f Force addition of devices, even if they appear in use
* -g Display guid for individual vdev name.
* -L Follow links when resolving vdev path name.
@ -1026,8 +1034,11 @@ add_prop_list_default(const char *propname, const char *propval,
int
zpool_do_add(int argc, char **argv)
{
boolean_t force = B_FALSE;
boolean_t check_replication = B_TRUE;
boolean_t check_inuse = B_TRUE;
boolean_t dryrun = B_FALSE;
boolean_t check_ashift = B_TRUE;
boolean_t force = B_FALSE;
int name_flags = 0;
int c;
nvlist_t *nvroot;
@ -1038,8 +1049,18 @@ zpool_do_add(int argc, char **argv)
nvlist_t *props = NULL;
char *propval;
struct option long_options[] = {
{"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE},
{"allow-replication-mismatch", no_argument, NULL,
ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH},
{"allow-ashift-mismatch", no_argument, NULL,
ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH},
{0, 0, 0, 0}
};
/* check options */
while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL))
!= -1) {
switch (c) {
case 'f':
force = B_TRUE;
@ -1069,6 +1090,15 @@ zpool_do_add(int argc, char **argv)
case 'P':
name_flags |= VDEV_NAME_PATH;
break;
case ZPOOL_OPTION_ALLOW_INUSE:
check_inuse = B_FALSE;
break;
case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH:
check_replication = B_FALSE;
break;
case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH:
check_ashift = B_FALSE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@ -1089,6 +1119,19 @@ zpool_do_add(int argc, char **argv)
usage(B_FALSE);
}
if (force) {
if (!check_inuse || !check_replication || !check_ashift) {
(void) fprintf(stderr, gettext("'-f' option is not "
"allowed with '--allow-replication-mismatch', "
"'--allow-ashift-mismatch', or "
"'--allow-in-use'\n"));
usage(B_FALSE);
}
check_inuse = B_FALSE;
check_replication = B_FALSE;
check_ashift = B_FALSE;
}
poolname = argv[0];
argc--;
@ -1119,8 +1162,8 @@ zpool_do_add(int argc, char **argv)
}
/* pass off to make_root_vdev for processing */
nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
argc, argv);
nvroot = make_root_vdev(zhp, props, !check_inuse,
check_replication, B_FALSE, dryrun, argc, argv);
if (nvroot == NULL) {
zpool_close(zhp);
return (1);
@ -1224,7 +1267,7 @@ zpool_do_add(int argc, char **argv)
ret = 0;
} else {
ret = (zpool_add(zhp, nvroot) != 0);
ret = (zpool_add(zhp, nvroot, check_ashift) != 0);
}
nvlist_free(props);
@ -7081,7 +7124,6 @@ zpool_do_split(int argc, char **argv)
return (ret);
}
#define POWER_OPT 1024
/*
* zpool online [--power] <pool> <device> ...
@ -7099,7 +7141,7 @@ zpool_do_online(int argc, char **argv)
int flags = 0;
boolean_t is_power_on = B_FALSE;
struct option long_options[] = {
{"power", no_argument, NULL, POWER_OPT},
{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
{0, 0, 0, 0}
};
@ -7109,7 +7151,7 @@ zpool_do_online(int argc, char **argv)
case 'e':
flags |= ZFS_ONLINE_EXPAND;
break;
case POWER_OPT:
case ZPOOL_OPTION_POWER:
is_power_on = B_TRUE;
break;
case '?':
@ -7222,7 +7264,7 @@ zpool_do_offline(int argc, char **argv)
boolean_t is_power_off = B_FALSE;
struct option long_options[] = {
{"power", no_argument, NULL, POWER_OPT},
{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
{0, 0, 0, 0}
};
@ -7235,7 +7277,7 @@ zpool_do_offline(int argc, char **argv)
case 't':
istmp = B_TRUE;
break;
case POWER_OPT:
case ZPOOL_OPTION_POWER:
is_power_off = B_TRUE;
break;
case '?':
@ -7335,7 +7377,7 @@ zpool_do_clear(int argc, char **argv)
char *pool, *device;
struct option long_options[] = {
{"power", no_argument, NULL, POWER_OPT},
{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
{0, 0, 0, 0}
};
@ -7352,7 +7394,7 @@ zpool_do_clear(int argc, char **argv)
case 'X':
xtreme_rewind = B_TRUE;
break;
case POWER_OPT:
case ZPOOL_OPTION_POWER:
is_power_on = B_TRUE;
break;
case '?':
@ -9177,22 +9219,22 @@ status_callback(zpool_handle_t *zhp, void *data)
}
/*
* zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ...
* zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ...
* [pool] [interval [count]]
*
* -c CMD For each vdev, run command CMD
* -D Display dedup status (undocumented)
* -e Display only unhealthy vdevs
* -i Display vdev initialization status.
* -g Display guid for individual vdev name.
* -i Display vdev initialization status.
* -L Follow links when resolving vdev path name.
* -p Display values in parsable (exact) format.
* -P Display full path for vdev name.
* -s Display slow IOs column.
* -v Display complete error logs
* -x Display only pools with potential problems
* -D Display dedup status (undocumented)
* -t Display vdev TRIM status.
* -T Display a timestamp in date(1) or Unix format
* -v Display complete error logs
* -x Display only pools with potential problems
* --power Display vdev enclosure slot power status
*
* Describes the health status of all pools or some subset.
@ -9208,12 +9250,12 @@ zpool_do_status(int argc, char **argv)
char *cmd = NULL;
struct option long_options[] = {
{"power", no_argument, NULL, POWER_OPT},
{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
{0, 0, 0, 0}
};
/* check options */
while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options,
NULL)) != -1) {
switch (c) {
case 'c':
@ -9240,15 +9282,18 @@ zpool_do_status(int argc, char **argv)
}
cmd = optarg;
break;
case 'D':
cb.cb_dedup_stats = B_TRUE;
break;
case 'e':
cb.cb_print_unhealthy = B_TRUE;
break;
case 'i':
cb.cb_print_vdev_init = B_TRUE;
break;
case 'g':
cb.cb_name_flags |= VDEV_NAME_GUID;
break;
case 'i':
cb.cb_print_vdev_init = B_TRUE;
break;
case 'L':
cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
break;
@ -9261,22 +9306,19 @@ zpool_do_status(int argc, char **argv)
case 's':
cb.cb_print_slow_ios = B_TRUE;
break;
case 'v':
cb.cb_verbose = B_TRUE;
break;
case 'x':
cb.cb_explain = B_TRUE;
break;
case 'D':
cb.cb_dedup_stats = B_TRUE;
break;
case 't':
cb.cb_print_vdev_trim = B_TRUE;
break;
case 'T':
get_timestamp_arg(*optarg);
break;
case POWER_OPT:
case 'v':
cb.cb_verbose = B_TRUE;
break;
case 'x':
cb.cb_explain = B_TRUE;
break;
case ZPOOL_OPTION_POWER:
cb.cb_print_power = B_TRUE;
break;
case '?':
@ -9315,7 +9357,6 @@ zpool_do_status(int argc, char **argv)
if (cb.vcdl != NULL)
free_vdev_cmd_data_list(cb.vcdl);
if (argc == 0 && cb.cb_count == 0)
(void) fprintf(stderr, gettext("no pools available\n"));
else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)

View file

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
@ -3375,7 +3375,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
"log" : NULL, raidz_children, zs->zs_mirrors,
1);
error = spa_vdev_add(spa, nvroot);
error = spa_vdev_add(spa, nvroot, B_FALSE);
fnvlist_free(nvroot);
switch (error) {
@ -3438,7 +3438,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
class, raidz_children, zs->zs_mirrors, 1);
error = spa_vdev_add(spa, nvroot);
error = spa_vdev_add(spa, nvroot, B_FALSE);
fnvlist_free(nvroot);
if (error == ENOSPC)
@ -3545,7 +3545,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
*/
nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
(ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
error = spa_vdev_add(spa, nvroot);
error = spa_vdev_add(spa, nvroot, B_FALSE);
switch (error) {
case 0:

View file

@ -4,6 +4,7 @@ dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [
ZFS_LINUX_TEST_SRC([filemap_range_has_page], [
#include <linux/fs.h>
#include <linux/pagemap.h>
],[
struct address_space *mapping = NULL;
loff_t lstart = 0;

View file

@ -0,0 +1,17 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
ZFS_LINUX_TEST_SRC([page_size], [
#include <linux/mm.h>
],[
unsigned long s;
s = page_size(NULL);
])
])
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
AC_MSG_CHECKING([whether page_size() is available])
ZFS_LINUX_TEST_RESULT([page_size], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
],[
AC_MSG_RESULT(no)
])
])

View file

@ -16,6 +16,9 @@ dnl #
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
dnl # generic_copy_file_range() added to support it
dnl #
dnl # 6.8: generic_copy_file_range() removed, replaced by
dnl # splice_copy_file_range()
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
#include <linux/fs.h>
@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
#include <linux/splice.h>
], [
struct file *src_file __attribute__ ((unused)) = NULL;
loff_t src_off __attribute__ ((unused)) = 0;
struct file *dst_file __attribute__ ((unused)) = NULL;
loff_t dst_off __attribute__ ((unused)) = 0;
size_t len __attribute__ ((unused)) = 0;
splice_copy_file_range(src_file, src_off, dst_file, dst_off,
len);
])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
AC_MSG_CHECKING([whether splice_copy_file_range() is available])
ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
[splice_copy_file_range() is available])
],[
AC_MSG_RESULT(no)
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
#include <linux/fs.h>

View file

@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
@ -166,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
ZFS_AC_KERNEL_SRC_SYNC_BDEV
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@ -266,6 +268,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_VFS_IOV_ITER
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
@ -314,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_COPY_SPLICE_READ
ZFS_AC_KERNEL_SYNC_BDEV
ZFS_AC_KERNEL_MM_PAGE_SIZE
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE

View file

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2022 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright Joyent, Inc.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2016, Intel Corporation.
@ -158,6 +158,7 @@ typedef enum zfs_error {
EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */
EZFS_SHAREFAILED, /* filesystem share failed */
EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */
EZFS_ASHIFT_MISMATCH, /* can't add vdevs with different ashifts */
EZFS_UNKNOWN
} zfs_error_t;
@ -261,7 +262,7 @@ _LIBZFS_H boolean_t zpool_skip_pool(const char *);
_LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
nvlist_t *, nvlist_t *);
_LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *);
_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *);
_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift);
typedef struct splitflags {
/* do not split, but return the config that would be split off */

View file

@ -80,7 +80,9 @@ noinst_HEADERS = \
%D%/spl/sys/zmod.h \
%D%/spl/sys/zone.h \
\
%D%/zfs/sys/arc_os.h \
%D%/zfs/sys/freebsd_crypto.h \
%D%/zfs/sys/freebsd_event.h \
%D%/zfs/sys/vdev_os.h \
%D%/zfs/sys/zfs_bootenv_os.h \
%D%/zfs/sys/zfs_context_os.h \

View file

@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
%D%/kernel/linux/compiler_compat.h \
%D%/kernel/linux/dcache_compat.h \
%D%/kernel/linux/kmap_compat.h \
%D%/kernel/linux/mm_compat.h \
%D%/kernel/linux/mod_compat.h \
%D%/kernel/linux/page_compat.h \
%D%/kernel/linux/percpu_compat.h \

View file

@ -0,0 +1,36 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ZFS_MM_COMPAT_H
#define _ZFS_MM_COMPAT_H
#include <linux/mm.h>
/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
#ifndef HAVE_MM_PAGE_SIZE
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
#endif
#endif /* _ZFS_MM_COMPAT_H */

View file

@ -68,6 +68,7 @@ enum scope_prefix_types {
zfs_trim,
zfs_txg,
zfs_vdev,
zfs_vdev_disk,
zfs_vdev_file,
zfs_vdev_mirror,
zfs_vnops,

View file

@ -79,6 +79,9 @@ typedef struct abd {
typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif
extern int zfs_abd_scatter_enabled;
@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
@ -213,6 +220,8 @@ void abd_fini(void);
/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);

View file

@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#ifndef _ABD_IMPL_H
@ -38,12 +39,30 @@ typedef enum abd_stats_op {
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;
struct scatterlist; /* forward declaration */
/* forward declarations */
struct scatterlist;
struct page;
struct abd_iter {
/* public interface */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
union {
/* for abd_iter_map()/abd_iter_unmap() */
struct {
/* addr corresponding to iter_pos */
void *iter_mapaddr;
/* length of data valid at mapaddr */
size_t iter_mapsize;
};
/* for abd_iter_page() */
struct {
/* current page */
struct page *iter_page;
/* offset of data in page */
size_t iter_page_doff;
/* size of data in page */
size_t iter_page_dsize;
};
};
/* private */
abd_t *iter_abd; /* ABD being iterated through */
@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
void abd_iter_advance(struct abd_iter *, size_t);
void abd_iter_map(struct abd_iter *);
void abd_iter_unmap(struct abd_iter *);
void abd_iter_page(struct abd_iter *);
/*
* Helper macros

View file

@ -752,8 +752,6 @@ void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
void *dmu_buf_get_user(dmu_buf_t *db);
objset_t *dmu_buf_get_objset(dmu_buf_t *db);
dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
void dmu_buf_dnode_exit(dmu_buf_t *db);
/* Block until any in-progress dmu buf user evictions complete. */
void dmu_buf_user_evict_wait(void);
@ -902,6 +900,8 @@ extern uint_t zfs_max_recordsize;
*/
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
typedef struct dmu_object_info {

View file

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
@ -1603,6 +1603,7 @@ typedef enum {
ZFS_ERR_RESUME_EXISTS,
ZFS_ERR_CRYPTO_NOTSUP,
ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
ZFS_ERR_ASHIFT_MISMATCH,
} zfs_errno_t;
/*

View file

@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2021 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@ -125,15 +125,15 @@ typedef struct zio_cksum_salt {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | pad | vdev1 | GRID | ASIZE |
* 0 | pad | vdev1 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 2 | pad | vdev2 | GRID | ASIZE |
* 2 | pad | vdev2 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 4 | pad | vdev3 | GRID | ASIZE |
* 4 | pad | vdev3 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@ -165,7 +165,6 @@ typedef struct zio_cksum_salt {
* LSIZE logical size
* PSIZE physical size (after compression)
* ASIZE allocated size (including RAID-Z parity and gang block headers)
* GRID RAID-Z layout information (reserved for future use)
* cksum checksum function
* comp compression function
* G gang block indicator
@ -190,11 +189,11 @@ typedef struct zio_cksum_salt {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | vdev1 | GRID | ASIZE |
* 0 | vdev1 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 2 | vdev2 | GRID | ASIZE |
* 2 | vdev2 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@ -355,7 +354,7 @@ typedef enum bp_embedded_type {
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1]))
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
@ -374,8 +373,7 @@ typedef struct blkptr {
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
uint64_t blk_prop; /* size, compression, type, etc */
uint64_t blk_pad[2]; /* Extra space for the future */
uint64_t blk_phys_birth; /* txg when block was allocated */
uint64_t blk_birth; /* transaction group at birth */
uint64_t blk_birth_word[2];
uint64_t blk_fill; /* fill count */
zio_cksum_t blk_cksum; /* 256-bit checksum */
} blkptr_t;
@ -395,9 +393,6 @@ typedef struct blkptr {
BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
SPA_MINBLOCKSHIFT, 0, x)
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
#define DVA_SET_VDEV(dva, x) \
BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
@ -480,15 +475,23 @@ typedef struct blkptr {
#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_GET_LOGICAL_BIRTH(bp) (bp)->blk_birth_word[1]
#define BP_SET_LOGICAL_BIRTH(bp, x) ((bp)->blk_birth_word[1] = (x))
#define BP_GET_PHYSICAL_BIRTH(bp) (bp)->blk_birth_word[0]
#define BP_SET_PHYSICAL_BIRTH(bp, x) ((bp)->blk_birth_word[0] = (x))
#define BP_GET_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \
BP_GET_LOGICAL_BIRTH(bp))
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
BP_SET_LOGICAL_BIRTH(bp, logical); \
BP_SET_PHYSICAL_BIRTH(bp, \
((logical) == (physical) ? 0 : (physical))); \
}
#define BP_GET_FILL(bp) \
@ -541,8 +544,8 @@ typedef struct blkptr {
(dva1)->dva_word[0] == (dva2)->dva_word[0])
#define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
(bp1)->blk_birth == (bp2)->blk_birth && \
(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \
BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@ -581,8 +584,8 @@ typedef struct blkptr {
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
(bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_birth_word[0] = 0; \
(bp)->blk_birth_word[1] = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
@ -631,7 +634,7 @@ typedef struct blkptr {
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \
} else if (BP_IS_EMBEDDED(bp)) { \
len = func(buf + len, size - len, \
"EMBEDDED [L%llu %s] et=%u %s " \
@ -642,14 +645,14 @@ typedef struct blkptr {
compress, \
(u_longlong_t)BPE_GET_LSIZE(bp), \
(u_longlong_t)BPE_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \
} else if (BP_IS_REDACTED(bp)) { \
len += func(buf + len, size - len, \
"REDACTED [L%llu %s] size=%llxL birth=%lluL", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
@ -691,8 +694,8 @@ typedef struct blkptr {
ws, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_BIRTH(bp), \
(u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \
@ -782,7 +785,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_DETACH_SPARE 0x4000
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check);
extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
int replacing, int rebuild);
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
@ -1142,9 +1145,9 @@ extern const char *spa_state_to_name(spa_t *spa);
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb,
const uint64_t *birth);
const uint64_t birth);
extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb,
const uint64_t *birth);
uint64_t birth);
extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,

View file

@ -165,7 +165,7 @@ struct uberblock {
* pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
* the value of the field is used to determine which ZIL blocks have
* been allocated according to the ms_sm when we are rewinding to a
* checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
* checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then
* the ZIL block is not allocated [see uses of spa_min_claim_txg()].
*/
uint64_t ub_checkpoint_txg;

View file

@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx);
/*
* Set the attribute with the given name to the given value. If an
@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
/*
* Get the length (in integers) and the integer size of the specified
@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx);
int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, dmu_tx_t *tx);
/*
* Returns (in *count) the number of attributes in the specified zap

View file

@ -145,6 +145,7 @@ typedef struct zap {
dmu_buf_user_t zap_dbu;
objset_t *zap_objset;
uint64_t zap_object;
dnode_t *zap_dnode;
struct dmu_buf *zap_dbuf;
krwlock_t zap_rwlock;
boolean_t zap_ismicro;

View file

@ -47,7 +47,7 @@ struct zap_stats;
* entries - header space (2*chunksize)
*/
#define ZAP_LEAF_NUMCHUNKS_BS(bs) \
(((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
(((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
ZAP_LEAF_CHUNKSIZE - 2)
#define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
@ -80,7 +80,7 @@ struct zap_stats;
* chunks per entry (3).
*/
#define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs))
#define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
#define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
@ -163,7 +163,7 @@ typedef struct zap_leaf {
dmu_buf_user_t l_dbu;
krwlock_t l_rwlock;
uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
int l_bs; /* block size shift */
uint_t l_bs; /* block size shift */
dmu_buf_t *l_dbuf;
} zap_leaf_t;
@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
*/
extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len);
extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
struct zap_stats *zs);

View file

@ -25,6 +25,7 @@
/*
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2024, Klara Inc.
*/
#ifndef _ZIO_IMPL_H
@ -39,7 +40,7 @@ extern "C" {
*
* The ZFS I/O pipeline is comprised of various stages which are defined
* in the zio_stage enum below. The individual stages are used to construct
* these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
* these basic I/O operations: Read, Write, Free, Claim, Ioctl and Trim.
*
* I/O operations: (XXX - provide detail for each of the operations)
*
@ -48,6 +49,7 @@ extern "C" {
* Free:
* Claim:
* Ioctl:
* Trim:
*
* Although the most common pipeline are used by the basic I/O operations
* above, there are some helper pipelines (one could consider them
@ -120,43 +122,43 @@ extern "C" {
* zio pipeline stage definitions
*/
enum zio_stage {
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
ZIO_STAGE_OPEN = 1 << 0, /* RWFCIT */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R----- */
ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W---- */
ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F--- */
ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* -WF--T */
ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W---- */
ZIO_STAGE_ENCRYPT = 1 << 6, /* -W--- */
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W--- */
ZIO_STAGE_ENCRYPT = 1 << 6, /* -W---- */
ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W---- */
ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */
ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W---- */
ZIO_STAGE_BRT_FREE = 1 << 9, /* --F-- */
ZIO_STAGE_BRT_FREE = 1 << 9, /* --F--- */
ZIO_STAGE_DDT_READ_START = 1 << 10, /* R---- */
ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R---- */
ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W--- */
ZIO_STAGE_DDT_FREE = 1 << 13, /* --F-- */
ZIO_STAGE_DDT_READ_START = 1 << 10, /* R----- */
ZIO_STAGE_DDT_READ_DONE = 1 << 11, /* R----- */
ZIO_STAGE_DDT_WRITE = 1 << 12, /* -W---- */
ZIO_STAGE_DDT_FREE = 1 << 13, /* --F--- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC- */
ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC- */
ZIO_STAGE_GANG_ASSEMBLE = 1 << 14, /* RWFC-- */
ZIO_STAGE_GANG_ISSUE = 1 << 15, /* RWFC-- */
ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W--- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W--- */
ZIO_STAGE_DVA_FREE = 1 << 18, /* --F-- */
ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C- */
ZIO_STAGE_DVA_THROTTLE = 1 << 16, /* -W---- */
ZIO_STAGE_DVA_ALLOCATE = 1 << 17, /* -W---- */
ZIO_STAGE_DVA_FREE = 1 << 18, /* --F--- */
ZIO_STAGE_DVA_CLAIM = 1 << 19, /* ---C-- */
ZIO_STAGE_READY = 1 << 20, /* RWFCI */
ZIO_STAGE_READY = 1 << 20, /* RWFCIT */
ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--I */
ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW--I */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--I */
ZIO_STAGE_VDEV_IO_START = 1 << 21, /* RW--IT */
ZIO_STAGE_VDEV_IO_DONE = 1 << 22, /* RW---T */
ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--IT */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R---- */
ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */
ZIO_STAGE_DONE = 1 << 25 /* RWFCI */
ZIO_STAGE_DONE = 1 << 25 /* RWFCIT */
};
#define ZIO_ROOT_PIPELINE \

View file

@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg)
* Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
* it's possible the offsets are equal. In that case, sort by txg
*/
if (l->blk_birth < r->blk_birth) {
if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) {
return (-1);
} else if (l->blk_birth > r->blk_birth) {
} else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) {
return (+1);
}
return (0);

View file

@ -1112,14 +1112,11 @@
<var-decl name='prev' type-id='b03eadb4' visibility='default'/>
</data-member>
</class-decl>
<class-decl name='list' size-in-bits='256' is-struct='yes' visibility='default' id='e824dae9'>
<class-decl name='list' size-in-bits='192' is-struct='yes' visibility='default' id='e824dae9'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='list_size' type-id='b59d7dce' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='64'>
<var-decl name='list_offset' type-id='b59d7dce' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='128'>
<data-member access='public' layout-offset-in-bits='64'>
<var-decl name='list_head' type-id='b0b5e45e' visibility='default'/>
</data-member>
</class-decl>
@ -2832,6 +2829,9 @@
</function-type>
</abi-instr>
<abi-instr address-size='64' path='lib/libzfs/libzfs_crypto.c' language='LANG_C99'>
<array-type-def dimensions='1' type-id='38b51b3c' size-in-bits='832' id='02b72c00'>
<subrange length='13' type-id='7359adad' id='487fded1'/>
</array-type-def>
<array-type-def dimensions='1' type-id='fb7c6451' size-in-bits='256' id='64177143'>
<subrange length='32' type-id='7359adad' id='ae5bde82'/>
</array-type-def>
@ -2844,6 +2844,10 @@
<class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
<class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
<class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
<class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
<array-type-def dimensions='1' type-id='80f4b756' size-in-bits='832' id='39e6f84a'>
<subrange length='13' type-id='7359adad' id='487fded1'/>
</array-type-def>
<array-type-def dimensions='1' type-id='95e97e5e' size-in-bits='896' id='47394ee0'>
<subrange length='28' type-id='7359adad' id='3db583d7'/>
</array-type-def>
@ -2964,6 +2968,24 @@
<typedef-decl name='__clock_t' type-id='bd54fe1a' id='4d66c6d7'/>
<typedef-decl name='__ssize_t' type-id='bd54fe1a' id='41060289'/>
<typedef-decl name='FILE' type-id='ec1ed955' id='aa12d1ba'/>
<class-decl name='__locale_struct' size-in-bits='1856' is-struct='yes' visibility='default' id='90cc1ce3'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='__locales' type-id='02b72c00' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='832'>
<var-decl name='__ctype_b' type-id='31347b7a' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='896'>
<var-decl name='__ctype_tolower' type-id='6d60f45d' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='960'>
<var-decl name='__ctype_toupper' type-id='6d60f45d' visibility='default'/>
</data-member>
<data-member access='public' layout-offset-in-bits='1024'>
<var-decl name='__names' type-id='39e6f84a' visibility='default'/>
</data-member>
</class-decl>
<typedef-decl name='__locale_t' type-id='f01e1813' id='b7ac9b5f'/>
<class-decl name='__sigset_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='b9c97942' visibility='default' id='2616147f'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='__val' type-id='d2baa450' visibility='default'/>
@ -2979,6 +3001,7 @@
</data-member>
</union-decl>
<typedef-decl name='__sigval_t' type-id='a094b870' id='eabacd01'/>
<typedef-decl name='locale_t' type-id='b7ac9b5f' id='973a4f8d'/>
<class-decl name='siginfo_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='cb681f62' visibility='default' id='d8149419'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='si_signo' type-id='95e97e5e' visibility='default'/>
@ -3214,9 +3237,13 @@
<pointer-type-def type-id='bb4788fa' size-in-bits='64' id='cecf4ea7'/>
<pointer-type-def type-id='010ae0b9' size-in-bits='64' id='e4c6fa61'/>
<pointer-type-def type-id='79bd3751' size-in-bits='64' id='c65a1f29'/>
<pointer-type-def type-id='23de8b96' size-in-bits='64' id='38b51b3c'/>
<pointer-type-def type-id='90cc1ce3' size-in-bits='64' id='f01e1813'/>
<qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
<qualified-type-def type-id='80f4b756' restrict='yes' id='9d26089a'/>
<pointer-type-def type-id='80f4b756' size-in-bits='64' id='7d3cd834'/>
<qualified-type-def type-id='95e97e5e' const='yes' id='2448a865'/>
<pointer-type-def type-id='2448a865' size-in-bits='64' id='6d60f45d'/>
<qualified-type-def type-id='aca3bac8' const='yes' id='2498fd78'/>
<pointer-type-def type-id='2498fd78' size-in-bits='64' id='eed6c816'/>
<qualified-type-def type-id='eed6c816' restrict='yes' id='a431a9da'/>
@ -3249,6 +3276,7 @@
<class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
<class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
<class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
<class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
<function-decl name='zpool_get_prop_int' mangled-name='zpool_get_prop_int' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_get_prop_int'>
<parameter type-id='4c81de99'/>
<parameter type-id='5d0c23fb'/>
@ -3353,6 +3381,10 @@
<function-decl name='dlerror' visibility='default' binding='global' size-in-bits='64'>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='uselocale' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='973a4f8d'/>
<return type-id='973a4f8d'/>
</function-decl>
<function-decl name='PKCS5_PBKDF2_HMAC_SHA1' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<parameter type-id='95e97e5e'/>
@ -3436,8 +3468,9 @@
<parameter type-id='80f4b756'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'>
<function-decl name='strerror_l' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='95e97e5e'/>
<parameter type-id='973a4f8d'/>
<return type-id='26a90f95'/>
</function-decl>
<function-decl name='tcgetattr' visibility='default' binding='global' size-in-bits='64'>
@ -3794,12 +3827,18 @@
<qualified-type-def type-id='9c313c2d' const='yes' id='c3b7ba7d'/>
<pointer-type-def type-id='c3b7ba7d' size-in-bits='64' id='713a56f5'/>
<pointer-type-def type-id='01a1b934' size-in-bits='64' id='566b3f52'/>
<qualified-type-def type-id='566b3f52' restrict='yes' id='c878edd6'/>
<pointer-type-def type-id='566b3f52' size-in-bits='64' id='82d4e9e8'/>
<qualified-type-def type-id='82d4e9e8' restrict='yes' id='aa19c230'/>
<pointer-type-def type-id='7e291ce6' size-in-bits='64' id='ca64ff60'/>
<pointer-type-def type-id='9da381c4' size-in-bits='64' id='cb785ebf'/>
<pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
<pointer-type-def type-id='8e0af06e' size-in-bits='64' id='053457bd'/>
<pointer-type-def type-id='857bb57e' size-in-bits='64' id='75be733c'/>
<pointer-type-def type-id='a63d15a3' size-in-bits='64' id='a195f4a3'/>
<qualified-type-def type-id='a195f4a3' restrict='yes' id='33518961'/>
<pointer-type-def type-id='a195f4a3' size-in-bits='64' id='e80ff3ab'/>
<qualified-type-def type-id='e80ff3ab' restrict='yes' id='8f2c7109'/>
<pointer-type-def type-id='eae6431d' size-in-bits='64' id='0d41d328'/>
<pointer-type-def type-id='7a6844eb' size-in-bits='64' id='18c91f9e'/>
<pointer-type-def type-id='dddf6ca2' size-in-bits='64' id='d915a820'/>
@ -4232,9 +4271,13 @@
<parameter type-id='9d424d31'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='getgrnam' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='566b3f52'/>
<function-decl name='getgrnam_r' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='9d26089a'/>
<parameter type-id='c878edd6'/>
<parameter type-id='266fe297'/>
<parameter type-id='b59d7dce'/>
<parameter type-id='aa19c230'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='hasmntopt' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='48bea5ec'/>
@ -4258,9 +4301,13 @@
<parameter type-id='18c91f9e'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='getpwnam' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='80f4b756'/>
<return type-id='a195f4a3'/>
<function-decl name='getpwnam_r' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='9d26089a'/>
<parameter type-id='33518961'/>
<parameter type-id='266fe297'/>
<parameter type-id='b59d7dce'/>
<parameter type-id='8f2c7109'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='strtol' visibility='default' binding='global' size-in-bits='64'>
<parameter type-id='9d26089a'/>
@ -6315,6 +6362,7 @@
<function-decl name='zpool_add' mangled-name='zpool_add' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_add'>
<parameter type-id='4c81de99' name='zhp'/>
<parameter type-id='5ce45b60' name='nvroot'/>
<parameter type-id='c19b74c3' name='ashift_check'/>
<return type-id='95e97e5e'/>
</function-decl>
<function-decl name='zpool_export' mangled-name='zpool_export' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_export'>
@ -6778,7 +6826,7 @@
<enumerator name='LZC_SEND_FLAG_RAW' value='8'/>
<enumerator name='LZC_SEND_FLAG_SAVED' value='16'/>
</enum-decl>
<class-decl name='ddt_key' size-in-bits='320' is-struct='yes' visibility='default' id='e0a4a1cb'>
<class-decl name='ddt_key_t' size-in-bits='320' is-struct='yes' naming-typedef-id='67f6d2cf' visibility='default' id='5fae1718'>
<data-member access='public' layout-offset-in-bits='0'>
<var-decl name='ddk_cksum' type-id='39730d0b' visibility='default'/>
</data-member>
@ -6786,7 +6834,7 @@
<var-decl name='ddk_prop' type-id='9c313c2d' visibility='default'/>
</data-member>
</class-decl>
<typedef-decl name='ddt_key_t' type-id='e0a4a1cb' id='67f6d2cf'/>
<typedef-decl name='ddt_key_t' type-id='5fae1718' id='67f6d2cf'/>
<enum-decl name='dmu_object_type' id='04b3b0b9'>
<underlying-type type-id='9cac1fee'/>
<enumerator name='DMU_OT_NONE' value='0'/>

View file

@ -22,7 +22,7 @@
/*
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2018 Datto Inc.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
@ -1724,7 +1724,7 @@ zpool_discard_checkpoint(zpool_handle_t *zhp)
* necessary verification to ensure that the vdev specification is well-formed.
*/
int
zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift)
{
zfs_cmd_t zc = {"\0"};
int ret;
@ -1756,6 +1756,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
zcmd_write_conf_nvlist(hdl, &zc, nvroot);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
zc.zc_flags = check_ashift;
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
switch (errno) {

View file

@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2020 Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2020 The FreeBSD Foundation
@ -319,6 +319,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
"dataset without force"));
case EZFS_RAIDZ_EXPAND_IN_PROGRESS:
return (dgettext(TEXT_DOMAIN, "raidz expansion in progress"));
case EZFS_ASHIFT_MISMATCH:
return (dgettext(TEXT_DOMAIN, "adding devices with "
"different physical sector sizes is not allowed"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
@ -768,6 +771,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS:
zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap);
break;
case ZFS_ERR_ASHIFT_MISMATCH:
zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
break;
default:
zfs_error_aux(hdl, "%s", zfs_strerror(error));
zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);

View file

@ -2,6 +2,7 @@
.\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024 Klara, Inc.
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
@ -15,7 +16,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd July 21, 2023
.Dd January 9, 2024
.Dt ZFS 4
.Os
.
@ -244,12 +245,25 @@ For blocks that could be forced to be a gang block (due to
.Sy metaslab_force_ganging ) ,
force this many of them to be gang blocks.
.
.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
Controls prefetching BRT records for blocks which are going to be cloned.
.
.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
Default BRT ZAP data block size as a power of 2. Note that changing this after
creating a BRT on the pool will not affect existing BRTs, only newly created
ones.
.
.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
Default BRT ZAP indirect block size as a power of 2. Note that changing this
after creating a BRT on the pool will not affect existing BRTs, only newly
created ones.
.
.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP data block size as a power of 2. Note that changing this after
creating a DDT on the pool will not affect existing DDTs, only newly created
ones.
.
.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP indirect block size as a power of 2. Note that changing this
after creating a DDT on the pool will not affect existing DDTs, only newly
created ones.
@ -1362,6 +1376,29 @@ _
4 Driver No driver retries on driver errors.
.TE
.
.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
Maximum number of segments to add to a BIO (min 4).
If this is higher than the maximum allowed by the device queue or the kernel
itself, it will be clamped.
Setting it to zero will cause the kernel's ideal size to be used.
This parameter only applies on Linux.
This parameter is ignored if
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
.
.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
and earlier.
This "classic" method has known issues with highly fragmented IO requests and
is slower on many workloads, but it has been in use for many years and is known
to be very stable.
If you set this parameter, please also open a bug report why you did so,
including the workload involved and any error messages.
.Pp
This parameter and the classic submission method will be removed once we have
total confidence in the new method.
.Pp
This parameter only applies on Linux, and can only be set at module load time.
.
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
Time before expiring
.Pa .zfs/snapshot .

View file

@ -24,8 +24,9 @@
.\" Copyright (c) 2018 George Melikov. All Rights Reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\" Copyright (c) 2024 by Delphix. All Rights Reserved.
.\"
.Dd March 16, 2022
.Dd March 8, 2024
.Dt ZPOOL-ADD 8
.Os
.
@ -36,6 +37,7 @@
.Nm zpool
.Cm add
.Op Fl fgLnP
.Op Fl -allow-in-use -allow-replication-mismatch -allow-ashift-mismatch
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool vdev Ns
.
@ -56,7 +58,8 @@ subcommand.
.It Fl f
Forces use of
.Ar vdev Ns s ,
even if they appear in use or specify a conflicting replication level.
even if they appear in use, have conflicting ashift values, or specify
a conflicting replication level.
Not all devices can be overridden in this manner.
.It Fl g
Display
@ -91,6 +94,17 @@ See the
manual page for a list of valid properties that can be set.
The only property supported at the moment is
.Sy ashift .
.It Fl -allow-ashift-mismatch
Disable the ashift validation which allows mismatched ashift values in the
pool.
Adding top-level
.Ar vdev Ns s
with different sector sizes will prohibit future device removal operations, see
.Xr zpool-remove 8 .
.It Fl -allow-in-use
Allow vdevs to be added even if they might be in use in another pool.
.It Fl -allow-replication-mismatch
Allow vdevs with conflicting replication levels to be added to the pool.
.El
.
.Sh EXAMPLES

View file

@ -25,8 +25,9 @@
.\" Copyright (c) 2018 George Melikov. All Rights Reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\" Copyright (c) 2024, Klara Inc.
.\"
.Dd July 11, 2023
.Dd February 28, 2024
.Dt ZPOOL-EVENTS 8
.Os
.
@ -363,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data.
.Sh I/O STAGES
The ZFS I/O pipeline is comprised of various stages which are defined below.
The individual stages are used to construct these basic I/O
operations: Read, Write, Free, Claim, and Ioctl.
operations: Read, Write, Free, Claim, Ioctl and Trim.
These stages may be
set on an event to describe the life cycle of a given I/O request.
.Pp
@ -372,43 +373,43 @@ tab(:);
l l l .
Stage:Bit Mask:Operations
_:_:_
ZIO_STAGE_OPEN:0x00000001:RWFCI
ZIO_STAGE_OPEN:0x00000001:RWFCIT
ZIO_STAGE_READ_BP_INIT:0x00000002:R----
ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W---
ZIO_STAGE_FREE_BP_INIT:0x00000008:--F--
ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF--
ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W---
ZIO_STAGE_READ_BP_INIT:0x00000002:R-----
ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W----
ZIO_STAGE_FREE_BP_INIT:0x00000008:--F---
ZIO_STAGE_ISSUE_ASYNC:0x00000010:-WF--T
ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W----
ZIO_STAGE_ENCRYPT:0x00000040:-W---
ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---
ZIO_STAGE_ENCRYPT:0x00000040:-W----
ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W----
ZIO_STAGE_NOP_WRITE:0x00000100:-W---
ZIO_STAGE_NOP_WRITE:0x00000100:-W----
ZIO_STAGE_BRT_FREE:0x00000200:--F--
ZIO_STAGE_BRT_FREE:0x00000200:--F---
ZIO_STAGE_DDT_READ_START:0x00000400:R----
ZIO_STAGE_DDT_READ_DONE:0x00000800:R----
ZIO_STAGE_DDT_WRITE:0x00001000:-W---
ZIO_STAGE_DDT_FREE:0x00002000:--F--
ZIO_STAGE_DDT_READ_START:0x00000400:R-----
ZIO_STAGE_DDT_READ_DONE:0x00000800:R-----
ZIO_STAGE_DDT_WRITE:0x00001000:-W----
ZIO_STAGE_DDT_FREE:0x00002000:--F---
ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC-
ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC-
ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC--
ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC--
ZIO_STAGE_DVA_THROTTLE:0x00010000:-W---
ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W---
ZIO_STAGE_DVA_FREE:0x00040000:--F--
ZIO_STAGE_DVA_CLAIM:0x00080000:---C-
ZIO_STAGE_DVA_THROTTLE:0x00010000:-W----
ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W----
ZIO_STAGE_DVA_FREE:0x00040000:--F---
ZIO_STAGE_DVA_CLAIM:0x00080000:---C--
ZIO_STAGE_READY:0x00100000:RWFCI
ZIO_STAGE_READY:0x00100000:RWFCIT
ZIO_STAGE_VDEV_IO_START:0x00200000:RW--I
ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--I
ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--I
ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT
ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW---T
ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT
ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----
ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
ZIO_STAGE_DONE:0x02000000:RWFCI
ZIO_STAGE_DONE:0x02000000:RWFCIT
.TE
.
.Sh I/O FLAGS

View file

@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm zpool
.Cm status
.Op Fl DeigLpPstvx
.Op Fl DegiLpPstvx
.Op Fl T Sy u Ns | Ns Sy d
.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns
.Oo Ar pool Oc Ns
@ -69,14 +69,20 @@ See the
option of
.Nm zpool Cm iostat
for complete details.
.It Fl D
Display a histogram of deduplication statistics, showing the allocated
.Pq physically present on disk
and referenced
.Pq logically referenced in the pool
block counts and sizes by reference count.
.It Fl e
Only show unhealthy vdevs (not-ONLINE or with errors).
.It Fl i
Display vdev initialization status.
.It Fl g
Display vdev GUIDs instead of the normal device names
These GUIDs can be used in place of device names for the zpool
detach/offline/remove/replace commands.
.It Fl i
Display vdev initialization status.
.It Fl L
Display real paths for vdevs resolving all symbolic links.
This can be used to look up the current block device name regardless of the
@ -90,12 +96,6 @@ the path.
This can be used in conjunction with the
.Fl L
flag.
.It Fl D
Display a histogram of deduplication statistics, showing the allocated
.Pq physically present on disk
and referenced
.Pq logically referenced in the pool
block counts and sizes by reference count.
.It Fl s
Display the number of leaf vdev slow I/O operations.
This is the number of I/O operations that didn't complete in

View file

@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}
/*

View file

@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
/*
@ -59,7 +60,9 @@
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
#include <linux/version.h>
#endif
#ifdef _KERNEL
@ -895,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
aiter->iter_pos = 0;
if (abd_is_linear(abd)) {
aiter->iter_offset = 0;
aiter->iter_sg = NULL;
} else {
if (!abd_is_linear(abd)) {
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
@ -915,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}
@ -926,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter)
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
/*
* Ensure that last chunk is not in use. abd_iterate_*() must clear
* this state (directly or abd_iter_unmap()) before advancing.
*/
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
ASSERT3P(aiter->iter_page, ==, NULL);
ASSERT0(aiter->iter_page_doff);
ASSERT0(aiter->iter_page_dsize);
/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
@ -1009,6 +1015,106 @@ abd_cache_reap_now(void)
}
#if defined(_KERNEL)
/*
* Yield the next page struct and data offset and size within it, without
* mapping it into the address space.
*/
void
abd_iter_page(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter)) {
aiter->iter_page = NULL;
aiter->iter_page_doff = 0;
aiter->iter_page_dsize = 0;
return;
}
struct page *page;
size_t doff, dsize;
if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
/* memory address at iter_pos */
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
/* struct page for address */
page = is_vmalloc_addr(paddr) ?
vmalloc_to_page(paddr) : virt_to_page(paddr);
/* offset of address within the page */
doff = offset_in_page(paddr);
/* total data remaining in abd from this position */
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
} else {
ASSERT(!abd_is_gang(aiter->iter_abd));
/* current scatter page */
page = sg_page(aiter->iter_sg);
/* position within page */
doff = aiter->iter_offset;
/* remaining data in scatterlist */
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
}
ASSERT(page);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
if (PageTail(page)) {
/*
* This page is part of a "compound page", which is a group of
* pages that can be referenced from a single struct page *.
* Its organised as a "head" page, followed by a series of
* "tail" pages.
*
* In OpenZFS, compound pages are allocated using the
* __GFP_COMP flag, which we get from scatter ABDs and SPL
* vmalloc slabs (ie >16K allocations). So a great many of the
* IO buffers we get are going to be of this type.
*
* The tail pages are just regular PAGE_SIZE pages, and can be
* safely used as-is. However, the head page has length
* covering itself and all the tail pages. If this ABD chunk
* spans multiple pages, then we can use the head page and a
* >PAGE_SIZE length, which is far more efficient.
*
* To do this, we need to adjust the offset to be counted from
* the head page. struct page for compound pages are stored
* contiguously, so we can just adjust by a simple offset.
*
* Before kernel 4.5, compound page heads were refcounted
* separately, such that moving back to the head page would
* require us to take a reference to it and releasing it once
* we're completely finished with it. In practice, that means
* when our caller is done with the ABD, which we have no
* insight into from here. Rather than contort this API to
* track head page references on such ancient kernels, we just
* compile this block out and use the tail pages directly. This
* is slightly less efficient, but makes everything far
* simpler.
*/
struct page *head = compound_head(page);
doff += ((page - head) * PAGESIZE);
page = head;
}
#endif
/* final page and position within it */
aiter->iter_page = page;
aiter->iter_page_doff = doff;
/* amount of data in the chunk, up to the end of the page */
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
}
/*
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
* vdev_disk.c.
*/
/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
@ -1163,4 +1269,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif
#endif /* _KERNEL */

View file

@ -24,6 +24,7 @@
* Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
* LLNL-CODE-403049.
* Copyright (c) 2012, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/
#include <sys/zfs_context.h>
@ -66,6 +67,13 @@ typedef struct vdev_disk {
krwlock_t vd_lock;
} vdev_disk_t;
/*
* Maximum number of segments to add to a bio (min 4). If this is higher than
* the maximum allowed by the device queue or the kernel itself, it will be
* clamped. Setting it to zero will cause the kernel's ideal size to be used.
*/
uint_t zfs_vdev_disk_max_segs = 0;
/*
* Unique identifier for the exclusive vdev holder.
*/
@ -83,55 +91,47 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
*/
#define EFI_MIN_RESV_SIZE (16 * 1024)
/*
* Virtual device vector for disks.
*/
typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */
atomic_t dr_ref; /* References */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[]; /* Attached bio's */
} dio_request_t;
/*
* BIO request failfast mask.
*/
static unsigned int zfs_vdev_failfast_mask = 1;
/*
* Convert SPA mode flags into bdev open mode flags.
*/
#ifdef HAVE_BLK_MODE_T
static blk_mode_t
typedef blk_mode_t vdev_bdev_mode_t;
#define VDEV_BDEV_MODE_READ BLK_OPEN_READ
#define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE
#define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL
#define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
#else
static fmode_t
typedef fmode_t vdev_bdev_mode_t;
#define VDEV_BDEV_MODE_READ FMODE_READ
#define VDEV_BDEV_MODE_WRITE FMODE_WRITE
#define VDEV_BDEV_MODE_EXCL FMODE_EXCL
#define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL)
#endif
vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
static vdev_bdev_mode_t
vdev_bdev_mode(spa_mode_t smode)
{
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = 0;
ASSERT3U(smode, !=, SPA_MODE_UNINIT);
ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
if (spa_mode & SPA_MODE_READ)
mode |= BLK_OPEN_READ;
vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
if (spa_mode & SPA_MODE_WRITE)
mode |= BLK_OPEN_WRITE;
if (smode & SPA_MODE_READ)
bmode |= VDEV_BDEV_MODE_READ;
if (exclusive)
mode |= BLK_OPEN_EXCL;
#else
fmode_t mode = 0;
if (smode & SPA_MODE_WRITE)
bmode |= VDEV_BDEV_MODE_WRITE;
if (spa_mode & SPA_MODE_READ)
mode |= FMODE_READ;
ASSERT(bmode & VDEV_BDEV_MODE_MASK);
ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
if (spa_mode & SPA_MODE_WRITE)
mode |= FMODE_WRITE;
if (exclusive)
mode |= FMODE_EXCL;
#endif
return (mode);
return (bmode);
}
/*
@ -238,30 +238,28 @@ vdev_disk_kobj_evt_post(vdev_t *v)
}
static zfs_bdev_handle_t *
vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder)
vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
{
vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
#if defined(HAVE_BDEV_OPEN_BY_PATH)
return (bdev_open_by_path(path,
vdev_bdev_mode(mode, B_TRUE), holder, NULL));
return (bdev_open_by_path(path, bmode, holder, NULL));
#elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
return (blkdev_get_by_path(path,
vdev_bdev_mode(mode, B_TRUE), holder, NULL));
return (blkdev_get_by_path(path, bmode, holder, NULL));
#else
return (blkdev_get_by_path(path,
vdev_bdev_mode(mode, B_TRUE), holder));
return (blkdev_get_by_path(path, bmode, holder));
#endif
}
static void
vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder)
vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
{
#if defined(HAVE_BDEV_RELEASE)
return (bdev_release(bdh));
#elif defined(HAVE_BLKDEV_PUT_HOLDER)
return (blkdev_put(BDH_BDEV(bdh), holder));
#else
return (blkdev_put(BDH_BDEV(bdh),
vdev_bdev_mode(mode, B_TRUE)));
return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
#endif
}
@ -270,11 +268,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
zfs_bdev_handle_t *bdh;
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
#else
fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
#endif
spa_mode_t smode = spa_mode(v->vdev_spa);
hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
vdev_disk_t *vd;
@ -325,16 +319,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
reread_part = B_TRUE;
}
vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
}
if (reread_part) {
bdh = vdev_blkdev_get_by_path(disk_name, mode,
bdh = vdev_blkdev_get_by_path(disk_name, smode,
zfs_vdev_holder);
if (!BDH_IS_ERR(bdh)) {
int error =
vdev_bdev_reread_part(BDH_BDEV(bdh));
vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
if (error == 0) {
timeout = MSEC2NSEC(
zfs_vdev_open_timeout_ms * 2);
@ -379,7 +373,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
hrtime_t start = gethrtime();
bdh = BDH_ERR_PTR(-ENXIO);
while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
bdh = vdev_blkdev_get_by_path(v->vdev_path, mode,
bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
zfs_vdev_holder);
if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
/*
@ -457,95 +451,15 @@ vdev_disk_close(vdev_t *v)
if (v->vdev_reopening || vd == NULL)
return;
if (vd->vd_bdh != NULL) {
if (vd->vd_bdh != NULL)
vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
zfs_vdev_holder);
}
rw_destroy(&vd->vd_lock);
kmem_free(vd, sizeof (vdev_disk_t));
v->vdev_tsd = NULL;
}
static dio_request_t *
vdev_disk_dio_alloc(int bio_count)
{
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
for (int i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
return (dr);
}
static void
vdev_disk_dio_free(dio_request_t *dr)
{
int i;
for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
bio_put(dr->dr_bio[i]);
kmem_free(dr, sizeof (dio_request_t) +
sizeof (struct bio *) * dr->dr_bio_count);
}
static void
vdev_disk_dio_get(dio_request_t *dr)
{
atomic_inc(&dr->dr_ref);
}
static void
vdev_disk_dio_put(dio_request_t *dr)
{
int rc = atomic_dec_return(&dr->dr_ref);
/*
* Free the dio_request when the last reference is dropped and
* ensure zio_interpret is called only once with the correct zio
*/
if (rc == 0) {
zio_t *zio = dr->dr_zio;
int error = dr->dr_error;
vdev_disk_dio_free(dr);
if (zio) {
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_delay_interrupt(zio);
}
}
}
BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
{
dio_request_t *dr = bio->bi_private;
if (dr->dr_error == 0) {
#ifdef HAVE_1ARG_BIO_END_IO_T
dr->dr_error = BIO_END_IO_ERROR(bio);
#else
if (error)
dr->dr_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
dr->dr_error = EIO;
#endif
}
/* Drop reference acquired by __vdev_disk_physio */
vdev_disk_dio_put(dr);
}
static inline void
vdev_submit_bio_impl(struct bio *bio)
{
@ -697,8 +611,460 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
return (bio);
}
static inline uint_t
vdev_bio_max_segs(struct block_device *bdev)
{
/*
* Smallest of the device max segs and the tuneable max segs. Minimum
* 4, so there's room to finish split pages if they come up.
*/
const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
#ifdef HAVE_BIO_MAX_SEGS
return (bio_max_segs(max_segs));
#else
return (MIN(max_segs, BIO_MAX_PAGES));
#endif
}
static inline uint_t
vdev_bio_max_bytes(struct block_device *bdev)
{
return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
}
/*
* Virtual block IO object (VBIO)
*
* Linux block IO (BIO) objects have a limit on how many data segments (pages)
* they can hold. Depending on how they're allocated and structured, a large
* ZIO can require more than one BIO to be submitted to the kernel, which then
* all have to complete before we can return the completed ZIO back to ZFS.
*
* A VBIO is a wrapper around multiple BIOs, carrying everything needed to
* translate a ZIO down into the kernel block layer and back again.
*
* Note that these are only used for data ZIOs (read/write). Meta-operations
* (flush/trim) don't need multiple BIOs and so can just make the call
* directly.
*/
typedef struct {
zio_t *vbio_zio; /* parent zio */
struct block_device *vbio_bdev; /* blockdev to submit bios to */
abd_t *vbio_abd; /* abd carrying borrowed linear buf */
uint_t vbio_max_segs; /* max segs per bio */
uint_t vbio_max_bytes; /* max bytes per bio */
uint_t vbio_lbs_mask; /* logical block size mask */
uint64_t vbio_offset; /* start offset of next bio */
struct bio *vbio_bio; /* pointer to the current bio */
int vbio_flags; /* bio flags */
} vbio_t;
static vbio_t *
vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
{
vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
vbio->vbio_zio = zio;
vbio->vbio_bdev = bdev;
vbio->vbio_abd = NULL;
vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
vbio->vbio_offset = zio->io_offset;
vbio->vbio_bio = NULL;
vbio->vbio_flags = flags;
return (vbio);
}
BIO_END_IO_PROTO(vbio_completion, bio, error);
static int
vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
{
struct bio *bio = vbio->vbio_bio;
uint_t ssize;
while (size > 0) {
if (bio == NULL) {
/* New BIO, allocate and set up */
bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
vbio->vbio_max_segs);
VERIFY(bio);
BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
bio_set_op_attrs(bio,
vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
WRITE : READ, vbio->vbio_flags);
if (vbio->vbio_bio) {
bio_chain(vbio->vbio_bio, bio);
vdev_submit_bio(vbio->vbio_bio);
}
vbio->vbio_bio = bio;
}
/*
* Only load as much of the current page data as will fit in
* the space left in the BIO, respecting lbs alignment. Older
* kernels will error if we try to overfill the BIO, while
* newer ones will accept it and split the BIO. This ensures
* everything works on older kernels, and avoids an additional
* overhead on the new.
*/
ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
vbio->vbio_lbs_mask);
if (ssize > 0 &&
bio_add_page(bio, page, ssize, offset) == ssize) {
/* Accepted, adjust and load any remaining. */
size -= ssize;
offset += ssize;
continue;
}
/* No room, set up for a new BIO and loop */
vbio->vbio_offset += BIO_BI_SIZE(bio);
/* Signal new BIO allocation wanted */
bio = NULL;
}
return (0);
}
/* Iterator callback to submit ABD pages to the vbio. */
static int
vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
{
vbio_t *vbio = priv;
return (vbio_add_page(vbio, page, len, off));
}
/* Create some BIOs, fill them with data and submit them */
static void
vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
{
ASSERT(vbio->vbio_bdev);
/*
* We plug so we can submit the BIOs as we go and only unplug them when
* they are fully created and submitted. This is important; if we don't
* plug, then the kernel may start executing earlier BIOs while we're
* still creating and executing later ones, and if the device goes
* away while that's happening, older kernels can get confused and
* trample memory.
*/
struct blk_plug plug;
blk_start_plug(&plug);
(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
ASSERT(vbio->vbio_bio);
vbio->vbio_bio->bi_end_io = vbio_completion;
vbio->vbio_bio->bi_private = vbio;
vdev_submit_bio(vbio->vbio_bio);
blk_finish_plug(&plug);
vbio->vbio_bio = NULL;
vbio->vbio_bdev = NULL;
}
/* IO completion callback */
BIO_END_IO_PROTO(vbio_completion, bio, error)
{
vbio_t *vbio = bio->bi_private;
zio_t *zio = vbio->vbio_zio;
ASSERT(zio);
/* Capture and log any errors */
#ifdef HAVE_1ARG_BIO_END_IO_T
zio->io_error = BIO_END_IO_ERROR(bio);
#else
zio->io_error = 0;
if (error)
zio->io_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
zio->io_error = EIO;
#endif
ASSERT3U(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
/* Return the BIO to the kernel */
bio_put(bio);
/*
* If we copied the ABD before issuing it, clean up and return the copy
* to the ADB, with changes if appropriate.
*/
if (vbio->vbio_abd != NULL) {
void *buf = abd_to_buf(vbio->vbio_abd);
abd_free(vbio->vbio_abd);
vbio->vbio_abd = NULL;
if (zio->io_type == ZIO_TYPE_READ)
abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
else
abd_return_buf(zio->io_abd, buf, zio->io_size);
}
/* Final cleanup */
kmem_free(vbio, sizeof (vbio_t));
/* All done, submit for processing */
zio_delay_interrupt(zio);
}
/*
* Iterator callback to count ABD pages and check their size & alignment.
*
* On Linux, each BIO segment can take a page pointer, and an offset+length of
* the data within that page. A page can be arbitrarily large ("compound"
* pages) but we still have to ensure the data portion is correctly sized and
* aligned to the logical block size, to ensure that if the kernel wants to
* split the BIO, the two halves will still be properly aligned.
*/
typedef struct {
uint_t bmask;
uint_t npages;
uint_t end;
} vdev_disk_check_pages_t;
static int
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
{
vdev_disk_check_pages_t *s = priv;
/*
* If we didn't finish on a block size boundary last time, then there
* would be a gap if we tried to use this ABD as-is, so abort.
*/
if (s->end != 0)
return (1);
/*
* Note if we're taking less than a full block, so we can check it
* above on the next call.
*/
s->end = len & s->bmask;
/* All blocks after the first must start on a block size boundary. */
if (s->npages != 0 && (off & s->bmask) != 0)
return (1);
s->npages++;
return (0);
}
/*
* Check if we can submit the pages in this ABD to the kernel as-is. Returns
* the number of pages, or 0 if it can't be submitted like this.
*/
static boolean_t
vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
{
vdev_disk_check_pages_t s = {
.bmask = bdev_logical_block_size(bdev)-1,
.npages = 0,
.end = 0,
};
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
return (B_FALSE);
return (B_TRUE);
}
static int
vdev_disk_io_rw(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
int flags = 0;
/*
* Accessing outside the block device is never allowed.
*/
if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
vdev_dbgmsg(zio->io_vd,
"Illegal access %llu size %llu, device size %llu",
(u_longlong_t)zio->io_offset,
(u_longlong_t)zio->io_size,
(u_longlong_t)i_size_read(bdev->bd_inode));
return (SET_ERROR(EIO));
}
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
v->vdev_failfast == B_TRUE) {
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
}
/*
* Check alignment of the incoming ABD. If any part of it would require
* submitting a page that is not aligned to the logical block size,
* then we take a copy into a linear buffer and submit that instead.
* This should be impossible on a 512b LBS, and fairly rare on 4K,
* usually requiring abnormally-small data blocks (eg gang blocks)
* mixed into the same ABD as larger ones (eg aggregated).
*/
abd_t *abd = zio->io_abd;
if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
void *buf;
if (zio->io_type == ZIO_TYPE_READ)
buf = abd_borrow_buf(zio->io_abd, zio->io_size);
else
buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
/*
* Wrap the copy in an abd_t, so we can use the same iterators
* to count and fill the vbio later.
*/
abd = abd_get_from_buf(buf, zio->io_size);
/*
* False here would mean the borrowed copy has an invalid
* alignment too, which would mean we've somehow been passed a
* linear ABD with an interior page that has a non-zero offset
* or a size not a multiple of PAGE_SIZE. This is not possible.
* It would mean either zio_buf_alloc() or its underlying
* allocators have done something extremely strange, or our
* math in vdev_disk_check_pages() is wrong. In either case,
* something in seriously wrong and its not safe to continue.
*/
VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
}
/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
vbio_t *vbio = vbio_alloc(zio, bdev, flags);
if (abd != zio->io_abd)
vbio->vbio_abd = abd;
/* Fill it with data pages and submit it to the kernel */
vbio_submit(vbio, abd, zio->io_size);
return (0);
}
/* ========== */
/*
* This is the classic, battle-tested BIO submission code. Until we're totally
* sure that the new code is safe and correct in all cases, this will remain
* available and can be enabled by setting zfs_vdev_disk_classic=1 at module
* load time.
*
* These functions have been renamed to vdev_classic_* to make it clear what
* they belong to, but their implementations are unchanged.
*/
/*
* Virtual device vector for disks.
*/
typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */
atomic_t dr_ref; /* References */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[]; /* Attached bio's */
} dio_request_t;
static dio_request_t *
vdev_classic_dio_alloc(int bio_count)
{
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;
for (int i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;
return (dr);
}
static void
vdev_classic_dio_free(dio_request_t *dr)
{
int i;
for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
bio_put(dr->dr_bio[i]);
kmem_free(dr, sizeof (dio_request_t) +
sizeof (struct bio *) * dr->dr_bio_count);
}
static void
vdev_classic_dio_get(dio_request_t *dr)
{
atomic_inc(&dr->dr_ref);
}
static void
vdev_classic_dio_put(dio_request_t *dr)
{
int rc = atomic_dec_return(&dr->dr_ref);
/*
* Free the dio_request when the last reference is dropped and
* ensure zio_interpret is called only once with the correct zio
*/
if (rc == 0) {
zio_t *zio = dr->dr_zio;
int error = dr->dr_error;
vdev_classic_dio_free(dr);
if (zio) {
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);
zio_delay_interrupt(zio);
}
}
}
BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
{
dio_request_t *dr = bio->bi_private;
if (dr->dr_error == 0) {
#ifdef HAVE_1ARG_BIO_END_IO_T
dr->dr_error = BIO_END_IO_ERROR(bio);
#else
if (error)
dr->dr_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
dr->dr_error = EIO;
#endif
}
/* Drop reference acquired by vdev_classic_physio */
vdev_classic_dio_put(dr);
}
static inline unsigned int
vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
{
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
bio_size, abd_offset);
@ -711,9 +1077,16 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
}
static int
__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
size_t io_size, uint64_t io_offset, int rw, int flags)
vdev_classic_physio(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
size_t io_size = zio->io_size;
uint64_t io_offset = zio->io_offset;
int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
int flags = 0;
dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
@ -736,7 +1109,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
}
retry:
dr = vdev_disk_dio_alloc(bio_count);
dr = vdev_classic_dio_alloc(bio_count);
if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
zio->io_vd->vdev_failfast == B_TRUE) {
@ -771,23 +1144,23 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
* this should be rare - see the comment above.
*/
if (dr->dr_bio_count == i) {
vdev_disk_dio_free(dr);
vdev_classic_dio_free(dr);
bio_count *= 2;
goto retry;
}
nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_disk_dio_free(dr);
vdev_classic_dio_free(dr);
return (SET_ERROR(ENOMEM));
}
/* Matching put called by vdev_disk_physio_completion */
vdev_disk_dio_get(dr);
/* Matching put called by vdev_classic_physio_completion */
vdev_classic_dio_get(dr);
BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
dr->dr_bio[i]->bi_private = dr;
bio_set_op_attrs(dr->dr_bio[i], rw, flags);
@ -801,7 +1174,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
}
/* Extra reference to protect dio_request during vdev_submit_bio */
vdev_disk_dio_get(dr);
vdev_classic_dio_get(dr);
if (dr->dr_bio_count > 1)
blk_start_plug(&plug);
@ -815,11 +1188,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);
vdev_disk_dio_put(dr);
vdev_classic_dio_put(dr);
return (error);
}
/* ========== */
BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
{
zio_t *zio = bio->bi_private;
@ -928,12 +1303,14 @@ vdev_disk_io_trim(zio_t *zio)
#endif
}
int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
int rw, error;
int error;
/*
* If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@ -996,13 +1373,6 @@ vdev_disk_io_start(zio_t *zio)
rw_exit(&vd->vd_lock);
zio_execute(zio);
return;
case ZIO_TYPE_WRITE:
rw = WRITE;
break;
case ZIO_TYPE_READ:
rw = READ;
break;
case ZIO_TYPE_TRIM:
zio->io_error = vdev_disk_io_trim(zio);
@ -1015,23 +1385,34 @@ vdev_disk_io_start(zio_t *zio)
#endif
return;
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = vdev_disk_io_rw_fn(zio);
rw_exit(&vd->vd_lock);
if (error) {
zio->io_error = error;
zio_interrupt(zio);
}
return;
default:
/*
* Getting here means our parent vdev has made a very strange
* request of us, and shouldn't happen. Assert here to force a
* crash in dev builds, but in production return the IO
* unhandled. The pool will likely suspend anyway but that's
* nicer than crashing the kernel.
*/
ASSERT3S(zio->io_type, ==, -1);
rw_exit(&vd->vd_lock);
zio->io_error = SET_ERROR(ENOTSUP);
zio_interrupt(zio);
return;
}
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
zio->io_size, zio->io_offset, rw, 0);
rw_exit(&vd->vd_lock);
if (error) {
zio->io_error = error;
zio_interrupt(zio);
return;
}
__builtin_unreachable();
}
static void
@ -1080,8 +1461,49 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}
/*
* BIO submission method. See comment above about vdev_classic.
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
*/
static uint_t zfs_vdev_disk_classic = 0; /* default new */
/* Set submission function from module parameter */
static int
vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
{
int err = param_set_uint(buf, kp);
if (err < 0)
return (SET_ERROR(err));
vdev_disk_io_rw_fn =
zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
zfs_vdev_disk_classic ? "classic" : "new");
return (0);
}
/*
* At first use vdev use, set the submission function from the default value if
* it hasn't been set already.
*/
static int
vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
{
(void) spa;
(void) nv;
(void) tsd;
if (vdev_disk_io_rw_fn == NULL)
vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
vdev_classic_physio : vdev_disk_io_rw;
return (0);
}
vdev_ops_t vdev_disk_ops = {
.vdev_op_init = NULL,
.vdev_op_init = vdev_disk_init,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
@ -1174,3 +1596,10 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
"Maximum number of data segments to add to an IO request (min 4)");
ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
"Use classic BIO submission method");

View file

@ -3795,11 +3795,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
err = dmu_tx_assign(tx, TXG_NOWAIT);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err != 0) {
if (err == ERESTART)
dmu_tx_wait(tx);
dmu_tx_abort(tx);
#ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
filemap_dirty_folio(page_mapping(pp), page_folio(pp));

View file

@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
{
boolean_t *for_sync = data;
fstrans_cookie_t cookie;
int ret;
ASSERT(PageLocked(pp));
ASSERT(!PageWriteback(pp));
cookie = spl_fstrans_mark();
(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
spl_fstrans_unmark(cookie);
return (0);
return (ret);
}
#ifdef HAVE_WRITEPAGE_T_FOLIO
static int
zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
{
(void) zpl_putpage(&pp->page, wbc, data);
return (0);
return (zpl_putpage(&pp->page, wbc, data));
}
#endif

View file

@ -26,6 +26,9 @@
#include <linux/compat.h>
#endif
#include <linux/fs.h>
#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
#include <linux/splice.h>
#endif
#include <sys/file.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_vnops.h>
@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
ret = zpl_clone_file_range_impl(src_file, src_off,
dst_file, dst_off, len);
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
/*
* Since Linux 5.3 the filesystem driver is responsible for executing
* an appropriate fallback, and a generic fallback function is provided.
@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
ret == -EAGAIN)
ret = generic_copy_file_range(src_file, src_off, dst_file,
dst_off, len, flags);
#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
/*
* Since 6.8 the fallback function is called splice_copy_file_range
* and has a slightly different signature.
*/
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
ret == -EAGAIN)
ret = splice_copy_file_range(src_file, src_off, dst_file,
dst_off, len);
#else
/*
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
*/
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
ret = -EOPNOTSUPP;
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
return (ret);
}

View file

@ -1314,6 +1314,13 @@ zvol_os_create_minor(const char *name)
if (idx < 0)
return (SET_ERROR(-idx));
minor = idx << ZVOL_MINOR_BITS;
if (MINOR(minor) != minor) {
/* too many partitions can cause an overflow */
zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
name, minor, MINOR(minor));
ida_simple_remove(&zvol_ida, idx);
return (SET_ERROR(EINVAL));
}
zv = zvol_find_by_name_hash(name, hash, RW_NONE);
if (zv) {

View file

@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
return (ret);
}
#if defined(__linux__) && defined(_KERNEL)
int
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
abd_iter_page_func_t *func, void *private)
{
struct abd_iter aiter;
int ret = 0;
if (size == 0)
return (0);
abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);
abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
while (size > 0) {
IMPLY(abd_is_gang(abd), c_abd != NULL);
abd_iter_page(&aiter);
size_t len = MIN(aiter.iter_page_dsize, size);
ASSERT3U(len, >, 0);
ret = func(aiter.iter_page, aiter.iter_page_doff,
len, private);
aiter.iter_page = NULL;
aiter.iter_page_doff = 0;
aiter.iter_page_dsize = 0;
if (ret != 0)
break;
size -= len;
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
}
return (ret);
}
#endif
struct buf_arg {
void *arg_buf;
};

View file

@ -1014,7 +1014,7 @@ static arc_buf_hdr_t *
buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
const dva_t *dva = BP_IDENTITY(bp);
uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t birth = BP_GET_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
arc_buf_hdr_t *hdr;
@ -2183,7 +2183,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
* (and generate an ereport) before leaving the ARC.
*/
ret = SET_ERROR(EIO);
spa_log_error(spa, zb, &buf->b_hdr->b_birth);
spa_log_error(spa, zb, buf->b_hdr->b_birth);
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0);
}
@ -5251,7 +5251,7 @@ arc_read_done(zio_t *zio)
if (HDR_IN_HASH_TABLE(hdr)) {
arc_buf_hdr_t *found;
ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
ASSERT3U(hdr->b_dva.dva_word[0], ==,
BP_IDENTITY(zio->io_bp)->dva_word[0]);
ASSERT3U(hdr->b_dva.dva_word[1], ==,
@ -5354,7 +5354,7 @@ arc_read_done(zio_t *zio)
error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(zio->io_spa, &acb->acb_zb,
&zio->io_bp->blk_birth);
BP_GET_LOGICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@ -5639,7 +5639,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
*/
rc = SET_ERROR(EIO);
if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, zb, &hdr->b_birth);
spa_log_error(spa, zb, hdr->b_birth);
(void) zfs_ereport_post(
FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, zb, NULL, 0);
@ -5686,12 +5686,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
* embedded data.
*/
arc_buf_hdr_t *exists = NULL;
hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
hdr = arc_hdr_alloc(guid, psize, lsize,
BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
if (!embedded_bp) {
hdr->b_dva = *BP_IDENTITY(bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
hdr->b_birth = BP_GET_BIRTH(bp);
exists = buf_hash_insert(hdr, &hash_lock);
}
if (exists != NULL) {
@ -6557,7 +6557,7 @@ arc_write_done(zio_t *zio)
buf_discard_identity(hdr);
} else {
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
}
} else {
ASSERT(HDR_EMPTY(hdr));

View file

@ -893,7 +893,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
*/
memset(&stored_bp, 0, sizeof (stored_bp));
stored_bp.blk_prop = bp->blk_prop;
stored_bp.blk_birth = bp->blk_birth;
BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
} else if (!BP_GET_DEDUP(bp)) {
/* The bpobj will compress better without the checksum */
memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
@ -953,7 +953,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
(void) bp_freed, (void) tx;
struct space_range_arg *sra = arg;
if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
sra->used += bp_get_dsize_sync(sra->spa, bp);
else
@ -985,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
/*
* Return the amount of space in the bpobj which is:
* mintxg < blk_birth <= maxtxg
* mintxg < logical birth <= maxtxg
*/
int
bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,

View file

@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache;
/*
* Enable/disable prefetching of BRT entries that we are going to modify.
*/
int zfs_brt_prefetch = 1;
static int brt_zap_prefetch = 1;
#ifdef ZFS_DEBUG
#define BRT_DEBUG(...) do { \
@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1;
#define BRT_DEBUG(...) do { } while (0)
#endif
int brt_zap_leaf_blockshift = 12;
int brt_zap_indirect_blockshift = 12;
static int brt_zap_default_bs = 12;
static int brt_zap_default_ibs = 12;
static kstat_t *brt_ksp;
@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
0, tx);
brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
VERIFY(brtvd->bv_mos_entries != 0);
BRT_DEBUG("MOS entries created, object=%llu",
(u_longlong_t)brtvd->bv_mos_entries);
@ -901,7 +900,6 @@ static int
brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
{
uint64_t mos_entries;
uint64_t one, physsize;
int error;
ASSERT(RW_LOCK_HELD(&brt->brt_lock));
@ -919,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
brt_unlock(brt);
error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
BRT_KEY_WORDS, &one, &physsize);
if (error == 0) {
ASSERT3U(one, ==, 1);
ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
error = zap_lookup_uint64(brt->brt_mos, mos_entries,
&bre->bre_offset, BRT_KEY_WORDS, 1,
sizeof (bre->bre_refcount), &bre->bre_refcount);
BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
"count=%llu error=%d", (u_longlong_t)mos_entries,
(u_longlong_t)brtvd->bv_vdevid,
(u_longlong_t)bre->bre_offset,
error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
}
error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
brt_wlock(brt);
@ -955,52 +940,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
if (mos_entries == 0)
return;
BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
(u_longlong_t)mos_entries, (u_longlong_t)vdevid,
(u_longlong_t)bre->bre_offset);
(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
}
static int
brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
{
int error;
ASSERT(RW_LOCK_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_mos_entries != 0);
ASSERT(bre->bre_refcount > 0);
error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
"error=%d", (u_longlong_t)brtvd->bv_mos_entries,
(u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
(u_longlong_t)bre->bre_refcount, error);
return (error);
}
static int
brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
{
int error;
ASSERT(RW_LOCK_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_mos_entries != 0);
ASSERT0(bre->bre_refcount);
error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
(uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
"error=%d", (u_longlong_t)brtvd->bv_mos_entries,
(u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
(u_longlong_t)bre->bre_refcount, error);
return (error);
}
/*
* Return TRUE if we _can_ have BRT entry for this bp. It might be false
* positive, but gives us quick answer if we should look into BRT, which
@ -1405,7 +1348,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp)
ASSERT(bp != NULL);
if (!zfs_brt_prefetch)
if (!brt_zap_prefetch)
return;
brt_entry_fill(bp, &bre, &vdevid);
@ -1420,13 +1363,13 @@ brt_pending_entry_compare(const void *x1, const void *x2)
const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
int cmp;
cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
DVA_GET_VDEV(&bp2->blk_dva[0]));
if (cmp == 0) {
cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
DVA_GET_VDEV(&bp2->blk_dva[0]));
if (cmp == 0) {
cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
DVA_GET_OFFSET(&bp2->blk_dva[0]));
cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
DVA_GET_OFFSET(&bp2->blk_dva[0]));
if (unlikely(cmp == 0)) {
cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
}
}
@ -1471,10 +1414,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
kmem_cache_free(brt_pending_entry_cache, newbpe);
} else {
ASSERT(bpe == NULL);
}
/* Prefetch BRT entry, as we will need it in the syncing context. */
brt_prefetch(brt, bp);
/* Prefetch BRT entry for the syncing context. */
brt_prefetch(brt, bp);
}
}
void
@ -1514,26 +1457,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
void
brt_pending_apply(spa_t *spa, uint64_t txg)
{
brt_t *brt;
brt_t *brt = spa->spa_brt;
brt_pending_entry_t *bpe;
avl_tree_t *pending_tree;
kmutex_t *pending_lock;
void *c;
ASSERT3U(txg, !=, 0);
brt = spa->spa_brt;
/*
* We are in syncing context, so no other brt_pending_tree accesses
* are possible for the TXG. Don't need to acquire brt_pending_lock.
*/
pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
mutex_enter(pending_lock);
c = NULL;
while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
boolean_t added_to_ddt;
mutex_exit(pending_lock);
for (int i = 0; i < bpe->bpe_count; i++) {
/*
* If the block has DEDUP bit set, it means that it
@ -1551,31 +1491,20 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
}
kmem_cache_free(brt_pending_entry_cache, bpe);
mutex_enter(pending_lock);
}
mutex_exit(pending_lock);
}
static void
brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
{
ASSERT(RW_WRITE_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_mos_entries != 0);
if (bre->bre_refcount == 0) {
int error;
error = brt_entry_remove(brt, brtvd, bre, tx);
ASSERT(error == 0 || error == ENOENT);
/*
* If error == ENOENT then zfs_clone_range() was done from a
* removed (but opened) file (open(), unlink()).
*/
ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
BRT_KEY_WORDS, tx);
VERIFY(error == 0 || error == ENOENT);
} else {
VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
&bre->bre_refcount, tx));
}
}
@ -1584,6 +1513,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
{
brt_vdev_t *brtvd;
brt_entry_t *bre;
dnode_t *dn;
uint64_t vdevid;
void *c;
@ -1607,14 +1537,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
if (brtvd->bv_mos_brtvdev == 0)
brt_vdev_create(brt, brtvd, tx);
VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
FTAG, &dn));
c = NULL;
while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
brt_sync_entry(brt, brtvd, bre, tx);
brt_sync_entry(dn, bre, tx);
brt_entry_free(bre);
ASSERT(brt->brt_nentries > 0);
brt->brt_nentries--;
}
dnode_rele(dn, FTAG);
brt_vdev_sync(brt, brtvd, tx);
if (brtvd->bv_totalcount == 0)
@ -1729,9 +1664,10 @@ brt_unload(spa_t *spa)
}
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
"Enable prefetching of BRT entries");
#ifdef ZFS_BRT_DEBUG
ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
#endif
ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
"Enable prefetching of BRT ZAP entries");
ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
"BRT ZAP leaf blockshift");
ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
"BRT ZAP indirect blockshift");
/* END CSTYLED */

View file

@ -1217,7 +1217,7 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
ASSERT0(bp->blk_phys_birth);
ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
}
}
}
@ -1457,7 +1457,7 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
dn->dn_datablksz : BP_GET_LSIZE(dbbp));
BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
}
}
@ -1486,7 +1486,7 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
memset(db->db.db_data, 0, db->db.db_size);
if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
bp->blk_birth != 0) {
BP_GET_LOGICAL_BIRTH(bp) != 0) {
dbuf_handle_indirect_hole(db, dn, bp);
}
db->db_state = DB_CACHED;
@ -1633,7 +1633,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
* If this is not true it indicates tampering and we report an error.
*/
if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
spa_log_error(db->db_objset->os_spa, &zb,
BP_GET_LOGICAL_BIRTH(bpp));
err = SET_ERROR(EIO);
goto early_unlock;
}
@ -2832,7 +2833,7 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
dl = &dr->dt.dl;
dl->dr_overridden_by = *bp;
dl->dr_override_state = DR_OVERRIDDEN;
dl->dr_overridden_by.blk_birth = dr->dr_txg;
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
boolean_t
@ -2909,7 +2910,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
dl->dr_overridden_by.blk_birth = dr->dr_txg;
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
void
@ -4174,21 +4175,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
return (dbi->db_objset);
}
dnode_t *
dmu_buf_dnode_enter(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
DB_DNODE_ENTER(dbi);
return (DB_DNODE(dbi));
}
void
dmu_buf_dnode_exit(dmu_buf_t *db)
{
dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
DB_DNODE_EXIT(dbi);
}
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
@ -4727,7 +4713,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
if (bp->blk_birth != 0) {
if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
@ -5014,7 +5000,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
drica.drica_os = dn->dn_objset;
drica.drica_blk_birth = bp->blk_birth;
drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
@ -5029,7 +5015,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
bp->blk_birth > ds->ds_dir->dd_origin_txg) {
BP_GET_LOGICAL_BIRTH(bp) >
ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@ -5151,7 +5138,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
}
ASSERT(db->db_level == 0 || data == db->db_buf);
ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
ASSERT(pio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?

View file

@ -437,7 +437,7 @@ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
for (int d = 0; d < SPA_DVAS_PER_BP; d++)
ddp->ddp_dva[d] = bp->blk_dva[d];
ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
}
void
@ -485,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
return (ddp);
}
return (NULL);

View file

@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
int64_t level2 = level;
uint64_t start, end, start2, end2;
if (dmu_prefetch_max == 0 || len == 0) {
dmu_prefetch_dnode(os, object, pri);
@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
if (dnode_hold(os, object, FTAG, &dn) != 0)
return;
dmu_prefetch_by_dnode(dn, level, offset, len, pri);
dnode_rele(dn, FTAG);
}
void
dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
int64_t level2 = level;
uint64_t start, end, start2, end2;
/*
* Depending on len we may do two prefetches: blocks [start, end) at
* level, and following blocks [start2, end2) at higher level2.
@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
for (uint64_t i = start2; i < end2; i++)
dbuf_prefetch(dn, level2, i, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
/*
@ -1619,7 +1627,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
* it's an old style hole.
*/
if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
dr->dt.dl.dr_overridden_by.blk_birth == 0)
BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
BP_ZERO(&dr->dt.dl.dr_overridden_by);
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@ -1650,7 +1658,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_bp->blk_birth == zio->io_txg);
ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
}
@ -2257,11 +2265,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
if (bp == NULL) {
/*
* The block was created in this transaction group,
* so it has no BP yet.
* The file size was increased, but the block was never
* written, otherwise we would either have the block
* pointer or the dirty record and would not get here.
* It is effectively a hole, so report it as such.
*/
error = SET_ERROR(EAGAIN);
goto out;
BP_ZERO(&bps[i]);
continue;
}
/*
* Make sure we clone only data blocks.
@ -2277,11 +2287,11 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
* operation into ZIL, or it may be impossible to replay, since
* the block may appear not yet allocated at that point.
*/
if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
error = SET_ERROR(EINVAL);
goto out;
}
if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
error = SET_ERROR(EAGAIN);
goto out;
}
@ -2353,18 +2363,17 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
dl->dr_overridden_by = *bp;
dl->dr_brtwrite = B_TRUE;
dl->dr_override_state = DR_OVERRIDDEN;
if (BP_IS_HOLE(bp)) {
dl->dr_overridden_by.blk_birth = 0;
dl->dr_overridden_by.blk_phys_birth = 0;
} else {
dl->dr_overridden_by.blk_birth = dr->dr_txg;
if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
if (!BP_IS_EMBEDDED(bp)) {
dl->dr_overridden_by.blk_phys_birth =
BP_PHYSICAL_BIRTH(bp);
BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
BP_GET_BIRTH(bp));
} else {
BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
dr->dr_txg);
}
}
dl->dr_brtwrite = B_TRUE;
dl->dr_override_state = DR_OVERRIDDEN;
mutex_exit(&db->db_mtx);
@ -2563,6 +2572,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
EXPORT_SYMBOL(dmu_buf_rele_array);
EXPORT_SYMBOL(dmu_prefetch);
EXPORT_SYMBOL(dmu_prefetch_by_dnode);
EXPORT_SYMBOL(dmu_prefetch_dnode);
EXPORT_SYMBOL(dmu_free_range);
EXPORT_SYMBOL(dmu_free_long_range);
EXPORT_SYMBOL(dmu_free_long_object);

View file

@ -1352,8 +1352,10 @@ corrective_read_done(zio_t *zio)
{
cr_cb_data_t *data = zio->io_private;
/* Corruption corrected; update error log if needed */
if (zio->io_error == 0)
spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth);
if (zio->io_error == 0) {
spa_remove_error(data->spa, &data->zb,
BP_GET_LOGICAL_BIRTH(zio->io_bp));
}
kmem_free(data, sizeof (cr_cb_data_t));
abd_free(zio->io_abd);
}
@ -1480,8 +1482,9 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
}
rrd->abd = abd;
io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd,
BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb);
io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
&zb);
ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
abd_get_size(abd) == BP_GET_PSIZE(bp));
@ -2110,6 +2113,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
dmu_buf_rele(db, FTAG);
dnode_rele(dn, FTAG);
}
/*
* If the receive fails, we want the resume stream to start with the
* same record that we last successfully received. There is no way to
* request resume from the object record, but we can benefit from the
* fact that sender always sends object record before anything else,
* after which it will "resend" data at offset 0 and resume normally.
*/
save_resume_state(rwa, drro->drr_object, 0, tx);
dmu_tx_commit(tx);
return (0);
@ -2343,7 +2356,6 @@ receive_process_write_record(struct receive_writer_arg *rwa,
if (rwa->heal) {
blkptr_t *bp;
dmu_buf_t *dbp;
dnode_t *dn;
int flags = DB_RF_CANFAIL;
if (rwa->raw)
@ -2375,19 +2387,15 @@ receive_process_write_record(struct receive_writer_arg *rwa,
dmu_buf_rele(dbp, FTAG);
return (err);
}
dn = dmu_buf_dnode_enter(dbp);
/* Make sure the on-disk block and recv record sizes match */
if (drrw->drr_logical_size !=
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) {
if (drrw->drr_logical_size != dbp->db_size) {
err = ENOTSUP;
dmu_buf_dnode_exit(dbp);
dmu_buf_rele(dbp, FTAG);
return (err);
}
/* Get the block pointer for the corrupted block */
bp = dmu_buf_get_blkptr(dbp);
err = do_corrective_recv(rwa, drrw, rrd, bp);
dmu_buf_dnode_exit(dbp);
dmu_buf_rele(dbp, FTAG);
return (err);
}

View file

@ -619,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
/* See comment in dump_dnode() for full details */
if (zfs_send_unmodified_spill_blocks &&
(bp->blk_birth <= dscp->dsc_fromtxg)) {
(BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
}
@ -804,7 +804,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
*/
if (zfs_send_unmodified_spill_blocks &&
(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
(DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
(BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
struct send_range record;
blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
@ -1123,7 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
*/
if (sta->os->os_encrypted &&
!BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
spa_log_error(spa, zb, &bp->blk_birth);
spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
return (SET_ERROR(EIO));
}

View file

@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
if (BP_IS_HOLE(bp))
return (0);
if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
if (claim_txg == 0 &&
BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
return (-1);
SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@ -108,7 +109,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
if (BP_IS_HOLE(bp))
return (0);
if (claim_txg == 0 || bp->blk_birth < claim_txg)
if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
return (0);
ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@ -192,7 +193,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
*/
if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
return (B_FALSE);
if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
return (B_FALSE);
if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
return (B_FALSE);
@ -235,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
ASSERT(0);
}
if (bp->blk_birth == 0) {
if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
/*
* Since this block has a birth time of 0 it must be one of
* two things: a hole created before the
@ -263,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
zb->zb_object == DMU_META_DNODE_OBJECT) &&
td->td_hole_birth_enabled_txg <= td->td_min_txg)
return (0);
} else if (bp->blk_birth <= td->td_min_txg) {
} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
return (0);
}

View file

@ -2557,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
}
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
db->db_blkptr->blk_birth <= txg ||
BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
@ -2605,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
(hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
break;
if (inc > 0 || *offset > 0)
*offset += inc;

View file

@ -1520,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
* If the block was live (referenced) at the time of this
* bookmark, add its space to the bookmark's FBN.
*/
if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
if (BP_GET_LOGICAL_BIRTH(bp) <=
dbn->dbn_phys.zbm_creation_txg &&
(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
mutex_enter(&dbn->dbn_lock);
dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=

View file

@ -156,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
return;
}
ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
dsl_dataset_phys(ds)->ds_prev_snap_txg);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
@ -190,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
bp->blk_birth > ds->ds_dir->dd_origin_txg &&
BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@ -236,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
mutex_exit(&ds->ds_remap_deadlist_lock);
BP_ZERO(&fakebp);
fakebp.blk_birth = birth;
BP_SET_LOGICAL_BIRTH(&fakebp, birth);
DVA_SET_VDEV(dva, vdev);
DVA_SET_OFFSET(dva, offset);
DVA_SET_ASIZE(dva, size);
@ -259,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
return (0);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(bp->blk_birth <= tx->tx_txg);
ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
if (ds == NULL) {
dsl_free(tx->tx_pool, tx->tx_txg, bp);
@ -277,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
* they do not need to be freed.
*/
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
bp->blk_birth > ds->ds_dir->dd_origin_txg &&
BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
!(BP_IS_EMBEDDED(bp))) {
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
@ -285,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
bplist_append(&ds->ds_dir->dd_pending_frees, bp);
}
if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
@ -317,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
ASSERT3U(ds->ds_prev->ds_object, ==,
dsl_dataset_phys(ds)->ds_prev_snap_obj);
ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
/* if (logical birth > prev prev snap txg) prev unique += bs */
if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
ds->ds_object && bp->blk_birth >
ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
mutex_enter(&ds->ds_prev->ds_lock);
dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
dsl_dir_transfer_space(ds->ds_dir, used,
DD_USED_HEAD, DD_USED_SNAP, tx);
}
@ -2895,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
if (snap == NULL)
return (B_FALSE);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
birth = dsl_dataset_get_blkptr(ds)->blk_birth;
birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
objset_t *os, *os_snap;

View file

@ -474,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
dle_tofind.dle_mintxg = bp->blk_birth;
dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@ -483,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
if (dle == NULL) {
zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
bp, (longlong_t)bp->blk_birth);
bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
dle = avl_first(&dl->dl_tree);
}
@ -1039,8 +1039,7 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
ASSERT3U(BP_GET_CHECKSUM(bp), ==,
BP_GET_CHECKSUM(&found->le_bp));
ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==,
BP_PHYSICAL_BIRTH(&found->le_bp));
ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
}
if (bp_freed) {
if (found == NULL) {

View file

@ -132,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
ASSERT(!BP_IS_HOLE(bp));
if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
if (BP_GET_LOGICAL_BIRTH(bp) <=
dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
if (poa->ds_prev && !poa->after_branch_point &&
bp->blk_birth >
BP_GET_LOGICAL_BIRTH(bp) >
dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
bp_get_dsize_sync(dp->dp_spa, bp);
@ -313,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
@ -727,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
ASSERT(zilog == NULL);
ASSERT3U(bp->blk_birth, >,
ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
@ -1017,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
ASSERT(ds->ds_prev == NULL ||
dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
tx->tx_txg);
rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));

View file

@ -1047,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
* will be wrong.
*/
rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
rrw_exit(&ds->ds_bp_rwlock, FTAG);
/* The origin doesn't get attached to itself */

View file

@ -429,8 +429,8 @@ sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
memset(bp, 0, sizeof (*bp));
bp->blk_prop = sio->sio_blk_prop;
bp->blk_phys_birth = sio->sio_phys_birth;
bp->blk_birth = sio->sio_birth;
BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
@ -444,8 +444,8 @@ static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
sio->sio_blk_prop = bp->blk_prop;
sio->sio_phys_birth = bp->blk_phys_birth;
sio->sio_birth = bp->blk_birth;
sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
sio->sio_cksum = bp->blk_cksum;
sio->sio_nr_dvas = BP_GET_NDVAS(bp);
@ -1721,7 +1721,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
zbookmark_phys_t zb;
ASSERT(!BP_IS_REDACTED(bp));
if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
if (BP_IS_HOLE(bp) ||
BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@ -1730,7 +1731,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
* (on-disk) even if it hasn't been claimed (even though for
* scrub there's nothing to do to it).
*/
if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
if (claim_txg == 0 &&
BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
return (0);
SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@ -1756,7 +1758,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
ASSERT(!BP_IS_REDACTED(bp));
if (BP_IS_HOLE(bp) ||
bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@ -1764,7 +1766,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
* already txg sync'ed (but this log block contains
* other records that are not synced)
*/
if (claim_txg == 0 || bp->blk_birth < claim_txg)
if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
return (0);
ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@ -1903,7 +1905,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
return;
if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
if (BP_IS_HOLE(bp) ||
BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
(BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
BP_GET_TYPE(bp) != DMU_OT_OBJSET))
return;
@ -2174,7 +2177,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
if (dnp != NULL &&
dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
scn->scn_phys.scn_errors++;
spa_log_error(spa, zb, &bp->blk_birth);
spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
return (SET_ERROR(EINVAL));
}
@ -2270,7 +2273,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
* by arc_read() for the cases above.
*/
scn->scn_phys.scn_errors++;
spa_log_error(spa, zb, &bp->blk_birth);
spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
return (SET_ERROR(EINVAL));
}
@ -2347,7 +2350,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
if (f != SPA_FEATURE_NONE)
ASSERT(dsl_dataset_feature_is_active(ds, f));
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
scn->scn_lt_min_this_txg++;
return;
}
@ -2373,7 +2376,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
* Don't scan it now unless we need to because something
* under it was modified.
*/
if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
scn->scn_gt_max_this_txg++;
return;
}
@ -4714,7 +4717,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
uint64_t phys_birth = BP_GET_BIRTH(bp);
size_t psize = BP_GET_PSIZE(bp);
boolean_t needs_io = B_FALSE;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;

View file

@ -5495,8 +5495,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
DVA_GET_VDEV(&bp->blk_dva[0]));
vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@ -5845,8 +5846,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
int error = 0;
ASSERT(bp->blk_birth == 0);
ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
@ -5900,7 +5901,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
int ndvas = BP_GET_NDVAS(bp);
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
/*
* If we have a checkpoint for the pool we need to make sure that
@ -5918,7 +5919,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
* normally as they will be referenced by the checkpointed uberblock.
*/
boolean_t checkpoint = B_FALSE;
if (bp->blk_birth <= spa->spa_checkpoint_txg &&
if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
/*
* At this point, if the block is part of the checkpoint

View file

@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2013 Saso Kiselkov. All rights reserved.
@ -2655,8 +2655,8 @@ spa_claim_notify(zio_t *zio)
return;
mutex_enter(&spa->spa_props_lock); /* any mutex will do */
if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
spa->spa_claim_max_txg = zio->io_bp->blk_birth;
if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
mutex_exit(&spa->spa_props_lock);
}
@ -6266,7 +6266,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
nvlist_t *nvl;
if (props == NULL ||
nvlist_lookup_string(props, "tname", &poolname) != 0)
nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
poolname = (char *)pool;
/*
@ -7082,7 +7083,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
* Add a device to a storage pool.
*/
int
spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
{
uint64_t txg, ndraid = 0;
int error;
@ -7173,6 +7174,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
}
if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
if (tvd->vdev_ashift != spa->spa_max_ashift) {
return (spa_vdev_exit(spa, vd, txg,
ZFS_ERR_ASHIFT_MISMATCH));
}
}
}
for (int c = 0; c < vd->vdev_children; c++) {
tvd = vd->vdev_child[c];
vdev_remove_child(vd, tvd);
@ -9801,7 +9812,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
* don't want to rely on that here).
*/
if (pass == 1 &&
spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
!dmu_objset_is_dirty(mos, txg)) {
/*
* Nothing changed on the first pass, therefore this

View file

@ -180,7 +180,7 @@ static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
* during spa_errlog_sync().
*/
void
spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
{
spa_error_entry_t search;
spa_error_entry_t *new;
@ -223,13 +223,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
new->se_zep.zb_object = zb->zb_object;
new->se_zep.zb_level = zb->zb_level;
new->se_zep.zb_blkid = zb->zb_blkid;
/*
* birth may end up being NULL, e.g. in zio_done(). We
* will handle this in process_error_block().
*/
if (birth != NULL)
new->se_zep.zb_birth = *birth;
new->se_zep.zb_birth = birth;
}
avl_insert(tree, new, where);
@ -258,7 +252,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
if (error == 0 && BP_IS_HOLE(&bp))
error = SET_ERROR(ENOENT);
*birth_txg = bp.blk_birth;
*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
return (error);
@ -535,7 +529,7 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
*/
zbookmark_phys_t zb;
zep_to_zb(head_ds, zep, &zb);
spa_remove_error(spa, &zb, &zep->zb_birth);
spa_remove_error(spa, &zb, zep->zb_birth);
}
return (error);
@ -563,7 +557,7 @@ spa_get_last_errlog_size(spa_t *spa)
*/
static void
spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
const uint64_t *birth)
const uint64_t birth)
{
char name[NAME_MAX_LEN];
@ -618,11 +612,7 @@ spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
healed_zep.zb_object = healed_zb->zb_object;
healed_zep.zb_level = healed_zb->zb_level;
healed_zep.zb_blkid = healed_zb->zb_blkid;
if (birth != NULL)
healed_zep.zb_birth = *birth;
else
healed_zep.zb_birth = 0;
healed_zep.zb_birth = birth;
errphys_to_name(&healed_zep, name, sizeof (name));
@ -742,7 +732,7 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
* later in spa_remove_healed_errors().
*/
void
spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth)
spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
{
spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
@ -890,7 +880,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
if (error == EACCES)
error = 0;
else if (!error)
zep.zb_birth = bp.blk_birth;
zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);

View file

@ -783,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
* request of flushing everything before we attempt to return
* immediately.
*/
if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
!dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
!spa_flush_all_logs_requested(spa))
return;

View file

@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
}
ub->ub_checkpoint_txg = 0;
return (ub->ub_rootbp.blk_birth == txg);
return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg);
}

View file

@ -531,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio)
uint64_t txg = zio->io_txg;
int c, lowest_load;
ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
lowest_load = INT_MAX;
mm->mm_preferred_cnt = 0;

View file

@ -2190,12 +2190,11 @@ vdev_raidz_close(vdev_t *vd)
/*
* Return the logical width to use, given the txg in which the allocation
* happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the
* happened. Note that BP_GET_BIRTH() is usually the txg in which the
* BP was allocated. Remapped BP's (that were relocated due to device
* removal, see remap_blkptr_cb()), will have a more recent
* BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can
* ignore these because they can't be on RAIDZ (device removal doesn't
* support RAIDZ).
* removal, see remap_blkptr_cb()), will have a more recent physical birth
* which reflects when the BP was relocated, but we can ignore these because
* they can't be on RAIDZ (device removal doesn't support RAIDZ).
*/
static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
@ -2295,7 +2294,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
logical_rs.rs_start = rr->rr_offset;
logical_rs.rs_end = logical_rs.rs_start +
vdev_raidz_asize(zio->io_vd, rr->rr_size,
BP_PHYSICAL_BIRTH(zio->io_bp));
BP_GET_BIRTH(zio->io_bp));
raidz_col_t *rc = &rr->rr_col[col];
vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
@ -2518,7 +2517,7 @@ vdev_raidz_io_start(zio_t *zio)
raidz_map_t *rm;
uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
BP_PHYSICAL_BIRTH(zio->io_bp));
BP_GET_BIRTH(zio->io_bp));
if (logical_width != vdrz->vd_physical_width) {
zfs_locked_range_t *lr = NULL;
uint64_t synced_offset = UINT64_MAX;

View file

@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
* set up block 1 - the first leaf
*/
dmu_buf_t *db;
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db, tx);
@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied);
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
dmu_prefetch_by_dnode(zap->zap_dnode, 0,
tbl->zt_blk << bs, tbl->zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
}
@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
uint64_t b = tbl->zt_blks_copied;
dmu_buf_t *db_old;
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
/* first half of entries in old[b] go to new[2*b+0] */
dmu_buf_t *db_new;
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
(newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
(newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
uint64_t off = idx & ((1<<(bs-3))-1);
dmu_buf_t *db;
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
uint64_t off2 = idx2 & ((1<<(bs-3))-1);
dmu_buf_t *db2;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
DMU_READ_NO_PREFETCH);
if (err != 0) {
@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
uint64_t blk = idx >> (bs-3);
uint64_t off = idx & ((1<<(bs-3))-1);
/*
* Note: this is equivalent to dmu_buf_hold(), but we use
* _dnode_enter / _by_dnode because it's faster because we don't
* have to hold the dnode.
*/
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
dmu_buf_t *db;
int err = dmu_buf_hold_by_dnode(dn,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err != 0)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
*/
blk = (idx*2) >> (bs-3);
dn = dmu_buf_dnode_enter(zap->zap_dbuf);
err = dmu_buf_hold_by_dnode(dn,
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err == 0)
dmu_buf_rele(db, FTAG);
}
@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
uint64_t newblk = zap_allocate_blocks(zap, 1);
dmu_buf_t *db_new;
int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
DMU_READ_NO_PREFETCH);
if (err != 0)
@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;
VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
DMU_READ_NO_PREFETCH));
dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
return (SET_ERROR(ENOENT));
int bs = FZAP_BLOCK_SHIFT(zap);
dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
int err = dmu_buf_hold_by_dnode(dn,
int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
dmu_buf_dnode_exit(zap->zap_dbuf);
if (err != 0)
return (err);
@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
int bs = FZAP_BLOCK_SHIFT(zap);
dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
ZIO_PRIORITY_SYNC_READ);
}
@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
*/
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
ZIO_PRIORITY_ASYNC_READ);
}
@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
} else {
dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
dmu_prefetch_by_dnode(zap->zap_dnode, 0,
zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
ZIO_PRIORITY_SYNC_READ);
@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
dmu_buf_t *db;
int err;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
err = dmu_buf_hold_by_dnode(zap->zap_dnode,
(zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (err == 0) {

View file

@ -41,7 +41,8 @@
#include <sys/zap_leaf.h>
#include <sys/arc.h>
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
uint16_t entry);
#define CHAIN_END 0xffff /* end of the chunk chain */
@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
static void
zap_memset(void *a, int c, size_t n)
{
char *cp = a;
char *cpend = cp + n;
while (cp < cpend)
*cp++ = c;
}
static void
stv(int len, void *addr, uint64_t value)
{
@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
*(uint64_t *)addr = value;
return;
default:
cmn_err(CE_PANIC, "bad int len %d", len);
PANIC("bad int len %d", len);
}
}
@ -96,13 +87,13 @@ ldv(int len, const void *addr)
case 8:
return (*(uint64_t *)addr);
default:
cmn_err(CE_PANIC, "bad int len %d", len);
PANIC("bad int len %d", len);
}
return (0xFEEDFACEDEADBEEFULL);
}
void
zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
{
zap_leaf_t l;
dmu_buf_t l_dbuf;
@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
struct zap_leaf_entry *le;
@ -160,11 +151,11 @@ void
zap_leaf_init(zap_leaf_t *l, boolean_t sort)
{
l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
memset(&zap_leaf_phys(l)->l_hdr, 0,
sizeof (struct zap_leaf_header));
zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
}
@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
{
ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
{
uint16_t chunk_head;
uint16_t *chunkp = &chunk_head;
int byten = 0;
int byten = integer_size;
uint64_t value = 0;
int shift = (integer_size - 1) * 8;
int len = num_integers;
ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
if (len > 0)
value = ldv(integer_size, buf);
while (len > 0) {
uint16_t chunk = zap_leaf_chunk_alloc(l);
struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
la->la_type = ZAP_CHUNK_ARRAY;
for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
if (byten == 0)
value = ldv(integer_size, buf);
la->la_array[i] = value >> shift;
value <<= 8;
if (++byten == integer_size) {
byten = 0;
buf += integer_size;
if (--byten == 0) {
if (--len == 0)
break;
byten = integer_size;
buf += integer_size;
value = ldv(integer_size, buf);
}
}
@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
*chunkp = CHAIN_END;
while (chunk != CHAIN_END) {
int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
ZAP_CHUNK_ARRAY);
zap_leaf_chunk_free(l, chunk);
@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
static boolean_t
zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
int chunk, int array_numints)
uint_t chunk, int array_numints)
{
int bseen = 0;
@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
uint64_t valuelen = integer_size * num_integers;
int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
return (SET_ERROR(E2BIG));
@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
/* link it into the hash chain */
/* XXX if we did the search above, we could just use that */
uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
zap_leaf_phys(l)->l_hdr.lh_nentries++;
@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
*/
static uint16_t *
zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
{
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
struct zap_leaf_entry *le2;
uint16_t *chunkp;
@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
&ZAP_LEAF_CHUNK(nl, nchunk).l_array;
struct zap_leaf_array *la =
&ZAP_LEAF_CHUNK(l, chunk).l_array;
int nextchunk = la->la_next;
uint_t nextchunk = la->la_next;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
}
static void
zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
{
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
*nle = *le; /* structure assignment */
(void) zap_leaf_rehash_entry(nl, chunk);
(void) zap_leaf_rehash_entry(nl, nle, chunk);
nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
nle->le_value_chunk =
@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
void
zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
{
int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* set new prefix and prefix_len */
zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* break existing hash chains */
zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
2*ZAP_LEAF_HASH_NUMENTRIES(l));
if (sort)
@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
* but this accesses memory more sequentially, and when we're
* called, the block is usually pretty full.
*/
for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
if (le->le_type != ZAP_CHUNK_ENTRY)
continue;
@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
if (le->le_hash & (1ULL << bit))
zap_leaf_transfer_entry(l, i, nl);
else
(void) zap_leaf_rehash_entry(l, i);
(void) zap_leaf_rehash_entry(l, le, i);
}
}
void
zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
{
int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
zap_leaf_phys(l)->l_hdr.lh_prefix_len;
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_leafs_with_2n_pointers[n]++;
@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_n_tenths_full[n]++;
for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
int nentries = 0;
int chunk = zap_leaf_phys(l)->l_hash[i];
for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
uint_t nentries = 0;
uint_t chunk = zap_leaf_phys(l)->l_hash[i];
while (chunk != CHAIN_END) {
struct zap_leaf_entry *le =

View file

@ -415,7 +415,7 @@ mze_destroy(zap_t *zap)
}
static zap_t *
mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
mzap_open(dmu_buf_t *db)
{
zap_t *winner;
uint64_t *zap_hdr = (uint64_t *)db->db_data;
@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
rw_enter(&zap->zap_rwlock, RW_WRITER);
zap->zap_objset = os;
zap->zap_object = obj;
zap->zap_objset = dmu_buf_get_objset(db);
zap->zap_object = db->db_object;
zap->zap_dbuf = db;
if (zap_block_type != ZBT_MICRO) {
@ -518,7 +518,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
* have the specified tag.
*/
static int
zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
ASSERT0(db->db_offset);
@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
*zapp = NULL;
dmu_object_info_from_db(db, &doi);
dmu_object_info_from_dnode(dn, &doi);
if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
return (SET_ERROR(EINVAL));
zap_t *zap = dmu_buf_get_user(db);
if (zap == NULL) {
zap = mzap_open(os, obj, db);
zap = mzap_open(db);
if (zap == NULL) {
/*
* mzap_open() didn't like what it saw on-disk.
@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
}
zap->zap_objset = os;
zap->zap_dnode = dn;
if (lt == RW_WRITER)
dmu_buf_will_dirty(db, tx);
@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
zap_t **zapp)
{
dmu_buf_t *db;
int err;
int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0)
return (err);
}
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
dmu_object_info_from_db(db, &doi);
ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
}
#endif
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0) {
err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0)
dmu_buf_rele(db, tag);
}
else
VERIFY(dnode_add_ref(dn, tag));
return (err);
}
@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
zap_t **zapp)
{
dnode_t *dn;
dmu_buf_t *db;
int err;
int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
err = dnode_hold(os, obj, tag, &dn);
if (err != 0)
return (err);
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
dmu_object_info_from_db(db, &doi);
ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
if (err != 0) {
dnode_rele(dn, tag);
return (err);
}
#endif
err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0)
err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
if (err != 0) {
dmu_buf_rele(db, tag);
dnode_rele(dn, tag);
}
return (err);
}
@ -645,6 +641,7 @@ void
zap_unlockdir(zap_t *zap, const void *tag)
{
rw_exit(&zap->zap_rwlock);
dnode_rele(zap->zap_dnode, tag);
dmu_buf_rele(zap->zap_dbuf, tag);
}
@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
if (flags != 0) {
zap_t *zap;
/* Only fat zap supports flags; upgrade immediately. */
VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
VERIFY(dnode_add_ref(dn, FTAG));
VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
B_FALSE, B_FALSE, &zap));
VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
zap_unlockdir(zap, FTAG);
@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
return (err);
}
static int
zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx, const void *tag)
{
int err;
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
}
err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlockdir(zap, tag);
return (err);
}
int
zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
zap_unlockdir(zap, FTAG);
err = zap_add_uint64_impl(zap, key, key_numints,
integer_size, num_integers, val, tx, FTAG);
/* zap_add_uint64_impl() calls zap_unlockdir() */
return (err);
}
int
zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
int key_numints, int integer_size, uint64_t num_integers,
const void *val, dmu_tx_t *tx)
{
zap_t *zap;
int err =
zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_add_uint64_impl(zap, key, key_numints,
integer_size, num_integers, val, tx, FTAG);
/* zap_add_uint64_impl() calls zap_unlockdir() */
return (err);
}
@ -1396,10 +1424,30 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
return (err);
}
static int
zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
const void *tag)
{
int err;
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
}
err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlockdir(zap, tag);
return (err);
}
int
zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
int key_numints, int integer_size, uint64_t num_integers, const void *val,
dmu_tx_t *tx)
{
zap_t *zap;
@ -1407,16 +1455,25 @@ zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
zap_unlockdir(zap, FTAG);
err = zap_update_uint64_impl(zap, key, key_numints,
integer_size, num_integers, val, tx, FTAG);
/* zap_update_uint64_impl() calls zap_unlockdir() */
return (err);
}
int
zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
{
zap_t *zap;
int err =
zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_update_uint64_impl(zap, key, key_numints,
integer_size, num_integers, val, tx, FTAG);
/* zap_update_uint64_impl() calls zap_unlockdir() */
return (err);
}
@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
return (err);
}
static int
zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
dmu_tx_t *tx, const void *tag)
{
int err;
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, tag);
return (SET_ERROR(ENOTSUP));
}
err = fzap_remove(zn, tx);
zap_name_free(zn);
zap_unlockdir(zap, tag);
return (err);
}
int
zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints, dmu_tx_t *tx)
@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_remove(zn, tx);
zap_name_free(zn);
zap_unlockdir(zap, FTAG);
err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
/* zap_remove_uint64_impl() calls zap_unlockdir() */
return (err);
}
int
zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
dmu_tx_t *tx)
{
zap_t *zap;
int err =
zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err != 0)
return (err);
err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
/* zap_remove_uint64_impl() calls zap_unlockdir() */
return (err);
}
@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
EXPORT_SYMBOL(zap_add);
EXPORT_SYMBOL(zap_add_by_dnode);
EXPORT_SYMBOL(zap_add_uint64);
EXPORT_SYMBOL(zap_add_uint64_by_dnode);
EXPORT_SYMBOL(zap_update);
EXPORT_SYMBOL(zap_update_uint64);
EXPORT_SYMBOL(zap_update_uint64_by_dnode);
EXPORT_SYMBOL(zap_length);
EXPORT_SYMBOL(zap_length_uint64);
EXPORT_SYMBOL(zap_remove);
EXPORT_SYMBOL(zap_remove_by_dnode);
EXPORT_SYMBOL(zap_remove_norm);
EXPORT_SYMBOL(zap_remove_uint64);
EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
EXPORT_SYMBOL(zap_count);
EXPORT_SYMBOL(zap_value_search);
EXPORT_SYMBOL(zap_join);

View file

@ -27,7 +27,7 @@
* Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2011, 2024 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
@ -1886,7 +1886,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config);
if (error == 0) {
error = spa_vdev_add(spa, config);
error = spa_vdev_add(spa, config, zc->zc_flags);
nvlist_free(config);
}
spa_close(spa, FTAG);

View file

@ -123,7 +123,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
/* Flush any mmap()'d data to disk */
if (zn_has_cached_data(zp, 0, file_sz - 1))
zn_flush_cached_data(zp, B_FALSE);
zn_flush_cached_data(zp, B_TRUE);
lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
@ -1187,6 +1187,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
}
}
/* Flush any mmap()'d data to disk */
if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
zn_flush_cached_data(inzp, B_TRUE);
/*
* Maintain predictable lock order.
*/

View file

@ -557,7 +557,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* that we rewind to is invalid. Thus, we return -1 so
* zil_parse() doesn't attempt to read it.
*/
if (bp->blk_birth >= first_txg)
if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
return (-1);
if (zil_bp_tree_add(zilog, bp) != 0)
@ -583,7 +583,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
zil_bp_tree_add(zilog, bp) != 0)
return (0);
@ -608,7 +608,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
* waited for all writes to be stable first), so it is semantically
* correct to declare this the end of the log.
*/
if (lr->lr_blkptr.blk_birth >= first_txg) {
if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
error = zil_read_log_data(zilog, lr, NULL);
if (error != 0)
return (error);
@ -655,7 +655,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
* just in case lets be safe and just stop here now instead of
* corrupting the pool.
*/
if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
if (BP_GET_BIRTH(bp) >= first_txg)
return (SET_ERROR(ENOENT));
/*
@ -710,8 +710,8 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
/*
* If we previously claimed it, we need to free it.
*/
if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
!BP_IS_HOLE(bp)) {
if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
}
@ -1965,7 +1965,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
&slog);
}
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
ZIO_CHECKSUM_ZILOG);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;

View file

@ -613,7 +613,7 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
zio->io_error = SET_ERROR(EIO);
if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
spa_log_error(spa, &zio->io_bookmark,
&zio->io_bp->blk_birth);
BP_GET_LOGICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
spa, NULL, &zio->io_bookmark, zio, 0);
}
@ -1052,8 +1052,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
(long long)bp->blk_prop,
(long long)bp->blk_pad[0],
(long long)bp->blk_pad[1],
(long long)bp->blk_phys_birth,
(long long)bp->blk_birth,
(long long)BP_GET_PHYSICAL_BIRTH(bp),
(long long)BP_GET_LOGICAL_BIRTH(bp),
(long long)bp->blk_fill,
(long long)bp->blk_cksum.zc_word[0],
(long long)bp->blk_cksum.zc_word[1],
@ -1156,10 +1156,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
/*
* Pool-specific checks.
*
* Note: it would be nice to verify that the blk_birth and
* BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
* allows the birth time of log blocks (and dmu_sync()-ed blocks
* that are in the log) to be arbitrarily large.
* Note: it would be nice to verify that the logical birth
* and physical birth are not too large. However,
* spa_freeze() allows the birth time of log blocks (and
* dmu_sync()-ed blocks that are in the log) to be arbitrarily
* large.
*/
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
const dva_t *dva = &bp->blk_dva[i];
@ -1246,7 +1247,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
{
zio_t *zio;
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
data, size, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@ -1435,7 +1436,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
* starts allocating blocks -- so that nothing is allocated twice.
* If txg == 0 we just verify that the block is claimable.
*/
ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
spa_min_claim_txg(spa));
ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */
@ -1731,7 +1732,7 @@ zio_write_bp_init(zio_t *zio)
blkptr_t *bp = zio->io_bp;
zio_prop_t *zp = &zio->io_prop;
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@ -1819,7 +1820,7 @@ zio_write_compress(zio_t *zio)
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
/*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
@ -1866,7 +1867,7 @@ zio_write_compress(zio_t *zio)
BP_SET_TYPE(bp, zio->io_prop.zp_type);
BP_SET_LEVEL(bp, zio->io_prop.zp_level);
zio_buf_free(cbuf, lsize);
bp->blk_birth = zio->io_txg;
BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
ASSERT(spa_feature_is_active(spa,
SPA_FEATURE_EMBEDDED_DATA));
@ -1947,7 +1948,7 @@ zio_write_compress(zio_t *zio)
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
VERIFY3U(psize, !=, 0);
@ -1961,7 +1962,7 @@ zio_write_compress(zio_t *zio)
}
if (psize == 0) {
if (zio->io_bp_orig.blk_birth != 0 &&
if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
BP_SET_LSIZE(bp, lsize);
BP_SET_TYPE(bp, zp->zp_type);
@ -3539,7 +3540,7 @@ zio_ddt_write(zio_t *zio)
else
ddt_phys_addref(ddp);
} else if (zio->io_bp_override) {
ASSERT(bp->blk_birth == txg);
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
ASSERT(BP_EQUAL(bp, zio->io_bp_override));
ddt_phys_fill(ddp, bp);
ddt_phys_addref(ddp);
@ -3810,11 +3811,13 @@ zio_dva_claim(zio_t *zio)
static void
zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
{
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
ASSERT(zio->io_bp_override == NULL);
if (!BP_IS_HOLE(bp))
metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
if (!BP_IS_HOLE(bp)) {
metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
B_TRUE);
}
if (gn != NULL) {
for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@ -4555,8 +4558,8 @@ zio_ready(zio_t *zio)
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
(zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
zio->io_ready(zio);
@ -4852,7 +4855,7 @@ zio_done(zio_t *zio)
* error and generate a logical data ereport.
*/
spa_log_error(zio->io_spa, &zio->io_bookmark,
&zio->io_bp->blk_birth);
BP_GET_LOGICAL_BIRTH(zio->io_bp));
(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
}

View file

@ -272,7 +272,7 @@ static void
zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
{
const dva_t *dva = BP_IDENTITY(bp);
uint64_t txg = BP_PHYSICAL_BIRTH(bp);
uint64_t txg = BP_GET_BIRTH(bp);
ASSERT(BP_IS_GANG(bp));

View file

@ -372,7 +372,8 @@ tags = ['functional', 'cli_root', 'zpool']
tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output',
'zpool_add--allow-ashift-mismatch']
tags = ['functional', 'cli_root', 'zpool_add']
[tests/functional/cli_root/zpool_attach]

View file

@ -988,6 +988,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/cli_root/zpool_add/add_prop_ashift.ksh \
functional/cli_root/zpool_add/cleanup.ksh \
functional/cli_root/zpool_add/setup.ksh \
functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \
functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
functional/cli_root/zpool_add/zpool_add_003_pos.ksh \

View file

@ -97,20 +97,19 @@ function verify_pool_prop_eq
function verify_pool_props
{
typeset -r dsize=$1
typeset -r ratio=$2
typeset -r oused=$1
typeset -r osaved=$2
typeset dsize=$3
typeset ratio=$4
if [[ $dsize -eq 0 ]]; then
verify_pool_prop_eq bcloneused 0
verify_pool_prop_eq bclonesaved 0
verify_pool_prop_eq bcloneratio 1.00
else
if [[ $ratio -eq 1 ]]; then
verify_pool_prop_eq bcloneused 0
else
verify_pool_prop_eq bcloneused $dsize
fi
verify_pool_prop_eq bclonesaved $((dsize*(ratio-1)))
ratio=1
elif [[ $ratio -eq 1 ]]; then
dsize=0
fi
verify_pool_prop_eq bcloneused $(($oused+$dsize))
verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1)))
if [[ $oused -eq 0 ]]; then
verify_pool_prop_eq bcloneratio "${ratio}.00"
fi
}
@ -124,16 +123,22 @@ function bclone_test
typeset -r srcdir=$4
typeset -r dstdir=$5
typeset dsize
typeset oused
typeset osaved
typeset -r original="${srcdir}/original"
typeset -r clone="${dstdir}/clone"
log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded"
# Save current block cloning stats for later use.
sync_pool $TESTPOOL
oused=$(get_pool_prop bcloneused $TESTPOOL)
osaved=$(get_pool_prop bclonesaved $TESTPOOL)
# Create a test file with known content.
case $datatype in
random|text)
sync_pool $TESTPOOL
if [[ $datatype = "random" ]]; then
dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null
else
@ -146,13 +151,13 @@ function bclone_test
sync_pool $TESTPOOL
# It is hard to predict block sizes that will be used,
# so just do one clone and take it from bcloneused.
filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL)
dsize=$(get_pool_prop bcloneused $TESTPOOL)
dsize=$(($dsize-$oused))
if [[ $embedded = "false" ]]; then
log_must test $filesize -gt 0
log_must test $dsize -gt 0
fi
rm -f "${clone}-tmp"
sync_pool $TESTPOOL
dsize=$filesize
;;
hole)
log_must truncate_test -s $filesize -f $original
@ -217,7 +222,7 @@ function bclone_test
test_file_integrity $original_checksum "${clone}4" $filesize
test_file_integrity $original_checksum "${clone}5" $filesize
verify_pool_props $dsize 7
verify_pool_props $oused $osaved $dsize 7
# Clear cache and test after fresh import.
log_must zpool export $TESTPOOL
@ -240,7 +245,7 @@ function bclone_test
sync_pool $TESTPOOL
verify_pool_props $dsize 11
verify_pool_props $oused $osaved $dsize 11
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
@ -268,7 +273,7 @@ function bclone_test
test_file_integrity $original_checksum "${clone}8" $filesize
test_file_integrity $original_checksum "${clone}9" $filesize
verify_pool_props $dsize 6
verify_pool_props $oused $osaved $dsize 6
rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9"
@ -276,11 +281,11 @@ function bclone_test
test_file_integrity $original_checksum "${clone}6" $filesize
verify_pool_props $dsize 1
verify_pool_props $oused $osaved $dsize 1
rm -f "${clone}6"
sync_pool $TESTPOOL
verify_pool_props $dsize 1
verify_pool_props $oused $osaved $dsize 1
}

View file

@ -66,7 +66,7 @@ function bclone_corner_cases_init
export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0)
export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1)
export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2)
export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest)
export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest)
export FIRST_HALF_CHECKSUM=""
export SECOND_HALF_CHECKSUM=""
}
@ -210,6 +210,8 @@ function bclone_corner_cases_test
typeset -r dstdir=$2
typeset limit=$3
typeset -i count=0
typeset oused
typeset osaved
if [[ $srcdir != "count" ]]; then
if [[ -n "$limit" ]]; then
@ -217,6 +219,11 @@ function bclone_corner_cases_test
limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs)
fi
bclone_corner_cases_init $srcdir $dstdir
# Save current block cloning stats for later use.
sync_pool $TESTPOOL
oused=$(get_pool_prop bcloneused $TESTPOOL)
osaved=$(get_pool_prop bclonesaved $TESTPOOL)
fi
#
@ -285,21 +292,24 @@ function bclone_corner_cases_test
overwrite_clone "$second_overwrite"
if checksum_compare $read_after; then
log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
else
log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
fi
log_must zpool export $TESTPOOL
log_must zpool import $TESTPOOL
if checksum_compare "yes"; then
log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
else
log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
fi
rm -f "$CLONE"
sync_pool $TESTPOOL
verify_pool_prop_eq bcloneused $oused
verify_pool_prop_eq bclonesaved $osaved
done
done
done

View file

@ -22,7 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -60,12 +60,23 @@ log_must mkfile $SIZE $disk2
logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
opt=""
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do
#
# Need to add the --allow-ashift-mismatch option to disable the
# ashift mismatch checks in zpool add.
#
if [[ $ashift -eq $orig_ashift ]]; then
opt=""
else
opt="--allow-ashift-mismatch"
fi
log_must zpool create $TESTPOOL $disk1
log_must zpool add -o ashift=$ashift $TESTPOOL $disk2
log_must zpool add $opt -o ashift=$ashift $TESTPOOL $disk2
log_must verify_ashift $disk2 $ashift
# clean things for the next run
@ -78,7 +89,7 @@ do
#
log_must zpool create $TESTPOOL $disk1
log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT $ashift
log_must zpool add $TESTPOOL $disk2
log_must zpool add $opt $TESTPOOL $disk2
exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
log_must verify_ashift $disk2 $exp

View file

@ -22,7 +22,7 @@
#
# Copyright 2017, loli10K. All rights reserved.
# Copyright (c) 2020 by Delphix. All rights reserved.
# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
@ -68,8 +68,13 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
for ashift in ${ashifts[@]}
do
if [ $ashift -eq $orig_ashift ];then
opt=""
else
opt="--allow-ashift-mismatch"
fi
log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
log_must zpool add $TESTPOOL $disk2
log_must zpool add $opt $TESTPOOL $disk2
log_must verify_ashift $disk2 $ashift
# clean things for the next run
@ -82,8 +87,13 @@ for ashift in ${ashifts[@]}
do
for cmdval in ${ashifts[@]}
do
if [ $ashift -eq $cmdval ];then
opt=""
else
opt="--allow-ashift-mismatch"
fi
log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2
log_must zpool add $opt -o ashift=$cmdval $TESTPOOL $disk2
log_must verify_ashift $disk2 $cmdval
# clean things for the next run

View file

@ -65,4 +65,15 @@ log_mustnot vdevs_in_pool $TESTPOOL $DISK2
log_must zpool add -f $TESTPOOL $DISK2
log_must vdevs_in_pool $TESTPOOL $DISK2
log_must zpool destroy $TESTPOOL
create_pool $TESTPOOL mirror $DISK0 $DISK1
log_must poolexists $TESTPOOL
log_mustnot zpool add $TESTPOOL $DISK2
log_mustnot vdevs_in_pool $TESTPOOL $DISK2
log_must zpool add --allow-replication-mismatch $TESTPOOL $DISK2
log_must vdevs_in_pool $TESTPOOL $DISK2
log_pass "'zpool add -f <pool> <vdev> ...' executes successfully."

View file

@ -70,7 +70,7 @@ if is_freebsd; then
recursive=$(get_tunable VOL_RECURSIVE)
log_must set_tunable64 VOL_RECURSIVE 1
fi
log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
log_must zpool add --allow-ashift-mismatch $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL"

View file

@ -75,7 +75,9 @@ log_must poolexists $TESTPOOL1
unset NOINUSE_CHECK
log_mustnot zpool add -f $TESTPOOL $DISK1
log_mustnot zpool add --allow-in-use $TESTPOOL $DISK1
log_mustnot zpool add -f $TESTPOOL $mnttab_dev
log_mustnot zpool add --allow-in-use $TESTPOOL $mnttab_dev
if is_linux; then
log_mustnot zpool add $TESTPOOL $vfstab_dev
else

View file

@ -64,7 +64,9 @@ log_mustnot zpool add -f $TESTPOOL $DISK0
for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache"
do
log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1
log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1
log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1
log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1
done
log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \

View file

@ -138,7 +138,7 @@ function zpool_create_forced_add
while ((j < ${#add_args[@]})); do
log_must zpool create $TESTPOOL1 ${create_args[$i]}
log_mustnot zpool add $TESTPOOL1 ${add_args[$j]}
log_must zpool add -f $TESTPOOL1 ${add_args[$j]}
log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]}
log_must zpool destroy -f $TESTPOOL1
((j += 1))

View file

@ -76,7 +76,7 @@ log_onexit cleanup
SRC_FILE=src.data
DST_FILE=dst.data
SRC_SIZE=$(($RANDOM % 2048))
SRC_SIZE=$((1024 + $RANDOM % 1024))
# A smaller recordsize is used merely to speed up the test.
RECORDSIZE=4096
@ -120,7 +120,7 @@ for mode in "never" "auto" "always"; do
# Overwrite a random range of an existing file and immediately copy it.
sync_pool $TESTPOOL
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
if [[ "$mode" == "always" ]]; then
log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
log_must ls -l $CP_TESTDIR
@ -152,7 +152,7 @@ for mode in "never" "auto" "always"; do
# Overwrite a random range of an existing file and immediately copy it.
log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
log_must cp --reflink=$mode $SRC_FILE $DST_FILE
verify_copy $SRC_FILE $DST_FILE
log_must rm -f $SRC_FILE $DST_FILE

View file

@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
int
main(int argc, const char *const *argv)
{
if (argc != 2) {
if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
return (1);
}
@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
return (1);
}
unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
if (dev_part != 0)
sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
const char *dev_part = strrchr(dev_name, 'p');
if (dev_part != NULL) {
sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
}
for (size_t i = 0; i < strlen(zvol_name); ++i)
if (isblank(zvol_name[i]))

View file

@ -664,6 +664,9 @@
/* Define to 1 if you have the `mlockall' function. */
#define HAVE_MLOCKALL 1
/* page_size() is available */
/* #undef HAVE_MM_PAGE_SIZE */
/* lookup_bdev() wants mode arg */
/* #undef HAVE_MODE_LOOKUP_BDEV */
@ -993,6 +996,9 @@
/* __set_page_dirty_nobuffers exists */
/* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */
/* splice_copy_file_range() is available */
/* #undef HAVE_VFS_SPLICE_COPY_FILE_RANGE */
/* __vmalloc page flags exists */
/* #undef HAVE_VMALLOC_PAGE_KERNEL */
@ -1155,7 +1161,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.2.99-365-FreeBSD_g8f2f6cd2a"
#define ZFS_META_ALIAS "zfs-2.2.99-398-FreeBSD_g39be46f43"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -1185,7 +1191,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "365-FreeBSD_g8f2f6cd2a"
#define ZFS_META_RELEASE "398-FreeBSD_g39be46f43"
/* Define the project version. */
#define ZFS_META_VERSION "2.2.99"

View file

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.2.99-365-g8f2f6cd2a-dirty"
#define ZFS_META_GITREV "zfs-2.2.99-398-g39be46f43"