Notable upstream pull request merges:
  #15024 Add missed DMU_PROJECTUSED_OBJECT prefetch
  #15029 Do not request data L1 buffers on scan prefetch
  #15036 FreeBSD: catch up to __FreeBSD_version 1400093
  #15039 Fix raw receive with different indirect block size
  #15047 FreeBSD: Fix build on stable/13 after 1302506
  #15049 Fix the ZFS checksum error histograms with larger record sizes
  #15052 Reduce bloat in ereport.fs.zfs.checksum events
  #15056 Avoid extra snprintf() in dsl_deadlist_merge()
  #15061 Ignore pool ashift property during vdev attachment
  #15063 Don't panic if setting vdev properties is unsupported for this vdev type
  #15067 spa_min_alloc should be GCD, not min
  #15071 Add explicit prefetches to bpobj_iterate()
  #15072 Adjust prefetch parameters
  #15076 Refactor dmu_prefetch()
  #15079 set autotrim default to 'off' everywhere
  #15080 ZIL: Fix config lock deadlock
  #15088 metaslab: tuneable to better control force ganging
  #15096 Avoid waiting in dmu_sync_late_arrival()
  #15097 BRT should return EOPNOTSUPP
  #15103 Remove zl_issuer_lock from zil_suspend()
  #15107 Remove fastwrite mechanism
  #15113 libzfs: sendrecv: send_progress_thread: handle SIGINFO/SIGUSR1
  #15122 ZIL: Second attempt to reduce scope of zl_issuer_lock
  #15129 zpool_vdev_remove() should handle EALREADY error return
  #15132 ZIL: Replay blocks without next block pointer
  #15148 zfs_clone_range should return descriptive error codes
  #15153 ZIL: Avoid dbuf_read() before dmu_sync()
  #15172 copy_file_range: fix fallback when source create on same txg
  #15180 Update outdated assertion from zio_write_compress

Obtained from:	OpenZFS
OpenZFS commit:	804414aad2
This commit is contained in:
Martin Matuska 2023-08-26 23:20:04 +02:00
commit 315ee00fa9
113 changed files with 3329 additions and 913 deletions

View File

@ -3042,6 +3042,7 @@ _prebuild_libs= ${_kerberos5_lib_libasn1} \
lib/libpam/libpam lib/libthr \
${_lib_libradius} lib/libsbuf lib/libtacplus \
lib/libgeom \
${_lib_librt} \
${_cddl_lib_libumem} ${_cddl_lib_libnvpair} \
${_cddl_lib_libuutil} \
${_cddl_lib_libavl} \
@ -3097,6 +3098,7 @@ lib/libpjdlog__L: lib/libutil__L
lib/libcasper__L: lib/libnv__L
lib/liblzma__L: lib/libmd__L lib/libthr__L
lib/libzstd__L: lib/libthr__L
lib/librt__L: lib/libthr__L
_generic_libs= ${_cddl_lib} gnu/lib ${_kerberos5_lib} lib ${_secure_lib}
.if ${MK_IPFILTER} != "no"
@ -3122,6 +3124,7 @@ cddl/lib/libnvpair__L: cddl/lib/libspl__L
cddl/lib/libuutil__L: cddl/lib/libavl__L cddl/lib/libspl__L
.if ${MK_ZFS} != "no"
_lib_librt= lib/librt
_cddl_lib_libicp= cddl/lib/libicp
_cddl_lib_libicp_rescue= cddl/lib/libicp_rescue
_cddl_lib_libtpool= cddl/lib/libtpool
@ -3136,7 +3139,7 @@ cddl/lib/libzutil__L: cddl/lib/libavl__L lib/libgeom__L lib/msun__L cddl/lib/lib
cddl/lib/libzfs_core__L: cddl/lib/libnvpair__L cddl/lib/libspl__L cddl/lib/libzutil__L
cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L
cddl/lib/libzfs__L: cddl/lib/libzfs_core__L lib/msun__L lib/libutil__L lib/librt__L
cddl/lib/libzfs__L: lib/libthr__L lib/libmd__L lib/libz__L cddl/lib/libumem__L
cddl/lib/libzfs__L: cddl/lib/libuutil__L cddl/lib/libavl__L lib/libgeom__L
cddl/lib/libzfs__L: cddl/lib/libnvpair__L cddl/lib/libzutil__L

View File

@ -19,6 +19,7 @@ LIBADD= \
md \
nvpair \
pthread \
rt \
umem \
util \
uutil \

View File

@ -15,6 +15,7 @@ DIRDEPS = \
lib/libexpat \
lib/libgeom \
lib/libmd \
lib/librt \
lib/libthr \
lib/libutil \
lib/libz \

View File

@ -143,7 +143,7 @@ CRUNCH_PROGS_usr.sbin+= zdb
CRUNCH_LIBS+= -l80211 -lalias -lcam -lncursesw -ldevstat -lipsec -llzma
.if ${MK_ZFS} != "no"
CRUNCH_LIBS+= -lavl -lpthread -luutil -lumem -ltpool -lspl
CRUNCH_LIBS+= -lavl -lpthread -luutil -lumem -ltpool -lspl -lrt
CRUNCH_LIBS_zfs+= ${LIBBE} \
${LIBZPOOL} \
${LIBZFS} \

View File

@ -39,6 +39,7 @@ DIRDEPS = \
lib/libmd \
lib/libmt \
lib/libnv \
lib/librt \
lib/libsbuf \
lib/libthr \
lib/libufs \

View File

@ -413,7 +413,7 @@ _DP_fifolog= z
_DP_ipf= kvm
_DP_tpool= spl
_DP_uutil= avl spl
_DP_zfs= md pthread umem util uutil m avl bsdxml crypto geom nvpair \
_DP_zfs= md pthread rt umem util uutil m avl bsdxml crypto geom nvpair \
z zfs_core zutil
_DP_zfsbootenv= zfs nvpair
_DP_zfs_core= nvpair spl zutil

View File

@ -1,10 +1,10 @@
Meta: 1
Name: zfs
Branch: 1.0
Version: 2.2.0
Release: rc1
Version: 2.2.99
Release: 1
Release-Tags: relext
License: CDDL
Author: OpenZFS
Linux-Maximum: 6.3
Linux-Maximum: 6.4
Linux-Minimum: 3.10

View File

@ -79,6 +79,7 @@
#include <sys/dsl_crypt.h>
#include <sys/dsl_scan.h>
#include <sys/btree.h>
#include <sys/brt.h>
#include <zfs_comutil.h>
#include <sys/zstd/zstd.h>
@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = {
#define ZB_TOTAL DN_MAX_LEVELS
#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1)
typedef struct zdb_brt_entry {
dva_t zbre_dva;
uint64_t zbre_refcount;
avl_node_t zbre_node;
} zdb_brt_entry_t;
typedef struct zdb_cb {
zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
uint64_t zcb_removing_size;
uint64_t zcb_checkpoint_size;
uint64_t zcb_dedup_asize;
uint64_t zcb_dedup_blocks;
uint64_t zcb_clone_asize;
uint64_t zcb_clone_blocks;
uint64_t zcb_psize_count[SPA_MAX_FOR_16M];
uint64_t zcb_lsize_count[SPA_MAX_FOR_16M];
uint64_t zcb_asize_count[SPA_MAX_FOR_16M];
@ -5368,6 +5377,8 @@ typedef struct zdb_cb {
int zcb_haderrors;
spa_t *zcb_spa;
uint32_t **zcb_vd_obsolete_counts;
avl_tree_t zcb_brt;
boolean_t zcb_brt_is_active;
} zdb_cb_t;
/* test if two DVA offsets from same vdev are within the same metaslab */
@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
zcb->zcb_asize_total += BP_GET_ASIZE(bp);
if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) {
/*
* Cloned blocks are special. We need to count them, so we can
* later uncount them when reporting leaked space, and we must
* only claim them them once.
*
* To do this, we keep our own in-memory BRT. For each block
* we haven't seen before, we look it up in the real BRT and
* if its there, we note it and its refcount then proceed as
* normal. If we see the block again, we count it as a clone
* and then give it no further consideration.
*/
zdb_brt_entry_t zbre_search, *zbre;
avl_index_t where;
zbre_search.zbre_dva = bp->blk_dva[0];
zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
if (zbre != NULL) {
zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
zcb->zcb_clone_blocks++;
zbre->zbre_refcount--;
if (zbre->zbre_refcount == 0) {
avl_remove(&zcb->zcb_brt, zbre);
umem_free(zbre, sizeof (zdb_brt_entry_t));
}
return;
}
uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp);
if (crefcnt > 0) {
zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
UMEM_NOFAIL);
zbre->zbre_dva = bp->blk_dva[0];
zbre->zbre_refcount = crefcnt;
avl_insert(&zcb->zcb_brt, zbre, where);
}
}
if (dump_opt['L'])
return;
@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa)
iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
}
static int
zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
{
const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
int cmp;
cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
if (cmp == 0)
cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
return (cmp);
}
static int
dump_block_stats(spa_t *spa)
{
@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa)
zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
sizeof (zdb_brt_entry_t),
offsetof(zdb_brt_entry_t, zbre_node));
zcb->zcb_brt_is_active = B_TRUE;
}
(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
(dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
(dump_opt['c'] == 1) ? "metadata " : "",
@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa)
metaslab_class_get_alloc(spa_special_class(spa)) +
metaslab_class_get_alloc(spa_dedup_class(spa)) +
get_unflushed_alloc_space(spa);
total_found = tzb->zb_asize - zcb->zcb_dedup_asize +
total_found =
tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
if (total_found == total_alloc && !dump_opt['L']) {
@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa)
"bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
(u_longlong_t)zcb->zcb_dedup_blocks,
(double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
(void) printf("\t%-16s %14llu count: %6llu\n",
"bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
(u_longlong_t)zcb->zcb_clone_blocks);
(void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:",
(u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);

View File

@ -607,8 +607,6 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
*/
if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 ||
strcmp(dp->dd_compare, path) != 0) {
zed_log_msg(LOG_INFO, " %s: no match (%s != vdev %s)",
__func__, dp->dd_compare, path);
return;
}
if (dp->dd_new_vdev_guid != 0 && dp->dd_new_vdev_guid != guid) {

View File

@ -416,6 +416,11 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0)
return;
if (vdev_guid == 0) {
fmd_hdl_debug(hdl, "Got a zero GUID");
return;
}
if (spare) {
int nspares = find_and_remove_spares(zhdl, vdev_guid);
fmd_hdl_debug(hdl, "%d spares removed", nspares);

View File

@ -0,0 +1,61 @@
#!/bin/sh
#
# Turn off disk's enclosure slot if it becomes FAULTED.
#
# Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos
# as they flip between FAULTED and ONLINE. If
# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets
# FAULTED, then power down the slot via sysfs:
#
# /sys/class/enclosure/<enclosure>/<slot>/power_status
#
# We assume the user will be responsible for turning the slot back on again.
#
# Note that this script requires that your enclosure be supported by the
# Linux SCSI Enclosure services (SES) driver. The script will do nothing
# if you have no enclosure, or if your enclosure isn't supported.
#
# Exit codes:
# 0: slot successfully powered off
# 1: enclosure not available
# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled
# 3: vdev was not FAULTED
# 4: The enclosure sysfs path passed from ZFS does not exist
# 5: Enclosure slot didn't actually turn off after we told it to
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
if [ ! -d /sys/class/enclosure ] ; then
# No JBOD enclosure or NVMe slots
exit 1
fi
if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then
exit 2
fi
if [ "$ZEVENT_VDEV_STATE_STR" != "FAULTED" ] ; then
exit 3
fi
if [ ! -f "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status" ] ; then
exit 4
fi
echo "off" | tee "$ZEVENT_VDEV_ENC_SYSFS_PATH/power_status"
# Wait for sysfs for report that the slot is off. It can take ~400ms on some
# enclosures.
for i in $(seq 1 20) ; do
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" == "off" ] ; then
break
fi
sleep 0.1
done
if [ "$(cat $ZEVENT_VDEV_ENC_SYSFS_PATH/power_status)" != "off" ] ; then
exit 5
fi
zed_log_msg "powered down slot $ZEVENT_VDEV_ENC_SYSFS_PATH for $ZEVENT_VDEV_PATH"

View File

@ -142,3 +142,8 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event"
# Disabled by default, 1 to enable and 0 to disable.
#ZED_SYSLOG_DISPLAY_GUIDS=1
##
# Power off the drive's slot in the enclosure if it becomes FAULTED. This can
# help silence misbehaving drives. This assumes your drive enclosure fully
# supports slot power control via sysfs.
#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1

View File

@ -2412,7 +2412,6 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
ztest_object_lock(zd, object, RL_READER);
@ -2446,6 +2445,7 @@ ztest_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
DMU_READ_NO_PREFETCH);
ASSERT0(error);
} else {
ASSERT3P(zio, !=, NULL);
size = doi.doi_data_block_size;
if (ISP2(size)) {
offset = P2ALIGN(offset, size);

View File

@ -4,6 +4,7 @@
# Not following: a was not specified as input (see shellcheck -x). [SC1091]
# Prefer putting braces around variable references even when not strictly required. [SC2250]
# Consider invoking this command separately to avoid masking its return value (or use '|| true' to ignore). [SC2312]
# Command appears to be unreachable. Check usage (or ignore if invoked indirectly). [SC2317]
# In POSIX sh, 'local' is undefined. [SC2039] # older ShellCheck versions
# In POSIX sh, 'local' is undefined. [SC3043] # newer ShellCheck versions
@ -18,7 +19,7 @@ PHONY += shellcheck
_STGT = $(subst ^,/,$(subst shellcheck-here-,,$@))
shellcheck-here-%:
if HAVE_SHELLCHECK
shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
shellcheck --format=gcc --enable=all --exclude=SC1090,SC1091,SC2039,SC2250,SC2312,SC2317,SC3043 $$([ -n "$(SHELLCHECK_SHELL)" ] && echo "--shell=$(SHELLCHECK_SHELL)") "$$([ -e "$(_STGT)" ] || echo "$(srcdir)/")$(_STGT)"
else
@echo "skipping shellcheck of" $(_STGT) "because shellcheck is not installed"
endif

View File

@ -16,12 +16,63 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [
])
])
dnl #
dnl # 6.5.x API change,
dnl # blkdev_get_by_path() takes 4 args
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [
ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [
#include <linux/fs.h>
#include <linux/blkdev.h>
], [
struct block_device *bdev __attribute__ ((unused)) = NULL;
const char *path = "path";
fmode_t mode = 0;
void *holder = NULL;
struct blk_holder_ops h;
bdev = blkdev_get_by_path(path, mode, holder, &h);
])
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
AC_MSG_CHECKING([whether blkdev_get_by_path() exists])
AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args])
ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [
AC_MSG_RESULT(yes)
], [
ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args])
ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [
AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1,
[blkdev_get_by_path() exists and takes 4 args])
AC_MSG_RESULT(yes)
], [
ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
])
])
])
dnl #
dnl # 6.5.x API change
dnl # blk_mode_t was added as a type to supercede some places where fmode_t
dnl # is used
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [
ZFS_LINUX_TEST_SRC([blk_mode_t], [
#include <linux/fs.h>
#include <linux/blkdev.h>
], [
blk_mode_t m __attribute((unused)) = (blk_mode_t)0;
])
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [
AC_MSG_CHECKING([whether blk_mode_t is defined])
ZFS_LINUX_TEST_RESULT([blk_mode_t], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined])
], [
AC_MSG_RESULT(no)
])
])
@ -41,12 +92,35 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [
])
])
dnl #
dnl # 6.5.x API change.
dnl # blkdev_put() takes (void* holder) as arg 2
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [
ZFS_LINUX_TEST_SRC([blkdev_put_holder], [
#include <linux/fs.h>
#include <linux/blkdev.h>
], [
struct block_device *bdev = NULL;
void *holder = NULL;
blkdev_put(bdev, holder);
])
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
AC_MSG_CHECKING([whether blkdev_put() exists])
ZFS_LINUX_TEST_RESULT([blkdev_put], [
AC_MSG_RESULT(yes)
], [
ZFS_LINUX_TEST_ERROR([blkdev_put()])
AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2])
ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1,
[blkdev_put() accepts void* as arg 2])
], [
ZFS_LINUX_TEST_ERROR([blkdev_put()])
])
])
])
@ -103,6 +177,33 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [
])
])
dnl #
dnl # 6.5.x API change
dnl # disk_check_media_change() was added
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
ZFS_LINUX_TEST_SRC([disk_check_media_change], [
#include <linux/fs.h>
#include <linux/blkdev.h>
], [
struct block_device *bdev = NULL;
bool error;
error = disk_check_media_change(bdev->bd_disk);
])
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
AC_MSG_CHECKING([whether disk_check_media_change() exists])
ZFS_LINUX_TEST_RESULT([disk_check_media_change], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1,
[disk_check_media_change() exists])
], [
AC_MSG_RESULT(no)
])
])
dnl #
dnl # bdev_kobj() is introduced from 5.12
dnl #
@ -443,9 +544,34 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
])
])
dnl #
dnl # 6.5.x API change
dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [
ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [
#include <linux/blkdev.h>
],[
blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT;
])
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [
AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined])
ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [
AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined])
AC_MSG_RESULT(yes)
], [
AC_MSG_RESULT(no)
])
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG
ZFS_AC_KERNEL_SRC_BLKDEV_PUT
ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER
ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART
ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV
ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV
@ -458,6 +584,9 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE
ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T
])
AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
@ -476,4 +605,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE
ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T
])

View File

@ -49,12 +49,42 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
], [], [])
])
dnl #
dnl # 5.9.x API change
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [
ZFS_LINUX_TEST_SRC([block_device_operations_release_void_1arg], [
#include <linux/blkdev.h>
void blk_release(struct gendisk *g) {
(void) g;
return;
}
static const struct block_device_operations
bops __attribute__ ((unused)) = {
.open = NULL,
.release = blk_release,
.ioctl = NULL,
.compat_ioctl = NULL,
};
], [], [])
])
AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [
AC_MSG_CHECKING([whether bops->release() is void])
AC_MSG_CHECKING([whether bops->release() is void and takes 2 args])
ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [
AC_MSG_RESULT(yes)
],[
ZFS_LINUX_TEST_ERROR([bops->release()])
AC_MSG_RESULT(no)
AC_MSG_CHECKING([whether bops->release() is void and takes 1 arg])
ZFS_LINUX_TEST_RESULT([block_device_operations_release_void_1arg], [
AC_MSG_RESULT(yes)
AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [1],
[Define if release() in block_device_operations takes 1 arg])
],[
ZFS_LINUX_TEST_ERROR([bops->release()])
])
])
])
@ -92,6 +122,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [
AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [
ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS
ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID
ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
])

View File

@ -0,0 +1,25 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ], [
dnl #
dnl # Kernel 6.5 - generic_file_splice_read was removed in favor
dnl # of copy_splice_read for the .splice_read member of the
dnl # file_operations struct.
dnl #
ZFS_LINUX_TEST_SRC([has_copy_splice_read], [
#include <linux/fs.h>
struct file_operations fops __attribute__((unused)) = {
.splice_read = copy_splice_read,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_COPY_SPLICE_READ], [
AC_MSG_CHECKING([whether copy_splice_read() exists])
ZFS_LINUX_TEST_RESULT([has_copy_splice_read], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_COPY_SPLICE_READ, 1,
[copy_splice_read exists])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -0,0 +1,27 @@
dnl #
dnl # Linux 6.5 removes register_sysctl_table
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [
ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [
#include <linux/sysctl.h>
static struct ctl_table dummy_table[] = {
{}
};
],[
struct ctl_table_header *h
__attribute((unused)) = register_sysctl_table(dummy_table);
])
])
AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
AC_MSG_CHECKING([whether register_sysctl_table exists])
ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_REGISTER_SYSCTL_TABLE, 1,
[register_sysctl_table exists])
],[
AC_MSG_RESULT([no])
])
])

View File

@ -0,0 +1,50 @@
dnl #
dnl # EL7 have backported copy_file_range and clone_file_range and
dnl # added them to an "extended" file_operations struct.
dnl #
dnl # We're testing for both functions in one here, because they will only
dnl # ever appear together and we don't want to match a similar method in
dnl # some future vendor kernel.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND], [
ZFS_LINUX_TEST_SRC([vfs_file_operations_extend], [
#include <linux/fs.h>
static ssize_t test_copy_file_range(struct file *src_file,
loff_t src_off, struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags) {
(void) src_file; (void) src_off;
(void) dst_file; (void) dst_off;
(void) len; (void) flags;
return (0);
}
static int test_clone_file_range(struct file *src_file,
loff_t src_off, struct file *dst_file, loff_t dst_off,
u64 len) {
(void) src_file; (void) src_off;
(void) dst_file; (void) dst_off;
(void) len;
return (0);
}
static const struct file_operations_extend
fops __attribute__ ((unused)) = {
.kabi_fops = {},
.copy_file_range = test_copy_file_range,
.clone_file_range = test_clone_file_range,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND], [
AC_MSG_CHECKING([whether file_operations_extend takes \
.copy_file_range() and .clone_file_range()])
ZFS_LINUX_TEST_RESULT([vfs_file_operations_extend], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_VFS_FILE_OPERATIONS_EXTEND, 1,
[file_operations_extend takes .copy_file_range()
and .clone_file_range()])
],[
AC_MSG_RESULT([no])
])
])

View File

@ -0,0 +1,164 @@
dnl #
dnl # The *_file_range APIs have a long history:
dnl #
dnl # 2.6.29: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE ioctl introduced
dnl # 3.12: BTRFS_IOC_FILE_EXTENT_SAME ioctl introduced
dnl #
dnl # 4.5: copy_file_range() syscall introduced, added to VFS
dnl # 4.5: BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE renamed to FICLONE ands
dnl # FICLONERANGE, added to VFS as clone_file_range()
dnl # 4.5: BTRFS_IOC_FILE_EXTENT_SAME renamed to FIDEDUPERANGE, added to VFS
dnl # as dedupe_file_range()
dnl #
dnl # 4.20: VFS clone_file_range() and dedupe_file_range() replaced by
dnl # remap_file_range()
dnl #
dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
dnl # generic_copy_file_range() added to support it
dnl #
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
#include <linux/fs.h>
static ssize_t test_copy_file_range(struct file *src_file,
loff_t src_off, struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags) {
(void) src_file; (void) src_off;
(void) dst_file; (void) dst_off;
(void) len; (void) flags;
return (0);
}
static const struct file_operations
fops __attribute__ ((unused)) = {
.copy_file_range = test_copy_file_range,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE], [
AC_MSG_CHECKING([whether fops->copy_file_range() is available])
ZFS_LINUX_TEST_RESULT([vfs_copy_file_range], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_VFS_COPY_FILE_RANGE, 1,
[fops->copy_file_range() is available])
],[
AC_MSG_RESULT([no])
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([generic_copy_file_range], [
#include <linux/fs.h>
], [
struct file *src_file __attribute__ ((unused)) = NULL;
loff_t src_off __attribute__ ((unused)) = 0;
struct file *dst_file __attribute__ ((unused)) = NULL;
loff_t dst_off __attribute__ ((unused)) = 0;
size_t len __attribute__ ((unused)) = 0;
unsigned int flags __attribute__ ((unused)) = 0;
generic_copy_file_range(src_file, src_off, dst_file, dst_off,
len, flags);
])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
AC_MSG_CHECKING([whether generic_copy_file_range() is available])
ZFS_LINUX_TEST_RESULT_SYMBOL([generic_copy_file_range],
[generic_copy_file_range], [fs/read_write.c], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_VFS_GENERIC_COPY_FILE_RANGE, 1,
[generic_copy_file_range() is available])
],[
AC_MSG_RESULT(no)
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
#include <linux/fs.h>
static int test_clone_file_range(struct file *src_file,
loff_t src_off, struct file *dst_file, loff_t dst_off,
u64 len) {
(void) src_file; (void) src_off;
(void) dst_file; (void) dst_off;
(void) len;
return (0);
}
static const struct file_operations
fops __attribute__ ((unused)) = {
.clone_file_range = test_clone_file_range,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE], [
AC_MSG_CHECKING([whether fops->clone_file_range() is available])
ZFS_LINUX_TEST_RESULT([vfs_clone_file_range], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_VFS_CLONE_FILE_RANGE, 1,
[fops->clone_file_range() is available])
],[
AC_MSG_RESULT([no])
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([vfs_dedupe_file_range], [
#include <linux/fs.h>
static int test_dedupe_file_range(struct file *src_file,
loff_t src_off, struct file *dst_file, loff_t dst_off,
u64 len) {
(void) src_file; (void) src_off;
(void) dst_file; (void) dst_off;
(void) len;
return (0);
}
static const struct file_operations
fops __attribute__ ((unused)) = {
.dedupe_file_range = test_dedupe_file_range,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE], [
AC_MSG_CHECKING([whether fops->dedupe_file_range() is available])
ZFS_LINUX_TEST_RESULT([vfs_dedupe_file_range], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_VFS_DEDUPE_FILE_RANGE, 1,
[fops->dedupe_file_range() is available])
],[
AC_MSG_RESULT([no])
])
])
AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE], [
ZFS_LINUX_TEST_SRC([vfs_remap_file_range], [
#include <linux/fs.h>
static loff_t test_remap_file_range(struct file *src_file,
loff_t src_off, struct file *dst_file, loff_t dst_off,
loff_t len, unsigned int flags) {
(void) src_file; (void) src_off;
(void) dst_file; (void) dst_off;
(void) len; (void) flags;
return (0);
}
static const struct file_operations
fops __attribute__ ((unused)) = {
.remap_file_range = test_remap_file_range,
};
],[])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE], [
AC_MSG_CHECKING([whether fops->remap_file_range() is available])
ZFS_LINUX_TEST_RESULT([vfs_remap_file_range], [
AC_MSG_RESULT([yes])
AC_DEFINE(HAVE_VFS_REMAP_FILE_RANGE, 1,
[fops->remap_file_range() is available])
],[
AC_MSG_RESULT([no])
])
])

View File

@ -6,8 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
#include <linux/fs.h>
#include <linux/uio.h>
],[
int type __attribute__ ((unused)) =
ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE;
int type __attribute__ ((unused)) = ITER_KVEC;
])
ZFS_LINUX_TEST_SRC([iov_iter_advance], [
@ -93,6 +92,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [
struct iov_iter iter = { 0 };
__attribute__((unused)) enum iter_type i = iov_iter_type(&iter);
])
ZFS_LINUX_TEST_SRC([iter_iov], [
#include <linux/fs.h>
#include <linux/uio.h>
],[
struct iov_iter iter = { 0 };
__attribute__((unused)) const struct iovec *iov = iter_iov(&iter);
])
])
AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
@ -201,4 +208,19 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [
AC_DEFINE(HAVE_VFS_IOV_ITER, 1,
[All required iov_iter interfaces are available])
])
dnl #
dnl # Kernel 6.5 introduces the iter_iov() function that returns the
dnl # __iov member of an iov_iter*. The iov member was renamed to this
dnl # __iov member, and is intended to be accessed via the helper
dnl # function now.
dnl #
AC_MSG_CHECKING([whether iter_iov() is available])
ZFS_LINUX_TEST_RESULT([iter_iov], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_ITER_IOV, 1,
[iter_iov() is available])
],[
AC_MSG_RESULT(no)
])
])

View File

@ -116,6 +116,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND
ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS
ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE
ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN
@ -154,6 +160,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_FILEMAP
ZFS_AC_KERNEL_SRC_WRITEPAGE_T
ZFS_AC_KERNEL_SRC_RECLAIMED
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@ -249,6 +257,12 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_VFS_RW_ITERATE
ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS
ZFS_AC_KERNEL_VFS_IOV_ITER
ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND
ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
ZFS_AC_KERNEL_MAKE_REQUEST_FN
@ -287,6 +301,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_FILEMAP
ZFS_AC_KERNEL_WRITEPAGE_T
ZFS_AC_KERNEL_RECLAIMED
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_COPY_SPLICE_READ
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE

View File

@ -1,3 +1,9 @@
openzfs-linux (2.2.99-1) unstable; urgency=low
* OpenZFS 2.2 is tagged.
-- Umer Saleem <usaleem@ixsystems.com> Wed, 12 Jul 2022 15:00:00 -0400
openzfs-linux (2.1.99-1) unstable; urgency=low
* Integrate minimally modified Debian packaging from ZFS on Linux

View File

@ -1,10 +1,8 @@
sbin/zinject
sbin/ztest
usr/bin/raidz_test
usr/share/man/man1/raidz_test.1
usr/share/man/man1/test-runner.1
usr/share/man/man1/ztest.1
usr/share/man/man8/zinject.8
usr/share/zfs/common.sh
usr/share/zfs/runfiles/
usr/share/zfs/test-runner

View File

@ -27,6 +27,7 @@ sbin/zfs
sbin/zfs_ids_to_path
sbin/zgenhostid
sbin/zhack
sbin/zinject
sbin/zpool
sbin/zstream
sbin/zstreamdump
@ -92,6 +93,7 @@ usr/share/man/man8/zfs_ids_to_path.8
usr/share/man/man7/zfsconcepts.7
usr/share/man/man7/zfsprops.7
usr/share/man/man8/zgenhostid.8
usr/share/man/man8/zinject.8
usr/share/man/man8/zpool-add.8
usr/share/man/man8/zpool-attach.8
usr/share/man/man8/zpool-checkpoint.8

View File

@ -12,11 +12,12 @@ ExecStart=/bin/sh -c '
decode_root_args || exit 0; \
[ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \
rootflags="$(getarg rootflags=)"; \
case ",$rootflags," in \
*,zfsutil,*) ;; \
,,) rootflags=zfsutil ;; \
*) rootflags="zfsutil,$rootflags" ;; \
esac; \
[ "$(@sbindir@/zfs get -H -o value mountpoint "$root")" = legacy ] || \
case ",$rootflags," in \
*,zfsutil,*) ;; \
,,) rootflags=zfsutil ;; \
*) rootflags="zfsutil,$rootflags" ;; \
esac; \
exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"'
[Install]

View File

@ -2,7 +2,7 @@
Description=Rollback bootfs just before it is mounted
Requisite=zfs-import.target
After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service
Before=dracut-mount.service
Before=dracut-mount.service sysroot.mount
DefaultDependencies=no
ConditionKernelCommandLine=bootfs.rollback
ConditionEnvironment=BOOTFS

View File

@ -36,7 +36,11 @@ struct xucred;
typedef struct flock flock64_t;
typedef struct vnode vnode_t;
typedef struct vattr vattr_t;
#define vtype_t __enum_uint8(vtype)
#if __FreeBSD_version < 1400093
typedef enum vtype vtype_t;
#else
#define vtype_t __enum_uint8(vtype)
#endif
#include <sys/types.h>
#include <sys/queue.h>

View File

@ -181,7 +181,11 @@ bi_status_to_errno(blk_status_t status)
return (ENOLINK);
case BLK_STS_TARGET:
return (EREMOTEIO);
#ifdef HAVE_BLK_STS_RESV_CONFLICT
case BLK_STS_RESV_CONFLICT:
#else
case BLK_STS_NEXUS:
#endif
return (EBADE);
case BLK_STS_MEDIUM:
return (ENODATA);
@ -215,7 +219,11 @@ errno_to_bi_status(int error)
case EREMOTEIO:
return (BLK_STS_TARGET);
case EBADE:
#ifdef HAVE_BLK_STS_RESV_CONFLICT
return (BLK_STS_RESV_CONFLICT);
#else
return (BLK_STS_NEXUS);
#endif
case ENODATA:
return (BLK_STS_MEDIUM);
case EILSEQ:
@ -337,6 +345,9 @@ zfs_check_media_change(struct block_device *bdev)
return (0);
}
#define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev)
#elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE)
#define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk)
#define zfs_check_media_change(bdev) disk_check_media_change(bdev->bd_disk)
#else
/*
* This is encountered if check_disk_change() and bdev_check_media_change()
@ -387,6 +398,12 @@ vdev_lookup_bdev(const char *path, dev_t *dev)
#endif
}
#if defined(HAVE_BLK_MODE_T)
#define blk_mode_is_open_write(flag) ((flag) & BLK_OPEN_WRITE)
#else
#define blk_mode_is_open_write(flag) ((flag) & FMODE_WRITE)
#endif
/*
* Kernels without bio_set_op_attrs use bi_rw for the bio flags.
*/

View File

@ -198,6 +198,14 @@ extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache);
spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
/*
* This is necessary to be compatible with other kernel modules
* or in-tree filesystem that may define kmem_cache_alloc,
* like bcachefs does it now.
*/
#ifdef kmem_cache_alloc
#undef kmem_cache_alloc
#endif
#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc)

View File

@ -38,7 +38,7 @@ typedef unsigned long ulong_t;
typedef unsigned long long u_longlong_t;
typedef long long longlong_t;
typedef unsigned long intptr_t;
typedef long intptr_t;
typedef unsigned long long rlim64_t;
typedef struct task_struct kthread_t;

View File

@ -173,4 +173,16 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset,
}
#endif
#if defined(HAVE_ITER_IOV)
#define zfs_uio_iter_iov(iter) iter_iov((iter))
#else
#define zfs_uio_iter_iov(iter) (iter)->iov
#endif
#if defined(HAVE_IOV_ITER_TYPE)
#define zfs_uio_iov_iter_type(iter) iov_iter_type((iter))
#else
#define zfs_uio_iov_iter_type(iter) (iter)->type
#endif
#endif /* SPL_UIO_H */

View File

@ -52,7 +52,11 @@ extern const struct inode_operations zpl_special_inode_operations;
/* zpl_file.c */
extern const struct address_space_operations zpl_address_space_operations;
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
extern const struct file_operations_extend zpl_file_operations;
#else
extern const struct file_operations zpl_file_operations;
#endif
extern const struct file_operations zpl_dir_file_operations;
/* zpl_super.c */
@ -180,6 +184,55 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx)
}
#endif /* HAVE_VFS_ITERATE */
/* zpl_file_range.c */
/* handlers for file_operations of the same name */
extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags);
extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags);
extern int zpl_clone_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, uint64_t len);
extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, uint64_t len);
/* compat for FICLONE/FICLONERANGE/FIDEDUPERANGE ioctls */
typedef struct {
int64_t fcr_src_fd;
uint64_t fcr_src_offset;
uint64_t fcr_src_length;
uint64_t fcr_dest_offset;
} zfs_ioc_compat_file_clone_range_t;
typedef struct {
int64_t fdri_dest_fd;
uint64_t fdri_dest_offset;
uint64_t fdri_bytes_deduped;
int32_t fdri_status;
uint32_t fdri_reserved;
} zfs_ioc_compat_dedupe_range_info_t;
typedef struct {
uint64_t fdr_src_offset;
uint64_t fdr_src_length;
uint16_t fdr_dest_count;
uint16_t fdr_reserved1;
uint32_t fdr_reserved2;
zfs_ioc_compat_dedupe_range_info_t fdr_info[];
} zfs_ioc_compat_dedupe_range_t;
#define ZFS_IOC_COMPAT_FICLONE _IOW(0x94, 9, int)
#define ZFS_IOC_COMPAT_FICLONERANGE \
_IOW(0x94, 13, zfs_ioc_compat_file_clone_range_t)
#define ZFS_IOC_COMPAT_FIDEDUPERANGE \
_IOWR(0x94, 54, zfs_ioc_compat_dedupe_range_t)
extern long zpl_ioctl_ficlone(struct file *filp, void *arg);
extern long zpl_ioctl_ficlonerange(struct file *filp, void *arg);
extern long zpl_ioctl_fideduperange(struct file *filp, void *arg);
#if defined(HAVE_INODE_TIMESTAMP_TRUNCATE)
#define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip)
#elif defined(HAVE_INODE_TIMESPEC64_TIMES)

View File

@ -60,7 +60,7 @@ typedef struct bpobj {
kmutex_t bpo_lock;
objset_t *bpo_os;
uint64_t bpo_object;
int bpo_epb;
uint32_t bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
uint8_t bpo_havefreed;

View File

@ -36,6 +36,7 @@ extern "C" {
#endif
extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp);
extern uint64_t brt_get_dspace(spa_t *spa);
extern uint64_t brt_get_used(spa_t *spa);

View File

@ -572,11 +572,15 @@ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp);
int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp);
int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp, int flags);
int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, boolean_t read, const void *tag, int *numbufsp,
dmu_buf_t ***dbpp, uint32_t flags);
int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
dmu_buf_t **dbp);
/*
* Add a reference to a dmu buffer that has already been held via
* dmu_buf_hold() in the current context.
@ -885,6 +889,7 @@ extern uint_t zfs_max_recordsize;
*/
void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, enum zio_priority pri);
void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */

View File

@ -247,8 +247,6 @@ typedef struct dmu_sendstatus {
void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
const void *, dmu_buf_t **);
#ifdef __cplusplus
}

View File

@ -36,8 +36,6 @@
extern "C" {
#endif
extern uint64_t zfetch_array_rd_sz;
struct dnode; /* so we can reference dnode */
typedef struct zfetch {

View File

@ -102,8 +102,6 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp"
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta"
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm"
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap"
#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
@ -112,8 +110,6 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears"
#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits"
#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits"
#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
#define FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME "snapshot_name"
#define FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME "device_name"
#define FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME "raw_name"

View File

@ -80,7 +80,6 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10
#define METASLAB_MUST_RESERVE 0x20
#define METASLAB_FASTWRITE 0x40
#define METASLAB_ZIL 0x80
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
@ -96,8 +95,6 @@ void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);
void metaslab_stat_init(void);
void metaslab_stat_fini(void);

View File

@ -313,7 +313,7 @@ struct metaslab_group {
* Each metaslab maintains a set of in-core trees to track metaslab
* operations. The in-core free tree (ms_allocatable) contains the list of
* free segments which are eligible for allocation. As blocks are
* allocated, the allocated segment are removed from the ms_allocatable and
* allocated, the allocated segments are removed from the ms_allocatable and
* added to a per txg allocation tree (ms_allocating). As blocks are
* freed, they are added to the free tree (ms_freeing). These trees
* allow us to process all allocations and frees in syncing context
@ -366,9 +366,9 @@ struct metaslab_group {
struct metaslab {
/*
* This is the main lock of the metaslab and its purpose is to
* coordinate our allocations and frees [e.g metaslab_block_alloc(),
* coordinate our allocations and frees [e.g., metaslab_block_alloc(),
* metaslab_free_concrete(), ..etc] with our various syncing
* procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc].
* procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc].
*
* The lock is also used during some miscellaneous operations like
* using the metaslab's histogram for the metaslab group's histogram

View File

@ -250,6 +250,7 @@ struct spa {
uint64_t spa_min_ashift; /* of vdevs in normal class */
uint64_t spa_max_ashift; /* of vdevs in normal class */
uint64_t spa_min_alloc; /* of vdevs in normal class */
uint64_t spa_gcd_alloc; /* of vdevs in normal class */
uint64_t spa_config_guid; /* config pool guid */
uint64_t spa_load_guid; /* spa_load initialized guid */
uint64_t spa_last_synced_guid; /* last synced guid */

View File

@ -266,7 +266,6 @@ struct vdev {
metaslab_group_t *vdev_mg; /* metaslab group */
metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */
metaslab_t **vdev_ms; /* metaslab array */
uint64_t vdev_pending_fastwrite; /* allocated fastwrites */
txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
@ -420,6 +419,7 @@ struct vdev {
boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */
boolean_t vdev_resilver_deferred; /* resilver deferred */
boolean_t vdev_kobj_flag; /* kobj event record */
boolean_t vdev_attaching; /* vdev attach ashift handling */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */

View File

@ -38,14 +38,22 @@ extern "C" {
/*
* Possible states for a given lwb structure.
*
* An lwb will start out in the "closed" state, and then transition to
* the "opened" state via a call to zil_lwb_write_open(). When
* transitioning from "closed" to "opened" the zilog's "zl_issuer_lock"
* must be held.
* An lwb will start out in the "new" state, and transition to the "opened"
* state via a call to zil_lwb_write_open() on first itx assignment. When
* transitioning from "new" to "opened" the zilog's "zl_issuer_lock" must be
* held.
*
* After the lwb is "opened", it can transition into the "issued" state
* via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must
* be held when making this transition.
* After the lwb is "opened", it can be assigned number of itxs and transition
* into the "closed" state via zil_lwb_write_close() when full or on timeout.
* When transitioning from "opened" to "closed" the zilog's "zl_issuer_lock"
* must be held. New lwb allocation also takes "zl_lock" to protect the list.
*
* After the lwb is "closed", it can transition into the "ready" state via
* zil_lwb_write_issue(). "zl_lock" must be held when making this transition.
* Since it is done by the same thread, "zl_issuer_lock" is not needed.
*
* When lwb in "ready" state receives its block pointer, it can transition to
* "issued". "zl_lock" must be held when making this transition.
*
* After the lwb's write zio completes, it transitions into the "write
* done" state via zil_lwb_write_done(); and then into the "flush done"
@ -62,17 +70,20 @@ extern "C" {
*
* Additionally, correctness when reading an lwb's state is often
* achieved by exploiting the fact that these state transitions occur in
* this specific order; i.e. "closed" to "opened" to "issued" to "done".
* this specific order; i.e. "new" to "opened" to "closed" to "ready" to
* "issued" to "write_done" and finally "flush_done".
*
* Thus, if an lwb is in the "closed" or "opened" state, holding the
* Thus, if an lwb is in the "new" or "opened" state, holding the
* "zl_issuer_lock" will prevent a concurrent thread from transitioning
* that lwb to the "issued" state. Likewise, if an lwb is already in the
* "issued" state, holding the "zl_lock" will prevent a concurrent
* thread from transitioning that lwb to the "write done" state.
* that lwb to the "closed" state. Likewise, if an lwb is already in the
* "ready" state, holding the "zl_lock" will prevent a concurrent thread
* from transitioning that lwb to the "issued" state.
*/
typedef enum {
LWB_STATE_CLOSED,
LWB_STATE_NEW,
LWB_STATE_OPENED,
LWB_STATE_CLOSED,
LWB_STATE_READY,
LWB_STATE_ISSUED,
LWB_STATE_WRITE_DONE,
LWB_STATE_FLUSH_DONE,
@ -91,18 +102,21 @@ typedef enum {
typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slim; /* log block has slim format */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */
int lwb_error; /* log block allocation error */
int lwb_nmax; /* max bytes in the buffer */
int lwb_nused; /* # used bytes in buffer */
int lwb_nfilled; /* # filled bytes in buffer */
int lwb_sz; /* size of block and buffer */
lwb_state_t lwb_state; /* the state of this lwb */
char *lwb_buf; /* log write buffer */
zio_t *lwb_child_zio; /* parent zio for children */
zio_t *lwb_write_zio; /* zio for the lwb buffer */
zio_t *lwb_root_zio; /* root zio for lwb write and flushes */
hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */
uint64_t lwb_issued_txg; /* the txg when the write is issued */
uint64_t lwb_alloc_txg; /* the txg when lwb_blk is allocated */
uint64_t lwb_max_txg; /* highest txg in this lwb */
list_node_t lwb_node; /* zilog->zl_lwb_list linkage */
list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */

View File

@ -222,7 +222,6 @@ typedef uint64_t zio_flag_t;
#define ZIO_FLAG_NOPWRITE (1ULL << 28)
#define ZIO_FLAG_REEXECUTED (1ULL << 29)
#define ZIO_FLAG_DELEGATED (1ULL << 30)
#define ZIO_FLAG_FASTWRITE (1ULL << 31)
#define ZIO_FLAG_MUSTSUCCEED 0
#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)

View File

@ -94,8 +94,6 @@ typedef const struct zio_checksum_info {
} zio_checksum_info_t;
typedef struct zio_bad_cksum {
zio_cksum_t zbc_expected;
zio_cksum_t zbc_actual;
const char *zbc_checksum_name;
uint8_t zbc_byteswapped;
uint8_t zbc_injected;

View File

@ -57,7 +57,7 @@ libzfs_la_LIBADD = \
libzutil.la \
libuutil.la
libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
libzfs_la_LIBADD += -lrt -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LIBFETCH_LIBS) $(LTLIBINTL)
libzfs_la_LDFLAGS = -pthread

View File

@ -3926,6 +3926,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
switch (errno) {
case EALREADY:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"removal for this vdev is already in progress."));
(void) zfs_error(hdl, EZFS_BUSY, errbuf);
break;
case EINVAL:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"invalid config; all top-level vdevs must "

View File

@ -928,6 +928,39 @@ zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written,
return (0);
}
static volatile boolean_t send_progress_thread_signal_duetotimer;
static void
send_progress_thread_act(int sig, siginfo_t *info, void *ucontext)
{
(void) sig, (void) ucontext;
send_progress_thread_signal_duetotimer = info->si_code == SI_TIMER;
}
struct timer_desirability {
timer_t timer;
boolean_t desired;
};
static void
timer_delete_cleanup(void *timer)
{
struct timer_desirability *td = timer;
if (td->desired)
timer_delete(td->timer);
}
#ifdef SIGINFO
#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO sigaddset(&new, SIGINFO)
#else
#define SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO
#endif
#define SEND_PROGRESS_THREAD_PARENT_BLOCK(old) { \
sigset_t new; \
sigemptyset(&new); \
sigaddset(&new, SIGUSR1); \
SEND_PROGRESS_THREAD_PARENT_BLOCK_SIGINFO; \
pthread_sigmask(SIG_BLOCK, &new, old); \
}
static void *
send_progress_thread(void *arg)
{
@ -941,6 +974,26 @@ send_progress_thread(void *arg)
struct tm tm;
int err;
const struct sigaction signal_action =
{.sa_sigaction = send_progress_thread_act, .sa_flags = SA_SIGINFO};
struct sigevent timer_cfg =
{.sigev_notify = SIGEV_SIGNAL, .sigev_signo = SIGUSR1};
const struct itimerspec timer_time =
{.it_value = {.tv_sec = 1}, .it_interval = {.tv_sec = 1}};
struct timer_desirability timer = {};
sigaction(SIGUSR1, &signal_action, NULL);
#ifdef SIGINFO
sigaction(SIGINFO, &signal_action, NULL);
#endif
if ((timer.desired = pa->pa_progress || pa->pa_astitle)) {
if (timer_create(CLOCK_MONOTONIC, &timer_cfg, &timer.timer))
return ((void *)(uintptr_t)errno);
(void) timer_settime(timer.timer, 0, &timer_time, NULL);
}
pthread_cleanup_push(timer_delete_cleanup, &timer);
if (!pa->pa_parsable && pa->pa_progress) {
(void) fprintf(stderr,
"TIME %s %sSNAPSHOT %s\n",
@ -953,12 +1006,12 @@ send_progress_thread(void *arg)
* Print the progress from ZFS_IOC_SEND_PROGRESS every second.
*/
for (;;) {
(void) sleep(1);
pause();
if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes,
&blocks)) != 0) {
if (err == EINTR || err == ENOENT)
return ((void *)0);
return ((void *)(uintptr_t)err);
err = 0;
pthread_exit(((void *)(uintptr_t)err));
}
(void) time(&t);
@ -991,21 +1044,25 @@ send_progress_thread(void *arg)
(void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n",
tm.tm_hour, tm.tm_min, tm.tm_sec,
(u_longlong_t)bytes, zhp->zfs_name);
} else if (pa->pa_progress) {
} else if (pa->pa_progress ||
!send_progress_thread_signal_duetotimer) {
zfs_nicebytes(bytes, buf, sizeof (buf));
(void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n",
tm.tm_hour, tm.tm_min, tm.tm_sec,
buf, zhp->zfs_name);
}
}
pthread_cleanup_pop(B_TRUE);
}
static boolean_t
send_progress_thread_exit(libzfs_handle_t *hdl, pthread_t ptid)
send_progress_thread_exit(
libzfs_handle_t *hdl, pthread_t ptid, sigset_t *oldmask)
{
void *status = NULL;
(void) pthread_cancel(ptid);
(void) pthread_join(ptid, &status);
pthread_sigmask(SIG_SETMASK, oldmask, NULL);
int error = (int)(uintptr_t)status;
if (error != 0 && status != PTHREAD_CANCELED)
return (zfs_standard_error(hdl, error,
@ -1199,7 +1256,8 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
* If progress reporting is requested, spawn a new thread to
* poll ZFS_IOC_SEND_PROGRESS at a regular interval.
*/
if (sdd->progress || sdd->progressastitle) {
sigset_t oldmask;
{
pa.pa_zhp = zhp;
pa.pa_fd = sdd->outfd;
pa.pa_parsable = sdd->parsable;
@ -1214,13 +1272,13 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
zfs_close(zhp);
return (err);
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj,
fromorigin, sdd->outfd, flags, sdd->debugnv);
if ((sdd->progress || sdd->progressastitle) &&
send_progress_thread_exit(zhp->zfs_hdl, tid))
if (send_progress_thread_exit(zhp->zfs_hdl, tid, &oldmask))
return (-1);
}
@ -1562,8 +1620,9 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
progress_arg_t pa = { 0 };
int err = 0;
pthread_t ptid;
sigset_t oldmask;
if (flags->progress || flags->progressastitle) {
{
pa.pa_zhp = zhp;
pa.pa_fd = fd;
pa.pa_parsable = flags->parsable;
@ -1577,6 +1636,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
return (zfs_error(zhp->zfs_hdl,
EZFS_THREADCREATEFAILED, errbuf));
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
err = lzc_send_space_resume_redacted(zhp->zfs_name, from,
@ -1584,8 +1644,7 @@ estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags,
redactbook, fd, &size);
*sizep = size;
if ((flags->progress || flags->progressastitle) &&
send_progress_thread_exit(zhp->zfs_hdl, ptid))
if (send_progress_thread_exit(zhp->zfs_hdl, ptid, &oldmask))
return (-1);
if (!flags->progress && !flags->parsable)
@ -1876,11 +1935,12 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
if (!flags->dryrun) {
progress_arg_t pa = { 0 };
pthread_t tid;
sigset_t oldmask;
/*
* If progress reporting is requested, spawn a new thread to
* poll ZFS_IOC_SEND_PROGRESS at a regular interval.
*/
if (flags->progress || flags->progressastitle) {
{
pa.pa_zhp = zhp;
pa.pa_fd = outfd;
pa.pa_parsable = flags->parsable;
@ -1898,6 +1958,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
zfs_close(zhp);
return (error);
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd,
@ -1905,8 +1966,7 @@ zfs_send_resume_impl_cb_impl(libzfs_handle_t *hdl, sendflags_t *flags,
if (redact_book != NULL)
free(redact_book);
if ((flags->progressastitle || flags->progress) &&
send_progress_thread_exit(hdl, tid)) {
if (send_progress_thread_exit(hdl, tid, &oldmask)) {
zfs_close(zhp);
return (-1);
}
@ -2691,7 +2751,8 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
* If progress reporting is requested, spawn a new thread to poll
* ZFS_IOC_SEND_PROGRESS at a regular interval.
*/
if (flags->progress || flags->progressastitle) {
sigset_t oldmask;
{
pa.pa_zhp = zhp;
pa.pa_fd = fd;
pa.pa_parsable = flags->parsable;
@ -2708,13 +2769,13 @@ zfs_send_one_cb_impl(zfs_handle_t *zhp, const char *from, int fd,
return (zfs_error(zhp->zfs_hdl,
EZFS_THREADCREATEFAILED, errbuf));
}
SEND_PROGRESS_THREAD_PARENT_BLOCK(&oldmask);
}
err = lzc_send_redacted(name, from, fd,
lzc_flags_from_sendflags(flags), redactbook);
if ((flags->progress || flags->progressastitle) &&
send_progress_thread_exit(hdl, ptid))
if (send_progress_thread_exit(hdl, ptid, &oldmask))
return (-1);
if (err == 0 && (flags->props || flags->holds || flags->backup)) {

View File

@ -15,7 +15,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd January 10, 2023
.Dd July 21, 2023
.Dt ZFS 4
.Os
.
@ -239,6 +239,11 @@ relative to the pool.
Make some blocks above a certain size be gang blocks.
This option is used by the test suite to facilitate testing.
.
.It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
For blocks that could be forced to be a gang block (due to
.Sy metaslab_force_ganging ) ,
force this many of them to be gang blocks.
.
.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP data block size as a power of 2. Note that changing this after
creating a DDT on the pool will not affect existing DDTs, only newly created
@ -519,9 +524,6 @@ However, this is limited by
Maximum micro ZAP size.
A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
.
.It Sy zfetch_array_rd_sz Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
If prefetching is enabled, disable prefetching for reads larger than this size.
.
.It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
Min bytes to prefetch per stream.
Prefetch distance starts from the demand access size and quickly grows to

View File

@ -29,7 +29,7 @@
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright 2019 Joyent, Inc.
.\"
.Dd January 12, 2023
.Dd July 27, 2023
.Dt ZFS-SEND 8
.Os
.
@ -297,6 +297,12 @@ This flag can only be used in conjunction with
.It Fl v , -verbose
Print verbose information about the stream package generated.
This information includes a per-second report of how much data has been sent.
The same report can be requested by sending
.Dv SIGINFO
or
.Dv SIGUSR1 ,
regardless of
.Fl v .
.Pp
The format of the stream is committed.
You will be able to receive your streams on future versions of ZFS.
@ -433,6 +439,12 @@ and the verbose output goes to standard error
.It Fl v , -verbose
Print verbose information about the stream package generated.
This information includes a per-second report of how much data has been sent.
The same report can be requested by sending
.Dv SIGINFO
or
.Dv SIGUSR1 ,
regardless of
.Fl v .
.El
.It Xo
.Nm zfs
@ -669,6 +681,10 @@ ones on the source, and are ready to be used, while the parent snapshot on the
target contains none of the username and password data present on the source,
because it was removed by the redacted send operation.
.
.Sh SIGNALS
See
.Fl v .
.
.Sh EXAMPLES
.\" These are, respectively, examples 12, 13 from zfs.8
.\" Make sure to update them bidirectionally

View File

@ -26,7 +26,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd May 27, 2021
.Dd July 11, 2023
.Dt ZPOOL-EVENTS 8
.Os
.
@ -305,10 +305,6 @@ The time when a given I/O request was submitted.
The time required to service a given I/O request.
.It Sy prev_state
The previous state of the vdev.
.It Sy cksum_expected
The expected checksum value for the block.
.It Sy cksum_actual
The actual checksum value for an errant block.
.It Sy cksum_algorithm
Checksum algorithm used.
See
@ -362,23 +358,6 @@ Like
but contains
.Pq Ar good data No & ~( Ns Ar bad data ) ;
that is, the bits set in the good data which are cleared in the bad data.
.It Sy bad_set_histogram
If this field exists, it is an array of counters.
Each entry counts bits set in a particular bit of a big-endian uint64 type.
The first entry counts bits
set in the high-order bit of the first byte, the 9th byte, etc, and the last
entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc.
This information is useful for observing a stuck bit in a parallel data path,
such as IDE or parallel SCSI.
.It Sy bad_cleared_histogram
If this field exists, it is an array of counters.
Each entry counts bit clears in a particular bit of a big-endian uint64 type.
The first entry counts bits
clears of the high-order bit of the first byte, the 9th byte, etc, and the
last entry counts clears of the low-order bit of the 8th byte, the 16th byte,
etc.
This information is useful for observing a stuck bit in a parallel data
path, such as IDE or parallel SCSI.
.El
.
.Sh I/O STAGES

View File

@ -461,6 +461,7 @@ ZFS_OBJS_OS := \
zpl_ctldir.o \
zpl_export.o \
zpl_file.o \
zpl_file_range.o \
zpl_inode.o \
zpl_super.o \
zpl_xattr.o \

View File

@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
ASSERT3S(outcount, <=, bufsize);
/* Prefetch znode */
if (prefetch)
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
/*
* Move to the next entry, fill in the previous offset.
@ -6268,7 +6266,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
goto bad_write_fallback;
}
} else {
#if __FreeBSD_version >= 1400086
#if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
__FreeBSD_version >= 1400086
vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
LK_EXCLUSIVE);
#else
@ -6294,7 +6293,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
ap->a_outoffp, &len, ap->a_outcred);
if (error == EXDEV)
if (error == EXDEV || error == EAGAIN || error == EINVAL ||
error == EOPNOTSUPP)
goto bad_locked_fallback;
*ap->a_lenp = (size_t)len;
out_locked:

View File

@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = {
.mode = 0644,
.proc_handler = &proc_dohostid,
},
#ifdef HAVE_REGISTER_SYSCTL_TABLE
{
.procname = "kmem",
.mode = 0555,
@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = {
.mode = 0555,
.child = spl_kstat_table,
},
#endif
{},
};
#ifdef HAVE_REGISTER_SYSCTL_TABLE
static struct ctl_table spl_dir[] = {
{
.procname = "spl",
@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = {
static struct ctl_table spl_root[] = {
{
.procname = "kernel",
.mode = 0555,
.child = spl_dir,
.procname = "kernel",
.mode = 0555,
.child = spl_dir,
},
{}
};
#endif
int
spl_proc_init(void)
{
int rc = 0;
#ifdef HAVE_REGISTER_SYSCTL_TABLE
spl_header = register_sysctl_table(spl_root);
if (spl_header == NULL)
return (-EUNATCH);
#else
spl_header = register_sysctl("kernel/spl", spl_table);
if (spl_header == NULL)
return (-EUNATCH);
if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) {
rc = -EUNATCH;
goto out;
}
if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) {
rc = -EUNATCH;
goto out;
}
#endif
proc_spl = proc_mkdir("spl", NULL);
if (proc_spl == NULL) {

View File

@ -80,9 +80,22 @@ typedef struct dio_request {
static unsigned int zfs_vdev_failfast_mask = 1;
#ifdef HAVE_BLK_MODE_T
static blk_mode_t
#else
static fmode_t
#endif
vdev_bdev_mode(spa_mode_t spa_mode)
{
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = 0;
if (spa_mode & SPA_MODE_READ)
mode |= BLK_OPEN_READ;
if (spa_mode & SPA_MODE_WRITE)
mode |= BLK_OPEN_WRITE;
#else
fmode_t mode = 0;
if (spa_mode & SPA_MODE_READ)
@ -90,6 +103,7 @@ vdev_bdev_mode(spa_mode_t spa_mode)
if (spa_mode & SPA_MODE_WRITE)
mode |= FMODE_WRITE;
#endif
return (mode);
}
@ -197,12 +211,47 @@ vdev_disk_kobj_evt_post(vdev_t *v)
}
}
#if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
/*
* Define a dummy struct blk_holder_ops for kernel versions
* prior to 6.5.
*/
struct blk_holder_ops {};
#endif
static struct block_device *
vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder,
const struct blk_holder_ops *hops)
{
#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG
return (blkdev_get_by_path(path,
vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops));
#else
return (blkdev_get_by_path(path,
vdev_bdev_mode(mode) | FMODE_EXCL, holder));
#endif
}
static void
vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder)
{
#ifdef HAVE_BLKDEV_PUT_HOLDER
return (blkdev_put(bdev, holder));
#else
return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL));
#endif
}
static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
uint64_t *logical_ashift, uint64_t *physical_ashift)
{
struct block_device *bdev;
#ifdef HAVE_BLK_MODE_T
blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
#else
fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
#endif
hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
vdev_disk_t *vd;
@ -252,15 +301,15 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
reread_part = B_TRUE;
}
blkdev_put(bdev, mode | FMODE_EXCL);
vdev_blkdev_put(bdev, mode, zfs_vdev_holder);
}
if (reread_part) {
bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
zfs_vdev_holder);
bdev = vdev_blkdev_get_by_path(disk_name, mode,
zfs_vdev_holder, NULL);
if (!IS_ERR(bdev)) {
int error = vdev_bdev_reread_part(bdev);
blkdev_put(bdev, mode | FMODE_EXCL);
vdev_blkdev_put(bdev, mode, zfs_vdev_holder);
if (error == 0) {
timeout = MSEC2NSEC(
zfs_vdev_open_timeout_ms * 2);
@ -305,8 +354,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
hrtime_t start = gethrtime();
bdev = ERR_PTR(-ENXIO);
while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
zfs_vdev_holder);
bdev = vdev_blkdev_get_by_path(v->vdev_path, mode,
zfs_vdev_holder, NULL);
if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
/*
* There is no point of waiting since device is removed
@ -382,8 +431,8 @@ vdev_disk_close(vdev_t *v)
return;
if (vd->vd_bdev != NULL) {
blkdev_put(vd->vd_bdev,
vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa),
zfs_vdev_holder);
}
rw_destroy(&vd->vd_lock);

View File

@ -478,17 +478,19 @@ zfsctl_is_snapdir(struct inode *ip)
*/
static struct inode *
zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
const struct file_operations *fops, const struct inode_operations *ops)
const struct file_operations *fops, const struct inode_operations *ops,
uint64_t creation)
{
inode_timespec_t now;
struct inode *ip;
znode_t *zp;
inode_timespec_t now = {.tv_sec = creation};
ip = new_inode(zfsvfs->z_sb);
if (ip == NULL)
return (NULL);
now = current_time(ip);
if (!creation)
now = current_time(ip);
zp = ITOZ(ip);
ASSERT3P(zp->z_dirlocks, ==, NULL);
ASSERT3P(zp->z_acl_cached, ==, NULL);
@ -552,14 +554,28 @@ zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
const struct file_operations *fops, const struct inode_operations *ops)
{
struct inode *ip = NULL;
uint64_t creation = 0;
dsl_dataset_t *snap_ds;
dsl_pool_t *pool;
while (ip == NULL) {
ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
if (ip)
break;
if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {
pool = dmu_objset_pool(zfsvfs->z_os);
dsl_pool_config_enter(pool, FTAG);
if (!dsl_dataset_hold_obj(pool,
ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {
creation = dsl_get_creation(snap_ds);
dsl_dataset_rele(snap_ds, FTAG);
}
dsl_pool_config_exit(pool, FTAG);
}
/* May fail due to concurrent zfsctl_inode_alloc() */
ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);
}
return (ip);
@ -581,7 +597,7 @@ zfsctl_create(zfsvfs_t *zfsvfs)
ASSERT(zfsvfs->z_ctldir == NULL);
zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
&zpl_fops_root, &zpl_ops_root);
&zpl_fops_root, &zpl_ops_root, 0);
if (zfsvfs->z_ctldir == NULL)
return (SET_ERROR(ENOENT));

View File

@ -1662,6 +1662,7 @@ zfs_umount(struct super_block *sb)
}
zfsvfs_free(zfsvfs);
sb->s_fs_info = NULL;
return (0);
}
@ -2091,6 +2092,9 @@ zfs_init(void)
zfs_znode_init();
dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
register_filesystem(&zpl_fs_type);
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
register_fo_extend(&zpl_file_operations);
#endif
}
void
@ -2101,6 +2105,9 @@ zfs_fini(void)
*/
taskq_wait(system_delay_taskq);
taskq_wait(system_taskq);
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
unregister_fo_extend(&zpl_file_operations);
#endif
unregister_filesystem(&zpl_fs_type);
zfs_znode_fini();
zfsctl_fini();

View File

@ -186,7 +186,7 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
return (error);
/* Honor ZFS_APPENDONLY file attribute */
if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
((flag & O_APPEND) == 0)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EPERM));
@ -1610,11 +1610,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
if (done)
break;
/* Prefetch znode */
if (prefetch) {
dmu_prefetch(os, objnum, 0, 0, 0,
ZIO_PRIORITY_SYNC_READ);
}
if (prefetch)
dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
/*
* Move to the next entry, fill in the previous offset.

View File

@ -415,7 +415,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
switch (ip->i_mode & S_IFMT) {
case S_IFREG:
ip->i_op = &zpl_inode_operations;
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
ip->i_fop = &zpl_file_operations.kabi_fops;
#else
ip->i_fop = &zpl_file_operations;
#endif
ip->i_mapping->a_ops = &zpl_address_space_operations;
break;
@ -455,7 +459,11 @@ zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
/* Assume the inode is a file and attempt to continue */
ip->i_mode = S_IFREG | 0644;
ip->i_op = &zpl_inode_operations;
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
ip->i_fop = &zpl_file_operations.kabi_fops;
#else
ip->i_fop = &zpl_file_operations;
#endif
ip->i_mapping->a_ops = &zpl_address_space_operations;
break;
}

View File

@ -42,7 +42,7 @@
static int
zpl_common_open(struct inode *ip, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE)
if (blk_mode_is_open_write(filp->f_mode))
return (-EACCES);
return (generic_file_open(ip, filp));

View File

@ -301,15 +301,10 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
#if defined(HAVE_VFS_IOV_ITER)
zfs_uio_iov_iter_init(uio, to, pos, count, skip);
#else
#ifdef HAVE_IOV_ITER_TYPE
zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos,
zfs_uio_iov_iter_type(to) & ITER_KVEC ?
UIO_SYSSPACE : UIO_USERSPACE,
count, skip);
#else
zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
count, skip);
#endif
#endif
}
@ -1257,6 +1252,12 @@ zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return (zpl_ioctl_getdosflags(filp, (void *)arg));
case ZFS_IOC_SETDOSFLAGS:
return (zpl_ioctl_setdosflags(filp, (void *)arg));
case ZFS_IOC_COMPAT_FICLONE:
return (zpl_ioctl_ficlone(filp, (void *)arg));
case ZFS_IOC_COMPAT_FICLONERANGE:
return (zpl_ioctl_ficlonerange(filp, (void *)arg));
case ZFS_IOC_COMPAT_FIDEDUPERANGE:
return (zpl_ioctl_fideduperange(filp, (void *)arg));
default:
return (-ENOTTY);
}
@ -1283,7 +1284,6 @@ zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
#endif /* CONFIG_COMPAT */
const struct address_space_operations zpl_address_space_operations = {
#ifdef HAVE_VFS_READPAGES
.readpages = zpl_readpages,
@ -1306,7 +1306,12 @@ const struct address_space_operations zpl_address_space_operations = {
#endif
};
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
const struct file_operations_extend zpl_file_operations = {
.kabi_fops = {
#else
const struct file_operations zpl_file_operations = {
#endif
.open = zpl_open,
.release = zpl_release,
.llseek = zpl_llseek,
@ -1318,7 +1323,11 @@ const struct file_operations zpl_file_operations = {
.read_iter = zpl_iter_read,
.write_iter = zpl_iter_write,
#ifdef HAVE_VFS_IOV_ITER
#ifdef HAVE_COPY_SPLICE_READ
.splice_read = copy_splice_read,
#else
.splice_read = generic_file_splice_read,
#endif
.splice_write = iter_file_splice_write,
#endif
#else
@ -1333,6 +1342,18 @@ const struct file_operations zpl_file_operations = {
.aio_fsync = zpl_aio_fsync,
#endif
.fallocate = zpl_fallocate,
#ifdef HAVE_VFS_COPY_FILE_RANGE
.copy_file_range = zpl_copy_file_range,
#endif
#ifdef HAVE_VFS_CLONE_FILE_RANGE
.clone_file_range = zpl_clone_file_range,
#endif
#ifdef HAVE_VFS_REMAP_FILE_RANGE
.remap_file_range = zpl_remap_file_range,
#endif
#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
.dedupe_file_range = zpl_dedupe_file_range,
#endif
#ifdef HAVE_FILE_FADVISE
.fadvise = zpl_fadvise,
#endif
@ -1340,6 +1361,11 @@ const struct file_operations zpl_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = zpl_compat_ioctl,
#endif
#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
}, /* kabi_fops */
.copy_file_range = zpl_copy_file_range,
.clone_file_range = zpl_clone_file_range,
#endif
};
const struct file_operations zpl_dir_file_operations = {

View File

@ -0,0 +1,272 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2023, Klara Inc.
*/
#ifdef CONFIG_COMPAT
#include <linux/compat.h>
#endif
#include <linux/fs.h>
#include <sys/file.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_vnops.h>
#include <sys/zfeature.h>
/*
* Clone part of a file via block cloning.
*
* Note that we are not required to update file offsets; the kernel will take
* care of that depending on how it was called.
*/
static ssize_t
__zpl_clone_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len)
{
struct inode *src_i = file_inode(src_file);
struct inode *dst_i = file_inode(dst_file);
uint64_t src_off_o = (uint64_t)src_off;
uint64_t dst_off_o = (uint64_t)dst_off;
uint64_t len_o = (uint64_t)len;
cred_t *cr = CRED();
fstrans_cookie_t cookie;
int err;
if (!spa_feature_is_enabled(
dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING))
return (-EOPNOTSUPP);
if (src_i != dst_i)
spl_inode_lock_shared(src_i);
spl_inode_lock(dst_i);
crhold(cr);
cookie = spl_fstrans_mark();
err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i),
&dst_off_o, &len_o, cr);
spl_fstrans_unmark(cookie);
crfree(cr);
spl_inode_unlock(dst_i);
if (src_i != dst_i)
spl_inode_unlock_shared(src_i);
if (err < 0)
return (err);
return ((ssize_t)len_o);
}
#if defined(HAVE_VFS_COPY_FILE_RANGE) || \
defined(HAVE_VFS_FILE_OPERATIONS_EXTEND)
/*
* Entry point for copy_file_range(). Copy len bytes from src_off in src_file
* to dst_off in dst_file. We are permitted to do this however we like, so we
* try to just clone the blocks, and if we can't support it, fall back to the
* kernel's generic byte copy function.
*/
ssize_t
zpl_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags)
{
ssize_t ret;
if (flags != 0)
return (-EINVAL);
/* Try to do it via zfs_clone_range() */
ret = __zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len);
#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
/*
* Since Linux 5.3 the filesystem driver is responsible for executing
* an appropriate fallback, and a generic fallback function is provided.
*/
if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
ret == -EAGAIN)
ret = generic_copy_file_range(src_file, src_off, dst_file,
dst_off, len, flags);
#else
/*
* Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
* to the kernel that it should fallback to a content copy.
*/
if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
ret = -EOPNOTSUPP;
#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
return (ret);
}
#endif /* HAVE_VFS_COPY_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
#ifdef HAVE_VFS_REMAP_FILE_RANGE
/*
* Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE.
*
* FICLONE and FICLONERANGE are basically the same as copy_file_range(), except
* that they must clone - they cannot fall back to copying. FICLONE is exactly
* FICLONERANGE, for the entire file. We don't need to try to tell them apart;
* the kernel will sort that out for us.
*
* FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the
* range in both files and if they're the same, arrange for them to be backed
* by the same storage.
*/
loff_t
zpl_remap_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags)
{
if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN))
return (-EINVAL);
/*
* REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given
* range if we want. Its designed for filesystems that make data past
* EOF available, and don't want it to be visible in both files. ZFS
* doesn't do that, so we just turn the flag off.
*/
flags &= ~REMAP_FILE_CAN_SHORTEN;
if (flags & REMAP_FILE_DEDUP)
/* No support for dedup yet */
return (-EOPNOTSUPP);
/* Zero length means to clone everything to the end of the file */
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;
return (__zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len));
}
#endif /* HAVE_VFS_REMAP_FILE_RANGE */
#if defined(HAVE_VFS_CLONE_FILE_RANGE) || \
defined(HAVE_VFS_FILE_OPERATIONS_EXTEND)
/*
* Entry point for FICLONE and FICLONERANGE, before Linux 4.20.
*/
int
zpl_clone_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, uint64_t len)
{
/* Zero length means to clone everything to the end of the file */
if (len == 0)
len = i_size_read(file_inode(src_file)) - src_off;
return (__zpl_clone_file_range(src_file, src_off,
dst_file, dst_off, len));
}
#endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */
#ifdef HAVE_VFS_DEDUPE_FILE_RANGE
/*
* Entry point for FIDEDUPERANGE, before Linux 4.20.
*/
int
zpl_dedupe_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, uint64_t len)
{
/* No support for dedup yet */
return (-EOPNOTSUPP);
}
#endif /* HAVE_VFS_DEDUPE_FILE_RANGE */
/* Entry point for FICLONE, before Linux 4.5. */
long
zpl_ioctl_ficlone(struct file *dst_file, void *arg)
{
unsigned long sfd = (unsigned long)arg;
struct file *src_file = fget(sfd);
if (src_file == NULL)
return (-EBADF);
if (dst_file->f_op != src_file->f_op)
return (-EXDEV);
size_t len = i_size_read(file_inode(src_file));
ssize_t ret =
__zpl_clone_file_range(src_file, 0, dst_file, 0, len);
fput(src_file);
if (ret < 0) {
if (ret == -EOPNOTSUPP)
return (-ENOTTY);
return (ret);
}
if (ret != len)
return (-EINVAL);
return (0);
}
/* Entry point for FICLONERANGE, before Linux 4.5. */
long
zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg)
{
zfs_ioc_compat_file_clone_range_t fcr;
if (copy_from_user(&fcr, arg, sizeof (fcr)))
return (-EFAULT);
struct file *src_file = fget(fcr.fcr_src_fd);
if (src_file == NULL)
return (-EBADF);
if (dst_file->f_op != src_file->f_op)
return (-EXDEV);
size_t len = fcr.fcr_src_length;
if (len == 0)
len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset;
ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset,
dst_file, fcr.fcr_dest_offset, len);
fput(src_file);
if (ret < 0) {
if (ret == -EOPNOTSUPP)
return (-ENOTTY);
return (ret);
}
if (ret != len)
return (-EINVAL);
return (0);
}
/* Entry point for FIDEDUPERANGE, before Linux 4.5. */
long
zpl_ioctl_fideduperange(struct file *filp, void *arg)
{
(void) arg;
/* No support for dedup yet */
return (-ENOTTY);
}

View File

@ -277,8 +277,6 @@ zpl_test_super(struct super_block *s, void *data)
{
zfsvfs_t *zfsvfs = s->s_fs_info;
objset_t *os = data;
int match;
/*
* If the os doesn't match the z_os in the super_block, assume it is
* not a match. Matching would imply a multimount of a dataset. It is
@ -286,19 +284,7 @@ zpl_test_super(struct super_block *s, void *data)
* that changes the z_os, e.g., rollback, where the match will be
* missed, but in that case the user will get an EBUSY.
*/
if (zfsvfs == NULL || os != zfsvfs->z_os)
return (0);
/*
* If they do match, recheck with the lock held to prevent mounting the
* wrong dataset since z_os can be stale when the teardown lock is held.
*/
if (zpl_enter(zfsvfs, FTAG) != 0)
return (0);
match = (os == zfsvfs->z_os);
zpl_exit(zfsvfs, FTAG);
return (match);
return (zfsvfs != NULL && os == zfsvfs->z_os);
}
static struct super_block *
@ -324,12 +310,35 @@ zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
/*
* Recheck with the lock held to prevent mounting the wrong dataset
* since z_os can be stale when the teardown lock is held.
*
* We can't do this in zpl_test_super in since it's under spinlock and
* also s_umount lock is not held there so it would race with
* zfs_umount and zfsvfs can be freed.
*/
if (!IS_ERR(s) && s->s_fs_info != NULL) {
zfsvfs_t *zfsvfs = s->s_fs_info;
if (zpl_enter(zfsvfs, FTAG) == 0) {
if (os != zfsvfs->z_os)
err = -SET_ERROR(EBUSY);
zpl_exit(zfsvfs, FTAG);
} else {
err = -SET_ERROR(EBUSY);
}
}
dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
dsl_dataset_rele(dmu_objset_ds(os), FTAG);
if (IS_ERR(s))
return (ERR_CAST(s));
if (err) {
deactivate_locked_super(s);
return (ERR_PTR(err));
}
if (s->s_root == NULL) {
err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
if (err) {

View File

@ -671,7 +671,11 @@ zvol_request(struct request_queue *q, struct bio *bio)
}
static int
#ifdef HAVE_BLK_MODE_T
zvol_open(struct gendisk *disk, blk_mode_t flag)
#else
zvol_open(struct block_device *bdev, fmode_t flag)
#endif
{
zvol_state_t *zv;
int error = 0;
@ -686,10 +690,14 @@ zvol_open(struct block_device *bdev, fmode_t flag)
/*
* Obtain a copy of private_data under the zvol_state_lock to make
* sure that either the result of zvol free code path setting
* bdev->bd_disk->private_data to NULL is observed, or zvol_os_free()
* disk->private_data to NULL is observed, or zvol_os_free()
* is not called on this zv because of the positive zv_open_count.
*/
#ifdef HAVE_BLK_MODE_T
zv = disk->private_data;
#else
zv = bdev->bd_disk->private_data;
#endif
if (zv == NULL) {
rw_exit(&zvol_state_lock);
return (SET_ERROR(-ENXIO));
@ -769,14 +777,15 @@ zvol_open(struct block_device *bdev, fmode_t flag)
}
}
error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
if (drop_namespace)
mutex_exit(&spa_namespace_lock);
}
if (error == 0) {
if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
if ((blk_mode_is_open_write(flag)) &&
(zv->zv_flags & ZVOL_RDONLY)) {
if (zv->zv_open_count == 0)
zvol_last_close(zv);
@ -791,14 +800,25 @@ zvol_open(struct block_device *bdev, fmode_t flag)
rw_exit(&zv->zv_suspend_lock);
if (error == 0)
#ifdef HAVE_BLK_MODE_T
disk_check_media_change(disk);
#else
zfs_check_media_change(bdev);
#endif
return (error);
}
static void
zvol_release(struct gendisk *disk, fmode_t mode)
#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
zvol_release(struct gendisk *disk)
#else
zvol_release(struct gendisk *disk, fmode_t unused)
#endif
{
#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
(void) unused;
#endif
zvol_state_t *zv;
boolean_t drop_suspend = B_TRUE;

View File

@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
dmu_buf_t *dbuf = NULL;
bpobj_t *bpo = bpi->bpi_bpo;
for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
sizeof (blkptr_t);
uint64_t ps = start * sizeof (blkptr_t);
uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
ps);
if (pe > pb) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
ZIO_PRIORITY_ASYNC_READ);
}
for (; i >= start; i--) {
uint64_t offset = i * sizeof (blkptr_t);
uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
if (dbuf)
dmu_buf_rele(dbuf, FTAG);
err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
offset, FTAG, &dbuf, 0);
offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
if (err)
break;
pe = pb;
pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
dbuf->db_offset - dmu_prefetch_max : 0, ps);
if (pe > pb) {
dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
}
}
ASSERT3U(offset, >=, dbuf->db_offset);
@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
int64_t i = bpi->bpi_unprocessed_subobjs - 1;
uint64_t offset = i * sizeof (uint64_t);
uint64_t obj_from_sublist;
uint64_t subobj;
err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
offset, sizeof (uint64_t), &obj_from_sublist,
DMU_READ_PREFETCH);
offset, sizeof (uint64_t), &subobj,
DMU_READ_NO_PREFETCH);
if (err)
break;
bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
KM_SLEEP);
err = bpobj_open(sublist, bpo->bpo_os,
obj_from_sublist);
if (err)
err = bpobj_open(subbpo, bpo->bpo_os, subobj);
if (err) {
kmem_free(subbpo, sizeof (bpobj_t));
break;
}
list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
mutex_enter(&sublist->bpo_lock);
if (subbpo->bpo_havesubobj &&
subbpo->bpo_phys->bpo_subobjs != 0) {
dmu_prefetch(subbpo->bpo_os,
subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
ZIO_PRIORITY_ASYNC_READ);
}
list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
mutex_enter(&subbpo->bpo_lock);
bpi->bpi_unprocessed_subobjs--;
}
}

View File

@ -174,7 +174,7 @@
* size_t len, unsigned int flags);
*
* Even though offsets and length represent bytes, they have to be
* block-aligned or we will return the EXDEV error so the upper layer can
* block-aligned or we will return an error so the upper layer can
* fallback to the generic mechanism that will just copy the data.
* Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
* This function was implemented based on zfs_write(), but instead of writing
@ -192,9 +192,9 @@
* Some special cases to consider and how we address them:
* - The block we want to clone may have been created within the same
* transaction group that we are trying to clone. Such block has no BP
* allocated yet, so cannot be immediately cloned. We return EXDEV.
* allocated yet, so cannot be immediately cloned. We return EAGAIN.
* - The block we want to clone may have been modified within the same
* transaction group. We return EXDEV.
* transaction group. We return EAGAIN.
* - A block may be cloned multiple times during one transaction group (that's
* why pending list is actually a tree and not an append-only list - this
* way we can figure out faster if this block is cloned for the first time
@ -680,7 +680,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
@ -709,7 +709,7 @@ brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
BT_SIZEOFMAP(brtvd->bv_nblocks)));
kmem_free(brtvd->bv_entcount,
vmem_free(brtvd->bv_entcount,
sizeof (entcount[0]) * brtvd->bv_size);
kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
}
@ -792,7 +792,7 @@ brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
ASSERT(RW_WRITE_HELD(&brt->brt_lock));
ASSERT(brtvd->bv_initiated);
kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
brtvd->bv_entcount = NULL;
kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
brtvd->bv_bitmap = NULL;
@ -1544,6 +1544,37 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp)
return (B_FALSE);
}
uint64_t
brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
{
brt_t *brt = spa->spa_brt;
brt_vdev_t *brtvd;
brt_entry_t bre_search, *bre;
uint64_t vdevid, refcnt;
int error;
brt_entry_fill(bp, &bre_search, &vdevid);
brt_rlock(brt);
brtvd = brt_vdev(brt, vdevid);
ASSERT(brtvd != NULL);
bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
if (bre == NULL) {
error = brt_entry_lookup(brt, brtvd, &bre_search);
ASSERT(error == 0 || error == ENOENT);
if (error == ENOENT)
refcnt = 0;
else
refcnt = bre_search.bre_refcount;
} else
refcnt = bre->bre_refcount;
brt_unlock(brt);
return (refcnt);
}
static void
brt_prefetch(brt_t *brt, const blkptr_t *bp)
{

View File

@ -2701,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
*/
mutex_enter(&db->db_mtx);
VERIFY(!dbuf_undirty(db, tx));
ASSERT(list_head(&db->db_dirty_records) == NULL);
ASSERT0(dbuf_find_dirty_eq(db, tx->tx_txg));
if (db->db_buf != NULL) {
arc_buf_destroy(db->db_buf, db);
db->db_buf = NULL;
@ -4457,6 +4457,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
} else if (db->db_state == DB_READ) {
/*
* This buffer has a clone we need to write, and an in-flight
* read on the BP we're about to clone. Its safe to issue the
* write here because the read has already been issued and the
* contents won't change.
*/
ASSERT(dr->dt.dl.dr_brtwrite &&
dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}

View File

@ -89,7 +89,11 @@ static int zfs_dmu_offset_next_sync = 1;
* helps to limit the amount of memory that can be used by prefetching.
* Larger objects should be prefetched a bit at a time.
*/
#ifdef _ILP32
uint_t dmu_prefetch_max = 8 * 1024 * 1024;
#else
uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
#endif
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
@ -161,7 +165,7 @@ dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
{ zfs_acl_byteswap, "acl" }
};
static int
int
dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
const void *tag, dmu_buf_t **dbp)
{
@ -181,6 +185,7 @@ dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
*dbp = &db->db;
return (0);
}
int
dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
const void *tag, dmu_buf_t **dbp)
@ -552,8 +557,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, 0, offset);
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
length <= zfetch_array_rd_sz) {
if ((flags & DMU_READ_NO_PREFETCH) == 0) {
/*
* Prepare the zfetch before initiating the demand reads, so
* that if multiple threads block on same indirect block, we
@ -691,74 +695,93 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
}
/*
* Issue prefetch i/os for the given blocks. If level is greater than 0, the
* Issue prefetch I/Os for the given blocks. If level is greater than 0, the
* indirect blocks prefetched will be those that point to the blocks containing
* the data starting at offset, and continuing to offset + len.
* the data starting at offset, and continuing to offset + len. If the range
* it too long, prefetch the first dmu_prefetch_max bytes as requested, while
* for the rest only a higher level, also fitting within dmu_prefetch_max. It
* should primarily help random reads, since for long sequential reads there is
* a speculative prefetcher.
*
* Note that if the indirect blocks above the blocks being prefetched are not
* in cache, they will be asynchronously read in.
* in cache, they will be asynchronously read in. Dnode read by dnode_hold()
* is currently synchronous.
*/
void
dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
uint64_t blkid;
int nblks, err;
int64_t level2 = level;
uint64_t start, end, start2, end2;
if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os);
if (object == 0 || object >= DN_MAX_OBJECT)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
blkid = dbuf_whichblock(dn, level,
object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
if (dmu_prefetch_max == 0 || len == 0) {
dmu_prefetch_dnode(os, object, pri);
return;
}
/*
* See comment before the definition of dmu_prefetch_max.
*/
len = MIN(len, dmu_prefetch_max);
/*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
if (dnode_hold(os, object, FTAG, &dn) != 0)
return;
/*
* offset + len - 1 is the last byte we want to prefetch for, and offset
* is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
* last block we want to prefetch, and dbuf_whichblock(dn, level,
* offset) is the first. Then the number we need to prefetch is the
* last - first + 1.
* Depending on len we may do two prefetches: blocks [start, end) at
* level, and following blocks [start2, end2) at higher level2.
*/
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (level > 0 || dn->dn_datablkshift != 0) {
nblks = dbuf_whichblock(dn, level, offset + len - 1) -
dbuf_whichblock(dn, level, offset) + 1;
if (dn->dn_datablkshift != 0) {
/*
* The object has multiple blocks. Calculate the full range
* of blocks [start, end2) and then split it into two parts,
* so that the first [start, end) fits into dmu_prefetch_max.
*/
start = dbuf_whichblock(dn, level, offset);
end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
uint8_t ibs = dn->dn_indblkshift;
uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
start2 = end = MIN(end2, start + limit);
/*
* Find level2 where [start2, end2) fits into dmu_prefetch_max.
*/
uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
do {
level2++;
start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
} while (end2 - start2 > limit);
} else {
nblks = (offset < dn->dn_datablksz);
/* There is only one block. Prefetch it or nothing. */
start = start2 = end2 = 0;
end = start + (level == 0 && offset < dn->dn_datablksz);
}
if (nblks != 0) {
blkid = dbuf_whichblock(dn, level, offset);
for (int i = 0; i < nblks; i++)
dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
for (uint64_t i = start; i < end; i++)
dbuf_prefetch(dn, level, i, pri, 0);
for (uint64_t i = start2; i < end2; i++)
dbuf_prefetch(dn, level2, i, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
}
/*
* Issue prefetch I/Os for the given object's dnode.
*/
void
dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
{
if (object == 0 || object >= DN_MAX_OBJECT)
return;
dnode_t *dn = DMU_META_DNODE(os);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
dbuf_prefetch(dn, 0, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
}
/*
* Get the next "chunk" of file data to free. We traverse the file from
* the end so that the file gets shorter over time (if we crashes in the
@ -1650,10 +1673,22 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
{
dmu_sync_arg_t *dsa;
dmu_tx_t *tx;
int error;
error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
DB_RF_CANFAIL | DB_RF_NOPREFETCH);
if (error != 0)
return (error);
tx = dmu_tx_create(os);
dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
/*
* This transaction does not produce any dirty data or log blocks, so
* it should not be throttled. All other cases wait for TXG sync, by
* which time the log block we are writing will be obsolete, so we can
* skip waiting and just return error here instead.
*/
if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
dmu_tx_abort(tx);
/* Make zl_get_data do txg_waited_synced() */
return (SET_ERROR(EIO));

View File

@ -1795,17 +1795,19 @@ receive_handle_existing_object(const struct receive_writer_arg *rwa,
}
/*
* The dmu does not currently support decreasing nlevels
* or changing the number of dnode slots on an object. For
* non-raw sends, this does not matter and the new object
* can just use the previous one's nlevels. For raw sends,
* however, the structure of the received dnode (including
* nlevels and dnode slots) must match that of the send
* side. Therefore, instead of using dmu_object_reclaim(),
* we must free the object completely and call
* dmu_object_claim_dnsize() instead.
* The dmu does not currently support decreasing nlevels or changing
* indirect block size if there is already one, same as changing the
* number of of dnode slots on an object. For non-raw sends this
* does not matter and the new object can just use the previous one's
* parameters. For raw sends, however, the structure of the received
* dnode (including indirects and dnode slots) must match that of the
* send side. Therefore, instead of using dmu_object_reclaim(), we
* must free the object completely and call dmu_object_claim_dnsize()
* instead.
*/
if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
if ((rwa->raw && ((doi->doi_indirection > 1 &&
indblksz != doi->doi_metadata_block_size) ||
drro->drr_nlevels < doi->doi_indirection)) ||
dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
err = dmu_free_long_object(rwa->os, drro->drr_object);
if (err != 0)

View File

@ -52,14 +52,19 @@ static unsigned int zfetch_max_streams = 8;
static unsigned int zfetch_min_sec_reap = 1;
/* max time before stream delete */
static unsigned int zfetch_max_sec_reap = 2;
#ifdef _ILP32
/* min bytes to prefetch per stream (default 2MB) */
static unsigned int zfetch_min_distance = 2 * 1024 * 1024;
/* max bytes to prefetch per stream (default 8MB) */
unsigned int zfetch_max_distance = 8 * 1024 * 1024;
#else
/* min bytes to prefetch per stream (default 4MB) */
static unsigned int zfetch_min_distance = 4 * 1024 * 1024;
/* max bytes to prefetch per stream (default 64MB) */
unsigned int zfetch_max_distance = 64 * 1024 * 1024;
#endif
/* max bytes to prefetch indirects for per stream (default 64MB) */
unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
/* max number of bytes in an array_read in which we allow prefetching (1MB) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
@ -580,6 +585,3 @@ ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
"Max bytes to prefetch indirects for per stream");
ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW,
"Number of bytes in a array_read");

View File

@ -1882,7 +1882,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs == dn->dn_indblkshift)
ibs = 0;
if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
if (size == dn->dn_datablksz && ibs == 0)
return (0);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
@ -1905,24 +1905,25 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
if (ibs && dn->dn_nlevels != 1)
goto fail;
/* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0) {
dbuf_new_size(db, size, tx);
} else if (err != ENOENT) {
goto fail;
}
dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
if (size != dn->dn_datablksz) {
/* resize the old block */
err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0) {
dbuf_new_size(db, size, tx);
} else if (err != ENOENT) {
goto fail;
}
dnode_setdblksz(dn, size);
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
if (db)
dbuf_rele(db, FTAG);
}
if (ibs) {
dn->dn_indblkshift = ibs;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
}
/* release after we have fixed the blocksize in the dnode */
if (db)
dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
return (0);

View File

@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
* in parallel. Then open them all in a second pass.
*/
dle->dle_bpobj.bpo_object = za.za_first_integer;
dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object,
ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_tree, dle);
}
@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl)
* in parallel. Then open them all in a second pass.
*/
dlce->dlce_bpobj = za.za_first_integer;
dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj,
ZIO_PRIORITY_SYNC_READ);
avl_add(&dl->dl_cache, dlce);
}
VERIFY3U(error, ==, ENOENT);
@ -892,9 +892,9 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
for (zap_cursor_init(&zc, dl->dl_os, obj);
(error = zap_cursor_retrieve(&zc, za)) == 0;
zap_cursor_advance(&zc)) {
uint64_t mintxg = zfs_strtonum(za->za_name, NULL);
dsl_deadlist_insert_bpobj(dl, za->za_first_integer, mintxg, tx);
VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
dsl_deadlist_insert_bpobj(dl, za->za_first_integer,
zfs_strtonum(za->za_name, NULL), tx);
VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx));
if (perror == 0) {
dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer,
zfs_strtonum(pza->za_name, NULL));

View File

@ -2015,6 +2015,11 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
zb->zb_objset, DMU_META_DNODE_OBJECT);
if (OBJSET_BUF_HAS_USERUSED(buf)) {
if (OBJSET_BUF_HAS_PROJECTUSED(buf)) {
dsl_scan_prefetch_dnode(scn,
&osp->os_projectused_dnode, zb->zb_objset,
DMU_PROJECTUSED_OBJECT);
}
dsl_scan_prefetch_dnode(scn,
&osp->os_groupused_dnode, zb->zb_objset,
DMU_GROUPUSED_OBJECT);
@ -2075,10 +2080,16 @@ dsl_scan_prefetch_thread(void *arg)
zio_flags |= ZIO_FLAG_RAW;
}
/* We don't need data L1 buffer since we do not prefetch L0. */
blkptr_t *bp = &spic->spic_bp;
if (BP_GET_LEVEL(bp) == 1 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
BP_GET_TYPE(bp) != DMU_OT_OBJSET)
flags |= ARC_FLAG_NO_BUF;
/* issue the prefetch asynchronously */
(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
&spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
(void) arc_read(scn->scn_zio_root, spa, bp,
dsl_scan_prefetch_cb, spic->spic_spc, ZIO_PRIORITY_SCRUB,
zio_flags, &flags, &spic->spic_zb);
kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
}

View File

@ -58,6 +58,11 @@ static uint64_t metaslab_aliquot = 1024 * 1024;
*/
uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
/*
* Of blocks of size >= metaslab_force_ganging, actually gang them this often.
*/
uint_t metaslab_force_ganging_pct = 3;
/*
* In pools where the log space map feature is not enabled we touch
* multiple metaslabs (and their respective space maps) with each
@ -1287,7 +1292,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
/*
* If this metaslab group is below its qmax or it's
* the only allocatable metasable group, then attempt
* the only allocatable metaslab group, then attempt
* to allocate from it.
*/
if (qdepth < qmax || mc->mc_alloc_groups == 1)
@ -5096,7 +5101,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
zio_alloc_list_t *zal, int allocator)
{
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
metaslab_group_t *mg, *fast_mg, *rotor;
metaslab_group_t *mg, *rotor;
vdev_t *vd;
boolean_t try_hard = B_FALSE;
@ -5109,7 +5114,9 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
* damage can result in extremely long reconstruction times. This
* will also test spilling from special to normal.
*/
if (psize >= metaslab_force_ganging && (random_in_range(100) < 3)) {
if (psize >= metaslab_force_ganging &&
metaslab_force_ganging_pct > 0 &&
(random_in_range(100) < MIN(metaslab_force_ganging_pct, 100))) {
metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
allocator);
return (SET_ERROR(ENOSPC));
@ -5157,15 +5164,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} else if (d != 0) {
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
mg = vd->vdev_mg->mg_next;
} else if (flags & METASLAB_FASTWRITE) {
mg = fast_mg = mca->mca_rotor;
do {
if (fast_mg->mg_vd->vdev_pending_fastwrite <
mg->mg_vd->vdev_pending_fastwrite)
mg = fast_mg;
} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
} else {
ASSERT(mca->mca_rotor != NULL);
mg = mca->mca_rotor;
@ -5290,7 +5288,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
mg->mg_bias = 0;
}
if ((flags & METASLAB_FASTWRITE) ||
if ((flags & METASLAB_ZIL) ||
atomic_add_64_nv(&mca->mca_aliquot, asize) >=
mg->mg_aliquot + mg->mg_bias) {
mca->mca_rotor = mg->mg_next;
@ -5303,11 +5301,6 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
((flags & METASLAB_GANG_HEADER) ? 1 : 0));
DVA_SET_ASIZE(&dva[d], asize);
if (flags & METASLAB_FASTWRITE) {
atomic_add_64(&vd->vdev_pending_fastwrite,
psize);
}
return (0);
}
next:
@ -5943,55 +5936,6 @@ metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
return (error);
}
void
metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
uint64_t psize = BP_GET_PSIZE(bp);
int d;
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < ndvas; d++) {
if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
continue;
atomic_add_64(&vd->vdev_pending_fastwrite, psize);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
void
metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
{
const dva_t *dva = bp->blk_dva;
int ndvas = BP_GET_NDVAS(bp);
uint64_t psize = BP_GET_PSIZE(bp);
int d;
vdev_t *vd;
ASSERT(!BP_IS_HOLE(bp));
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(psize > 0);
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (d = 0; d < ndvas; d++) {
if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
continue;
ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
static void
metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
uint64_t size, void *arg)
@ -6266,7 +6210,10 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
"Segment-based metaslab selection maximum buckets before switching");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, U64, ZMOD_RW,
"Blocks larger than this size are forced to be gang blocks");
"Blocks larger than this size are sometimes forced to be gang blocks");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging_pct, UINT, ZMOD_RW,
"Percentage of large blocks that will be forced to be gang blocks");
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, UINT, ZMOD_RW,
"Max distance (bytes) to search forward before using size tree");

View File

@ -1147,8 +1147,8 @@ spa_ld_log_sm_data(spa_t *spa)
/* Prefetch log spacemaps dnodes. */
for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls;
sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj,
ZIO_PRIORITY_SYNC_READ);
}
uint_t pn = 0;

View File

@ -772,6 +772,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_min_ashift = INT_MAX;
spa->spa_max_ashift = 0;
spa->spa_min_alloc = INT_MAX;
spa->spa_gcd_alloc = INT_MAX;
/* Reset cached value */
spa->spa_dedup_dspace = ~0ULL;

View File

@ -889,9 +889,15 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
&vd->vdev_not_present);
/*
* Get the alignment requirement.
* Get the alignment requirement. Ignore pool ashift for vdev
* attach case.
*/
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
if (alloctype != VDEV_ALLOC_ATTACH) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
&vd->vdev_ashift);
} else {
vd->vdev_attaching = B_TRUE;
}
/*
* Retrieve the vdev creation time.
@ -1186,7 +1192,6 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT(tvd == tvd->vdev_top);
tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
tvd->vdev_ms_array = svd->vdev_ms_array;
tvd->vdev_ms_shift = svd->vdev_ms_shift;
tvd->vdev_ms_count = svd->vdev_ms_count;
@ -1393,6 +1398,36 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
/*
* Choose GCD for spa_gcd_alloc.
*/
static uint64_t
vdev_gcd(uint64_t a, uint64_t b)
{
while (b != 0) {
uint64_t t = b;
b = a % b;
a = t;
}
return (a);
}
/*
* Set spa_min_alloc and spa_gcd_alloc.
*/
static void
vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
{
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
if (spa->spa_gcd_alloc == INT_MAX) {
spa->spa_gcd_alloc = min_alloc;
} else {
spa->spa_gcd_alloc = vdev_gcd(min_alloc,
spa->spa_gcd_alloc);
}
}
void
vdev_metaslab_group_create(vdev_t *vd)
{
@ -1445,8 +1480,7 @@ vdev_metaslab_group_create(vdev_t *vd)
spa->spa_min_ashift = vd->vdev_ashift;
uint64_t min_alloc = vdev_get_min_alloc(vd);
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
vdev_spa_set_alloc(spa, min_alloc);
}
}
}
@ -1620,7 +1654,6 @@ vdev_metaslab_fini(vdev_t *vd)
}
}
ASSERT0(vd->vdev_ms_count);
ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
}
typedef struct vdev_probe_stats {
@ -2144,9 +2177,9 @@ vdev_open(vdev_t *vd)
return (SET_ERROR(EDOM));
}
if (vd->vdev_top == vd) {
if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
vdev_ashift_optimize(vd);
}
vd->vdev_attaching = B_FALSE;
}
if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
vd->vdev_ashift > ASHIFT_MAX)) {
@ -2207,8 +2240,7 @@ vdev_open(vdev_t *vd)
if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
uint64_t min_alloc = vdev_get_min_alloc(vd);
if (min_alloc < spa->spa_min_alloc)
spa->spa_min_alloc = min_alloc;
vdev_spa_set_alloc(spa, min_alloc);
}
/*
@ -5688,6 +5720,7 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
objset_t *mos = spa->spa_meta_objset;
nvpair_t *elem = NULL;
uint64_t vdev_guid;
uint64_t objid;
nvlist_t *nvprops;
vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
@ -5698,31 +5731,28 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx)
if (vd == NULL)
return;
/*
* Set vdev property values in the vdev props mos object.
*/
if (vd->vdev_root_zap != 0) {
objid = vd->vdev_root_zap;
} else if (vd->vdev_top_zap != 0) {
objid = vd->vdev_top_zap;
} else if (vd->vdev_leaf_zap != 0) {
objid = vd->vdev_leaf_zap;
} else {
panic("unexpected vdev type");
}
mutex_enter(&spa->spa_props_lock);
while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
uint64_t intval, objid = 0;
uint64_t intval;
const char *strval;
vdev_prop_t prop;
const char *propname = nvpair_name(elem);
zprop_type_t proptype;
/*
* Set vdev property values in the vdev props mos object.
*/
if (vd->vdev_root_zap != 0) {
objid = vd->vdev_root_zap;
} else if (vd->vdev_top_zap != 0) {
objid = vd->vdev_top_zap;
} else if (vd->vdev_leaf_zap != 0) {
objid = vd->vdev_leaf_zap;
} else {
/*
* XXX: implement vdev_props_set_check()
*/
panic("vdev not root/top/leaf");
}
switch (prop = vdev_name_to_prop(propname)) {
case VDEV_PROP_USERPROP:
if (vdev_prop_user(propname)) {
@ -5791,6 +5821,12 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
ASSERT(vd != NULL);
/* Check that vdev has a zap we can use */
if (vd->vdev_root_zap == 0 &&
vd->vdev_top_zap == 0 &&
vd->vdev_leaf_zap == 0)
return (SET_ERROR(EINVAL));
if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
&vdev_guid) != 0)
return (SET_ERROR(EINVAL));

View File

@ -1398,7 +1398,7 @@ vdev_indirect_checksum_error(zio_t *zio,
vd->vdev_stat.vs_checksum_errors++;
mutex_exit(&vd->vdev_stat_lock);
zio_bad_cksum_t zbc = {{{ 0 }}};
zio_bad_cksum_t zbc = { 0 };
abd_t *bad_abd = ic->ic_data;
abd_t *good_abd = is->is_good_child->ic_data;
(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,

View File

@ -1785,7 +1785,7 @@ vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
static int
raidz_checksum_verify(zio_t *zio)
{
zio_bad_cksum_t zbc = {{{0}}};
zio_bad_cksum_t zbc = {0};
raidz_map_t *rm = zio->io_vsd;
int ret = zio_checksum_error(zio, &zbc);

View File

@ -754,10 +754,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
#define MAX_RANGES 16
typedef struct zfs_ecksum_info {
/* histograms of set and cleared bits by bit number in a 64-bit word */
uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY];
uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
/* inline arrays of bits set and cleared. */
uint64_t zei_bits_set[ZFM_MAX_INLINE];
uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
@ -781,7 +777,7 @@ typedef struct zfs_ecksum_info {
} zfs_ecksum_info_t;
static void
update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
update_bad_bits(uint64_t value_arg, uint32_t *count)
{
size_t i;
size_t bits = 0;
@ -789,10 +785,8 @@ update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count)
/* We store the bits in big-endian (largest-first) order */
for (i = 0; i < 64; i++) {
if (value & (1ull << i)) {
hist[63 - i]++;
if (value & (1ull << i))
++bits;
}
}
/* update the count of bits changed */
*count += bits;
@ -920,14 +914,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
if (info != NULL && info->zbc_has_cksum) {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
DATA_TYPE_UINT64_ARRAY,
sizeof (info->zbc_expected) / sizeof (uint64_t),
(uint64_t *)&info->zbc_expected,
FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
DATA_TYPE_UINT64_ARRAY,
sizeof (info->zbc_actual) / sizeof (uint64_t),
(uint64_t *)&info->zbc_actual,
FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
DATA_TYPE_STRING,
info->zbc_checksum_name,
@ -1010,10 +996,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
offset++;
}
update_histogram(set, eip->zei_histogram_set,
&eip->zei_range_sets[range]);
update_histogram(cleared, eip->zei_histogram_cleared,
&eip->zei_range_clears[range]);
update_bad_bits(set, &eip->zei_range_sets[range]);
update_bad_bits(cleared, &eip->zei_range_clears[range]);
}
/* convert to byte offsets */
@ -1049,15 +1033,6 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
DATA_TYPE_UINT8_ARRAY,
inline_size, (uint8_t *)eip->zei_bits_cleared,
NULL);
} else {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
DATA_TYPE_UINT8_ARRAY,
NBBY * sizeof (uint64_t), eip->zei_histogram_set,
FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
DATA_TYPE_UINT8_ARRAY,
NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
NULL);
}
return (eip);
}

View File

@ -839,7 +839,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
uint64_t zp_gen;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
/*
@ -889,6 +888,7 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
ASSERT(error == 0 || error == ENOENT);
} else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's
* written out and its checksum is being calculated
@ -917,8 +917,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
}
#endif
if (error == 0)
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
error = dmu_buf_hold_noread(os, object, offset, zgd,
&db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@ -1028,6 +1028,10 @@ zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
*
* On success, the function return the number of bytes copied in *lenp.
* Note, it doesn't return how much bytes are left to be copied.
* On errors which are caused by any file system limitations or
* brt limitations `EINVAL` is returned. In the most cases a user
* requested bad parameters, it could be possible to clone the file but
* some parameters don't match the requirements.
*/
int
zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
@ -1078,6 +1082,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
return (SET_ERROR(EXDEV));
}
/*
* outos and inos belongs to the same storage pool.
* see a few lines above, only one check.
*/
if (!spa_feature_is_enabled(dmu_objset_spa(outos),
SPA_FEATURE_BLOCK_CLONING)) {
zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
return (SET_ERROR(EOPNOTSUPP));
}
ASSERT(!outzfsvfs->z_replay);
error = zfs_verify_zp(inzp);
@ -1088,12 +1102,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
return (error);
}
if (!spa_feature_is_enabled(dmu_objset_spa(outos),
SPA_FEATURE_BLOCK_CLONING)) {
zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
return (SET_ERROR(EXDEV));
}
/*
* We don't copy source file's flags that's why we don't allow to clone
* files that are in quarantine.
@ -1167,7 +1175,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
* We cannot clone into files with different block size.
*/
if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
error = SET_ERROR(EXDEV);
error = SET_ERROR(EINVAL);
goto unlock;
}
@ -1175,7 +1183,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
* Offsets and len must be at block boundries.
*/
if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
error = SET_ERROR(EXDEV);
error = SET_ERROR(EINVAL);
goto unlock;
}
/*
@ -1183,7 +1191,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
*/
if ((len % inblksz) != 0 &&
(len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
error = SET_ERROR(EXDEV);
error = SET_ERROR(EINVAL);
goto unlock;
}
@ -1212,7 +1220,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
gid = KGID_TO_SGID(ZTOGID(outzp));
projid = outzp->z_projid;
bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
/*
* Clone the file in reasonable size chunks. Each chunk is cloned
@ -1238,13 +1246,11 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
&nbps);
if (error != 0) {
/*
* If we are tyring to clone a block that was created
* in the current transaction group. Return an error,
* so the caller can fallback to just copying the data.
* If we are trying to clone a block that was created
* in the current transaction group, error will be
* EAGAIN here, which we can just return to the caller
* so it can fallback if it likes.
*/
if (error == EAGAIN) {
error = SET_ERROR(EXDEV);
}
break;
}
/*
@ -1330,7 +1336,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
done += size;
}
kmem_free(bps, sizeof (bps[0]) * maxblocks);
vmem_free(bps, sizeof (bps[0]) * maxblocks);
zfs_znode_update_vfs(outzp);
unlock:

File diff suppressed because it is too large Load Diff

View File

@ -1596,6 +1596,19 @@ zio_shrink(zio_t *zio, uint64_t size)
}
}
/*
* Round provided allocation size up to a value that can be allocated
* by at least some vdev(s) in the pool with minimum or no additional
* padding and without extra space usage on others
*/
static uint64_t
zio_roundup_alloc_size(spa_t *spa, uint64_t size)
{
if (size > spa->spa_min_alloc)
return (roundup(size, spa->spa_gcd_alloc));
return (spa->spa_min_alloc);
}
/*
* ==========================================================================
* Prepare to read and write logical blocks
@ -1762,8 +1775,9 @@ zio_write_compress(zio_t *zio)
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
ASSERT(BP_IS_EMBEDDED(bp) || BP_IS_GANG(bp) ||
MIN(zp->zp_copies, spa_max_replication(spa))
== BP_GET_NDVAS(bp));
}
/* If it's a compressed write that is not raw, compress the buffer. */
@ -1802,9 +1816,8 @@ zio_write_compress(zio_t *zio)
* in that we charge for the padding used to fill out
* the last sector.
*/
ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
size_t rounded = (size_t)roundup(psize,
spa->spa_min_alloc);
size_t rounded = (size_t)zio_roundup_alloc_size(spa,
psize);
if (rounded >= lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
@ -1847,8 +1860,8 @@ zio_write_compress(zio_t *zio)
* take this codepath because it will change the on-disk block
* and decryption will fail.
*/
size_t rounded = MIN((size_t)roundup(psize,
spa->spa_min_alloc), lsize);
size_t rounded = MIN((size_t)zio_roundup_alloc_size(spa, psize),
lsize);
if (rounded != psize) {
abd_t *cdata = abd_alloc_linear(rounded, B_TRUE);
@ -3012,11 +3025,6 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
*/
pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
/*
* We didn't allocate this bp, so make sure it doesn't get unmarked.
*/
pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
zio_nowait(zio);
return (pio);
@ -3604,7 +3612,6 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
if (zio->io_flags & ZIO_FLAG_NODATA)
flags |= METASLAB_DONT_THROTTLE;
if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
@ -3764,7 +3771,7 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* of, so we just hash the objset ID to pick the allocator to get
* some parallelism.
*/
int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
int flags = METASLAB_ZIL;
int allocator = (uint_t)cityhash4(0, 0, 0,
os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
@ -4460,8 +4467,8 @@ zio_ready(zio_t *zio)
zio_t *pio, *pio_next;
zio_link_t *zl = NULL;
if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
ZIO_WAIT_READY)) {
if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, ZIO_WAIT_READY)) {
return (NULL);
}
@ -4919,12 +4926,6 @@ zio_done(zio_t *zio)
zfs_ereport_free_checksum(zcr);
}
if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
!BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
!(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
}
/*
* It is the responsibility of the done callback to ensure that this
* particular zio is no longer discoverable for adoption, and as

View File

@ -515,8 +515,6 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
}
if (info != NULL) {
info->zbc_expected = expected_cksum;
info->zbc_actual = actual_cksum;
info->zbc_checksum_name = ci->ci_name;
info->zbc_byteswapped = byteswap;
info->zbc_injected = 0;

View File

@ -698,7 +698,6 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
int error;
ASSERT3P(lwb, !=, NULL);
ASSERT3P(zio, !=, NULL);
ASSERT3U(size, !=, 0);
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
@ -717,6 +716,7 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
DMU_READ_NO_PREFETCH);
} else { /* indirect write */
ASSERT3P(zio, !=, NULL);
/*
* Have to lock the whole block to ensure when it's written out
* and its checksum is being calculated that no one can change
@ -727,8 +727,8 @@ zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
offset = P2ALIGN_TYPED(offset, size, uint64_t);
zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
size, RL_READER);
error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
&db);
if (error == 0) {
blkptr_t *bp = &lr->lr_blkptr;
@ -981,7 +981,7 @@ zvol_prefetch_minors_impl(void *arg)
job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
FTAG, &os);
if (job->error == 0) {
dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
dmu_objset_disown(os, B_TRUE, FTAG);
}
}

View File

@ -34,6 +34,17 @@ tags = ['functional', 'acl', 'posix-sa']
tests = ['atime_003_pos', 'root_relatime_on']
tags = ['functional', 'atime']
[tests/functional/block_cloning:Linux]
tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial',
'block_cloning_copyfilerange_fallback',
'block_cloning_ficlone', 'block_cloning_ficlonerange',
'block_cloning_ficlonerange_partial',
'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone',
'block_cloning_disabled_ficlonerange',
'block_cloning_copyfilerange_cross_dataset',
'block_cloning_copyfilerange_fallback_same_txg']
tags = ['functional', 'block_cloning']
[tests/functional/chattr:Linux]
tests = ['chattr_001_pos', 'chattr_002_neg']
tags = ['functional', 'chattr']

View File

@ -134,6 +134,12 @@ ci_reason = 'CI runner doesn\'t have all requirements'
#
idmap_reason = 'Idmapped mount needs kernel 5.12+'
#
# copy_file_range() is not supported by all kernels
#
cfr_reason = 'Kernel copy_file_range support required'
cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
#
# These tests are known to fail, thus we use this list to prevent these
# failures from failing the job as a whole; only unexpected failures
@ -288,6 +294,18 @@ elif sys.platform.startswith('linux'):
'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason],
'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason],
'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason],
'block_cloning/block_cloning_disabled_copyfilerange':
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange':
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange_partial':
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange_fallback':
['SKIP', cfr_reason],
'block_cloning/block_cloning_copyfilerange_cross_dataset':
['SKIP', cfr_cross_reason],
'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
['SKIP', cfr_cross_reason],
})

View File

@ -1,6 +1,7 @@
/badsend
/btree_test
/chg_usr_exec
/clonefile
/devname2devid
/dir_rd_update
/draid

View File

@ -119,6 +119,7 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2
scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest
scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet
scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util
scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile
%C%_idmap_util_LDADD = libspl.la

View File

@ -0,0 +1,333 @@
/*
* SPDX-License-Identifier: MIT
*
* Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/*
* This program is to test the availability and behaviour of copy_file_range,
* FICLONE, FICLONERANGE and FIDEDUPERANGE in the Linux kernel. It should
* compile and run even if these features aren't exposed through the libc.
*/
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdint.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <stdlib.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#ifndef __NR_copy_file_range
#if defined(__x86_64__)
#define __NR_copy_file_range (326)
#elif defined(__i386__)
#define __NR_copy_file_range (377)
#elif defined(__s390__)
#define __NR_copy_file_range (375)
#elif defined(__arm__)
#define __NR_copy_file_range (391)
#elif defined(__aarch64__)
#define __NR_copy_file_range (285)
#elif defined(__powerpc__)
#define __NR_copy_file_range (379)
#else
#error "no definition of __NR_copy_file_range for this platform"
#endif
#endif /* __NR_copy_file_range */
ssize_t
copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int)
__attribute__((weak));
static inline ssize_t
cf_copy_file_range(int sfd, loff_t *soff, int dfd, loff_t *doff,
size_t len, unsigned int flags)
{
if (copy_file_range)
return (copy_file_range(sfd, soff, dfd, doff, len, flags));
return (
syscall(__NR_copy_file_range, sfd, soff, dfd, doff, len, flags));
}
/* Define missing FICLONE */
#ifdef FICLONE
#define CF_FICLONE FICLONE
#else
#define CF_FICLONE _IOW(0x94, 9, int)
#endif
/* Define missing FICLONERANGE and support structs */
#ifdef FICLONERANGE
#define CF_FICLONERANGE FICLONERANGE
typedef struct file_clone_range cf_file_clone_range_t;
#else
typedef struct {
int64_t src_fd;
uint64_t src_offset;
uint64_t src_length;
uint64_t dest_offset;
} cf_file_clone_range_t;
#define CF_FICLONERANGE _IOW(0x94, 13, cf_file_clone_range_t)
#endif
/* Define missing FIDEDUPERANGE and support structs */
#ifdef FIDEDUPERANGE
#define CF_FIDEDUPERANGE FIDEDUPERANGE
#define CF_FILE_DEDUPE_RANGE_SAME FILE_DEDUPE_RANGE_SAME
#define CF_FILE_DEDUPE_RANGE_DIFFERS FILE_DEDUPE_RANGE_DIFFERS
typedef struct file_dedupe_range_info cf_file_dedupe_range_info_t;
typedef struct file_dedupe_range cf_file_dedupe_range_t;
#else
typedef struct {
int64_t dest_fd;
uint64_t dest_offset;
uint64_t bytes_deduped;
int32_t status;
uint32_t reserved;
} cf_file_dedupe_range_info_t;
typedef struct {
uint64_t src_offset;
uint64_t src_length;
uint16_t dest_count;
uint16_t reserved1;
uint32_t reserved2;
cf_file_dedupe_range_info_t info[0];
} cf_file_dedupe_range_t;
#define CF_FIDEDUPERANGE _IOWR(0x94, 54, cf_file_dedupe_range_t)
#define CF_FILE_DEDUPE_RANGE_SAME (0)
#define CF_FILE_DEDUPE_RANGE_DIFFERS (1)
#endif
typedef enum {
CF_MODE_NONE,
CF_MODE_CLONE,
CF_MODE_CLONERANGE,
CF_MODE_COPYFILERANGE,
CF_MODE_DEDUPERANGE,
} cf_mode_t;
static int
usage(void)
{
printf(
"usage:\n"
" FICLONE:\n"
" clonefile -c <src> <dst>\n"
" FICLONERANGE:\n"
" clonefile -r <src> <dst> <soff> <doff> <len>\n"
" copy_file_range:\n"
" clonefile -f <src> <dst> <soff> <doff> <len>\n"
" FIDEDUPERANGE:\n"
" clonefile -d <src> <dst> <soff> <doff> <len>\n");
return (1);
}
int do_clone(int sfd, int dfd);
int do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
int do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
int do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len);
int quiet = 0;
int
main(int argc, char **argv)
{
cf_mode_t mode = CF_MODE_NONE;
char c;
while ((c = getopt(argc, argv, "crfdq")) != -1) {
switch (c) {
case 'c':
mode = CF_MODE_CLONE;
break;
case 'r':
mode = CF_MODE_CLONERANGE;
break;
case 'f':
mode = CF_MODE_COPYFILERANGE;
break;
case 'd':
mode = CF_MODE_DEDUPERANGE;
break;
case 'q':
quiet = 1;
break;
}
}
if (mode == CF_MODE_NONE || (argc-optind) < 2 ||
(mode != CF_MODE_CLONE && (argc-optind) < 5))
return (usage());
loff_t soff = 0, doff = 0;
size_t len = 0;
if (mode != CF_MODE_CLONE) {
soff = strtoull(argv[optind+2], NULL, 10);
if (soff == ULLONG_MAX) {
fprintf(stderr, "invalid source offset");
return (1);
}
doff = strtoull(argv[optind+3], NULL, 10);
if (doff == ULLONG_MAX) {
fprintf(stderr, "invalid dest offset");
return (1);
}
len = strtoull(argv[optind+4], NULL, 10);
if (len == ULLONG_MAX) {
fprintf(stderr, "invalid length");
return (1);
}
}
int sfd = open(argv[optind], O_RDONLY);
if (sfd < 0) {
fprintf(stderr, "open: %s: %s\n",
argv[optind], strerror(errno));
return (1);
}
int dfd = open(argv[optind+1], O_WRONLY|O_CREAT,
S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (dfd < 0) {
fprintf(stderr, "open: %s: %s\n",
argv[optind+1], strerror(errno));
close(sfd);
return (1);
}
int err;
switch (mode) {
case CF_MODE_CLONE:
err = do_clone(sfd, dfd);
break;
case CF_MODE_CLONERANGE:
err = do_clonerange(sfd, dfd, soff, doff, len);
break;
case CF_MODE_COPYFILERANGE:
err = do_copyfilerange(sfd, dfd, soff, doff, len);
break;
case CF_MODE_DEDUPERANGE:
err = do_deduperange(sfd, dfd, soff, doff, len);
break;
default:
abort();
}
off_t spos = lseek(sfd, 0, SEEK_CUR);
off_t slen = lseek(sfd, 0, SEEK_END);
off_t dpos = lseek(dfd, 0, SEEK_CUR);
off_t dlen = lseek(dfd, 0, SEEK_END);
fprintf(stderr, "file offsets: src=%lu/%lu; dst=%lu/%lu\n", spos, slen,
dpos, dlen);
close(dfd);
close(sfd);
return (err == 0 ? 0 : 1);
}
int
do_clone(int sfd, int dfd)
{
fprintf(stderr, "using FICLONE\n");
int err = ioctl(dfd, CF_FICLONE, sfd);
if (err < 0) {
fprintf(stderr, "ioctl(FICLONE): %s\n", strerror(errno));
return (err);
}
return (0);
}
int
do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
{
fprintf(stderr, "using FICLONERANGE\n");
cf_file_clone_range_t fcr = {
.src_fd = sfd,
.src_offset = soff,
.src_length = len,
.dest_offset = doff,
};
int err = ioctl(dfd, CF_FICLONERANGE, &fcr);
if (err < 0) {
fprintf(stderr, "ioctl(FICLONERANGE): %s\n", strerror(errno));
return (err);
}
return (0);
}
int
do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
{
fprintf(stderr, "using copy_file_range\n");
ssize_t copied = cf_copy_file_range(sfd, &soff, dfd, &doff, len, 0);
if (copied < 0) {
fprintf(stderr, "copy_file_range: %s\n", strerror(errno));
return (1);
}
if (copied != len) {
fprintf(stderr, "copy_file_range: copied less than requested: "
"requested=%lu; copied=%lu\n", len, copied);
return (1);
}
return (0);
}
int
do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len)
{
fprintf(stderr, "using FIDEDUPERANGE\n");
char buf[sizeof (cf_file_dedupe_range_t)+
sizeof (cf_file_dedupe_range_info_t)] = {0};
cf_file_dedupe_range_t *fdr = (cf_file_dedupe_range_t *)&buf[0];
cf_file_dedupe_range_info_t *fdri =
(cf_file_dedupe_range_info_t *)
&buf[sizeof (cf_file_dedupe_range_t)];
fdr->src_offset = soff;
fdr->src_length = len;
fdr->dest_count = 1;
fdri->dest_fd = dfd;
fdri->dest_offset = doff;
int err = ioctl(sfd, CF_FIDEDUPERANGE, fdr);
if (err != 0)
fprintf(stderr, "ioctl(FIDEDUPERANGE): %s\n", strerror(errno));
if (fdri->status < 0) {
fprintf(stderr, "dedup failed: %s\n", strerror(-fdri->status));
err = -1;
} else if (fdri->status == CF_FILE_DEDUPE_RANGE_DIFFERS) {
fprintf(stderr, "dedup failed: range differs\n");
err = -1;
}
return (err);
}

View File

@ -44,6 +44,7 @@
#include <fcntl.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <time.h>
int

View File

@ -182,6 +182,7 @@ export ZFS_FILES='zdb
export ZFSTEST_FILES='badsend
btree_test
chg_usr_exec
clonefile
devname2devid
dir_rd_update
draid

View File

@ -90,6 +90,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \
functional/alloc_class/alloc_class.kshlib \
functional/atime/atime.cfg \
functional/atime/atime_common.kshlib \
functional/block_cloning/block_cloning.kshlib \
functional/cache/cache.cfg \
functional/cache/cache.kshlib \
functional/cachefile/cachefile.cfg \
@ -437,6 +438,19 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/atime/root_atime_on.ksh \
functional/atime/root_relatime_on.ksh \
functional/atime/setup.ksh \
functional/block_cloning/cleanup.ksh \
functional/block_cloning/setup.ksh \
functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
functional/block_cloning/block_cloning_copyfilerange.ksh \
functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
functional/block_cloning/block_cloning_disabled_ficlone.ksh \
functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
functional/block_cloning/block_cloning_ficlone.ksh \
functional/block_cloning/block_cloning_ficlonerange.ksh \
functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
functional/bootfs/bootfs_001_pos.ksh \
functional/bootfs/bootfs_002_neg.ksh \
functional/bootfs/bootfs_003_pos.ksh \

View File

@ -0,0 +1,54 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
function have_same_content
{
typeset hash1=$(cat $1 | md5sum)
typeset hash2=$(cat $2 | md5sum)
log_must [ "$hash1" = "$hash2" ]
}
#
# get_same_blocks dataset1 path/to/file1 dataset2 path/to/file2
#
# Returns a space-separated list of the indexes (starting at 0) of the L0
# blocks that are shared between both files (by first DVA and checksum).
# Assumes that the two files have the same content, use have_same_content to
# confirm that.
#
function get_same_blocks
{
typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$
zdb -vvvvv $1 -O $2 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a
zdb -vvvvv $3 -O $4 | \
awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b
echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ')
}

View File

@ -0,0 +1,60 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
claim="The copy_file_range syscall can clone whole files."
log_assert $claim
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
log_must sync_pool $TESTPOOL
log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
log_must [ "$blocks" = "0 1 2 3" ]
log_pass $claim

View File

@ -0,0 +1,65 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then
log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3"
fi
claim="The copy_file_range syscall can clone across datasets."
log_assert $claim
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must zfs create $TESTPOOL/$TESTFS1
log_must zfs create $TESTPOOL/$TESTFS2
log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS1/file1 bs=128K count=4
log_must sync_pool $TESTPOOL
log_must \
clonefile -f /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/$TESTFS1/file1 /$TESTPOOL/$TESTFS2/file2
typeset blocks=$(get_same_blocks \
$TESTPOOL/$TESTFS1 file1 $TESTPOOL/$TESTFS2 file2)
log_must [ "$blocks" = "0 1 2 3" ]
log_pass $claim

View File

@ -0,0 +1,86 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
claim="copy_file_range will fall back to copy when cloning not possible."
log_assert $claim
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must sync_pool $TESTPOOL
log_note "Copying entire file with copy_file_range"
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "0 1 2 3" ]
log_note "Copying within a block with copy_file_range"
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "1 2 3" ]
log_note "Copying across a block with copy_file_range"
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "1" ]
log_pass $claim

View File

@ -0,0 +1,66 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
# Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
claim="copy_file_range will fall back to copy when cloning on same txg"
log_assert $claim
typeset timeout=$(get_tunable TXG_TIMEOUT)
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
set_tunable64 TXG_TIMEOUT $timeout
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must set_tunable64 TXG_TIMEOUT 5000
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4
log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone
typeset blocks=$(get_same_blocks $TESTPOOL file $TESTPOOL clone)
log_must [ "$blocks" = "" ]
log_pass $claim

View File

@ -0,0 +1,68 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
verify_runnable "global"
if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then
log_unsupported "copy_file_range not available before Linux 4.5"
fi
claim="The copy_file_range syscall can clone parts of a file."
log_assert $claim
function cleanup
{
datasetexists $TESTPOOL && destroy_pool $TESTPOOL
}
log_onexit cleanup
log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS
log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=128K count=4
log_must sync_pool $TESTPOOL
log_must dd if=/$TESTPOOL/file1 of=/$TESTPOOL/file2 bs=128K count=4
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
log_must [ "$blocks" = "" ]
log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 131072 131072 262144
log_must sync_pool $TESTPOOL
log_must have_same_content /$TESTPOOL/file1 /$TESTPOOL/file2
typeset blocks=$(get_same_blocks $TESTPOOL file1 $TESTPOOL file2)
log_must [ "$blocks" = "1 2" ]
log_pass $claim

Some files were not shown because too many files have changed in this diff Show More