Notable upstream pull request merges:
 #15469 cbe882298 Add slow disk diagnosis to ZED
 #15857 d0d273320 Update zfs-snapshot.8
 #15864 a5a725440 zfs list: add '-t fs' and '-t vol' options
 #15874 6cc93ccde BRT: Fix slop space calculation with block cloning
 #15882 a0635ae73 zdb: Fix false leak report for BRT objects

Obtained from:	OpenZFS
OpenZFS commit:	e0bd8118d0
This commit is contained in:
Martin Matuska 2024-02-15 10:21:13 +01:00
commit e2257b3168
40 changed files with 747 additions and 158 deletions

View file

@ -7952,6 +7952,17 @@ dump_mos_leaks(spa_t *spa)
}
}
if (spa->spa_brt != NULL) {
brt_t *brt = spa->spa_brt;
for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid];
if (brtvd != NULL && brtvd->bv_initiated) {
mos_obj_refd(brtvd->bv_mos_brtvdev);
mos_obj_refd(brtvd->bv_mos_entries);
}
}
}
/*
* Visit all allocated objects and make sure they are referenced.
*/

View file

@ -22,6 +22,7 @@
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
*
* Copyright (c) 2016, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/
/*
@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name)
if (strcmp(name, "spare_on_remove") == 0)
return (1);
if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0)
return (10); /* N = 10 events */
return (0);
}
int64_t
fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name)
{
(void) hdl;
/*
* These can be looked up in mp->modinfo->fmdi_props
* For now we just hard code for phase 2. In the
* future, there can be a ZED based override.
*/
if (strcmp(name, "remove_timeout") == 0)
return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */
if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0)
return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */
return (0);
}
@ -535,6 +514,19 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name)
return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL);
}
int
fmd_serd_active(fmd_hdl_t *hdl, const char *name)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
return (0);
}
return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp));
}
void
fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
{
@ -543,12 +535,10 @@ fmd_serd_reset(fmd_hdl_t *hdl, const char *name)
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name);
return;
} else {
fmd_serd_eng_reset(sgp);
fmd_hdl_debug(hdl, "serd_reset %s", name);
}
fmd_serd_eng_reset(sgp);
fmd_hdl_debug(hdl, "serd_reset %s", name);
}
int
@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_eng_t *sgp;
int err;
if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) {
zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'",
name);
return (0);
}
err = fmd_serd_eng_record(sgp, ep->ev_hrt);
return (fmd_serd_eng_record(sgp, ep->ev_hrt));
}
return (err);
void
fmd_serd_gc(fmd_hdl_t *hdl)
{
fmd_module_t *mp = (fmd_module_t *)hdl;
fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL);
}
/* FMD Timers */
@ -579,7 +574,7 @@ _timer_notify(union sigval sv)
const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops;
struct itimerspec its;
fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid);
fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid);
/* disarm the timer */
memset(&its, 0, sizeof (struct itimerspec));

View file

@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list);
extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...);
extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *);
extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *);
#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */
#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */
@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *);
extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t);
extern void fmd_serd_destroy(fmd_hdl_t *, const char *);
extern int fmd_serd_exists(fmd_hdl_t *, const char *);
extern int fmd_serd_active(fmd_hdl_t *, const char *);
extern void fmd_serd_reset(fmd_hdl_t *, const char *);
extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *);
extern int fmd_serd_fired(fmd_hdl_t *, const char *);
extern int fmd_serd_empty(fmd_hdl_t *, const char *);
extern void fmd_serd_gc(fmd_hdl_t *);
extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t);
extern void fmd_timer_remove(fmd_hdl_t *, id_t);

View file

@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp)
}
void
fmd_serd_eng_gc(fmd_serd_eng_t *sgp)
fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg)
{
(void) arg;
fmd_serd_elem_t *sep, *nep;
hrtime_t hrt;

View file

@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *);
extern int fmd_serd_eng_empty(fmd_serd_eng_t *);
extern void fmd_serd_eng_reset(fmd_serd_eng_t *);
extern void fmd_serd_eng_gc(fmd_serd_eng_t *);
extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *);
#ifdef __cplusplus
}

View file

@ -23,6 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/
#include <stddef.h>
@ -47,11 +48,16 @@
#define DEFAULT_CHECKSUM_T 600 /* seconds */
#define DEFAULT_IO_N 10 /* events */
#define DEFAULT_IO_T 600 /* seconds */
#define DEFAULT_SLOW_IO_N 10 /* events */
#define DEFAULT_SLOW_IO_T 30 /* seconds */
#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */
/*
* Our serd engines are named 'zfs_<pool_guid>_<vdev_guid>_{checksum,io}'. This
* #define reserves enough space for two 64-bit hex values plus the length of
* the longest string.
* Our serd engines are named in the following format:
* 'zfs_<pool_guid>_<vdev_guid>_{checksum,io,slow_io}'
* This #define reserves enough space for two 64-bit hex values plus the
* length of the longest string.
*/
#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum"))
@ -68,6 +74,7 @@ typedef struct zfs_case_data {
int zc_pool_state;
char zc_serd_checksum[MAX_SERDLEN];
char zc_serd_io[MAX_SERDLEN];
char zc_serd_slow_io[MAX_SERDLEN];
int zc_has_remove_timer;
} zfs_case_data_t;
@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = {
{ "resource_drops", FMD_TYPE_UINT64, "resource related ereports" }
};
static hrtime_t zfs_remove_timeout;
/* wait 15 seconds after a removal */
static hrtime_t zfs_remove_timeout = SEC2NSEC(15);
uu_list_pool_t *zfs_case_pool;
uu_list_t *zfs_cases;
@ -124,6 +132,8 @@ uu_list_t *zfs_cases;
#define ZFS_MAKE_EREPORT(type) \
FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type
static void zfs_purge_cases(fmd_hdl_t *hdl);
/*
* Write out the persistent representation of an active case.
*/
@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp)
return (zcp);
}
/*
* count other unique slow-io cases in a pool
*/
static uint_t
zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case)
{
zfs_case_t *zcp;
uint_t cases = 0;
static hrtime_t next_check = 0;
/*
* Note that plumbing in some external GC would require adding locking,
* since most of this module code is not thread safe and assumes there
* is only one thread running against the module. So we perform GC here
* inline periodically so that future delay induced faults will be
* possible once the issue causing multiple vdev delays is resolved.
*/
if (gethrestime_sec() > next_check) {
/* Periodically purge old SERD entries and stale cases */
fmd_serd_gc(hdl);
zfs_purge_cases(hdl);
next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS;
}
for (zcp = uu_list_first(zfs_cases); zcp != NULL;
zcp = uu_list_next(zfs_cases, zcp)) {
if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid &&
zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid &&
zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) {
cases++;
}
}
return (cases);
}
/*
* Iterate over any active cases. If any cases are associated with a pool or
* vdev which is no longer present on the system, close the associated case.
@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid,
(long long unsigned int)vdev_guid, type);
}
static void
zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp)
{
fmd_hdl_debug(hdl, "retiring case");
fmd_case_close(hdl, zcp->zc_case);
}
/*
* Solve a given ZFS case. This first checks to make sure the diagnosis is
* still valid, as well as cleaning up any pending timer associated with the
@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 ||
strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 ||
strcmp(class,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) {
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) {
zfs_stats.resource_drops.fmds_value.ui64++;
return;
}
@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (zcp->zc_data.zc_serd_checksum[0] != '\0')
fmd_serd_reset(hdl,
zcp->zc_data.zc_serd_checksum);
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
fmd_serd_reset(hdl,
zcp->zc_data.zc_serd_slow_io);
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) {
uint64_t state = 0;
@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
if (fmd_case_solved(hdl, zcp->zc_case))
return;
fmd_hdl_debug(hdl, "error event '%s'", class);
if (vdev_guid)
fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class,
vdev_guid);
else
fmd_hdl_debug(hdl, "error event '%s'", class);
/*
* Determine if we should solve the case and generate a fault. We solve
@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) ||
fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) ||
fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) {
const char *failmode = NULL;
boolean_t checkremove = B_FALSE;
@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
}
if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep))
checkremove = B_TRUE;
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) {
uint64_t slow_io_n, slow_io_t;
/*
* Create a slow io SERD engine when the VDEV has the
* 'vdev_slow_io_n' and 'vdev_slow_io_n' properties.
*/
if (zcp->zc_data.zc_serd_slow_io[0] == '\0' &&
nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
&slow_io_n) == 0 &&
nvlist_lookup_uint64(nvl,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
&slow_io_t) == 0) {
zfs_serd_name(zcp->zc_data.zc_serd_slow_io,
pool_guid, vdev_guid, "slow_io");
fmd_serd_create(hdl,
zcp->zc_data.zc_serd_slow_io,
slow_io_n,
SEC2NSEC(slow_io_t));
zfs_case_serialize(zcp);
}
/* Pass event to SERD engine and see if this triggers */
if (zcp->zc_data.zc_serd_slow_io[0] != '\0' &&
fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io,
ep)) {
/*
* Ignore a slow io diagnosis when other
* VDEVs in the pool show signs of being slow.
*/
if (zfs_other_slow_cases(hdl, &zcp->zc_data)) {
zfs_case_retire(hdl, zcp);
fmd_hdl_debug(hdl, "pool %llu has "
"multiple slow io cases -- skip "
"degrading vdev %llu",
(u_longlong_t)
zcp->zc_data.zc_pool_guid,
(u_longlong_t)
zcp->zc_data.zc_vdev_guid);
} else {
zfs_case_solve(hdl, zcp,
"fault.fs.zfs.vdev.slow_io");
}
}
} else if (fmd_nvl_class_match(hdl, nvl,
ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) {
/*
@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum);
if (zcp->zc_data.zc_serd_io[0] != '\0')
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io);
if (zcp->zc_data.zc_serd_slow_io[0] != '\0')
fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io);
if (zcp->zc_data.zc_has_remove_timer)
fmd_timer_remove(hdl, zcp->zc_remove_timer);
@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs)
fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t));
}
/*
* We use the fmd gc entry point to look for old cases that no longer apply.
* This allows us to keep our set of case data small in a long running system.
*/
static void
zfs_fm_gc(fmd_hdl_t *hdl)
{
zfs_purge_cases(hdl);
}
static const fmd_hdl_ops_t fmd_ops = {
zfs_fm_recv, /* fmdo_recv */
zfs_fm_timeout, /* fmdo_timeout */
zfs_fm_close, /* fmdo_close */
NULL, /* fmdo_stats */
zfs_fm_gc, /* fmdo_gc */
NULL, /* fmdo_gc */
};
static const fmd_prop_t fmd_props[] = {
{ "checksum_N", FMD_TYPE_UINT32, "10" },
{ "checksum_T", FMD_TYPE_TIME, "10min" },
{ "io_N", FMD_TYPE_UINT32, "10" },
{ "io_T", FMD_TYPE_TIME, "10min" },
{ "remove_timeout", FMD_TYPE_TIME, "15sec" },
{ NULL, 0, NULL }
};
@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl)
(void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) /
sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats);
zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout");
}
void

View file

@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
} else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.vdev.checksum")) {
degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.vdev.slow_io")) {
degrade_device = B_TRUE;
} else if (fmd_nvl_class_match(hdl, fault,
"fault.fs.zfs.device")) {
fault_device = B_FALSE;

View file

@ -3672,15 +3672,25 @@ zfs_do_list(int argc, char **argv)
for (char *tok; (tok = strsep(&optarg, ",")); ) {
static const char *const type_subopts[] = {
"filesystem", "volume",
"snapshot", "snap",
"filesystem",
"fs",
"volume",
"vol",
"snapshot",
"snap",
"bookmark",
"all" };
"all"
};
static const int type_types[] = {
ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME,
ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT,
ZFS_TYPE_FILESYSTEM,
ZFS_TYPE_FILESYSTEM,
ZFS_TYPE_VOLUME,
ZFS_TYPE_VOLUME,
ZFS_TYPE_SNAPSHOT,
ZFS_TYPE_SNAPSHOT,
ZFS_TYPE_BOOKMARK,
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK };
ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK
};
for (c = 0; c < ARRAY_SIZE(type_subopts); ++c)
if (strcmp(tok, type_subopts[c]) == 0) {

View file

@ -1083,6 +1083,22 @@ main(int argc, char **argv)
libzfs_fini(g_zfs);
return (1);
}
if (record.zi_nlanes) {
switch (io_type) {
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
case ZIO_TYPES:
break;
default:
(void) fprintf(stderr, "I/O type for a delay "
"must be 'read' or 'write'\n");
usage();
libzfs_fini(g_zfs);
return (1);
}
}
if (!error)
error = ENXIO;

View file

@ -2569,7 +2569,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
break;
case VDEV_AUX_ERR_EXCEEDED:
(void) printf(gettext("too many errors"));
if (vs->vs_read_errors + vs->vs_write_errors +
vs->vs_checksum_errors == 0 && children == 0 &&
vs->vs_slow_ios > 0) {
(void) printf(gettext("too many slow I/Os"));
} else {
(void) printf(gettext("too many errors"));
}
break;
case VDEV_AUX_IO_FAILURE:

View file

@ -104,7 +104,7 @@ typedef struct taskq {
/* list node for the cpu hotplug callback */
struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support;
unsigned long lastshouldstop; /* when to purge dynamic */
unsigned long lastspawnstop; /* when to purge dynamic */
} taskq_t;
typedef struct taskq_ent {

View file

@ -82,6 +82,8 @@ extern "C" {
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t"
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"

View file

@ -366,6 +366,8 @@ typedef enum {
VDEV_PROP_IO_N,
VDEV_PROP_IO_T,
VDEV_PROP_RAIDZ_EXPANDING,
VDEV_PROP_SLOW_IO_N,
VDEV_PROP_SLOW_IO_T,
VDEV_NUM_PROPS
} vdev_prop_t;

View file

@ -22,6 +22,7 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2020 by Delphix. All rights reserved.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2023, Klara Inc.
*/
#ifndef _SYS_VDEV_IMPL_H
@ -454,12 +455,14 @@ struct vdev {
zfs_ratelimit_t vdev_checksum_rl;
/*
* Checksum and IO thresholds for tuning ZED
* Vdev properties for tuning ZED
*/
uint64_t vdev_checksum_n;
uint64_t vdev_checksum_t;
uint64_t vdev_io_n;
uint64_t vdev_io_t;
uint64_t vdev_slow_io_n;
uint64_t vdev_slow_io_t;
};
#define VDEV_PAD_SIZE (8 << 10)

View file

@ -5626,7 +5626,9 @@
<enumerator name='VDEV_PROP_IO_N' value='44'/>
<enumerator name='VDEV_PROP_IO_T' value='45'/>
<enumerator name='VDEV_PROP_RAIDZ_EXPANDING' value='46'/>
<enumerator name='VDEV_NUM_PROPS' value='47'/>
<enumerator name='VDEV_PROP_SLOW_IO_N' value='47'/>
<enumerator name='VDEV_PROP_SLOW_IO_T' value='48'/>
<enumerator name='VDEV_NUM_PROPS' value='49'/>
</enum-decl>
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

View file

@ -5264,6 +5264,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name,
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
if (intval == UINT64_MAX) {
(void) strlcpy(buf, "-", len);
} else {

View file

@ -1704,7 +1704,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop,
(prop == VDEV_PROP_CHECKSUM_N ||
prop == VDEV_PROP_CHECKSUM_T ||
prop == VDEV_PROP_IO_N ||
prop == VDEV_PROP_IO_T)) {
prop == VDEV_PROP_IO_T ||
prop == VDEV_PROP_SLOW_IO_N ||
prop == VDEV_PROP_SLOW_IO_T)) {
*ivalp = UINT64_MAX;
}

View file

@ -186,18 +186,8 @@ reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
(Linux-only)
How long a taskq has to have had no work before we tear it down.
Previously, we would tear down a dynamic taskq worker as soon
as we noticed it had no work, but it was observed that this led
to a lot of churn in tearing down things we then immediately
spawned anew.
In practice, it seems any nonzero value will remove the vast
majority of this churn, while the nontrivially larger value
was chosen to help filter out the little remaining churn on
a mostly idle system.
Setting this value to
.Sy 0
will revert to the previous behavior.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
Minimum idle threads exit interval for dynamic taskqs.
Smaller values allow idle threads exit more often and potentially be
respawned again on demand, causing more churn.
.El

View file

@ -44,7 +44,7 @@ section, below.
Every vdev has a set of properties that export statistics about the vdev
as well as control various behaviors.
Properties are not inherited from top-level vdevs, with the exception of
checksum_n, checksum_t, io_n, and io_t.
checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
.Pp
The values of numeric properties can be specified using human-readable suffixes
.Po for example,
@ -117,7 +117,7 @@ If this device is currently being removed from the pool
.Pp
The following native properties can be used to change the behavior of a vdev.
.Bl -tag -width "allocating"
.It Sy checksum_n , checksum_t , io_n , io_t
.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
Tune the fault management daemon by specifying checksum/io thresholds of <N>
errors in <T> seconds, respectively.
These properties can be set on leaf and top-level vdevs.

View file

@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning.
The underlying conditions are as follows:
.Bl -bullet -compact
.It
The number of checksum errors exceeds acceptable levels and the device is
degraded as an indication that something may be wrong.
The number of checksum errors or slow I/Os exceeds acceptable levels and the
device is degraded as an indication that something may be wrong.
ZFS continues to use the device as necessary.
.It
The number of I/O errors exceeds acceptable levels.

View file

@ -29,7 +29,7 @@
.\" Copyright 2018 Nexenta Systems, Inc.
.\" Copyright 2019 Joyent, Inc.
.\"
.Dd March 16, 2022
.Dd February 8, 2024
.Dt ZFS-LIST 8
.Os
.
@ -155,6 +155,15 @@ or
For example, specifying
.Fl t Sy snapshot
displays only snapshots.
.Sy fs ,
.Sy snap ,
or
.Sy vol
can be used as aliases for
.Sy filesystem ,
.Sy snapshot ,
or
.Sy volume .
.El
.
.Sh EXAMPLES

View file

@ -44,13 +44,21 @@
.Ar dataset Ns @ Ns Ar snapname Ns
.
.Sh DESCRIPTION
All previous modifications by successful system calls to the file system are
part of the snapshots.
Snapshots are taken atomically, so that all snapshots correspond to the same
moment in time.
Creates a snapshot of a dataset or multiple snapshots of different
datasets.
.Pp
Snapshots are created atomically.
That is, a snapshot is a consistent image of a dataset at a specific
point in time; it includes all modifications to the dataset made by
system calls that have successfully completed before that point in time.
Recursive snapshots created through the
.Fl r
option are all created at the same time.
.Pp
.Nm zfs Cm snap
can be used as an alias for
.Nm zfs Cm snapshot .
.Pp
See the
.Sx Snapshots
section of

View file

@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state.
.Nm zinject
.Fl d Ar vdev
.Fl D Ar latency : Ns Ar lanes
.Op Fl T Ar read|write
.Ar pool
.Xc
Add an artificial delay to I/O requests on a particular

View file

@ -36,12 +36,12 @@ static int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
static uint_t spl_taskq_thread_timeout_ms = 10000;
static uint_t spl_taskq_thread_timeout_ms = 5000;
/* BEGIN CSTYLED */
module_param(spl_taskq_thread_timeout_ms, uint, 0644);
/* END CSTYLED */
MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
"Time to require a dynamic thread be idle before it gets cleaned up");
"Minimum idle threads exit interval for dynamic taskqs");
static int spl_taskq_thread_dynamic = 1;
module_param(spl_taskq_thread_dynamic, int, 0444);
@ -594,8 +594,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
ASSERT(tq->tq_nactive <= tq->tq_nthreads);
if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
/* Dynamic taskq may be able to spawn another thread */
if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
taskq_thread_spawn(tq) == 0)
if (taskq_thread_spawn(tq) == 0)
goto out;
}
@ -629,11 +628,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
spin_unlock(&t->tqent_lock);
wake_up(&tq->tq_work_waitq);
out:
/* Spawn additional taskq threads if required. */
if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
out:
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
return (rc);
}
@ -676,10 +675,11 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
spin_unlock(&t->tqent_lock);
out:
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
out:
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
return (rc);
}
@ -704,9 +704,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
/* Dynamic taskq may be able to spawn another thread */
if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
taskq_thread_spawn(tq) == 0)
goto out2;
if (taskq_thread_spawn(tq) == 0)
goto out;
flags |= TQ_FRONT;
}
@ -742,11 +741,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
spin_unlock(&t->tqent_lock);
wake_up(&tq->tq_work_waitq);
out:
/* Spawn additional taskq threads if required. */
if (tq->tq_nactive == tq->tq_nthreads)
(void) taskq_thread_spawn(tq);
out2:
out:
spin_unlock_irqrestore(&tq->tq_lock, irqflags);
}
EXPORT_SYMBOL(taskq_dispatch_ent);
@ -825,6 +824,7 @@ taskq_thread_spawn(taskq_t *tq)
if (!(tq->tq_flags & TASKQ_DYNAMIC))
return (0);
tq->lastspawnstop = jiffies;
if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
(tq->tq_flags & TASKQ_ACTIVE)) {
spawning = (++tq->tq_nspawn);
@ -836,9 +836,9 @@ taskq_thread_spawn(taskq_t *tq)
}
/*
* Threads in a dynamic taskq should only exit once it has been completely
* drained and no other threads are actively servicing tasks. This prevents
* threads from being created and destroyed more than is required.
* Threads in a dynamic taskq may exit once there is no more work to do.
* To prevent threads from being created and destroyed too often limit
* the exit rate to one per spl_taskq_thread_timeout_ms.
*
* The first thread is the thread list is treated as the primary thread.
* There is nothing special about the primary thread but in order to avoid
@ -847,44 +847,22 @@ taskq_thread_spawn(taskq_t *tq)
static int
taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
{
if (!(tq->tq_flags & TASKQ_DYNAMIC))
ASSERT(!taskq_next_ent(tq));
if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic)
return (0);
if (!(tq->tq_flags & TASKQ_ACTIVE))
return (1);
if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
tqt_thread_list) == tqt)
return (0);
int no_work =
((tq->tq_nspawn == 0) && /* No threads are being spawned */
(tq->tq_nactive == 0) && /* No threads are handling tasks */
(tq->tq_nthreads > 1) && /* More than 1 thread is running */
(!taskq_next_ent(tq)) && /* There are no pending tasks */
(spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
/*
* If we would have said stop before, let's instead wait a bit, maybe
* we'll see more work come our way soon...
*/
if (no_work) {
/* if it's 0, we want the old behavior. */
/* if the taskq is being torn down, we also want to go away. */
if (spl_taskq_thread_timeout_ms == 0 ||
!(tq->tq_flags & TASKQ_ACTIVE))
return (1);
unsigned long lasttime = tq->lastshouldstop;
if (lasttime > 0) {
if (time_after(jiffies, lasttime +
msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
return (1);
else
return (0);
} else {
tq->lastshouldstop = jiffies;
}
} else {
tq->lastshouldstop = 0;
}
return (0);
ASSERT3U(tq->tq_nthreads, >, 1);
if (tq->tq_nspawn != 0)
return (0);
if (time_before(jiffies, tq->lastspawnstop +
msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
return (0);
tq->lastspawnstop = jiffies;
return (1);
}
static int
@ -935,10 +913,8 @@ taskq_thread(void *args)
if (list_empty(&tq->tq_pend_list) &&
list_empty(&tq->tq_prio_list)) {
if (taskq_thread_should_stop(tq, tqt)) {
wake_up_all(&tq->tq_wait_waitq);
if (taskq_thread_should_stop(tq, tqt))
break;
}
add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
spin_unlock_irqrestore(&tq->tq_lock, flags);
@ -1013,9 +989,6 @@ taskq_thread(void *args)
tqt->tqt_id = TASKQID_INVALID;
tqt->tqt_flags = 0;
wake_up_all(&tq->tq_wait_waitq);
} else {
if (taskq_thread_should_stop(tq, tqt))
break;
}
set_current_state(TASK_INTERRUPTIBLE);
@ -1122,7 +1095,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
tq->tq_flags = (flags | TASKQ_ACTIVE);
tq->tq_next_id = TASKQID_INITIAL;
tq->tq_lowest_id = TASKQID_INITIAL;
tq->lastshouldstop = 0;
tq->lastspawnstop = jiffies;
INIT_LIST_HEAD(&tq->tq_free_list);
INIT_LIST_HEAD(&tq->tq_pend_list);
INIT_LIST_HEAD(&tq->tq_prio_list);

View file

@ -431,6 +431,12 @@ vdev_prop_init(void)
zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "IO_T", B_FALSE,
sfeatures);
zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<events>", "SLOW_IO_N", B_FALSE,
sfeatures);
zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX,
PROP_DEFAULT, ZFS_TYPE_VDEV, "<seconds>", "SLOW_IO_T", B_FALSE,
sfeatures);
/* default index (boolean) properties */
zprop_register_index(VDEV_PROP_REMOVING, "removing", 0,

View file

@ -1837,7 +1837,8 @@ spa_get_slop_space(spa_t *spa)
* deduplicated data, so since it's not useful to reserve more
* space with more deduplicated data, we subtract that out here.
*/
space = spa_get_dspace(spa) - spa->spa_dedup_dspace;
space =
spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa);
slop = MIN(space >> spa_slop_shift, spa_max_slop);
/*

View file

@ -677,6 +677,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
list_link_init(&vd->vdev_config_dirty_node);
list_link_init(&vd->vdev_state_dirty_node);
@ -3755,6 +3757,18 @@ vdev_load(vdev_t *vd)
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
&vd->vdev_slow_io_n);
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
&vd->vdev_slow_io_t);
if (error && error != ENOENT)
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
"failed [error=%d]", (u_longlong_t)zapobj, error);
}
/*
@ -5970,6 +5984,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
}
vd->vdev_io_t = intval;
break;
case VDEV_PROP_SLOW_IO_N:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
vd->vdev_slow_io_n = intval;
break;
case VDEV_PROP_SLOW_IO_T:
if (nvpair_value_uint64(elem, &intval) != 0) {
error = EINVAL;
break;
}
vd->vdev_slow_io_t = intval;
break;
default:
/* Most processing is done in vdev_props_set_sync */
break;
@ -6313,6 +6341,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
case VDEV_PROP_CHECKSUM_T:
case VDEV_PROP_IO_N:
case VDEV_PROP_IO_T:
case VDEV_PROP_SLOW_IO_N:
case VDEV_PROP_SLOW_IO_T:
err = vdev_prop_get_int(vd, prop, &intval);
if (err && err != ENOENT)
break;

View file

@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
case VDEV_PROP_IO_T:
propval = vd->vdev_io_t;
break;
case VDEV_PROP_SLOW_IO_N:
propval = vd->vdev_slow_io_n;
break;
case VDEV_PROP_SLOW_IO_T:
propval = vd->vdev_slow_io_t;
break;
default:
propval = propdef;
break;
@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
NULL);
}
if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
uint64_t slow_io_n, slow_io_t;
slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
DATA_TYPE_UINT64,
slow_io_n,
NULL);
slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
DATA_TYPE_UINT64,
slow_io_t,
NULL);
}
mutex_exit(&spa->spa_errlist_lock);
*ereport_out = ereport;

View file

@ -605,6 +605,10 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
handler->zi_record.zi_iotype != zio->io_type)
continue;
/*
* Defensive; should never happen as the array allocation
* occurs prior to inserting this handler on the list.

View file

@ -104,7 +104,8 @@ tags = ['functional', 'devices']
[tests/functional/events:Linux]
tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config']
'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
'zed_slow_io', 'zed_slow_io_many_vdevs']
tags = ['functional', 'events']
[tests/functional/fadvise:Linux]

View file

@ -80,7 +80,7 @@ export TESTPOOL=testpool
export TESTPOOL1=testpool1
export TESTPOOL2=testpool2
export TESTPOOL3=testpool3
export PERFPOOL=perfpool
export PERFPOOL=${PERFPOOL:-perfpool}
# some test file system names
export TESTFS=testfs

View file

@ -1447,6 +1447,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/events/zed_fd_spill.ksh \
functional/events/zed_io_config.ksh \
functional/events/zed_rc_filter.ksh \
functional/events/zed_slow_io.ksh \
functional/events/zed_slow_io_many_vdevs.ksh \
functional/exec/cleanup.ksh \
functional/exec/exec_001_pos.ksh \
functional/exec/exec_002_neg.ksh \

View file

@ -70,4 +70,6 @@ typeset -a properties=(
checksum_t
io_n
io_t
slow_io_n
slow_io_t
)

View file

@ -26,8 +26,10 @@
. $STF_SUITE/include/libtest.shlib
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
zed_stop
zed_cleanup all-debug.sh all-syslog.sh all-dumpfds
zed_events_drain
default_cleanup

View file

@ -0,0 +1,205 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
# DESCRIPTION:
# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED.
#
# STRATEGY:
# 1. Create a pool with single vdev
# 2. Set slow_io_n/slow_io_t to non-default values
# 3. Inject slow io errors
# 4. Verify that ZED degrades vdev
#
. $STF_SUITE/include/libtest.shlib
TESTDIR="$TEST_BASE_DIR/zed_slow_io"
VDEV="$TEST_BASE_DIR/vdevfile.$$"
TESTPOOL="slow_io_pool"
FILEPATH="$TESTDIR/slow_io.testfile"
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
verify_runnable "both"
function do_setup
{
log_must truncate -s 1G $VDEV
default_setup_noexit $VDEV
zed_events_drain
log_must zfs set compression=off $TESTPOOL
log_must zfs set primarycache=none $TESTPOOL
log_must zfs set prefetch=none $TESTPOOL
log_must zfs set recordsize=512 $TESTPOOL
for i in {1..10}; do
dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null
done
zpool sync
}
# intermediate cleanup
function do_clean
{
log_must zinject -c all
log_must zpool destroy $TESTPOOL
log_must rm -f $VDEV
}
# final cleanup
function cleanup
{
log_must zinject -c all
# if pool still exists then something failed so log additional info
if poolexists $TESTPOOL ; then
log_note "$(zpool status -s $TESTPOOL)"
echo "=================== zed log search ==================="
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
destroy_pool $TESTPOOL
fi
log_must zed_stop
log_must rm -f $VDEV
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
}
function start_slow_io
{
zpool sync
log_must set_tunable64 ZIO_SLOW_IO_MS 10
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL
zpool sync
}
function stop_slow_io
{
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
log_must zinject -c all
}
# Test default ZED settings:
# inject 10 events over 2.5 seconds, should not degrade.
function default_degrade
{
do_setup
start_slow_io
for i in {1..10}; do
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
sleep 0.25
done
stop_slow_io
log_note "$(zpool status -s $TESTPOOL)"
# give slow ZED a chance to process the delay events
sleep 18
log_note "$(zpool status -s $TESTPOOL)"
degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
log_note $degrades vdev degrades in ZED log
[ $degrades -eq "0" ] || \
log_fail "expecting no degrade events, found $degrades"
do_clean
}
# change slow_io_n, slow_io_t to 5 events in 60 seconds
# fire more than 5 events, should degrade
function slow_io_degrade
{
do_setup
zpool set slow_io_n=5 $TESTPOOL $VDEV
zpool set slow_io_t=60 $TESTPOOL $VDEV
start_slow_io
for i in {1..16}; do
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
sleep 0.5
done
stop_slow_io
zpool sync
#
# wait up to 60 seconds for kernel to produce at least 5 delay events
#
typeset -i i=0
typeset -i events=0
while [[ $i -lt 60 ]]; do
events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
[[ $events -ge "5" ]] && break
i=$((i+1))
sleep 1
done
log_note "$events delay events found"
if [[ $events -ge "5" ]]; then
log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10
fi
do_clean
}
# change slow_io_n, slow_io_t to 10 events in 1 second
# inject events spaced 0.5 seconds apart, should not degrade
function slow_io_no_degrade
{
do_setup
zpool set slow_io_n=10 $TESTPOOL $VDEV
zpool set slow_io_t=1 $TESTPOOL $VDEV
start_slow_io
for i in {1..16}; do
dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null
sleep 0.5
done
stop_slow_io
zpool sync
log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45
do_clean
}
log_assert "Test ZED slow io configurability"
log_onexit cleanup
log_must zed_events_drain
log_must zed_start
default_degrade
slow_io_degrade
slow_io_no_degrade
log_pass "Test ZED slow io configurability"

View file

@ -0,0 +1,177 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2023, Klara Inc.
#
# DESCRIPTION:
# Verify that delay events from multiple vdevs doesnt degrade
#
# STRATEGY:
# 1. Create a pool with a 3 disk raidz vdev
# 2. Inject slow io errors
# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs
#
. $STF_SUITE/include/libtest.shlib
TESTDIR="$TEST_BASE_DIR/zed_slow_io"
VDEV1="$TEST_BASE_DIR/vdevfile1.$$"
VDEV2="$TEST_BASE_DIR/vdevfile2.$$"
VDEV3="$TEST_BASE_DIR/vdevfile3.$$"
VDEV4="$TEST_BASE_DIR/vdevfile4.$$"
VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4"
TESTPOOL="slow_io_pool"
FILEPATH="$TESTDIR/slow_io.testfile"
OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS)
OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND)
verify_runnable "both"
function cleanup
{
log_must zinject -c all
# if pool still exists then something failed so log additional info
if poolexists $TESTPOOL ; then
log_note "$(zpool status -s $TESTPOOL)"
echo "=================== zed log search ==================="
grep "Diagnosis Engine" $ZEDLET_DIR/zed.log
destroy_pool $TESTPOOL
fi
log_must zed_stop
log_must rm -f $VDEVS
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
}
function start_slow_io
{
for vdev in $VDEVS
do
log_must zpool set slow_io_n=4 $TESTPOOL $vdev
log_must zpool set slow_io_t=60 $TESTPOOL $vdev
done
zpool sync
log_must set_tunable64 ZIO_SLOW_IO_MS 10
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000
for vdev in $VDEVS
do
log_must zinject -d $vdev -D10:1 $TESTPOOL
done
zpool sync
}
function stop_slow_io
{
log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO
log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS
log_must zinject -c all
}
function multiple_slow_vdevs_test
{
log_must truncate -s 1G $VDEVS
default_raidz_setup_noexit $VDEVS
log_must zpool events -c
log_must zfs set compression=off $TESTPOOL
log_must zfs set primarycache=none $TESTPOOL
log_must zfs set recordsize=4K $TESTPOOL
log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20
zpool sync
#
# Read the file with slow io injected on the disks
# This will cause multiple errors on each disk to trip ZED SERD
#
# pool: slow_io_pool
# state: ONLINE
# config:
#
# NAME STATE READ WRITE CKSUM SLOW
# slow_io_pool ONLINE 0 0 0 -
# raidz1-0 ONLINE 0 0 0 -
# /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113
# /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109
# /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96
# /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109
#
start_slow_io
dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null
stop_slow_io
# count events available for processing
typeset -i i=0
typeset -i events=0
while [[ $i -lt 60 ]]; do
events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l)
[[ $events -ge "50" ]] && break
i=$((i+1))
sleep 1
done
log_note "$events delay events found"
if [[ $events -lt "50" ]]; then
log_note "bailing: not enough events to complete the test"
destroy_pool $TESTPOOL
return
fi
#
# give slow ZED a chance to process the delay events
#
typeset -i i=0
typeset -i skips=0
while [[ $i -lt 75 ]]; do
skips=$(grep "retiring case" \
$ZEDLET_DIR/zed.log | wc -l)
[[ $skips -gt "0" ]] && break
i=$((i+1))
sleep 1
done
log_note $skips degrade skips in ZED log after $i seconds
[ $skips -gt "0" ] || log_fail "expecting to see skips"
degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l)
log_note $degrades vdev degrades in ZED log
[ $degrades -eq "0" ] || \
log_fail "expecting no degrade events, found $degrades"
destroy_pool $TESTPOOL
}
log_assert "Test ZED slow io across multiple vdevs"
log_onexit cleanup
log_must zed_events_drain
log_must zed_start
multiple_slow_vdevs_test
log_pass "Test ZED slow io across multiple vdevs"

View file

@ -32,5 +32,6 @@ cleanup_devices $DISKS
zed_stop
zed_cleanup resilver_finish-start-scrub.sh
zed_events_drain
log_pass

View file

@ -28,6 +28,7 @@
verify_runnable "global"
zed_events_drain
zed_setup resilver_finish-start-scrub.sh
zed_start

View file

@ -162,6 +162,9 @@
/* blkdev_issue_discard() is available */
/* #undef HAVE_BLKDEV_ISSUE_DISCARD */
/* __blkdev_issue_discard() is available */
/* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC */
/* blkdev_issue_secure_erase() is available */
/* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */
@ -1152,7 +1155,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.2.99-338-FreeBSD_g229b9f4ed"
#define ZFS_META_ALIAS "zfs-2.2.99-345-FreeBSD_ge0bd8118d"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -1182,7 +1185,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "338-FreeBSD_g229b9f4ed"
#define ZFS_META_RELEASE "345-FreeBSD_ge0bd8118d"
/* Define the project version. */
#define ZFS_META_VERSION "2.2.99"

View file

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.2.99-338-g229b9f4ed"
#define ZFS_META_GITREV "zfs-2.2.99-345-ge0bd8118d"