vdev probe to slow disk can stall mmp write checker

Simplify vdev probes in the zio_vdev_io_done context to
avoid holding the spa config lock for a long duration.

Also allow zpool clear if no evidence of another host
is using the pool.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed-by: Olaf Faaland <faaland1@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Don Brady <don.brady@klarasystems.com>
Closes #15839
This commit is contained in:
Don Brady 2024-04-29 15:35:53 -06:00 committed by GitHub
parent b28461b7c6
commit c3f2f1aa2d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 242 additions and 52 deletions

View file

@ -9050,7 +9050,7 @@ status_callback(zpool_handle_t *zhp, void *data)
printf_color(ANSI_BOLD, gettext("action: "));
printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices"
" are connected, then reboot your system and\n\timport the "
"pool.\n"));
"pool or run 'zpool clear' to resume the pool.\n"));
break;
case ZPOOL_STATUS_IO_FAILURE_WAIT:

View file

@ -770,7 +770,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
#define SPA_ASYNC_PROBE 0x04
#define SPA_ASYNC_FAULT_VDEV 0x04
#define SPA_ASYNC_RESILVER_DONE 0x08
#define SPA_ASYNC_RESILVER 0x10
#define SPA_ASYNC_AUTOEXPAND 0x20
@ -1123,6 +1123,8 @@ extern uint32_t spa_get_hostid(spa_t *spa);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);
extern boolean_t spa_mmp_remote_host_activity(spa_t *spa);
extern spa_mode_t spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);

View file

@ -50,20 +50,20 @@ extern "C" {
#define MMP_SEQ_VALID_BIT 0x02
#define MMP_FAIL_INT_VALID_BIT 0x04
#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \
ubp->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \
(ubp)->ub_mmp_magic == MMP_MAGIC)
#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_INTERVAL_VALID_BIT))
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_SEQ_VALID_BIT))
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \
#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \
MMP_FAIL_INT_VALID_BIT))
#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \
#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \
>> 8)
#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \
#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \
>> 32)
#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \
#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \
>> 48)
#define MMP_INTERVAL_SET(write) \

View file

@ -273,7 +273,7 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
boolean_t vdev_fault_wanted; /* async faulted wanted? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */

View file

@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the
devices can be accessed.
Pools with
.Sy multihost
enabled which have been suspended cannot be resumed.
While the pool was suspended, it may have been imported on
another host, and resuming I/O could result in pool damage.
enabled which have been suspended cannot be resumed when there is evidence
that the pool was imported by another host.
The same checks performed during an import will be applied before the clear
proceeds.
.Bl -tag -width Ds
.It Fl -power
Power on the devices's slot in the storage enclosure and wait for the device

View file

@ -664,12 +664,13 @@ mmp_thread(void *arg)
(gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
"mmp_last_write %llu mmp_interval %llu "
"mmp_fail_intervals %llu mmp_fail_ns %llu",
"mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu",
spa_name(spa), (u_longlong_t)gethrtime(),
(u_longlong_t)mmp->mmp_last_write,
(u_longlong_t)mmp_interval,
(u_longlong_t)mmp_fail_intervals,
(u_longlong_t)mmp_fail_ns);
(u_longlong_t)mmp_fail_ns,
(u_longlong_t)spa->spa_uberblock.ub_txg);
cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
"succeeded in over %llu ms; suspending pool. "
"Hrtime %llu",

View file

@ -3594,11 +3594,16 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
}
/*
* Perform the import activity check. If the user canceled the import or
* we detected activity then fail.
* Remote host activity check.
*
* error results:
* 0 - no activity detected
* EREMOTEIO - remote activity detected
* EINTR - user canceled the operation
*/
static int
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
boolean_t importing)
{
uint64_t txg = ub->ub_txg;
uint64_t timestamp = ub->ub_timestamp;
@ -3643,19 +3648,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
import_expire = gethrtime() + import_delay;
spa_import_progress_set_notes(spa, "Checking MMP activity, waiting "
"%llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
if (importing) {
spa_import_progress_set_notes(spa, "Checking MMP activity, "
"waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
}
int interations = 0;
int iterations = 0;
while ((now = gethrtime()) < import_expire) {
if (interations++ % 30 == 0) {
if (importing && iterations++ % 30 == 0) {
spa_import_progress_set_notes(spa, "Checking MMP "
"activity, %llu ms remaining",
(u_longlong_t)NSEC2MSEC(import_expire - now));
}
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
if (importing) {
(void) spa_import_progress_set_mmp_check(spa_guid(spa),
NSEC2SEC(import_expire - gethrtime()));
}
vdev_uberblock_load(rvd, ub, &mmp_label);
@ -3737,6 +3746,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
return (error);
}
/*
* Called from zfs_ioc_clear for a pool that was suspended
* after failing mmp write checks.
*/
boolean_t
spa_mmp_remote_host_activity(spa_t *spa)
{
ASSERT(spa_multihost(spa) && spa_suspended(spa));
nvlist_t *best_label;
uberblock_t best_ub;
/*
* Locate the best uberblock on disk
*/
vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
if (best_label) {
/*
* confirm that the best hostid matches our hostid
*/
if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
spa_get_hostid(spa) !=
fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
nvlist_free(best_label);
return (B_TRUE);
}
nvlist_free(best_label);
} else {
return (B_TRUE);
}
if (!MMP_VALID(&best_ub) ||
!MMP_FAIL_INT_VALID(&best_ub) ||
MMP_FAIL_INT(&best_ub) == 0) {
return (B_TRUE);
}
if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
zfs_dbgmsg("txg mismatch detected during pool clear "
"txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
(u_longlong_t)spa->spa_uberblock.ub_txg,
(u_longlong_t)best_ub.ub_txg,
(u_longlong_t)spa->spa_uberblock.ub_timestamp,
(u_longlong_t)best_ub.ub_timestamp);
return (B_TRUE);
}
/*
* Perform an activity check looking for any remote writer
*/
return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
B_FALSE) != 0);
}
static int
spa_verify_host(spa_t *spa, nvlist_t *mos_config)
{
@ -4063,7 +4127,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
}
int error = spa_activity_check(spa, ub, spa->spa_config);
int error =
spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
if (error) {
nvlist_free(label);
return (error);
@ -8771,15 +8836,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
}
static void
spa_async_probe(spa_t *spa, vdev_t *vd)
spa_async_fault_vdev(spa_t *spa, vdev_t *vd)
{
if (vd->vdev_probe_wanted) {
vd->vdev_probe_wanted = B_FALSE;
vdev_reopen(vd); /* vdev_open() does the actual probe */
if (vd->vdev_fault_wanted) {
vd->vdev_fault_wanted = B_FALSE;
vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
VDEV_AUX_ERR_EXCEEDED);
}
for (int c = 0; c < vd->vdev_children; c++)
spa_async_probe(spa, vd->vdev_child[c]);
spa_async_fault_vdev(spa, vd->vdev_child[c]);
}
static void
@ -8867,11 +8933,11 @@ spa_async_thread(void *arg)
}
/*
* See if any devices need to be probed.
* See if any devices need to be marked faulted.
*/
if (tasks & SPA_ASYNC_PROBE) {
if (tasks & SPA_ASYNC_FAULT_VDEV) {
spa_vdev_state_enter(spa, SCL_NONE);
spa_async_probe(spa, spa->spa_root_vdev);
spa_async_fault_vdev(spa, spa->spa_root_vdev);
(void) spa_vdev_state_exit(spa, NULL, 0);
}

View file

@ -550,6 +550,15 @@ txg_sync_thread(void *arg)
timer = (delta > timeout ? 0 : timeout - delta);
}
/*
* When we're suspended, nothing should be changing and for
* MMP we don't want to bump anything that would make it
* harder to detect if another host is changing it when
* resuming after a MMP suspend.
*/
if (spa_suspended(spa))
continue;
/*
* Wait until the quiesce thread hands off a txg to us,
* prompting it to do so if necessary.

View file

@ -1664,6 +1664,7 @@ vdev_metaslab_fini(vdev_t *vd)
typedef struct vdev_probe_stats {
boolean_t vps_readable;
boolean_t vps_writeable;
boolean_t vps_zio_done_probe;
int vps_flags;
} vdev_probe_stats_t;
@ -1709,6 +1710,17 @@ vdev_probe_done(zio_t *zio)
(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
spa, vd, NULL, NULL, 0);
zio->io_error = SET_ERROR(ENXIO);
/*
* If this probe was initiated from zio pipeline, then
* change the state in a spa_async_request. Probes that
* were initiated from a vdev_open can change the state
* as part of the open call.
*/
if (vps->vps_zio_done_probe) {
vd->vdev_fault_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
}
}
mutex_enter(&vd->vdev_probe_lock);
@ -1759,6 +1771,7 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
vps->vps_zio_done_probe = (zio != NULL);
if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
/*
@ -1785,15 +1798,6 @@ vdev_probe(vdev_t *vd, zio_t *zio)
vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
vdev_probe_done, vps,
vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
/*
* We can't change the vdev state in this context, so we
* kick off an async task to do it on our behalf.
*/
if (zio != NULL) {
vd->vdev_probe_wanted = B_TRUE;
spa_async_request(spa, SPA_ASYNC_PROBE);
}
}
if (zio != NULL)

View file

@ -2027,6 +2027,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
/*
* If this isn't a resync due to I/O errors,
* and nothing changed in this transaction group,
* and multihost protection isn't enabled,
* and the vdev configuration hasn't changed,
* then there's nothing to do.
*/
@ -2034,7 +2035,8 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
txg, spa->spa_mmp.mmp_delay);
if (!changed && list_is_empty(&spa->spa_config_dirty_list))
if (!changed && list_is_empty(&spa->spa_config_dirty_list) &&
!spa_multihost(spa))
return (0);
}

View file

@ -5823,10 +5823,13 @@ zfs_ioc_clear(zfs_cmd_t *zc)
/*
* If multihost is enabled, resuming I/O is unsafe as another
* host may have imported the pool.
* host may have imported the pool. Check for remote activity.
*/
if (spa_multihost(spa) && spa_suspended(spa))
return (SET_ERROR(EINVAL));
if (spa_multihost(spa) && spa_suspended(spa) &&
spa_mmp_remote_host_activity(spa)) {
spa_close(spa, FTAG);
return (SET_ERROR(EREMOTEIO));
}
spa_vdev_state_enter(spa, SCL_NONE);

View file

@ -2532,8 +2532,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
"failure and the failure mode property for this pool "
"is set to panic.", spa_name(spa));
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
"failure and has been suspended.\n", spa_name(spa));
if (reason != ZIO_SUSPEND_MMP) {
cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable "
"I/O failure and has been suspended.\n", spa_name(spa));
}
(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
NULL, NULL, 0);

View file

@ -607,9 +607,11 @@ zio_handle_io_delay(zio_t *zio)
if (vd->vdev_guid != handler->zi_record.zi_guid)
continue;
/* also match on I/O type (e.g., -T read) */
if (handler->zi_record.zi_iotype != ZIO_TYPES &&
handler->zi_record.zi_iotype != zio->io_type)
continue;
handler->zi_record.zi_iotype != zio->io_type) {
continue;
}
/*
* Defensive; should never happen as the array allocation

View file

@ -146,7 +146,7 @@ tags = ['functional', 'mmap']
tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid']
'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
tags = ['functional', 'mmp']
[tests/functional/mount:Linux]

View file

@ -1593,6 +1593,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/mmp/mmp_on_zdb.ksh \
functional/mmp/mmp_reset_interval.ksh \
functional/mmp/mmp_write_distribution.ksh \
functional/mmp/mmp_write_slow_disk.ksh \
functional/mmp/mmp_write_uberblocks.ksh \
functional/mmp/multihost_history.ksh \
functional/mmp/setup.ksh \

View file

@ -0,0 +1,97 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2024, Klara Inc
#
# DESCRIPTION:
# Verify that long VDEV probes do not cause MMP checks to suspend pool
# Note: without PR-15839 fix, this test will suspend the pool.
#
# A device that is returning unexpected errors will trigger a vdev_probe.
# When the device additionally has slow response times, the probe can hold
# the spa config lock as a writer for a long period of time such that the
# mmp uberblock updates stall when trying to acquire the spa config lock.
#
# STRATEGY:
# 1. Create a pool with multiple leaf vdevs
# 2. Enable multihost and multihost_history
# 3. Delay for MMP writes to occur
# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/mmp/mmp.cfg
. $STF_SUITE/tests/functional/mmp/mmp.kshlib
verify_runnable "both"
function cleanup
{
log_must zinject -c all
if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then
log_must zpool clear $MMP_POOL
zpool get state $MMP_POOL $MMP_DIR/file.3
zpool events | grep ".fs.zfs." | grep -v "history_event"
fi
poolexists $MMP_POOL && destroy_pool $MMP_POOL
log_must rm -r $MMP_DIR
log_must mmp_clear_hostid
}
log_assert "A long VDEV probe doesn't cause a MMP check suspend"
log_onexit cleanup
MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost
# Create a multiple drive pool
log_must zpool events -c
log_must mkdir -p $MMP_DIR
log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5}
log_must zpool create -f $MMP_POOL \
mirror $MMP_DIR/file.{0,1,2} \
mirror $MMP_DIR/file.{3,4,5}
# Enable MMP
log_must mmp_set_hostid $HOSTID1
log_must zpool set multihost=on $MMP_POOL
clear_mmp_history
# Inject vdev write error along with a delay
log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL
log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5
sleep 10
sync_pool $MMP_POOL
# Confirm mmp writes to the non-slow disks have taken place
for x in {0,1,2,4}; do
write_count=$(grep -c file.${x} $MMP_HISTORY_URL)
[[ $write_count -gt 0 ]] || log_fail "expecting mmp writes"
done
# Expect that the pool was not suspended
log_must check_state $MMP_POOL "" "ONLINE"
health=$(zpool list -H -o health $MMP_POOL)
log_note "$MMP_POOL health is $health"
[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected"
log_pass "A long VDEV probe doesn't cause a MMP check suspend"