diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index 7a2ecffe81d4..c4a750901743 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -416,6 +416,34 @@ nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr) } } +static void +nvme_pre_reset(struct nvme_controller *ctrlr) +{ + /* + * Make sure that all the ISRs are done before proceeding with the reset + * (and also keep any stray interrupts that happen during this process + * from racing this process). For startup, this is a nop, since the + * hardware is in a good state. But for recovery, where we randomly + * reset the hardware, this ensure that we're not racing the ISRs. + */ + mtx_lock(&ctrlr->adminq.recovery); + for (int i = 0; i < ctrlr->num_io_queues; i++) { + mtx_lock(&ctrlr->ioq[i].recovery); + } +} + +static void +nvme_post_reset(struct nvme_controller *ctrlr) +{ + /* + * Reset complete, unblock ISRs + */ + mtx_unlock(&ctrlr->adminq.recovery); + for (int i = 0; i < ctrlr->num_io_queues; i++) { + mtx_unlock(&ctrlr->ioq[i].recovery); + } +} + static int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { @@ -427,9 +455,11 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) err = nvme_ctrlr_disable(ctrlr); if (err != 0) - return err; + goto out; err = nvme_ctrlr_enable(ctrlr); +out: + TSEXIT(); return (err); } @@ -1157,6 +1187,11 @@ nvme_ctrlr_start_config_hook(void *arg) TSENTER(); + /* + * Don't call pre/post reset here. We've not yet created the qpairs, + * haven't setup the ISRs, so there's no need to 'drain' them or + * 'exclude' them. + */ if (nvme_ctrlr_hw_reset(ctrlr) != 0) { fail: nvme_ctrlr_fail(ctrlr); @@ -1201,16 +1236,9 @@ nvme_ctrlr_reset_task(void *arg, int pending) int status; nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller"); + nvme_pre_reset(ctrlr); status = nvme_ctrlr_hw_reset(ctrlr); - /* - * Use pause instead of DELAY, so that we yield to any nvme interrupt - * handlers on this CPU that were blocked on a qpair lock. We want - * all nvme interrupts completed before proceeding with restarting the - * controller. - * - * XXX - any way to guarantee the interrupt handlers have quiesced? - */ - pause("nvmereset", hz / 10); + nvme_post_reset(ctrlr); if (status == 0) nvme_ctrlr_start(ctrlr, true); else @@ -1697,6 +1725,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr) if (ctrlr->is_failed) return (0); + nvme_pre_reset(ctrlr); if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; #ifdef NVME_2X_RESET @@ -1708,6 +1737,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr) if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; #endif + nvme_post_reset(ctrlr); /* * Now that we've reset the hardware, we can restart the controller. Any @@ -1724,6 +1754,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr) * the controller. However, we have to return success for the resume * itself, due to questionable APIs. */ + nvme_post_reset(ctrlr); nvme_printf(ctrlr, "Failed to reset on resume, failing.\n"); nvme_ctrlr_fail(ctrlr); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h index e4b319b9d8b7..a6239f30f3bf 100644 --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -162,10 +162,9 @@ struct nvme_qpair { struct resource *res; void *tag; - struct callout timer; - sbintime_t deadline; - bool timer_armed; - enum nvme_recovery recovery_state; + struct callout timer; /* recovery lock */ + bool timer_armed; /* recovery lock */ + enum nvme_recovery recovery_state; /* recovery lock */ uint32_t num_entries; uint32_t num_trackers; @@ -182,6 +181,7 @@ struct nvme_qpair { int64_t num_retries; int64_t num_failures; int64_t num_ignored; + int64_t num_recovery_nolock; struct nvme_command *cmd; struct nvme_completion *cpl; @@ -200,7 +200,7 @@ struct nvme_qpair { struct nvme_tracker **act_tr; struct mtx_padalign lock; - + struct mtx_padalign recovery; } __aligned(CACHE_LINE_SIZE); struct nvme_namespace { diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c index 6d34c7ddba2d..b256c4713c8d 100644 --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -528,14 +528,17 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, nvme_free_request(req); } -bool -nvme_qpair_process_completions(struct nvme_qpair *qpair) +/* Locked version of completion processor */ +static bool +_nvme_qpair_process_completions(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_completion cpl; - int done = 0; + bool done = false; bool in_panic = dumping || SCHEDULER_STOPPED(); + mtx_assert(&qpair->recovery, MA_OWNED); + /* * qpair is not enabled, likely because a controller reset is in * progress. Ignore the interrupt - any I/O that was associated with @@ -629,7 +632,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) else tr = NULL; - done++; + done = true; if (tr != NULL) { nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL); qpair->sq_head = cpl.sqhd; @@ -666,12 +669,35 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair) } } - if (done != 0) { + if (done) { bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, qpair->cq_hdbl_off, qpair->cq_head); } - return (done != 0); + return (done); +} + +bool +nvme_qpair_process_completions(struct nvme_qpair *qpair) +{ + bool done; + + /* + * Interlock with reset / recovery code. This is an usually uncontended + * to make sure that we drain out of the ISRs before we reset the card + * and to prevent races with the recovery process called from a timeout + * context. + */ + if (!mtx_trylock(&qpair->recovery)) { + qpair->num_recovery_nolock++; + return (false); + } + + done = _nvme_qpair_process_completions(qpair); + + mtx_unlock(&qpair->recovery); + + return (done); } static void @@ -699,6 +725,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, qpair->ctrlr = ctrlr; mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); + mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF); /* Note: NVMe PRP format is restricted to 4-byte alignment. */ err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), @@ -765,7 +792,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair, qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; - callout_init(&qpair->timer, 1); + callout_init_mtx(&qpair->timer, &qpair->recovery, 0); qpair->timer_armed = false; qpair->recovery_state = RECOVERY_WAITING; @@ -903,6 +930,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair) if (mtx_initialized(&qpair->lock)) mtx_destroy(&qpair->lock); + if (mtx_initialized(&qpair->recovery)) + mtx_destroy(&qpair->recovery); if (qpair->res) { bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, @@ -975,14 +1004,12 @@ nvme_qpair_timeout(void *arg) struct nvme_controller *ctrlr = qpair->ctrlr; struct nvme_tracker *tr; sbintime_t now; - bool idle; + bool idle = false; bool needs_reset; uint32_t csts; uint8_t cfs; - - mtx_lock(&qpair->lock); - idle = TAILQ_EMPTY(&qpair->outstanding_tr); + mtx_assert(&qpair->recovery, MA_OWNED); switch (qpair->recovery_state) { case RECOVERY_NONE: @@ -1003,25 +1030,10 @@ nvme_qpair_timeout(void *arg) goto do_reset; /* - * Next, check to see if we have any completions. If we do, - * we've likely missed an interrupt, but the card is otherwise - * fine. This will also catch all the commands that are about - * to timeout (but there's still a tiny race). Since the timeout - * is long relative to the race between here and the check below, - * this is still a win. + * Process completions. We already have the recovery lock, so + * call the locked version. */ - mtx_unlock(&qpair->lock); - nvme_qpair_process_completions(qpair); - mtx_lock(&qpair->lock); - if (qpair->recovery_state != RECOVERY_NONE) { - /* - * Somebody else adjusted recovery state while unlocked, - * we should bail. Unlock the qpair and return without - * doing anything else. - */ - mtx_unlock(&qpair->lock); - return; - } + _nvme_qpair_process_completions(qpair); /* * Check to see if we need to timeout any commands. If we do, then @@ -1029,7 +1041,13 @@ nvme_qpair_timeout(void *arg) */ now = getsbinuptime(); needs_reset = false; + idle = true; + mtx_lock(&qpair->lock); TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { + /* + * Skip async commands, they are posted to the card for + * an indefinite amount of time and have no deadline. + */ if (tr->deadline == SBT_MAX) continue; if (now > tr->deadline) { @@ -1055,6 +1073,7 @@ nvme_qpair_timeout(void *arg) idle = false; } } + mtx_unlock(&qpair->lock); if (!needs_reset) break; @@ -1076,6 +1095,7 @@ nvme_qpair_timeout(void *arg) break; case RECOVERY_WAITING: nvme_printf(ctrlr, "waiting for reset to complete\n"); + idle = false; /* We want to keep polling */ break; } @@ -1087,7 +1107,6 @@ nvme_qpair_timeout(void *arg) } else { qpair->timer_armed = false; } - mtx_unlock(&qpair->lock); } /* @@ -1196,9 +1215,12 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { /* - * No tracker is available, or the qpair is disabled due to - * an in-progress controller-level reset or controller - * failure. + * No tracker is available, or the qpair is disabled due to an + * in-progress controller-level reset or controller failure. If + * we lose the race with recovery_state, then we may add an + * extra request to the queue which will be resubmitted later. + * We only set recovery_state to NONE with qpair->lock also + * held. */ if (qpair->ctrlr->is_failed) { @@ -1260,7 +1282,10 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) static void nvme_qpair_enable(struct nvme_qpair *qpair) { - mtx_assert(&qpair->lock, MA_OWNED); + if (mtx_initialized(&qpair->recovery)) + mtx_assert(&qpair->recovery, MA_OWNED); + if (mtx_initialized(&qpair->lock)) + mtx_assert(&qpair->lock, MA_OWNED); qpair->recovery_state = RECOVERY_NONE; } @@ -1311,9 +1336,11 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair) nvme_printf(qpair->ctrlr, "done aborting outstanding admin\n"); + mtx_lock(&qpair->recovery); mtx_lock(&qpair->lock); nvme_qpair_enable(qpair); mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } void @@ -1340,8 +1367,8 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair) if (report) nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n"); + mtx_lock(&qpair->recovery); mtx_lock(&qpair->lock); - nvme_qpair_enable(qpair); STAILQ_INIT(&temp); @@ -1360,6 +1387,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair) nvme_printf(qpair->ctrlr, "done resubmitting i/o\n"); mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } static void @@ -1367,27 +1395,40 @@ nvme_qpair_disable(struct nvme_qpair *qpair) { struct nvme_tracker *tr, *tr_temp; - mtx_lock(&qpair->lock); + if (mtx_initialized(&qpair->recovery)) + mtx_assert(&qpair->recovery, MA_OWNED); + if (mtx_initialized(&qpair->lock)) + mtx_assert(&qpair->lock, MA_OWNED); + qpair->recovery_state = RECOVERY_WAITING; TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { tr->deadline = SBT_MAX; } - mtx_unlock(&qpair->lock); } void nvme_admin_qpair_disable(struct nvme_qpair *qpair) { + mtx_lock(&qpair->recovery); + mtx_lock(&qpair->lock); nvme_qpair_disable(qpair); nvme_admin_qpair_abort_aers(qpair); + + mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } void nvme_io_qpair_disable(struct nvme_qpair *qpair) { + mtx_lock(&qpair->recovery); + mtx_lock(&qpair->lock); nvme_qpair_disable(qpair); + + mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } void diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c index a1a8a968eebe..ac0d507e2337 100644 --- a/sys/dev/nvme/nvme_sysctl.c +++ b/sys/dev/nvme/nvme_sysctl.c @@ -163,6 +163,7 @@ nvme_qpair_reset_stats(struct nvme_qpair *qpair) qpair->num_retries = 0; qpair->num_failures = 0; qpair->num_ignored = 0; + qpair->num_recovery_nolock = 0; } static int @@ -240,6 +241,21 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS) return (sysctl_handle_64(oidp, &num_ignored, 0, req)); } +static int +nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS) +{ + struct nvme_controller *ctrlr = arg1; + int64_t num; + int i; + + num = ctrlr->adminq.num_recovery_nolock; + + for (i = 0; i < ctrlr->num_io_queues; i++) + num += ctrlr->ioq[i].num_recovery_nolock; + + return (sysctl_handle_64(oidp, &num, 0, req)); +} + static int nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS) { @@ -298,6 +314,9 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair, SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored", CTLFLAG_RD, &qpair->num_ignored, "Number of interrupts posted, but were administratively ignored"); + SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock", + CTLFLAG_RD, &qpair->num_recovery_nolock, + "Number of times that we failed to lock recovery in the ISR"); SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO, "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, @@ -366,6 +385,11 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr) ctrlr, 0, nvme_sysctl_num_ignored, "IU", "Number of interrupts ignored administratively"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, + "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU", + "Number of times that we failed to lock recovery in the ISR"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, "reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr, 0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");