diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 7a2ecffe81d4..c4a750901743 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -416,6 +416,34 @@ nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
 	}
 }
 
+static void
+nvme_pre_reset(struct nvme_controller *ctrlr)
+{
+	/*
+	 * Make sure that all the ISRs are done before proceeding with the reset
+	 * (and also keep any stray interrupts that happen during this process
+	 * from racing this process). For startup, this is a nop, since the
+	 * hardware is in a good state. But for recovery, where we randomly
+	 * reset the hardware, this ensure that we're not racing the ISRs.
+	 */
+	mtx_lock(&ctrlr->adminq.recovery);
+	for (int i = 0; i < ctrlr->num_io_queues; i++) {
+		mtx_lock(&ctrlr->ioq[i].recovery);
+	}
+}
+
+static void
+nvme_post_reset(struct nvme_controller *ctrlr)
+{
+	/*
+	 * Reset complete, unblock ISRs
+	 */
+	mtx_unlock(&ctrlr->adminq.recovery);
+	for (int i = 0; i < ctrlr->num_io_queues; i++) {
+		mtx_unlock(&ctrlr->ioq[i].recovery);
+	}
+}
+
 static int
 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 {
@@ -427,9 +455,11 @@ nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 
 	err = nvme_ctrlr_disable(ctrlr);
 	if (err != 0)
-		return err;
+		goto out;
 
 	err = nvme_ctrlr_enable(ctrlr);
+out:
+
 	TSEXIT();
 	return (err);
 }
@@ -1157,6 +1187,11 @@ nvme_ctrlr_start_config_hook(void *arg)
 
 	TSENTER();
 
+	/*
+	 * Don't call pre/post reset here. We've not yet created the qpairs,
+	 * haven't setup the ISRs, so there's no need to 'drain' them or
+	 * 'exclude' them.
+	 */
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
 fail:
 		nvme_ctrlr_fail(ctrlr);
@@ -1201,16 +1236,9 @@ nvme_ctrlr_reset_task(void *arg, int pending)
 	int			status;
 
 	nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
+	nvme_pre_reset(ctrlr);
 	status = nvme_ctrlr_hw_reset(ctrlr);
-	/*
-	 * Use pause instead of DELAY, so that we yield to any nvme interrupt
-	 *  handlers on this CPU that were blocked on a qpair lock. We want
-	 *  all nvme interrupts completed before proceeding with restarting the
-	 *  controller.
-	 *
-	 * XXX - any way to guarantee the interrupt handlers have quiesced?
-	 */
-	pause("nvmereset", hz / 10);
+	nvme_post_reset(ctrlr);
 	if (status == 0)
 		nvme_ctrlr_start(ctrlr, true);
 	else
@@ -1697,6 +1725,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 	if (ctrlr->is_failed)
 		return (0);
 
+	nvme_pre_reset(ctrlr);
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 #ifdef NVME_2X_RESET
@@ -1708,6 +1737,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 	if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 		goto fail;
 #endif
+	nvme_post_reset(ctrlr);
 
 	/*
 	 * Now that we've reset the hardware, we can restart the controller. Any
@@ -1724,6 +1754,7 @@ nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 	 * the controller. However, we have to return success for the resume
 	 * itself, due to questionable APIs.
 	 */
+	nvme_post_reset(ctrlr);
 	nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
 	nvme_ctrlr_fail(ctrlr);
 	(void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index e4b319b9d8b7..a6239f30f3bf 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -162,10 +162,9 @@ struct nvme_qpair {
 	struct resource		*res;
 	void 			*tag;
 
-	struct callout		timer;
-	sbintime_t		deadline;
-	bool			timer_armed;
-	enum nvme_recovery	recovery_state;
+	struct callout		timer;			/* recovery lock */
+	bool			timer_armed;		/* recovery lock */
+	enum nvme_recovery	recovery_state;		/* recovery lock */
 
 	uint32_t		num_entries;
 	uint32_t		num_trackers;
@@ -182,6 +181,7 @@ struct nvme_qpair {
 	int64_t			num_retries;
 	int64_t			num_failures;
 	int64_t			num_ignored;
+	int64_t			num_recovery_nolock;
 
 	struct nvme_command	*cmd;
 	struct nvme_completion	*cpl;
@@ -200,7 +200,7 @@ struct nvme_qpair {
 	struct nvme_tracker	**act_tr;
 
 	struct mtx_padalign	lock;
-
+	struct mtx_padalign	recovery;
 } __aligned(CACHE_LINE_SIZE);
 
 struct nvme_namespace {
diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
index 6d34c7ddba2d..b256c4713c8d 100644
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -528,14 +528,17 @@ nvme_qpair_manual_complete_request(struct nvme_qpair *qpair,
 	nvme_free_request(req);
 }
 
-bool
-nvme_qpair_process_completions(struct nvme_qpair *qpair)
+/* Locked version of completion processor */
+static bool
+_nvme_qpair_process_completions(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr;
 	struct nvme_completion	cpl;
-	int done = 0;
+	bool done = false;
 	bool in_panic = dumping || SCHEDULER_STOPPED();
 
+	mtx_assert(&qpair->recovery, MA_OWNED);
+
 	/*
 	 * qpair is not enabled, likely because a controller reset is in
 	 * progress.  Ignore the interrupt - any I/O that was associated with
@@ -629,7 +632,7 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 		else
 			tr = NULL;
 
-		done++;
+		done = true;
 		if (tr != NULL) {
 			nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL);
 			qpair->sq_head = cpl.sqhd;
@@ -666,12 +669,35 @@ nvme_qpair_process_completions(struct nvme_qpair *qpair)
 		}
 	}
 
-	if (done != 0) {
+	if (done) {
 		bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle,
 		    qpair->cq_hdbl_off, qpair->cq_head);
 	}
 
-	return (done != 0);
+	return (done);
+}
+
+bool
+nvme_qpair_process_completions(struct nvme_qpair *qpair)
+{
+	bool done;
+
+	/*
+	 * Interlock with reset / recovery code. This is an usually uncontended
+	 * to make sure that we drain out of the ISRs before we reset the card
+	 * and to prevent races with the recovery process called from a timeout
+	 * context.
+	 */
+	if (!mtx_trylock(&qpair->recovery)) {
+		qpair->num_recovery_nolock++;
+		return (false);
+	}
+
+	done = _nvme_qpair_process_completions(qpair);
+
+	mtx_unlock(&qpair->recovery);
+
+	return (done);
 }
 
 static void
@@ -699,6 +725,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 	qpair->ctrlr = ctrlr;
 
 	mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF);
+	mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF);
 
 	/* Note: NVMe PRP format is restricted to 4-byte alignment. */
 	err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
@@ -765,7 +792,7 @@ nvme_qpair_construct(struct nvme_qpair *qpair,
 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
 	prpmem_phys = queuemem_phys + cmdsz + cplsz;
 
-	callout_init(&qpair->timer, 1);
+	callout_init_mtx(&qpair->timer, &qpair->recovery, 0);
 	qpair->timer_armed = false;
 	qpair->recovery_state = RECOVERY_WAITING;
 
@@ -903,6 +930,8 @@ nvme_qpair_destroy(struct nvme_qpair *qpair)
 
 	if (mtx_initialized(&qpair->lock))
 		mtx_destroy(&qpair->lock);
+	if (mtx_initialized(&qpair->recovery))
+		mtx_destroy(&qpair->recovery);
 
 	if (qpair->res) {
 		bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ,
@@ -975,14 +1004,12 @@ nvme_qpair_timeout(void *arg)
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
 	struct nvme_tracker	*tr;
 	sbintime_t		now;
-	bool			idle;
+	bool			idle = false;
 	bool			needs_reset;
 	uint32_t		csts;
 	uint8_t			cfs;
 
-
-	mtx_lock(&qpair->lock);
-	idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+	mtx_assert(&qpair->recovery, MA_OWNED);
 
 	switch (qpair->recovery_state) {
 	case RECOVERY_NONE:
@@ -1003,25 +1030,10 @@ nvme_qpair_timeout(void *arg)
 			goto do_reset;
 
 		/*
-		 * Next, check to see if we have any completions. If we do,
-		 * we've likely missed an interrupt, but the card is otherwise
-		 * fine. This will also catch all the commands that are about
-		 * to timeout (but there's still a tiny race). Since the timeout
-		 * is long relative to the race between here and the check below,
-		 * this is still a win.
+		 * Process completions. We already have the recovery lock, so
+		 * call the locked version.
 		 */
-		mtx_unlock(&qpair->lock);
-		nvme_qpair_process_completions(qpair);
-		mtx_lock(&qpair->lock);
-		if (qpair->recovery_state != RECOVERY_NONE) {
-			/*
-			 * Somebody else adjusted recovery state while unlocked,
-			 * we should bail. Unlock the qpair and return without
-			 * doing anything else.
-			 */
-			mtx_unlock(&qpair->lock);
-			return;
-		}
+		_nvme_qpair_process_completions(qpair);
 
 		/*
 		 * Check to see if we need to timeout any commands. If we do, then
@@ -1029,7 +1041,13 @@ nvme_qpair_timeout(void *arg)
 		 */
 		now = getsbinuptime();
 		needs_reset = false;
+		idle = true;
+		mtx_lock(&qpair->lock);
 		TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+			/*
+			 * Skip async commands, they are posted to the card for
+			 * an indefinite amount of time and have no deadline.
+			 */
 			if (tr->deadline == SBT_MAX)
 				continue;
 			if (now > tr->deadline) {
@@ -1055,6 +1073,7 @@ nvme_qpair_timeout(void *arg)
 				idle = false;
 			}
 		}
+		mtx_unlock(&qpair->lock);
 		if (!needs_reset)
 			break;
 
@@ -1076,6 +1095,7 @@ nvme_qpair_timeout(void *arg)
 		break;
 	case RECOVERY_WAITING:
 		nvme_printf(ctrlr, "waiting for reset to complete\n");
+		idle = false;			/* We want to keep polling */
 		break;
 	}
 
@@ -1087,7 +1107,6 @@ nvme_qpair_timeout(void *arg)
 	} else {
 		qpair->timer_armed = false;
 	}
-	mtx_unlock(&qpair->lock);
 }
 
 /*
@@ -1196,9 +1215,12 @@ _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 
 	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		/*
-		 * No tracker is available, or the qpair is disabled due to
-		 *  an in-progress controller-level reset or controller
-		 *  failure.
+		 * No tracker is available, or the qpair is disabled due to an
+		 * in-progress controller-level reset or controller failure. If
+		 * we lose the race with recovery_state, then we may add an
+		 * extra request to the queue which will be resubmitted later.
+		 * We only set recovery_state to NONE with qpair->lock also
+		 * held.
 		 */
 
 		if (qpair->ctrlr->is_failed) {
@@ -1260,7 +1282,10 @@ nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req)
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
-	mtx_assert(&qpair->lock, MA_OWNED);
+	if (mtx_initialized(&qpair->recovery))
+		mtx_assert(&qpair->recovery, MA_OWNED);
+	if (mtx_initialized(&qpair->lock))
+		mtx_assert(&qpair->lock, MA_OWNED);
 
 	qpair->recovery_state = RECOVERY_NONE;
 }
@@ -1311,9 +1336,11 @@ nvme_admin_qpair_enable(struct nvme_qpair *qpair)
 		nvme_printf(qpair->ctrlr,
 		    "done aborting outstanding admin\n");
 
+	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
 	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 void
@@ -1340,8 +1367,8 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
 	if (report)
 		nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n");
 
+	mtx_lock(&qpair->recovery);
 	mtx_lock(&qpair->lock);
-
 	nvme_qpair_enable(qpair);
 
 	STAILQ_INIT(&temp);
@@ -1360,6 +1387,7 @@ nvme_io_qpair_enable(struct nvme_qpair *qpair)
 		nvme_printf(qpair->ctrlr, "done resubmitting i/o\n");
 
 	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 static void
@@ -1367,27 +1395,40 @@ nvme_qpair_disable(struct nvme_qpair *qpair)
 {
 	struct nvme_tracker	*tr, *tr_temp;
 
-	mtx_lock(&qpair->lock);
+	if (mtx_initialized(&qpair->recovery))
+		mtx_assert(&qpair->recovery, MA_OWNED);
+	if (mtx_initialized(&qpair->lock))
+		mtx_assert(&qpair->lock, MA_OWNED);
+
 	qpair->recovery_state = RECOVERY_WAITING;
 	TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
 		tr->deadline = SBT_MAX;
 	}
-	mtx_unlock(&qpair->lock);
 }
 
 void
 nvme_admin_qpair_disable(struct nvme_qpair *qpair)
 {
+	mtx_lock(&qpair->recovery);
+	mtx_lock(&qpair->lock);
 
 	nvme_qpair_disable(qpair);
 	nvme_admin_qpair_abort_aers(qpair);
+
+	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 void
 nvme_io_qpair_disable(struct nvme_qpair *qpair)
 {
+	mtx_lock(&qpair->recovery);
+	mtx_lock(&qpair->lock);
 
 	nvme_qpair_disable(qpair);
+
+	mtx_unlock(&qpair->lock);
+	mtx_unlock(&qpair->recovery);
 }
 
 void
diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c
index a1a8a968eebe..ac0d507e2337 100644
--- a/sys/dev/nvme/nvme_sysctl.c
+++ b/sys/dev/nvme/nvme_sysctl.c
@@ -163,6 +163,7 @@ nvme_qpair_reset_stats(struct nvme_qpair *qpair)
 	qpair->num_retries = 0;
 	qpair->num_failures = 0;
 	qpair->num_ignored = 0;
+	qpair->num_recovery_nolock = 0;
 }
 
 static int
@@ -240,6 +241,21 @@ nvme_sysctl_num_ignored(SYSCTL_HANDLER_ARGS)
 	return (sysctl_handle_64(oidp, &num_ignored, 0, req));
 }
 
+static int
+nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS)
+{
+	struct nvme_controller 	*ctrlr = arg1;
+	int64_t			num;
+	int			i;
+
+	num = ctrlr->adminq.num_recovery_nolock;
+
+	for (i = 0; i < ctrlr->num_io_queues; i++)
+		num += ctrlr->ioq[i].num_recovery_nolock;
+
+	return (sysctl_handle_64(oidp, &num, 0, req));
+}
+
 static int
 nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS)
 {
@@ -298,6 +314,9 @@ nvme_sysctl_initialize_queue(struct nvme_qpair *qpair,
 	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored",
 	    CTLFLAG_RD, &qpair->num_ignored,
 	    "Number of interrupts posted, but were administratively ignored");
+	SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock",
+	    CTLFLAG_RD, &qpair->num_recovery_nolock,
+	    "Number of times that we failed to lock recovery in the ISR");
 
 	SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO,
 	    "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
@@ -366,6 +385,11 @@ nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr)
 	    ctrlr, 0, nvme_sysctl_num_ignored, "IU",
 	    "Number of interrupts ignored administratively");
 
+	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
+	    "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU",
+	    "Number of times that we failed to lock recovery in the ISR");
+
 	SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO,
 	    "reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr,
 	    0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");