nvmf: The in-kernel NVMe over Fabrics host

This is the client (initiator in SCSI terms) for NVMe over Fabrics.
Userland is responsible for creating a set of queue pairs and then
handing them off via an ioctl to this driver, e.g. via the 'connect'
command from nvmecontrol(8).  An nvmeX new-bus device is created
at the top-level to represent the remote controller similar to PCI
nvmeX devices for PCI-express controllers.

As with nvme(4), namespace devices named /dev/nvmeXnsY are created and
pass through commands can be submitted to either the namespace devices
or the controller device.  For example, 'nvmecontrol identify nvmeX'
works for a remote Fabrics controller the same as for a PCI-express
controller.

nvmf exports remote namespaces via nda(4) devices using the new NVMF
CAM transport.  nvmf does not support nvd(4), only nda(4).

Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44714
This commit is contained in:
John Baldwin 2024-05-02 16:29:37 -07:00
parent 07c6a62bab
commit a1eda74167
14 changed files with 3082 additions and 2 deletions

View file

@ -408,6 +408,7 @@ MAN= aac.4 \
nvd.4 \
${_nvdimm.4} \
nvme.4 \
nvmf.4 \
nvmf_tcp.4 \
${_nvram.4} \
oce.4 \

87
share/man/man4/nvmf.4 Normal file
View file

@ -0,0 +1,87 @@
.\"
.\" SPDX-License-Identifier: BSD-2-Clause
.\"
.\" Copyright (c) 2024 Chelsio Communications, Inc.
.\"
.Dd May 2, 2024
.Dt NVMF 4
.Os
.Sh NAME
.Nm nvmf
.Nd "NVM Express over Fabrics host driver"
.Sh SYNOPSIS
To compile the driver into the kernel,
place the following line in the
kernel configuration file:
.Bd -ragged -offset indent
.Cd "device nvmf"
.Ed
.Pp
Alternatively, to load the driver as a
module at boot time, place the following line in
.Xr loader.conf 5 :
.Bd -literal -offset indent
nvmf_load="YES"
.Ed
.Sh DESCRIPTION
The
.Nm
driver provides the kernel component of an NVM Express over Fabrics
host.
The NVMeoF host is the client which provides local access to
namespaces exported by a remote controller.
.Pp
Associations between the local host and remote controllers are managed
using
.Xr nvmecontrol 8 .
New associations are created via the
.Cm connect
command and destroyed via the
.Cm disconnect
command.
If an association's connection is interrupted,
the
.Cm reconnect
command creates a new association to replace the interrupted association.
.Pp
Similar to
.Xr nvme 4 ,
.Nm
creates controller device nodes using the format
.Pa /dev/nvmeX
and namespace device nodes using the format
.Pa /dev/nvmeXnsY .
.Nm
also exports remote namespaces via the CAM
.Xr nda 4
peripheral driver.
Unlike
.Xr nvme 4 ,
.Nm
does not support the
.Xr nvd 4
disk driver.
.Pp
Associations require a supported transport such as
.Xr nvmf_tcp 4
for associations using TCP/IP.
.Sh SEE ALSO
.Xr nda 4 ,
.Xr nvme 4 ,
.Xr nvmf_tcp 4 ,
.Xr nvmft 4 ,
.Xr nvmecontrol 8
.Sh HISTORY
The
.Nm
module first appeared in
.Fx 15.0 .
.Sh AUTHORS
The
.Nm
driver was developed by
.An John Baldwin Aq Mt jhb@FreeBSD.org
under sponsorship from Chelsio Communications, Inc.
.Sh BUGS
.Nm
only supports a single I/O queue pair per association.

View file

@ -1676,12 +1676,14 @@ device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
# NVM Express
#
# nvme: PCI-express NVM Express host controllers
# nvmf: NVM Express over Fabrics host
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
device nvme # base NVMe driver
device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
device nvmf # NVMeoF host driver
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme

View file

@ -2533,7 +2533,15 @@ dev/nvme/nvme_test.c optional nvme
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
dev/nvmf/host/nvmf.c optional nvmf
dev/nvmf/host/nvmf_aer.c optional nvmf
dev/nvmf/host/nvmf_cmd.c optional nvmf
dev/nvmf/host/nvmf_ctldev.c optional nvmf
dev/nvmf/host/nvmf_ns.c optional nvmf
dev/nvmf/host/nvmf_qpair.c optional nvmf
dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
dev/nvmf/nvmf_transport.c optional nvmf
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci

939
sys/dev/nvmf/host/nvmf.c Normal file
View file

@ -0,0 +1,939 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/sx.h>
#include <sys/taskqueue.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/host/nvmf_var.h>
static struct cdevsw nvmf_cdevsw;
MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host");
static void nvmf_disconnect_task(void *arg, int pending);
void
nvmf_complete(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_completion_status *status = arg;
struct mtx *mtx;
status->cqe = *cqe;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
status->done = true;
mtx_unlock(mtx);
wakeup(status);
}
void
nvmf_io_complete(void *arg, size_t xfered, int error)
{
struct nvmf_completion_status *status = arg;
struct mtx *mtx;
status->io_error = error;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
status->io_done = true;
mtx_unlock(mtx);
wakeup(status);
}
void
nvmf_wait_for_reply(struct nvmf_completion_status *status)
{
struct mtx *mtx;
mtx = mtx_pool_find(mtxpool_sleep, status);
mtx_lock(mtx);
while (!status->done || !status->io_done)
mtx_sleep(status, mtx, 0, "nvmfcmd", 0);
mtx_unlock(mtx);
}
static int
nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t *value)
{
const struct nvmf_fabric_prop_get_rsp *rsp;
struct nvmf_completion_status status;
nvmf_status_init(&status);
if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status,
M_WAITOK))
return (ECONNABORTED);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe;
if (size == 8)
*value = le64toh(rsp->value.u64);
else
*value = le32toh(rsp->value.u32.low);
return (0);
}
static int
nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t value)
{
struct nvmf_completion_status status;
nvmf_status_init(&status);
if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status,
M_WAITOK))
return (ECONNABORTED);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
return (0);
}
static void
nvmf_shutdown_controller(struct nvmf_softc *sc)
{
uint64_t cc;
int error;
error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch CC for shutdown\n");
return;
}
cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL);
error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc);
if (error != 0)
device_printf(sc->dev,
"Failed to set CC to trigger shutdown\n");
}
static void
nvmf_check_keep_alive(void *arg)
{
struct nvmf_softc *sc = arg;
int traffic;
traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic);
if (traffic == 0) {
device_printf(sc->dev,
"disconnecting due to KeepAlive timeout\n");
nvmf_disconnect(sc);
return;
}
callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK);
}
static void
nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_softc *sc = arg;
atomic_store_int(&sc->ka_active_rx_traffic, 1);
if (cqe->status != 0) {
device_printf(sc->dev,
"KeepAlive response reported status %#x\n",
le16toh(cqe->status));
}
}
static void
nvmf_send_keep_alive(void *arg)
{
struct nvmf_softc *sc = arg;
int traffic;
/*
* Don't bother sending a KeepAlive command if TKAS is active
* and another command has been sent during the interval.
*/
traffic = atomic_load_int(&sc->ka_active_tx_traffic);
if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete,
sc, M_NOWAIT))
device_printf(sc->dev,
"Failed to allocate KeepAlive command\n");
/* Clear ka_active_tx_traffic after sending the keep alive command. */
atomic_store_int(&sc->ka_active_tx_traffic, 0);
callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK);
}
int
nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh)
{
size_t len;
u_int i;
int error;
memset(ivars, 0, sizeof(*ivars));
if (!hh->admin.admin || hh->num_io_queues < 1)
return (EINVAL);
ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK);
error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata));
if (error != 0)
goto out;
nvme_controller_data_swapbytes(ivars->cdata);
len = hh->num_io_queues * sizeof(*ivars->io_params);
ivars->io_params = malloc(len, M_NVMF, M_WAITOK);
error = copyin(hh->io, ivars->io_params, len);
if (error != 0)
goto out;
for (i = 0; i < hh->num_io_queues; i++) {
if (ivars->io_params[i].admin) {
error = EINVAL;
goto out;
}
/* Require all I/O queues to be the same size. */
if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) {
error = EINVAL;
goto out;
}
}
ivars->hh = hh;
return (0);
out:
free(ivars->io_params, M_NVMF);
free(ivars->cdata, M_NVMF);
return (error);
}
void
nvmf_free_ivars(struct nvmf_ivars *ivars)
{
free(ivars->io_params, M_NVMF);
free(ivars->cdata, M_NVMF);
}
static int
nvmf_probe(device_t dev)
{
struct nvmf_ivars *ivars = device_get_ivars(dev);
char desc[260];
if (ivars == NULL)
return (ENXIO);
snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn);
device_set_desc_copy(dev, desc);
return (BUS_PROBE_DEFAULT);
}
static int
nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars)
{
char name[16];
/* Setup the admin queue. */
sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin,
"admin queue");
if (sc->admin == NULL) {
device_printf(sc->dev, "Failed to setup admin queue\n");
return (ENXIO);
}
/* Setup I/O queues. */
sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF,
M_WAITOK | M_ZERO);
sc->num_io_queues = ivars->hh->num_io_queues;
for (u_int i = 0; i < sc->num_io_queues; i++) {
snprintf(name, sizeof(name), "I/O queue %u", i);
sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype,
&ivars->io_params[i], name);
if (sc->io[i] == NULL) {
device_printf(sc->dev, "Failed to setup I/O queue %u\n",
i + 1);
return (ENXIO);
}
}
/* Start KeepAlive timers. */
if (ivars->hh->kato != 0) {
sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS,
sc->cdata->ctratt) != 0;
sc->ka_rx_sbt = mstosbt(ivars->hh->kato);
sc->ka_tx_sbt = sc->ka_rx_sbt / 2;
callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0,
nvmf_check_keep_alive, sc, C_HARDCLOCK);
callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0,
nvmf_send_keep_alive, sc, C_HARDCLOCK);
}
return (0);
}
static bool
nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist,
struct nvme_namespace_data *data, uint32_t *nsidp)
{
struct nvmf_completion_status status;
uint32_t nsid;
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist,
nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY active namespaces command\n");
return (false);
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY active namespaces failed, status %#x\n",
le16toh(status.cqe.status));
return (false);
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY active namespaces failed with I/O error %d\n",
status.io_error);
return (false);
}
for (u_int i = 0; i < nitems(nslist->ns); i++) {
nsid = nslist->ns[i];
if (nsid == 0) {
*nsidp = 0;
return (true);
}
if (sc->ns[nsid - 1] != NULL) {
device_printf(sc->dev,
"duplicate namespace %u in active namespace list\n",
nsid);
return (false);
}
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
&status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY namespace %u command\n",
nsid);
return (false);
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed, status %#x\n", nsid,
le16toh(status.cqe.status));
return (false);
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed with I/O error %d\n",
nsid, status.io_error);
return (false);
}
/*
* As in nvme_ns_construct, a size of zero indicates an
* invalid namespace.
*/
nvme_namespace_data_swapbytes(data);
if (data->nsze == 0) {
device_printf(sc->dev,
"ignoring active namespace %u with zero size\n",
nsid);
continue;
}
sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
nvmf_sim_rescan_ns(sc, nsid);
}
MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0);
if (nsid >= 0xfffffffd)
*nsidp = 0;
else
*nsidp = nsid + 1;
return (true);
}
static bool
nvmf_add_namespaces(struct nvmf_softc *sc)
{
struct nvme_namespace_data *data;
struct nvme_ns_list *nslist;
uint32_t nsid;
bool retval;
sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF,
M_WAITOK | M_ZERO);
nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK);
data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
nsid = 0;
retval = true;
for (;;) {
if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) {
retval = false;
break;
}
if (nsid == 0)
break;
}
free(data, M_NVMF);
free(nslist, M_NVMF);
return (retval);
}
static int
nvmf_attach(device_t dev)
{
struct make_dev_args mda;
struct nvmf_softc *sc = device_get_softc(dev);
struct nvmf_ivars *ivars = device_get_ivars(dev);
uint64_t val;
u_int i;
int error;
if (ivars == NULL)
return (ENXIO);
sc->dev = dev;
sc->trtype = ivars->hh->trtype;
callout_init(&sc->ka_rx_timer, 1);
callout_init(&sc->ka_tx_timer, 1);
sx_init(&sc->connection_lock, "nvmf connection");
TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc);
/* Claim the cdata pointer from ivars. */
sc->cdata = ivars->cdata;
ivars->cdata = NULL;
nvmf_init_aer(sc);
/* TODO: Multiqueue support. */
sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */;
error = nvmf_establish_connection(sc, ivars);
if (error != 0)
goto out;
error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch CAP\n");
error = ENXIO;
goto out;
}
error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val);
if (error != 0) {
device_printf(sc->dev, "Failed to fetch VS\n");
error = ENXIO;
goto out;
}
sc->vs = val;
/* Honor MDTS if it is set. */
sc->max_xfer_size = maxphys;
if (sc->cdata->mdts != 0) {
sc->max_xfer_size = ulmin(sc->max_xfer_size,
1 << (sc->cdata->mdts + NVME_MPS_SHIFT +
NVME_CAP_HI_MPSMIN(sc->cap >> 32)));
}
error = nvmf_init_sim(sc);
if (error != 0)
goto out;
error = nvmf_start_aer(sc);
if (error != 0) {
nvmf_destroy_sim(sc);
goto out;
}
if (!nvmf_add_namespaces(sc)) {
nvmf_destroy_sim(sc);
goto out;
}
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
mda.mda_si_drv1 = sc;
error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev));
if (error != 0) {
nvmf_destroy_sim(sc);
goto out;
}
return (0);
out:
if (sc->ns != NULL) {
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_destroy_ns(sc->ns[i]);
}
free(sc->ns, M_NVMF);
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
if (sc->admin != NULL)
nvmf_shutdown_controller(sc);
for (i = 0; i < sc->num_io_queues; i++) {
if (sc->io[i] != NULL)
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
nvmf_destroy_aer(sc);
taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
sx_destroy(&sc->connection_lock);
free(sc->cdata, M_NVMF);
return (error);
}
void
nvmf_disconnect(struct nvmf_softc *sc)
{
taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task);
}
static void
nvmf_disconnect_task(void *arg, int pending __unused)
{
struct nvmf_softc *sc = arg;
u_int i;
sx_xlock(&sc->connection_lock);
if (sc->admin == NULL) {
/*
* Ignore transport errors if there is no active
* association.
*/
sx_xunlock(&sc->connection_lock);
return;
}
if (sc->detaching) {
if (sc->admin != NULL) {
/*
* This unsticks the detach process if a
* transport error occurs during detach.
*/
nvmf_shutdown_qp(sc->admin);
}
sx_xunlock(&sc->connection_lock);
return;
}
if (sc->cdev == NULL) {
/*
* Transport error occurred during attach (nvmf_add_namespaces).
* Shutdown the admin queue.
*/
nvmf_shutdown_qp(sc->admin);
sx_xunlock(&sc->connection_lock);
return;
}
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
sc->ka_traffic = false;
/* Quiesce namespace consumers. */
nvmf_disconnect_sim(sc);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_disconnect_ns(sc->ns[i]);
}
/* Shutdown the existing qpairs. */
for (i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
sc->io = NULL;
sc->num_io_queues = 0;
nvmf_destroy_qp(sc->admin);
sc->admin = NULL;
sx_xunlock(&sc->connection_lock);
}
static int
nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh)
{
struct nvmf_ivars ivars;
u_int i;
int error;
/* XXX: Should we permit changing the transport type? */
if (sc->trtype != hh->trtype) {
device_printf(sc->dev,
"transport type mismatch on reconnect\n");
return (EINVAL);
}
error = nvmf_init_ivars(&ivars, hh);
if (error != 0)
return (error);
sx_xlock(&sc->connection_lock);
if (sc->admin != NULL || sc->detaching) {
error = EBUSY;
goto out;
}
/*
* Ensure this is for the same controller. Note that the
* controller ID can vary across associations if the remote
* system is using the dynamic controller model. This merely
* ensures the new association is connected to the same NVMe
* subsystem.
*/
if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn,
sizeof(ivars.cdata->subnqn)) != 0) {
device_printf(sc->dev,
"controller subsystem NQN mismatch on reconnect\n");
error = EINVAL;
goto out;
}
/*
* XXX: Require same number and size of I/O queues so that
* max_pending_io is still correct?
*/
error = nvmf_establish_connection(sc, &ivars);
if (error != 0)
goto out;
error = nvmf_start_aer(sc);
if (error != 0)
goto out;
device_printf(sc->dev,
"established new association with %u I/O queues\n",
sc->num_io_queues);
/* Restart namespace consumers. */
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_reconnect_ns(sc->ns[i]);
}
nvmf_reconnect_sim(sc);
out:
sx_xunlock(&sc->connection_lock);
nvmf_free_ivars(&ivars);
return (error);
}
static int
nvmf_detach(device_t dev)
{
struct nvmf_softc *sc = device_get_softc(dev);
u_int i;
destroy_dev(sc->cdev);
sx_xlock(&sc->connection_lock);
sc->detaching = true;
sx_xunlock(&sc->connection_lock);
nvmf_destroy_sim(sc);
for (i = 0; i < sc->cdata->nn; i++) {
if (sc->ns[i] != NULL)
nvmf_destroy_ns(sc->ns[i]);
}
free(sc->ns, M_NVMF);
callout_drain(&sc->ka_tx_timer);
callout_drain(&sc->ka_rx_timer);
if (sc->admin != NULL)
nvmf_shutdown_controller(sc);
for (i = 0; i < sc->num_io_queues; i++) {
nvmf_destroy_qp(sc->io[i]);
}
free(sc->io, M_NVMF);
taskqueue_drain(taskqueue_thread, &sc->disconnect_task);
if (sc->admin != NULL)
nvmf_destroy_qp(sc->admin);
nvmf_destroy_aer(sc);
sx_destroy(&sc->connection_lock);
free(sc->cdata, M_NVMF);
return (0);
}
void
nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid)
{
struct nvmf_completion_status status;
struct nvme_namespace_data *data;
struct nvmf_namespace *ns;
data = malloc(sizeof(*data), M_NVMF, M_WAITOK);
nvmf_status_init(&status);
nvmf_status_wait_io(&status);
if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete,
&status, nvmf_io_complete, &status, M_WAITOK)) {
device_printf(sc->dev,
"failed to send IDENTIFY namespace %u command\n", nsid);
free(data, M_NVMF);
return;
}
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed, status %#x\n", nsid,
le16toh(status.cqe.status));
free(data, M_NVMF);
return;
}
if (status.io_error != 0) {
device_printf(sc->dev,
"IDENTIFY namespace %u failed with I/O error %d\n",
nsid, status.io_error);
free(data, M_NVMF);
return;
}
nvme_namespace_data_swapbytes(data);
/* XXX: Needs locking around sc->ns[]. */
ns = sc->ns[nsid - 1];
if (data->nsze == 0) {
/* XXX: Needs locking */
if (ns != NULL) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
}
} else {
/* XXX: Needs locking */
if (ns == NULL) {
sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data);
} else {
if (!nvmf_update_ns(ns, data)) {
nvmf_destroy_ns(ns);
sc->ns[nsid - 1] = NULL;
}
}
}
free(data, M_NVMF);
nvmf_sim_rescan_ns(sc, nsid);
}
int
nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
bool admin)
{
struct nvmf_completion_status status;
struct nvme_command cmd;
struct memdesc mem;
struct nvmf_host_qpair *qp;
struct nvmf_request *req;
void *buf;
int error;
if (pt->len > sc->max_xfer_size)
return (EINVAL);
buf = NULL;
if (pt->len != 0) {
/*
* XXX: Depending on the size we may want to pin the
* user pages and use a memdesc with vm_page_t's
* instead.
*/
buf = malloc(pt->len, M_NVMF, M_WAITOK);
if (pt->is_read == 0) {
error = copyin(pt->buf, buf, pt->len);
if (error != 0) {
free(buf, M_NVMF);
return (error);
}
} else {
/* Ensure no kernel data is leaked to userland. */
memset(buf, 0, pt->len);
}
}
memset(&cmd, 0, sizeof(cmd));
cmd.opc = pt->cmd.opc;
cmd.fuse = pt->cmd.fuse;
cmd.nsid = pt->cmd.nsid;
cmd.cdw10 = pt->cmd.cdw10;
cmd.cdw11 = pt->cmd.cdw11;
cmd.cdw12 = pt->cmd.cdw12;
cmd.cdw13 = pt->cmd.cdw13;
cmd.cdw14 = pt->cmd.cdw14;
cmd.cdw15 = pt->cmd.cdw15;
if (admin)
qp = sc->admin;
else
qp = nvmf_select_io_queue(sc);
nvmf_status_init(&status);
req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK);
if (req == NULL) {
device_printf(sc->dev, "failed to send passthrough command\n");
error = ECONNABORTED;
goto error;
}
if (pt->len != 0) {
mem = memdesc_vaddr(buf, pt->len);
nvmf_capsule_append_data(req->nc, &mem, pt->len,
pt->is_read == 0, nvmf_io_complete, &status);
nvmf_status_wait_io(&status);
}
nvmf_submit_request(req);
nvmf_wait_for_reply(&status);
memset(&pt->cpl, 0, sizeof(pt->cpl));
pt->cpl.cdw0 = status.cqe.cdw0;
pt->cpl.status = status.cqe.status;
error = status.io_error;
if (error == 0 && pt->len != 0 && pt->is_read != 0)
error = copyout(buf, pt->buf, pt->len);
error:
free(buf, M_NVMF);
return (error);
}
static int
nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
struct nvmf_softc *sc = cdev->si_drv1;
struct nvme_get_nsid *gnsid;
struct nvme_pt_command *pt;
struct nvmf_reconnect_params *rp;
struct nvmf_handoff_host *hh;
switch (cmd) {
case NVME_PASSTHROUGH_CMD:
pt = (struct nvme_pt_command *)arg;
return (nvmf_passthrough_cmd(sc, pt, true));
case NVME_GET_NSID:
gnsid = (struct nvme_get_nsid *)arg;
strncpy(gnsid->cdev, device_get_nameunit(sc->dev),
sizeof(gnsid->cdev));
gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
gnsid->nsid = 0;
return (0);
case NVME_GET_MAX_XFER_SIZE:
*(uint64_t *)arg = sc->max_xfer_size;
return (0);
case NVMF_RECONNECT_PARAMS:
rp = (struct nvmf_reconnect_params *)arg;
if ((sc->cdata->fcatt & 1) == 0)
rp->cntlid = NVMF_CNTLID_DYNAMIC;
else
rp->cntlid = sc->cdata->ctrlr_id;
memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn));
return (0);
case NVMF_RECONNECT_HOST:
hh = (struct nvmf_handoff_host *)arg;
return (nvmf_reconnect_host(sc, hh));
default:
return (ENOTTY);
}
}
static struct cdevsw nvmf_cdevsw = {
.d_version = D_VERSION,
.d_ioctl = nvmf_ioctl
};
static int
nvmf_modevent(module_t mod, int what, void *arg)
{
switch (what) {
case MOD_LOAD:
return (nvmf_ctl_load());
case MOD_QUIESCE:
return (0);
case MOD_UNLOAD:
nvmf_ctl_unload();
destroy_dev_drain(&nvmf_cdevsw);
return (0);
default:
return (EOPNOTSUPP);
}
}
static device_method_t nvmf_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, nvmf_probe),
DEVMETHOD(device_attach, nvmf_attach),
DEVMETHOD(device_detach, nvmf_detach),
#if 0
DEVMETHOD(device_shutdown, nvmf_shutdown),
#endif
DEVMETHOD_END
};
driver_t nvme_nvmf_driver = {
"nvme",
nvmf_methods,
sizeof(struct nvmf_softc),
};
DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL);
MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1);

View file

@ -0,0 +1,290 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/types.h>
#include <sys/bus.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/taskqueue.h>
#include <dev/nvmf/host/nvmf_var.h>
struct nvmf_aer {
struct nvmf_softc *sc;
uint8_t log_page_id;
uint8_t info;
uint8_t type;
u_int page_len;
void *page;
int error;
uint16_t status;
int pending;
struct mtx *lock;
struct task complete_task;
struct task finish_page_task;
};
#define MAX_LOG_PAGE_SIZE 4096
static void nvmf_complete_aer(void *arg, const struct nvme_completion *cqe);
static void
nvmf_submit_aer(struct nvmf_softc *sc, struct nvmf_aer *aer)
{
struct nvmf_request *req;
struct nvme_command cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete_aer, aer,
M_WAITOK);
if (req == NULL)
return;
req->aer = true;
nvmf_submit_request(req);
}
static void
nvmf_handle_changed_namespaces(struct nvmf_softc *sc,
struct nvme_ns_list *ns_list)
{
uint32_t nsid;
/*
* If more than 1024 namespaces have changed, we should
* probably just rescan the entire set of namespaces.
*/
if (ns_list->ns[0] == 0xffffffff) {
device_printf(sc->dev, "too many changed namespaces\n");
return;
}
for (u_int i = 0; i < nitems(ns_list->ns); i++) {
if (ns_list->ns[i] == 0)
break;
nsid = le32toh(ns_list->ns[i]);
nvmf_rescan_ns(sc, nsid);
}
}
static void
nvmf_finish_aer_page(struct nvmf_softc *sc, struct nvmf_aer *aer)
{
/* If an error occurred fetching the page, just bail. */
if (aer->error != 0 || aer->status != 0)
return;
taskqueue_enqueue(taskqueue_thread, &aer->finish_page_task);
}
static void
nvmf_finish_aer_page_task(void *arg, int pending)
{
struct nvmf_aer *aer = arg;
struct nvmf_softc *sc = aer->sc;
switch (aer->log_page_id) {
case NVME_LOG_ERROR:
/* TODO: Should we log these? */
break;
case NVME_LOG_CHANGED_NAMESPACE:
nvmf_handle_changed_namespaces(sc, aer->page);
break;
}
/* Resubmit this AER command. */
nvmf_submit_aer(sc, aer);
}
static void
nvmf_io_complete_aer_page(void *arg, size_t xfered, int error)
{
struct nvmf_aer *aer = arg;
struct nvmf_softc *sc = aer->sc;
mtx_lock(aer->lock);
aer->error = error;
aer->pending--;
if (aer->pending == 0) {
mtx_unlock(aer->lock);
nvmf_finish_aer_page(sc, aer);
} else
mtx_unlock(aer->lock);
}
static void
nvmf_complete_aer_page(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_aer *aer = arg;
struct nvmf_softc *sc = aer->sc;
mtx_lock(aer->lock);
aer->status = cqe->status;
aer->pending--;
if (aer->pending == 0) {
mtx_unlock(aer->lock);
nvmf_finish_aer_page(sc, aer);
} else
mtx_unlock(aer->lock);
}
static u_int
nvmf_log_page_size(struct nvmf_softc *sc, uint8_t log_page_id)
{
switch (log_page_id) {
case NVME_LOG_ERROR:
return ((sc->cdata->elpe + 1) *
sizeof(struct nvme_error_information_entry));
case NVME_LOG_CHANGED_NAMESPACE:
return (sizeof(struct nvme_ns_list));
default:
return (0);
}
}
static void
nvmf_complete_aer(void *arg, const struct nvme_completion *cqe)
{
struct nvmf_aer *aer = arg;
struct nvmf_softc *sc = aer->sc;
uint32_t cdw0;
/*
* The only error defined for AER is an abort due to
* submitting too many AER commands. Just discard this AER
* without resubmitting if we get an error.
*
* NB: Pending AER commands are aborted during controller
* shutdown, so discard aborted commands silently.
*/
if (cqe->status != 0) {
if (!nvmf_cqe_aborted(cqe))
device_printf(sc->dev, "Ignoring error %#x for AER\n",
le16toh(cqe->status));
return;
}
cdw0 = le32toh(cqe->cdw0);
aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cdw0);
aer->info = NVMEV(NVME_ASYNC_EVENT_INFO, cdw0);
aer->type = NVMEV(NVME_ASYNC_EVENT_TYPE, cdw0);
device_printf(sc->dev, "AER type %u, info %#x, page %#x\n",
aer->type, aer->info, aer->log_page_id);
aer->page_len = nvmf_log_page_size(sc, aer->log_page_id);
taskqueue_enqueue(taskqueue_thread, &aer->complete_task);
}
static void
nvmf_complete_aer_task(void *arg, int pending)
{
struct nvmf_aer *aer = arg;
struct nvmf_softc *sc = aer->sc;
if (aer->page_len != 0) {
/* Read the associated log page. */
aer->page_len = MIN(aer->page_len, MAX_LOG_PAGE_SIZE);
aer->pending = 2;
(void) nvmf_cmd_get_log_page(sc, NVME_GLOBAL_NAMESPACE_TAG,
aer->log_page_id, 0, aer->page, aer->page_len,
nvmf_complete_aer_page, aer, nvmf_io_complete_aer_page,
aer, M_WAITOK);
} else {
/* Resubmit this AER command. */
nvmf_submit_aer(sc, aer);
}
}
static int
nvmf_set_async_event_config(struct nvmf_softc *sc, uint32_t config)
{
struct nvme_command cmd;
struct nvmf_completion_status status;
struct nvmf_request *req;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = NVME_OPC_SET_FEATURES;
cmd.cdw10 = htole32(NVME_FEAT_ASYNC_EVENT_CONFIGURATION);
cmd.cdw11 = htole32(config);
nvmf_status_init(&status);
req = nvmf_allocate_request(sc->admin, &cmd, nvmf_complete, &status,
M_WAITOK);
if (req == NULL) {
device_printf(sc->dev,
"failed to allocate SET_FEATURES (ASYNC_EVENT_CONFIGURATION) command\n");
return (ECONNABORTED);
}
nvmf_submit_request(req);
nvmf_wait_for_reply(&status);
if (status.cqe.status != 0) {
device_printf(sc->dev,
"SET_FEATURES (ASYNC_EVENT_CONFIGURATION) failed, status %#x\n",
le16toh(status.cqe.status));
return (EIO);
}
return (0);
}
void
nvmf_init_aer(struct nvmf_softc *sc)
{
/* 8 matches NVME_MAX_ASYNC_EVENTS */
sc->num_aer = min(8, sc->cdata->aerl + 1);
sc->aer = mallocarray(sc->num_aer, sizeof(*sc->aer), M_NVMF,
M_WAITOK | M_ZERO);
for (u_int i = 0; i < sc->num_aer; i++) {
sc->aer[i].sc = sc;
sc->aer[i].page = malloc(MAX_LOG_PAGE_SIZE, M_NVMF, M_WAITOK);
sc->aer[i].lock = mtx_pool_find(mtxpool_sleep, &sc->aer[i]);
TASK_INIT(&sc->aer[i].complete_task, 0, nvmf_complete_aer_task,
&sc->aer[i]);
TASK_INIT(&sc->aer[i].finish_page_task, 0,
nvmf_finish_aer_page_task, &sc->aer[i]);
}
}
int
nvmf_start_aer(struct nvmf_softc *sc)
{
uint32_t async_event_config;
int error;
async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
NVME_CRIT_WARN_ST_READ_ONLY |
NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
if (sc->cdata->ver >= NVME_REV(1, 2))
async_event_config |=
sc->cdata->oaes & NVME_ASYNC_EVENT_NS_ATTRIBUTE;
error = nvmf_set_async_event_config(sc, async_event_config);
if (error != 0)
return (error);
for (u_int i = 0; i < sc->num_aer; i++)
nvmf_submit_aer(sc, &sc->aer[i]);
return (0);
}
void
nvmf_destroy_aer(struct nvmf_softc *sc)
{
for (u_int i = 0; i < sc->num_aer; i++) {
taskqueue_drain(taskqueue_thread, &sc->aer[i].complete_task);
taskqueue_drain(taskqueue_thread, &sc->aer[i].finish_page_task);
free(sc->aer[i].page, M_NVMF);
}
free(sc->aer, M_NVMF);
}

View file

@ -0,0 +1,171 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/types.h>
#include <sys/memdesc.h>
#include <sys/systm.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_proto.h>
#include <dev/nvmf/host/nvmf_var.h>
bool
nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
nvmf_request_complete_t *cb, void *cb_arg, int how)
{
struct nvmf_fabric_prop_get_cmd cmd;
struct nvmf_request *req;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_GET;
switch (size) {
case 4:
cmd.attrib.size = NVMF_PROP_SIZE_4;
break;
case 8:
cmd.attrib.size = NVMF_PROP_SIZE_8;
break;
default:
panic("Invalid property size");
}
cmd.ofst = htole32(offset);
req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
if (req != NULL)
nvmf_submit_request(req);
return (req != NULL);
}
bool
nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size,
uint64_t value, nvmf_request_complete_t *cb, void *cb_arg, int how)
{
struct nvmf_fabric_prop_set_cmd cmd;
struct nvmf_request *req;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = NVME_OPC_FABRICS_COMMANDS;
cmd.fctype = NVMF_FABRIC_COMMAND_PROPERTY_SET;
switch (size) {
case 4:
cmd.attrib.size = NVMF_PROP_SIZE_4;
cmd.value.u32.low = htole32(value);
break;
case 8:
cmd.attrib.size = NVMF_PROP_SIZE_8;
cmd.value.u64 = htole64(value);
break;
default:
panic("Invalid property size");
}
cmd.ofst = htole32(offset);
req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
if (req != NULL)
nvmf_submit_request(req);
return (req != NULL);
}
bool
nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
void *cb_arg, int how)
{
struct nvme_command cmd;
struct nvmf_request *req;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = NVME_OPC_KEEP_ALIVE;
req = nvmf_allocate_request(sc->admin, &cmd, cb, cb_arg, how);
if (req != NULL)
nvmf_submit_request(req);
return (req != NULL);
}
bool
nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
{
struct nvme_command cmd;
struct memdesc mem;
struct nvmf_request *req;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = NVME_OPC_IDENTIFY;
/* 5.15.1 Use CNS of 0x02 for namespace data. */
cmd.cdw10 = htole32(2);
cmd.nsid = htole32(id);
req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
if (req == NULL)
return (false);
mem = memdesc_vaddr(nslist, sizeof(*nslist));
nvmf_capsule_append_data(req->nc, &mem, sizeof(*nslist), false,
io_cb, io_cb_arg);
nvmf_submit_request(req);
return (true);
}
bool
nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
{
struct nvme_command cmd;
struct memdesc mem;
struct nvmf_request *req;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = NVME_OPC_IDENTIFY;
/* 5.15.1 Use CNS of 0x00 for namespace data. */
cmd.cdw10 = htole32(0);
cmd.nsid = htole32(id);
req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
if (req == NULL)
return (false);
mem = memdesc_vaddr(nsdata, sizeof(*nsdata));
nvmf_capsule_append_data(req->nc, &mem, sizeof(*nsdata), false,
io_cb, io_cb_arg);
nvmf_submit_request(req);
return (true);
}
bool
nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how)
{
struct nvme_command cmd;
struct memdesc mem;
struct nvmf_request *req;
size_t numd;
MPASS(len != 0 && len % 4 == 0);
MPASS(offset % 4 == 0);
numd = (len / 4) - 1;
memset(&cmd, 0, sizeof(cmd));
cmd.opc = NVME_OPC_GET_LOG_PAGE;
cmd.nsid = htole32(nsid);
cmd.cdw10 = htole32(numd << 16 | lid);
cmd.cdw11 = htole32(numd >> 16);
cmd.cdw12 = htole32(offset);
cmd.cdw13 = htole32(offset >> 32);
req = nvmf_allocate_request(sc->admin, &cmd, req_cb, req_cb_arg, how);
if (req == NULL)
return (false);
mem = memdesc_vaddr(buf, len);
nvmf_capsule_append_data(req->nc, &mem, len, false, io_cb, io_cb_arg);
nvmf_submit_request(req);
return (true);
}

View file

@ -0,0 +1,159 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/malloc.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/host/nvmf_var.h>
static struct cdev *nvmf_cdev;
static int
nvmf_handoff_host(struct nvmf_handoff_host *hh)
{
struct nvmf_ivars ivars;
device_t dev;
int error;
error = nvmf_init_ivars(&ivars, hh);
if (error != 0)
return (error);
bus_topo_lock();
dev = device_add_child(root_bus, "nvme", -1);
if (dev == NULL) {
bus_topo_unlock();
error = ENXIO;
goto out;
}
device_set_ivars(dev, &ivars);
error = device_probe_and_attach(dev);
device_set_ivars(dev, NULL);
if (error != 0)
device_delete_child(root_bus, dev);
bus_topo_unlock();
out:
nvmf_free_ivars(&ivars);
return (error);
}
static bool
nvmf_matches(device_t dev, char *name)
{
struct nvmf_softc *sc = device_get_softc(dev);
if (strcmp(device_get_nameunit(dev), name) == 0)
return (true);
if (strcmp(sc->cdata->subnqn, name) == 0)
return (true);
return (false);
}
static int
nvmf_disconnect_by_name(char *name)
{
devclass_t dc;
device_t dev;
int error, unit;
bool found;
found = false;
error = 0;
bus_topo_lock();
dc = devclass_find("nvme");
if (dc == NULL)
goto out;
for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
dev = devclass_get_device(dc, unit);
if (dev == NULL)
continue;
if (device_get_driver(dev) != &nvme_nvmf_driver)
continue;
if (device_get_parent(dev) != root_bus)
continue;
if (name != NULL && !nvmf_matches(dev, name))
continue;
error = device_delete_child(root_bus, dev);
if (error != 0)
break;
found = true;
}
out:
bus_topo_unlock();
if (error == 0 && !found)
error = ENOENT;
return (error);
}
static int
nvmf_disconnect_host(const char **namep)
{
char *name;
int error;
name = malloc(PATH_MAX, M_NVMF, M_WAITOK);
error = copyinstr(*namep, name, PATH_MAX, NULL);
if (error == 0)
error = nvmf_disconnect_by_name(name);
free(name, M_NVMF);
return (error);
}
static int
nvmf_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
switch (cmd) {
case NVMF_HANDOFF_HOST:
return (nvmf_handoff_host((struct nvmf_handoff_host *)arg));
case NVMF_DISCONNECT_HOST:
return (nvmf_disconnect_host((const char **)arg));
case NVMF_DISCONNECT_ALL:
return (nvmf_disconnect_by_name(NULL));
default:
return (ENOTTY);
}
}
static struct cdevsw nvmf_ctl_cdevsw = {
.d_version = D_VERSION,
.d_ioctl = nvmf_ctl_ioctl
};
int
nvmf_ctl_load(void)
{
struct make_dev_args mda;
int error;
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_ctl_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
error = make_dev_s(&mda, &nvmf_cdev, "nvmf");
if (error != 0)
nvmf_cdev = NULL;
return (error);
}
void
nvmf_ctl_unload(void)
{
if (nvmf_cdev != NULL) {
destroy_dev(nvmf_cdev);
nvmf_cdev = NULL;
}
}

483
sys/dev/nvmf/host/nvmf_ns.c Normal file
View file

@ -0,0 +1,483 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/bio.h>
#include <sys/bus.h>
#include <sys/conf.h>
#include <sys/disk.h>
#include <sys/fcntl.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/sbuf.h>
#include <machine/stdarg.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/host/nvmf_var.h>
struct nvmf_namespace {
struct nvmf_softc *sc;
uint64_t size;
uint32_t id;
u_int flags;
uint32_t lba_size;
bool disconnected;
TAILQ_HEAD(, bio) pending_bios;
struct mtx lock;
volatile u_int active_bios;
struct cdev *cdev;
};
static void nvmf_ns_strategy(struct bio *bio);
static void
ns_printf(struct nvmf_namespace *ns, const char *fmt, ...)
{
char buf[128];
struct sbuf sb;
va_list ap;
sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
sbuf_set_drain(&sb, sbuf_printf_drain, NULL);
sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev),
ns->id);
va_start(ap, fmt);
sbuf_vprintf(&sb, fmt, ap);
va_end(ap);
sbuf_finish(&sb);
sbuf_delete(&sb);
}
/*
* The I/O completion may trigger after the received CQE if the I/O
* used a zero-copy mbuf that isn't harvested until after the NIC
* driver processes TX completions. Abuse bio_driver1 as a refcount.
* Store I/O errors in bio_driver2.
*/
static __inline u_int *
bio_refs(struct bio *bio)
{
return ((u_int *)&bio->bio_driver1);
}
static void
nvmf_ns_biodone(struct bio *bio)
{
struct nvmf_namespace *ns;
int error;
if (!refcount_release(bio_refs(bio)))
return;
ns = bio->bio_dev->si_drv1;
/* If a request is aborted, resubmit or queue it for resubmission. */
if (bio->bio_error == ECONNABORTED) {
bio->bio_error = 0;
bio->bio_driver2 = 0;
mtx_lock(&ns->lock);
if (ns->disconnected) {
TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
mtx_unlock(&ns->lock);
} else {
mtx_unlock(&ns->lock);
nvmf_ns_strategy(bio);
}
} else {
/*
* I/O errors take precedence over generic EIO from
* CQE errors.
*/
error = (intptr_t)bio->bio_driver2;
if (error != 0)
bio->bio_error = error;
if (bio->bio_error != 0)
bio->bio_flags |= BIO_ERROR;
biodone(bio);
}
if (refcount_release(&ns->active_bios))
wakeup(ns);
}
static void
nvmf_ns_io_complete(void *arg, size_t xfered, int error)
{
struct bio *bio = arg;
KASSERT(xfered <= bio->bio_bcount,
("%s: xfered > bio_bcount", __func__));
bio->bio_driver2 = (void *)(intptr_t)error;
bio->bio_resid = bio->bio_bcount - xfered;
nvmf_ns_biodone(bio);
}
static void
nvmf_ns_delete_complete(void *arg, size_t xfered, int error)
{
struct bio *bio = arg;
if (error != 0)
bio->bio_resid = bio->bio_bcount;
else
bio->bio_resid = 0;
free(bio->bio_driver2, M_NVMF);
bio->bio_driver2 = (void *)(intptr_t)error;
nvmf_ns_biodone(bio);
}
static void
nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe)
{
struct bio *bio = arg;
if (nvmf_cqe_aborted(cqe))
bio->bio_error = ECONNABORTED;
else if (cqe->status != 0)
bio->bio_error = EIO;
nvmf_ns_biodone(bio);
}
static int
nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio)
{
struct nvme_command cmd;
struct nvmf_request *req;
struct nvme_dsm_range *dsm_range;
struct memdesc mem;
uint64_t lba, lba_count;
dsm_range = NULL;
memset(&cmd, 0, sizeof(cmd));
switch (bio->bio_cmd) {
case BIO_READ:
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count);
break;
case BIO_WRITE:
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count);
break;
case BIO_FLUSH:
nvme_ns_flush_cmd(&cmd, ns->id);
break;
case BIO_DELETE:
dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT |
M_ZERO);
if (dsm_range == NULL)
return (ENOMEM);
lba = bio->bio_offset / ns->lba_size;
lba_count = bio->bio_bcount / ns->lba_size;
dsm_range->starting_lba = htole64(lba);
dsm_range->length = htole32(lba_count);
cmd.opc = NVME_OPC_DATASET_MANAGEMENT;
cmd.nsid = htole32(ns->id);
cmd.cdw10 = htole32(0); /* 1 range */
cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
break;
default:
return (EOPNOTSUPP);
}
mtx_lock(&ns->lock);
if (ns->disconnected) {
TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue);
mtx_unlock(&ns->lock);
free(dsm_range, M_NVMF);
return (0);
}
req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd,
nvmf_ns_bio_complete, bio, M_NOWAIT);
if (req == NULL) {
mtx_unlock(&ns->lock);
free(dsm_range, M_NVMF);
return (ENOMEM);
}
switch (bio->bio_cmd) {
case BIO_READ:
case BIO_WRITE:
refcount_init(bio_refs(bio), 2);
mem = memdesc_bio(bio);
nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount,
bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio);
break;
case BIO_DELETE:
refcount_init(bio_refs(bio), 2);
mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range));
nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range),
true, nvmf_ns_delete_complete, bio);
bio->bio_driver2 = dsm_range;
break;
default:
refcount_init(bio_refs(bio), 1);
KASSERT(bio->bio_resid == 0,
("%s: input bio_resid != 0", __func__));
break;
}
refcount_acquire(&ns->active_bios);
nvmf_submit_request(req);
mtx_unlock(&ns->lock);
return (0);
}
static int
nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag,
struct thread *td)
{
struct nvmf_namespace *ns = dev->si_drv1;
struct nvme_get_nsid *gnsid;
struct nvme_pt_command *pt;
switch (cmd) {
case NVME_PASSTHROUGH_CMD:
pt = (struct nvme_pt_command *)arg;
pt->cmd.nsid = htole32(ns->id);
return (nvmf_passthrough_cmd(ns->sc, pt, false));
case NVME_GET_NSID:
gnsid = (struct nvme_get_nsid *)arg;
strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev),
sizeof(gnsid->cdev));
gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
gnsid->nsid = ns->id;
return (0);
case DIOCGMEDIASIZE:
*(off_t *)arg = ns->size;
return (0);
case DIOCGSECTORSIZE:
*(u_int *)arg = ns->lba_size;
return (0);
default:
return (ENOTTY);
}
}
static int
nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
int error;
error = 0;
if ((oflags & FWRITE) != 0)
error = securelevel_gt(td->td_ucred, 0);
return (error);
}
void
nvmf_ns_strategy(struct bio *bio)
{
struct nvmf_namespace *ns;
int error;
ns = bio->bio_dev->si_drv1;
error = nvmf_ns_submit_bio(ns, bio);
if (error != 0) {
bio->bio_error = error;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
}
static struct cdevsw nvmf_ns_cdevsw = {
.d_version = D_VERSION,
.d_flags = D_DISK,
.d_open = nvmf_ns_open,
.d_read = physread,
.d_write = physwrite,
.d_strategy = nvmf_ns_strategy,
.d_ioctl = nvmf_ns_ioctl
};
struct nvmf_namespace *
nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
struct nvme_namespace_data *data)
{
struct make_dev_args mda;
struct nvmf_namespace *ns;
int error;
uint8_t lbads, lbaf;
ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO);
ns->sc = sc;
ns->id = id;
TAILQ_INIT(&ns->pending_bios);
mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF);
/* One dummy bio avoids dropping to 0 until destroy. */
refcount_init(&ns->active_bios, 1);
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
ns_printf(ns, "End-to-end data protection not supported\n");
goto fail;
}
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
if (lbaf > data->nlbaf) {
ns_printf(ns, "Invalid LBA format index\n");
goto fail;
}
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
ns_printf(ns, "Namespaces with metadata are not supported\n");
goto fail;
}
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
if (lbads == 0) {
ns_printf(ns, "Invalid LBA format index\n");
goto fail;
}
ns->lba_size = 1 << lbads;
ns->size = data->nsze * ns->lba_size;
if (nvme_ctrlr_has_dataset_mgmt(sc->cdata))
ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED;
if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0)
ns->flags |= NVME_NS_FLUSH_SUPPORTED;
/*
* XXX: Does any of the boundary splitting for NOIOB make any
* sense for Fabrics?
*/
make_dev_args_init(&mda);
mda.mda_devsw = &nvmf_ns_cdevsw;
mda.mda_uid = UID_ROOT;
mda.mda_gid = GID_WHEEL;
mda.mda_mode = 0600;
mda.mda_si_drv1 = ns;
error = make_dev_s(&mda, &ns->cdev, "%sns%u",
device_get_nameunit(sc->dev), id);
if (error != 0)
goto fail;
ns->cdev->si_flags |= SI_UNMAPPED;
return (ns);
fail:
mtx_destroy(&ns->lock);
free(ns, M_NVMF);
return (NULL);
}
void
nvmf_disconnect_ns(struct nvmf_namespace *ns)
{
mtx_lock(&ns->lock);
ns->disconnected = true;
mtx_unlock(&ns->lock);
}
void
nvmf_reconnect_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
mtx_lock(&ns->lock);
ns->disconnected = false;
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
nvmf_ns_strategy(bio);
}
}
void
nvmf_destroy_ns(struct nvmf_namespace *ns)
{
TAILQ_HEAD(, bio) bios;
struct bio *bio;
destroy_dev(ns->cdev);
/*
* Wait for active I/O requests to drain. The release drops
* the reference on the "dummy bio" when the namespace is
* created.
*/
mtx_lock(&ns->lock);
if (!refcount_release(&ns->active_bios)) {
while (ns->active_bios != 0)
mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0);
}
/* Abort any pending I/O requests. */
TAILQ_INIT(&bios);
TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue);
mtx_unlock(&ns->lock);
while (!TAILQ_EMPTY(&bios)) {
bio = TAILQ_FIRST(&bios);
TAILQ_REMOVE(&bios, bio, bio_queue);
bio->bio_error = ECONNABORTED;
bio->bio_flags |= BIO_ERROR;
bio->bio_resid = bio->bio_bcount;
biodone(bio);
}
mtx_destroy(&ns->lock);
free(ns, M_NVMF);
}
bool
nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data)
{
uint8_t lbads, lbaf;
if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) {
ns_printf(ns, "End-to-end data protection not supported\n");
return (false);
}
lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas);
if (lbaf > data->nlbaf) {
ns_printf(ns, "Invalid LBA format index\n");
return (false);
}
if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) {
ns_printf(ns, "Namespaces with metadata are not supported\n");
return (false);
}
lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]);
if (lbads == 0) {
ns_printf(ns, "Invalid LBA format index\n");
return (false);
}
ns->lba_size = 1 << lbads;
ns->size = data->nsze * ns->lba_size;
return (true);
}

View file

@ -0,0 +1,386 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/types.h>
#include <sys/bus.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/host/nvmf_var.h>
struct nvmf_host_command {
struct nvmf_request *req;
TAILQ_ENTRY(nvmf_host_command) link;
uint16_t cid;
};
struct nvmf_host_qpair {
struct nvmf_softc *sc;
struct nvmf_qpair *qp;
bool sq_flow_control;
bool shutting_down;
u_int allocating;
u_int num_commands;
uint16_t sqhd;
uint16_t sqtail;
struct mtx lock;
TAILQ_HEAD(, nvmf_host_command) free_commands;
STAILQ_HEAD(, nvmf_request) pending_requests;
/* Indexed by cid. */
struct nvmf_host_command **active_commands;
char name[16];
};
struct nvmf_request *
nvmf_allocate_request(struct nvmf_host_qpair *qp, void *sqe,
nvmf_request_complete_t *cb, void *cb_arg, int how)
{
struct nvmf_request *req;
struct nvmf_qpair *nq;
KASSERT(how == M_WAITOK || how == M_NOWAIT,
("%s: invalid how", __func__));
req = malloc(sizeof(*req), M_NVMF, how | M_ZERO);
if (req == NULL)
return (NULL);
mtx_lock(&qp->lock);
nq = qp->qp;
if (nq == NULL) {
mtx_unlock(&qp->lock);
free(req, M_NVMF);
return (NULL);
}
qp->allocating++;
MPASS(qp->allocating != 0);
mtx_unlock(&qp->lock);
req->qp = qp;
req->cb = cb;
req->cb_arg = cb_arg;
req->nc = nvmf_allocate_command(nq, sqe, how);
if (req->nc == NULL) {
free(req, M_NVMF);
req = NULL;
}
mtx_lock(&qp->lock);
qp->allocating--;
if (qp->allocating == 0 && qp->shutting_down)
wakeup(qp);
mtx_unlock(&qp->lock);
return (req);
}
static void
nvmf_abort_request(struct nvmf_request *req, uint16_t cid)
{
struct nvme_completion cqe;
memset(&cqe, 0, sizeof(cqe));
cqe.cid = cid;
cqe.status = htole16(NVMEF(NVME_STATUS_SCT, NVME_SCT_PATH_RELATED) |
NVMEF(NVME_STATUS_SC, NVME_SC_COMMAND_ABORTED_BY_HOST));
req->cb(req->cb_arg, &cqe);
}
void
nvmf_free_request(struct nvmf_request *req)
{
if (req->nc != NULL)
nvmf_free_capsule(req->nc);
free(req, M_NVMF);
}
static void
nvmf_dispatch_command(struct nvmf_host_qpair *qp, struct nvmf_host_command *cmd)
{
struct nvmf_softc *sc = qp->sc;
struct nvme_command *sqe;
struct nvmf_capsule *nc;
int error;
nc = cmd->req->nc;
sqe = nvmf_capsule_sqe(nc);
/*
* NB: Don't bother byte-swapping the cid so that receive
* doesn't have to swap.
*/
sqe->cid = cmd->cid;
error = nvmf_transmit_capsule(nc);
if (error != 0) {
device_printf(sc->dev,
"failed to transmit capsule: %d, disconnecting\n", error);
nvmf_disconnect(sc);
return;
}
if (sc->ka_traffic)
atomic_store_int(&sc->ka_active_tx_traffic, 1);
}
static void
nvmf_qp_error(void *arg, int error)
{
struct nvmf_host_qpair *qp = arg;
struct nvmf_softc *sc = qp->sc;
/* Ignore simple close of queue pairs during shutdown. */
if (!(sc->detaching && error == 0))
device_printf(sc->dev, "error %d on %s, disconnecting\n", error,
qp->name);
nvmf_disconnect(sc);
}
static void
nvmf_receive_capsule(void *arg, struct nvmf_capsule *nc)
{
struct nvmf_host_qpair *qp = arg;
struct nvmf_softc *sc = qp->sc;
struct nvmf_host_command *cmd;
struct nvmf_request *req;
const struct nvme_completion *cqe;
uint16_t cid;
cqe = nvmf_capsule_cqe(nc);
if (sc->ka_traffic)
atomic_store_int(&sc->ka_active_rx_traffic, 1);
/*
* NB: Don't bother byte-swapping the cid as transmit doesn't
* swap either.
*/
cid = cqe->cid;
if (cid > qp->num_commands) {
device_printf(sc->dev,
"received invalid CID %u, disconnecting\n", cid);
nvmf_disconnect(sc);
nvmf_free_capsule(nc);
return;
}
/*
* If the queue has been shutdown due to an error, silently
* drop the response.
*/
mtx_lock(&qp->lock);
if (qp->qp == NULL) {
device_printf(sc->dev,
"received completion for CID %u on shutdown %s\n", cid,
qp->name);
mtx_unlock(&qp->lock);
nvmf_free_capsule(nc);
return;
}
cmd = qp->active_commands[cid];
if (cmd == NULL) {
mtx_unlock(&qp->lock);
device_printf(sc->dev,
"received completion for inactive CID %u, disconnecting\n",
cid);
nvmf_disconnect(sc);
nvmf_free_capsule(nc);
return;
}
KASSERT(cmd->cid == cid, ("%s: CID mismatch", __func__));
req = cmd->req;
cmd->req = NULL;
if (STAILQ_EMPTY(&qp->pending_requests)) {
qp->active_commands[cid] = NULL;
TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
mtx_unlock(&qp->lock);
} else {
cmd->req = STAILQ_FIRST(&qp->pending_requests);
STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
mtx_unlock(&qp->lock);
nvmf_dispatch_command(qp, cmd);
}
req->cb(req->cb_arg, cqe);
nvmf_free_capsule(nc);
nvmf_free_request(req);
}
struct nvmf_host_qpair *
nvmf_init_qp(struct nvmf_softc *sc, enum nvmf_trtype trtype,
struct nvmf_handoff_qpair_params *handoff, const char *name)
{
struct nvmf_host_command *cmd, *ncmd;
struct nvmf_host_qpair *qp;
u_int i;
qp = malloc(sizeof(*qp), M_NVMF, M_WAITOK | M_ZERO);
qp->sc = sc;
qp->sq_flow_control = handoff->sq_flow_control;
qp->sqhd = handoff->sqhd;
qp->sqtail = handoff->sqtail;
strlcpy(qp->name, name, sizeof(qp->name));
mtx_init(&qp->lock, "nvmf qp", NULL, MTX_DEF);
/*
* Allocate a spare command slot for each pending AER command
* on the admin queue.
*/
qp->num_commands = handoff->qsize - 1;
if (handoff->admin)
qp->num_commands += sc->num_aer;
qp->active_commands = malloc(sizeof(*qp->active_commands) *
qp->num_commands, M_NVMF, M_WAITOK | M_ZERO);
TAILQ_INIT(&qp->free_commands);
for (i = 0; i < qp->num_commands; i++) {
cmd = malloc(sizeof(*cmd), M_NVMF, M_WAITOK | M_ZERO);
cmd->cid = i;
TAILQ_INSERT_TAIL(&qp->free_commands, cmd, link);
}
STAILQ_INIT(&qp->pending_requests);
qp->qp = nvmf_allocate_qpair(trtype, false, handoff, nvmf_qp_error,
qp, nvmf_receive_capsule, qp);
if (qp->qp == NULL) {
TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
TAILQ_REMOVE(&qp->free_commands, cmd, link);
free(cmd, M_NVMF);
}
free(qp->active_commands, M_NVMF);
mtx_destroy(&qp->lock);
free(qp, M_NVMF);
return (NULL);
}
return (qp);
}
void
nvmf_shutdown_qp(struct nvmf_host_qpair *qp)
{
struct nvmf_host_command *cmd;
struct nvmf_request *req;
struct nvmf_qpair *nq;
mtx_lock(&qp->lock);
nq = qp->qp;
qp->qp = NULL;
if (nq == NULL) {
while (qp->shutting_down)
mtx_sleep(qp, &qp->lock, 0, "nvmfqpsh", 0);
mtx_unlock(&qp->lock);
return;
}
qp->shutting_down = true;
while (qp->allocating != 0)
mtx_sleep(qp, &qp->lock, 0, "nvmfqpqu", 0);
mtx_unlock(&qp->lock);
nvmf_free_qpair(nq);
/*
* Abort outstanding requests. Active requests will have
* their I/O completions invoked and associated capsules freed
* by the transport layer via nvmf_free_qpair. Pending
* requests must have their I/O completion invoked via
* nvmf_abort_capsule_data.
*/
for (u_int i = 0; i < qp->num_commands; i++) {
cmd = qp->active_commands[i];
if (cmd != NULL) {
if (!cmd->req->aer)
printf("%s: aborted active command %p (CID %u)\n",
__func__, cmd->req, cmd->cid);
/* This was freed by nvmf_free_qpair. */
cmd->req->nc = NULL;
nvmf_abort_request(cmd->req, cmd->cid);
nvmf_free_request(cmd->req);
free(cmd, M_NVMF);
}
}
while (!STAILQ_EMPTY(&qp->pending_requests)) {
req = STAILQ_FIRST(&qp->pending_requests);
STAILQ_REMOVE_HEAD(&qp->pending_requests, link);
if (!req->aer)
printf("%s: aborted pending command %p\n", __func__,
req);
nvmf_abort_capsule_data(req->nc, ECONNABORTED);
nvmf_abort_request(req, 0);
nvmf_free_request(req);
}
mtx_lock(&qp->lock);
qp->shutting_down = false;
mtx_unlock(&qp->lock);
wakeup(qp);
}
void
nvmf_destroy_qp(struct nvmf_host_qpair *qp)
{
struct nvmf_host_command *cmd, *ncmd;
nvmf_shutdown_qp(qp);
TAILQ_FOREACH_SAFE(cmd, &qp->free_commands, link, ncmd) {
TAILQ_REMOVE(&qp->free_commands, cmd, link);
free(cmd, M_NVMF);
}
free(qp->active_commands, M_NVMF);
mtx_destroy(&qp->lock);
free(qp, M_NVMF);
}
void
nvmf_submit_request(struct nvmf_request *req)
{
struct nvmf_host_qpair *qp;
struct nvmf_host_command *cmd;
qp = req->qp;
mtx_lock(&qp->lock);
if (qp->qp == NULL) {
mtx_unlock(&qp->lock);
printf("%s: aborted pending command %p\n", __func__, req);
nvmf_abort_capsule_data(req->nc, ECONNABORTED);
nvmf_abort_request(req, 0);
nvmf_free_request(req);
return;
}
cmd = TAILQ_FIRST(&qp->free_commands);
if (cmd == NULL) {
/*
* Queue this request. Will be sent after enough
* in-flight requests have completed.
*/
STAILQ_INSERT_TAIL(&qp->pending_requests, req, link);
mtx_unlock(&qp->lock);
return;
}
TAILQ_REMOVE(&qp->free_commands, cmd, link);
KASSERT(qp->active_commands[cmd->cid] == NULL,
("%s: CID already busy", __func__));
qp->active_commands[cmd->cid] = cmd;
cmd->req = req;
mtx_unlock(&qp->lock);
nvmf_dispatch_command(qp, cmd);
}

View file

@ -0,0 +1,332 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/types.h>
#include <sys/malloc.h>
#include <sys/memdesc.h>
#include <sys/refcount.h>
#include <cam/cam.h>
#include <cam/cam_ccb.h>
#include <cam/cam_sim.h>
#include <cam/cam_xpt_sim.h>
#include <cam/cam_debug.h>
#include <dev/nvmf/host/nvmf_var.h>
/*
* The I/O completion may trigger after the received CQE if the I/O
* used a zero-copy mbuf that isn't harvested until after the NIC
* driver processes TX completions. Use spriv_field0 to as a refcount.
*
* Store any I/O error returned in spriv_field1.
*/
static __inline u_int *
ccb_refs(union ccb *ccb)
{
return ((u_int *)&ccb->ccb_h.spriv_field0);
}
#define spriv_ioerror spriv_field1
static void
nvmf_ccb_done(union ccb *ccb)
{
if (!refcount_release(ccb_refs(ccb)))
return;
if (nvmf_cqe_aborted(&ccb->nvmeio.cpl)) {
ccb->ccb_h.status = CAM_REQUEUE_REQ;
xpt_done(ccb);
} else if (ccb->nvmeio.cpl.status != 0) {
ccb->ccb_h.status = CAM_NVME_STATUS_ERROR;
xpt_done(ccb);
} else if (ccb->ccb_h.spriv_ioerror != 0) {
KASSERT(ccb->ccb_h.spriv_ioerror != EJUSTRETURN,
("%s: zero sized transfer without CQE error", __func__));
ccb->ccb_h.status = CAM_REQ_CMP_ERR;
xpt_done(ccb);
} else {
ccb->ccb_h.status = CAM_REQ_CMP;
xpt_done_direct(ccb);
}
}
static void
nvmf_ccb_io_complete(void *arg, size_t xfered, int error)
{
union ccb *ccb = arg;
/*
* TODO: Reporting partial completions requires extending
* nvmeio to support resid and updating nda to handle partial
* reads, either by returning partial success (or an error) to
* the caller, or retrying all or part of the request.
*/
ccb->ccb_h.spriv_ioerror = error;
if (error == 0) {
if (xfered == 0) {
#ifdef INVARIANTS
/*
* If the request fails with an error in the CQE
* there will be no data transferred but also no
* I/O error.
*/
ccb->ccb_h.spriv_ioerror = EJUSTRETURN;
#endif
} else
KASSERT(xfered == ccb->nvmeio.dxfer_len,
("%s: partial CCB completion", __func__));
}
nvmf_ccb_done(ccb);
}
static void
nvmf_ccb_complete(void *arg, const struct nvme_completion *cqe)
{
union ccb *ccb = arg;
ccb->nvmeio.cpl = *cqe;
nvmf_ccb_done(ccb);
}
static void
nvmf_sim_io(struct nvmf_softc *sc, union ccb *ccb)
{
struct ccb_nvmeio *nvmeio = &ccb->nvmeio;
struct memdesc mem;
struct nvmf_request *req;
struct nvmf_host_qpair *qp;
mtx_lock(&sc->sim_mtx);
if (sc->sim_disconnected) {
mtx_unlock(&sc->sim_mtx);
nvmeio->ccb_h.status = CAM_REQUEUE_REQ;
xpt_done(ccb);
return;
}
if (nvmeio->ccb_h.func_code == XPT_NVME_IO)
qp = nvmf_select_io_queue(sc);
else
qp = sc->admin;
req = nvmf_allocate_request(qp, &nvmeio->cmd, nvmf_ccb_complete,
ccb, M_NOWAIT);
if (req == NULL) {
mtx_unlock(&sc->sim_mtx);
nvmeio->ccb_h.status = CAM_RESRC_UNAVAIL;
xpt_done(ccb);
return;
}
if (nvmeio->dxfer_len != 0) {
refcount_init(ccb_refs(ccb), 2);
mem = memdesc_ccb(ccb);
nvmf_capsule_append_data(req->nc, &mem, nvmeio->dxfer_len,
(ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_OUT,
nvmf_ccb_io_complete, ccb);
} else
refcount_init(ccb_refs(ccb), 1);
/*
* Clear spriv_ioerror as it can hold an earlier error if this
* CCB was aborted and has been retried.
*/
ccb->ccb_h.spriv_ioerror = 0;
KASSERT(ccb->ccb_h.status == CAM_REQ_INPROG,
("%s: incoming CCB is not in-progress", __func__));
ccb->ccb_h.status |= CAM_SIM_QUEUED;
nvmf_submit_request(req);
mtx_unlock(&sc->sim_mtx);
}
static void
nvmf_sim_action(struct cam_sim *sim, union ccb *ccb)
{
struct nvmf_softc *sc = cam_sim_softc(sim);
CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE,
("nvmf_sim_action: func= %#x\n",
ccb->ccb_h.func_code));
switch (ccb->ccb_h.func_code) {
case XPT_PATH_INQ: /* Path routing inquiry */
{
struct ccb_pathinq *cpi = &ccb->cpi;
cpi->version_num = 1;
cpi->hba_inquiry = 0;
cpi->target_sprt = 0;
cpi->hba_misc = PIM_UNMAPPED | PIM_NOSCAN;
cpi->hba_eng_cnt = 0;
cpi->max_target = 0;
cpi->max_lun = sc->cdata->nn;
cpi->async_flags = 0;
cpi->hpath_id = 0;
cpi->initiator_id = 0;
strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN);
strlcpy(cpi->hba_vid, "NVMeoF", HBA_IDLEN);
strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN);
cpi->unit_number = cam_sim_unit(sim);
cpi->bus_id = 0;
/* XXX: Same as iSCSI. */
cpi->base_transfer_speed = 150000;
cpi->protocol = PROTO_NVME;
cpi->protocol_version = sc->vs;
cpi->transport = XPORT_NVMF;
cpi->transport_version = sc->vs;
cpi->xport_specific.nvmf.nsid =
xpt_path_lun_id(ccb->ccb_h.path);
cpi->xport_specific.nvmf.trtype = sc->trtype;
strncpy(cpi->xport_specific.nvmf.dev_name,
device_get_nameunit(sc->dev),
sizeof(cpi->xport_specific.nvmf.dev_name));
cpi->maxio = sc->max_xfer_size;
cpi->hba_vendor = 0;
cpi->hba_device = 0;
cpi->hba_subvendor = 0;
cpi->hba_subdevice = 0;
cpi->ccb_h.status = CAM_REQ_CMP;
break;
}
case XPT_GET_TRAN_SETTINGS: /* Get transport settings */
{
struct ccb_trans_settings *cts = &ccb->cts;
struct ccb_trans_settings_nvme *nvme;
struct ccb_trans_settings_nvmf *nvmf;
cts->protocol = PROTO_NVME;
cts->protocol_version = sc->vs;
cts->transport = XPORT_NVMF;
cts->transport_version = sc->vs;
nvme = &cts->proto_specific.nvme;
nvme->valid = CTS_NVME_VALID_SPEC;
nvme->spec = sc->vs;
nvmf = &cts->xport_specific.nvmf;
nvmf->valid = CTS_NVMF_VALID_TRTYPE;
nvmf->trtype = sc->trtype;
cts->ccb_h.status = CAM_REQ_CMP;
break;
}
case XPT_SET_TRAN_SETTINGS: /* Set transport settings */
/*
* No transfer settings can be set, but nvme_xpt sends
* this anyway.
*/
ccb->ccb_h.status = CAM_REQ_CMP;
break;
case XPT_NVME_IO: /* Execute the requested I/O */
case XPT_NVME_ADMIN: /* or Admin operation */
nvmf_sim_io(sc, ccb);
return;
default:
/* XXX */
device_printf(sc->dev, "unhandled sim function %#x\n",
ccb->ccb_h.func_code);
ccb->ccb_h.status = CAM_REQ_INVALID;
break;
}
xpt_done(ccb);
}
int
nvmf_init_sim(struct nvmf_softc *sc)
{
struct cam_devq *devq;
int max_trans;
max_trans = sc->max_pending_io * 3 / 4;
devq = cam_simq_alloc(max_trans);
if (devq == NULL) {
device_printf(sc->dev, "Failed to allocate CAM simq\n");
return (ENOMEM);
}
mtx_init(&sc->sim_mtx, "nvmf sim", NULL, MTX_DEF);
sc->sim = cam_sim_alloc(nvmf_sim_action, NULL, "nvme", sc,
device_get_unit(sc->dev), NULL, max_trans, max_trans, devq);
if (sc->sim == NULL) {
device_printf(sc->dev, "Failed to allocate CAM sim\n");
cam_simq_free(devq);
mtx_destroy(&sc->sim_mtx);
return (ENXIO);
}
if (xpt_bus_register(sc->sim, sc->dev, 0) != CAM_SUCCESS) {
device_printf(sc->dev, "Failed to create CAM bus\n");
cam_sim_free(sc->sim, TRUE);
mtx_destroy(&sc->sim_mtx);
return (ENXIO);
}
if (xpt_create_path(&sc->path, NULL, cam_sim_path(sc->sim),
CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) {
device_printf(sc->dev, "Failed to create CAM path\n");
xpt_bus_deregister(cam_sim_path(sc->sim));
cam_sim_free(sc->sim, TRUE);
mtx_destroy(&sc->sim_mtx);
return (ENXIO);
}
return (0);
}
void
nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id)
{
union ccb *ccb;
ccb = xpt_alloc_ccb_nowait();
if (ccb == NULL) {
device_printf(sc->dev,
"unable to alloc CCB for rescan of namespace %u\n", id);
return;
}
/*
* As with nvme_sim, map NVMe namespace IDs onto CAM unit
* LUNs.
*/
if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(sc->sim), 0,
id) != CAM_REQ_CMP) {
device_printf(sc->dev,
"Unable to create path for rescan of namespace %u\n", id);
xpt_free_ccb(ccb);
return;
}
xpt_rescan(ccb);
}
void
nvmf_disconnect_sim(struct nvmf_softc *sc)
{
mtx_lock(&sc->sim_mtx);
sc->sim_disconnected = true;
xpt_freeze_simq(sc->sim, 1);
mtx_unlock(&sc->sim_mtx);
}
void
nvmf_reconnect_sim(struct nvmf_softc *sc)
{
mtx_lock(&sc->sim_mtx);
sc->sim_disconnected = false;
mtx_unlock(&sc->sim_mtx);
xpt_release_simq(sc->sim, 1);
}
void
nvmf_destroy_sim(struct nvmf_softc *sc)
{
xpt_async(AC_LOST_DEVICE, sc->path, NULL);
if (sc->sim_disconnected)
xpt_release_simq(sc->sim, 1);
xpt_free_path(sc->path);
xpt_bus_deregister(cam_sim_path(sc->sim));
cam_sim_free(sc->sim, TRUE);
mtx_destroy(&sc->sim_mtx);
}

View file

@ -0,0 +1,208 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#ifndef __NVMF_VAR_H__
#define __NVMF_VAR_H__
#include <sys/_callout.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
#include <sys/_sx.h>
#include <sys/_task.h>
#include <sys/queue.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf_transport.h>
struct nvmf_aer;
struct nvmf_capsule;
struct nvmf_host_qpair;
struct nvmf_namespace;
typedef void nvmf_request_complete_t(void *, const struct nvme_completion *);
struct nvmf_ivars {
struct nvmf_handoff_host *hh;
struct nvmf_handoff_qpair_params *io_params;
struct nvme_controller_data *cdata;
};
struct nvmf_softc {
device_t dev;
struct nvmf_host_qpair *admin;
struct nvmf_host_qpair **io;
u_int num_io_queues;
enum nvmf_trtype trtype;
struct cam_sim *sim;
struct cam_path *path;
struct mtx sim_mtx;
bool sim_disconnected;
struct nvmf_namespace **ns;
struct nvme_controller_data *cdata;
uint64_t cap;
uint32_t vs;
u_int max_pending_io;
u_long max_xfer_size;
struct cdev *cdev;
/*
* Keep Alive support depends on two timers. The 'tx' timer
* is responsible for sending KeepAlive commands and runs at
* half the timeout interval. The 'rx' timer is responsible
* for detecting an actual timeout.
*
* For efficient support of TKAS, the host does not reschedule
* these timers every time new commands are scheduled.
* Instead, the host sets the *_traffic flags when commands
* are sent and received. The timeout handlers check and
* clear these flags. This does mean it can take up to twice
* the timeout time to detect an AWOL controller.
*/
bool ka_traffic; /* Using TKAS? */
volatile int ka_active_tx_traffic;
struct callout ka_tx_timer;
sbintime_t ka_tx_sbt;
volatile int ka_active_rx_traffic;
struct callout ka_rx_timer;
sbintime_t ka_rx_sbt;
struct sx connection_lock;
struct task disconnect_task;
bool detaching;
u_int num_aer;
struct nvmf_aer *aer;
};
struct nvmf_request {
struct nvmf_host_qpair *qp;
struct nvmf_capsule *nc;
nvmf_request_complete_t *cb;
void *cb_arg;
bool aer;
STAILQ_ENTRY(nvmf_request) link;
};
struct nvmf_completion_status {
struct nvme_completion cqe;
bool done;
bool io_done;
int io_error;
};
static __inline struct nvmf_host_qpair *
nvmf_select_io_queue(struct nvmf_softc *sc)
{
/* TODO: Support multiple queues? */
return (sc->io[0]);
}
static __inline bool
nvmf_cqe_aborted(const struct nvme_completion *cqe)
{
uint16_t status;
status = le16toh(cqe->status);
return (NVME_STATUS_GET_SCT(status) == NVME_SCT_PATH_RELATED &&
NVME_STATUS_GET_SC(status) == NVME_SC_COMMAND_ABORTED_BY_HOST);
}
static __inline void
nvmf_status_init(struct nvmf_completion_status *status)
{
status->done = false;
status->io_done = true;
status->io_error = 0;
}
static __inline void
nvmf_status_wait_io(struct nvmf_completion_status *status)
{
status->io_done = false;
}
#ifdef DRIVER_MODULE
extern driver_t nvme_nvmf_driver;
#endif
#ifdef MALLOC_DECLARE
MALLOC_DECLARE(M_NVMF);
#endif
/* nvmf.c */
void nvmf_complete(void *arg, const struct nvme_completion *cqe);
void nvmf_io_complete(void *arg, size_t xfered, int error);
void nvmf_wait_for_reply(struct nvmf_completion_status *status);
int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh);
void nvmf_free_ivars(struct nvmf_ivars *ivars);
void nvmf_disconnect(struct nvmf_softc *sc);
void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid);
int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt,
bool admin);
/* nvmf_aer.c */
void nvmf_init_aer(struct nvmf_softc *sc);
int nvmf_start_aer(struct nvmf_softc *sc);
void nvmf_destroy_aer(struct nvmf_softc *sc);
/* nvmf_cmd.c */
bool nvmf_cmd_get_property(struct nvmf_softc *sc, uint32_t offset,
uint8_t size, nvmf_request_complete_t *cb, void *cb_arg, int how);
bool nvmf_cmd_set_property(struct nvmf_softc *sc, uint32_t offset,
uint8_t size, uint64_t value, nvmf_request_complete_t *cb, void *cb_arg,
int how);
bool nvmf_cmd_keep_alive(struct nvmf_softc *sc, nvmf_request_complete_t *cb,
void *cb_arg, int how);
bool nvmf_cmd_identify_active_namespaces(struct nvmf_softc *sc, uint32_t id,
struct nvme_ns_list *nslist, nvmf_request_complete_t *req_cb,
void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
bool nvmf_cmd_identify_namespace(struct nvmf_softc *sc, uint32_t id,
struct nvme_namespace_data *nsdata, nvmf_request_complete_t *req_cb,
void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
bool nvmf_cmd_get_log_page(struct nvmf_softc *sc, uint32_t nsid, uint8_t lid,
uint64_t offset, void *buf, size_t len, nvmf_request_complete_t *req_cb,
void *req_cb_arg, nvmf_io_complete_t *io_cb, void *io_cb_arg, int how);
/* nvmf_ctldev.c */
int nvmf_ctl_load(void);
void nvmf_ctl_unload(void);
/* nvmf_ns.c */
struct nvmf_namespace *nvmf_init_ns(struct nvmf_softc *sc, uint32_t id,
struct nvme_namespace_data *data);
void nvmf_disconnect_ns(struct nvmf_namespace *ns);
void nvmf_reconnect_ns(struct nvmf_namespace *ns);
void nvmf_destroy_ns(struct nvmf_namespace *ns);
bool nvmf_update_ns(struct nvmf_namespace *ns,
struct nvme_namespace_data *data);
/* nvmf_qpair.c */
struct nvmf_host_qpair *nvmf_init_qp(struct nvmf_softc *sc,
enum nvmf_trtype trtype, struct nvmf_handoff_qpair_params *handoff,
const char *name);
void nvmf_shutdown_qp(struct nvmf_host_qpair *qp);
void nvmf_destroy_qp(struct nvmf_host_qpair *qp);
struct nvmf_request *nvmf_allocate_request(struct nvmf_host_qpair *qp,
void *sqe, nvmf_request_complete_t *cb, void *cb_arg, int how);
void nvmf_submit_request(struct nvmf_request *req);
void nvmf_free_request(struct nvmf_request *req);
/* nvmf_sim.c */
int nvmf_init_sim(struct nvmf_softc *sc);
void nvmf_disconnect_sim(struct nvmf_softc *sc);
void nvmf_reconnect_sim(struct nvmf_softc *sc);
void nvmf_destroy_sim(struct nvmf_softc *sc);
void nvmf_sim_rescan_ns(struct nvmf_softc *sc, uint32_t id);
#endif /* !__NVMF_VAR_H__ */

View file

@ -1,4 +1,5 @@
SUBDIR= nvmf_tcp \
SUBDIR= nvmf \
nvmf_tcp \
nvmf_transport
.include <bsd.subdir.mk>

View file

@ -0,0 +1,13 @@
.PATH: ${SRCTOP}/sys/dev/nvmf/host
KMOD= nvmf
SRCS= nvmf.c \
nvmf_aer.c \
nvmf_cmd.c \
nvmf_ctldev.c \
nvmf_ns.c \
nvmf_qpair.c \
nvmf_sim.c
.include <bsd.kmod.mk>