nvmf: Add infrastructure kernel module for NVMe over Fabrics

nvmf_transport.ko provides routines for managing NVMeoF queue pairs
and capsules.  It provides a glue layer between transports (such as
TCP or RDMA) and an NVMeoF host (initiator) and controller (target).

Unlike the synchronous API exposed to the host and controller by
libnvmf, the kernel's transport layer uses an asynchronous API built
on callbacks.  Upper layers provide callbacks on queue pairs that are
invoked for transport errors (error_cb) or anytime a capsule is
received (receive_cb).

Data transfers for a command are usually associated with a callback
that is invoked once a transfer has finished either due to an error
or successful completion.

For an upper layer that is a host, command capsules are allocated and
populated with an NVMe SQE by calling nvmf_allocate_command.  A data
buffer (described by a struct memdesc) can be associated with a
command capsule before it is transmitted via nvmf_capsule_append_data.
This function accepts a direction (send vs receive) as well as the
data transfer callback.  The host then transmits the command via
nvmf_transmit_capsule.  The host must ensure that the data buffer
described by the 'struct memdesc' remains valid until the data
transfer callback is called.  The queue pair's receive_cb callback
should match received response capsules up with previously transmitted
commands.

For the controller, incoming commands are received via the queue
pair's receive_cb callback.  nvmf_receive_controller_data is used to
retrieve any data from a command (e.g. the data for a WRITE command).
It can be called multiple times to split the data transfer into
smaller sizes.  This function accepts an I/O completion callback that
is invoked once the data transfer has completed.
nvmf_send_controller_data is used to send data to a remote host in
response to a command.  In this case a callback function is not used
but the status is returned synchronously.  Finally, the controller can
allocate a response capsule via nvmf_allocate_response populated with
a supplied CQE and send the response via nvmf_transmit_capsule.

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44711
This commit is contained in:
John Baldwin 2024-05-02 16:28:32 -07:00
parent 2da066ef6d
commit aa1207ea4f
6 changed files with 625 additions and 0 deletions

View file

@ -0,0 +1,344 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2022-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/refcount.h>
#include <sys/sysctl.h>
#include <sys/sx.h>
#include <dev/nvme/nvme.h>
#include <dev/nvmf/nvmf.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/nvmf_transport_internal.h>
/* Transport-independent support for fabrics queue pairs and commands. */
struct nvmf_transport {
struct nvmf_transport_ops *nt_ops;
volatile u_int nt_active_qpairs;
SLIST_ENTRY(nvmf_transport) nt_link;
};
/* nvmf_transports[nvmf_trtype] is sorted by priority */
static SLIST_HEAD(, nvmf_transport) nvmf_transports[NVMF_TRTYPE_TCP + 1];
static struct sx nvmf_transports_lock;
static MALLOC_DEFINE(M_NVMF_TRANSPORT, "nvmf_xport",
"NVMe over Fabrics transport");
SYSCTL_NODE(_kern, OID_AUTO, nvmf, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"NVMe over Fabrics");
static bool
nvmf_supported_trtype(enum nvmf_trtype trtype)
{
return (trtype < nitems(nvmf_transports));
}
struct nvmf_qpair *
nvmf_allocate_qpair(enum nvmf_trtype trtype, bool controller,
const struct nvmf_handoff_qpair_params *params,
nvmf_qpair_error_t *error_cb, void *error_cb_arg,
nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg)
{
struct nvmf_transport *nt;
struct nvmf_qpair *qp;
if (!nvmf_supported_trtype(trtype))
return (NULL);
sx_slock(&nvmf_transports_lock);
SLIST_FOREACH(nt, &nvmf_transports[trtype], nt_link) {
qp = nt->nt_ops->allocate_qpair(controller, params);
if (qp != NULL) {
refcount_acquire(&nt->nt_active_qpairs);
break;
}
}
sx_sunlock(&nvmf_transports_lock);
if (qp == NULL)
return (NULL);
qp->nq_transport = nt;
qp->nq_ops = nt->nt_ops;
qp->nq_controller = controller;
qp->nq_error = error_cb;
qp->nq_error_arg = error_cb_arg;
qp->nq_receive = receive_cb;
qp->nq_receive_arg = receive_cb_arg;
qp->nq_admin = params->admin;
return (qp);
}
void
nvmf_free_qpair(struct nvmf_qpair *qp)
{
struct nvmf_transport *nt;
nt = qp->nq_transport;
qp->nq_ops->free_qpair(qp);
if (refcount_release(&nt->nt_active_qpairs))
wakeup(nt);
}
struct nvmf_capsule *
nvmf_allocate_command(struct nvmf_qpair *qp, const void *sqe, int how)
{
struct nvmf_capsule *nc;
KASSERT(how == M_WAITOK || how == M_NOWAIT,
("%s: invalid how", __func__));
nc = qp->nq_ops->allocate_capsule(qp, how);
if (nc == NULL)
return (NULL);
nc->nc_qpair = qp;
nc->nc_qe_len = sizeof(struct nvme_command);
memcpy(&nc->nc_sqe, sqe, nc->nc_qe_len);
/* 4.2 of NVMe base spec: Fabrics always uses SGL. */
nc->nc_sqe.fuse &= ~NVMEM(NVME_CMD_PSDT);
nc->nc_sqe.fuse |= NVMEF(NVME_CMD_PSDT, NVME_PSDT_SGL);
return (nc);
}
struct nvmf_capsule *
nvmf_allocate_response(struct nvmf_qpair *qp, const void *cqe, int how)
{
struct nvmf_capsule *nc;
KASSERT(how == M_WAITOK || how == M_NOWAIT,
("%s: invalid how", __func__));
nc = qp->nq_ops->allocate_capsule(qp, how);
if (nc == NULL)
return (NULL);
nc->nc_qpair = qp;
nc->nc_qe_len = sizeof(struct nvme_completion);
memcpy(&nc->nc_cqe, cqe, nc->nc_qe_len);
return (nc);
}
int
nvmf_capsule_append_data(struct nvmf_capsule *nc, struct memdesc *mem,
size_t len, bool send, nvmf_io_complete_t *complete_cb,
void *cb_arg)
{
if (nc->nc_data.io_len != 0)
return (EBUSY);
nc->nc_send_data = send;
nc->nc_data.io_mem = *mem;
nc->nc_data.io_len = len;
nc->nc_data.io_complete = complete_cb;
nc->nc_data.io_complete_arg = cb_arg;
return (0);
}
void
nvmf_free_capsule(struct nvmf_capsule *nc)
{
nc->nc_qpair->nq_ops->free_capsule(nc);
}
int
nvmf_transmit_capsule(struct nvmf_capsule *nc)
{
return (nc->nc_qpair->nq_ops->transmit_capsule(nc));
}
void
nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error)
{
if (nc->nc_data.io_len != 0)
nvmf_complete_io_request(&nc->nc_data, 0, error);
}
void *
nvmf_capsule_sqe(struct nvmf_capsule *nc)
{
KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
("%s: capsule %p is not a command capsule", __func__, nc));
return (&nc->nc_sqe);
}
void *
nvmf_capsule_cqe(struct nvmf_capsule *nc)
{
KASSERT(nc->nc_qe_len == sizeof(struct nvme_completion),
("%s: capsule %p is not a response capsule", __func__, nc));
return (&nc->nc_cqe);
}
uint8_t
nvmf_validate_command_capsule(struct nvmf_capsule *nc)
{
KASSERT(nc->nc_qe_len == sizeof(struct nvme_command),
("%s: capsule %p is not a command capsule", __func__, nc));
if (NVMEV(NVME_CMD_PSDT, nc->nc_sqe.fuse) != NVME_PSDT_SGL)
return (NVME_SC_INVALID_FIELD);
return (nc->nc_qpair->nq_ops->validate_command_capsule(nc));
}
size_t
nvmf_capsule_data_len(const struct nvmf_capsule *nc)
{
return (nc->nc_qpair->nq_ops->capsule_data_len(nc));
}
int
nvmf_receive_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
struct memdesc *mem, size_t len, nvmf_io_complete_t *complete_cb,
void *cb_arg)
{
struct nvmf_io_request io;
io.io_mem = *mem;
io.io_len = len;
io.io_complete = complete_cb;
io.io_complete_arg = cb_arg;
return (nc->nc_qpair->nq_ops->receive_controller_data(nc, data_offset,
&io));
}
u_int
nvmf_send_controller_data(struct nvmf_capsule *nc, uint32_t data_offset,
struct mbuf *m, size_t len)
{
MPASS(m_length(m, NULL) == len);
return (nc->nc_qpair->nq_ops->send_controller_data(nc, data_offset, m,
len));
}
int
nvmf_transport_module_handler(struct module *mod, int what, void *arg)
{
struct nvmf_transport_ops *ops = arg;
struct nvmf_transport *nt, *nt2, *prev;
int error;
switch (what) {
case MOD_LOAD:
if (!nvmf_supported_trtype(ops->trtype)) {
printf("NVMF: Unsupported transport %u", ops->trtype);
return (EINVAL);
}
nt = malloc(sizeof(*nt), M_NVMF_TRANSPORT, M_WAITOK | M_ZERO);
nt->nt_ops = arg;
sx_xlock(&nvmf_transports_lock);
if (SLIST_EMPTY(&nvmf_transports[ops->trtype])) {
SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype], nt,
nt_link);
} else {
prev = NULL;
SLIST_FOREACH(nt2, &nvmf_transports[ops->trtype],
nt_link) {
if (ops->priority > nt2->nt_ops->priority)
break;
prev = nt2;
}
if (prev == NULL)
SLIST_INSERT_HEAD(&nvmf_transports[ops->trtype],
nt, nt_link);
else
SLIST_INSERT_AFTER(prev, nt, nt_link);
}
sx_xunlock(&nvmf_transports_lock);
return (0);
case MOD_QUIESCE:
if (!nvmf_supported_trtype(ops->trtype))
return (0);
sx_slock(&nvmf_transports_lock);
SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
if (nt->nt_ops == ops)
break;
}
if (nt == NULL) {
sx_sunlock(&nvmf_transports_lock);
return (0);
}
if (nt->nt_active_qpairs != 0) {
sx_sunlock(&nvmf_transports_lock);
return (EBUSY);
}
sx_sunlock(&nvmf_transports_lock);
return (0);
case MOD_UNLOAD:
if (!nvmf_supported_trtype(ops->trtype))
return (0);
sx_xlock(&nvmf_transports_lock);
prev = NULL;
SLIST_FOREACH(nt, &nvmf_transports[ops->trtype], nt_link) {
if (nt->nt_ops == ops)
break;
prev = nt;
}
if (nt == NULL) {
KASSERT(nt->nt_active_qpairs == 0,
("unregistered transport has connections"));
sx_xunlock(&nvmf_transports_lock);
return (0);
}
if (prev == NULL)
SLIST_REMOVE_HEAD(&nvmf_transports[ops->trtype],
nt_link);
else
SLIST_REMOVE_AFTER(prev, nt_link);
error = 0;
while (nt->nt_active_qpairs != 0 && error == 0)
error = sx_sleep(nt, &nvmf_transports_lock, PCATCH,
"nftunld", 0);
sx_xunlock(&nvmf_transports_lock);
if (error != 0)
return (error);
free(nt, M_NVMF_TRANSPORT);
return (0);
default:
return (EOPNOTSUPP);
}
}
static int
nvmf_transport_modevent(module_t mod __unused, int what, void *arg __unused)
{
switch (what) {
case MOD_LOAD:
for (u_int i = 0; i < nitems(nvmf_transports); i++)
SLIST_INIT(&nvmf_transports[i]);
sx_init(&nvmf_transports_lock, "nvmf transports");
return (0);
default:
return (EOPNOTSUPP);
}
}
static moduledata_t nvmf_transport_mod = {
"nvmf_transport",
nvmf_transport_modevent,
0
};
DECLARE_MODULE(nvmf_transport, nvmf_transport_mod, SI_SUB_DRIVERS,
SI_ORDER_FIRST);
MODULE_VERSION(nvmf_transport, 1);

View file

@ -0,0 +1,140 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2022-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#ifndef __NVMF_TRANSPORT_H__
#define __NVMF_TRANSPORT_H__
/*
* Interface used by the Fabrics host (initiator) and controller
* (target) to send and receive capsules and associated data.
*/
#include <sys/sysctl.h>
#include <dev/nvmf/nvmf_proto.h>
struct mbuf;
struct memdesc;
struct nvmf_capsule;
struct nvmf_connection;
struct nvmf_qpair;
struct nvmf_handoff_qpair_params;
SYSCTL_DECL(_kern_nvmf);
/*
* Callback to invoke when an error occurs on a qpair. The last
* parameter is an error value. If the error value is zero, the qpair
* has been closed at the transport level rather than a transport
* error occuring.
*/
typedef void nvmf_qpair_error_t(void *, int);
/* Callback to invoke when a capsule is received. */
typedef void nvmf_capsule_receive_t(void *, struct nvmf_capsule *);
/*
* Callback to invoke when an I/O request has completed. The second
* parameter is the amount of data transferred. The last parameter is
* an error value which is non-zero if the request did not complete
* successfully. A request with an error may complete partially.
*/
typedef void nvmf_io_complete_t(void *, size_t, int);
/*
* A queue pair represents either an Admin or I/O
* submission/completion queue pair. The params contains negotiated
* values passed in from userland.
*
* Unlike libnvmf in userland, the kernel transport interface does not
* have any notion of an association. Instead, qpairs are
* independent.
*/
struct nvmf_qpair *nvmf_allocate_qpair(enum nvmf_trtype trtype,
bool controller, const struct nvmf_handoff_qpair_params *params,
nvmf_qpair_error_t *error_cb, void *error_cb_arg,
nvmf_capsule_receive_t *receive_cb, void *receive_cb_arg);
void nvmf_free_qpair(struct nvmf_qpair *qp);
/*
* Capsules are either commands (host -> controller) or responses
* (controller -> host). A data buffer may be associated with a
* command capsule. Transmitted data is not copied by this API but
* instead must be preserved until the completion callback is invoked
* to indicate capsule transmission has completed.
*/
struct nvmf_capsule *nvmf_allocate_command(struct nvmf_qpair *qp,
const void *sqe, int how);
struct nvmf_capsule *nvmf_allocate_response(struct nvmf_qpair *qp,
const void *cqe, int how);
void nvmf_free_capsule(struct nvmf_capsule *nc);
int nvmf_capsule_append_data(struct nvmf_capsule *nc,
struct memdesc *mem, size_t len, bool send,
nvmf_io_complete_t *complete_cb, void *cb_arg);
int nvmf_transmit_capsule(struct nvmf_capsule *nc);
void nvmf_abort_capsule_data(struct nvmf_capsule *nc, int error);
void *nvmf_capsule_sqe(struct nvmf_capsule *nc);
void *nvmf_capsule_cqe(struct nvmf_capsule *nc);
/* Controller-specific APIs. */
/*
* A controller calls this function to check for any
* transport-specific errors (invalid fields) in a received command
* capsule. The callback returns a generic command status value:
* NVME_SC_SUCCESS if no error is found.
*/
uint8_t nvmf_validate_command_capsule(struct nvmf_capsule *nc);
/*
* A controller calls this function to query the amount of data
* associated with a command capsule.
*/
size_t nvmf_capsule_data_len(const struct nvmf_capsule *cc);
/*
* A controller calls this function to receive data associated with a
* command capsule (e.g. the data for a WRITE command). This can
* either return in-capsule data or fetch data from the host
* (e.g. using a R2T PDU over TCP). The received command capsule
* should be passed in 'nc'. The received data is stored in 'mem'.
* If this function returns success, then the callback will be invoked
* once the operation has completed. Note that the callback might be
* invoked before this function returns.
*/
int nvmf_receive_controller_data(struct nvmf_capsule *nc,
uint32_t data_offset, struct memdesc *mem, size_t len,
nvmf_io_complete_t *complete_cb, void *cb_arg);
/*
* A controller calls this function to send data in response to a
* command prior to sending a response capsule. If an error occurs,
* the function returns a generic status completion code to be sent in
* the following CQE. Note that the transfer might send a subset of
* the data requested by nc. If the transfer succeeds, this function
* can return one of the following values:
*
* - NVME_SC_SUCCESS: The transfer has completed successfully and the
* caller should send a success CQE in a response capsule.
*
* - NVMF_SUCCESS_SENT: The transfer has completed successfully and
* the transport layer has sent an implicit success CQE to the
* remote host (e.g. the SUCCESS flag for TCP). The caller should
* not send a response capsule.
*
* - NVMF_MORE: The transfer has completed successfully, but the
* transfer did not complete the data buffer.
*
* The mbuf chain in 'm' is consumed by this function even if an error
* is returned.
*/
u_int nvmf_send_controller_data(struct nvmf_capsule *nc,
uint32_t data_offset, struct mbuf *m, size_t len);
#define NVMF_SUCCESS_SENT 0x100
#define NVMF_MORE 0x101
#endif /* !__NVMF_TRANSPORT_H__ */

View file

@ -0,0 +1,128 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2022-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#ifndef __NVMF_TRANSPORT_INTERNAL_H__
#define __NVMF_TRANSPORT_INTERNAL_H__
#include <sys/memdesc.h>
/*
* Interface between the transport-independent APIs in
* nvmf_transport.c and individual transports.
*/
struct module;
struct nvmf_io_request;
struct nvmf_transport_ops {
/* Queue pair management. */
struct nvmf_qpair *(*allocate_qpair)(bool controller,
const struct nvmf_handoff_qpair_params *params);
void (*free_qpair)(struct nvmf_qpair *qp);
/* Capsule operations. */
struct nvmf_capsule *(*allocate_capsule)(struct nvmf_qpair *qp,
int how);
void (*free_capsule)(struct nvmf_capsule *nc);
int (*transmit_capsule)(struct nvmf_capsule *nc);
uint8_t (*validate_command_capsule)(struct nvmf_capsule *nc);
/* Transferring controller data. */
size_t (*capsule_data_len)(const struct nvmf_capsule *nc);
int (*receive_controller_data)(struct nvmf_capsule *nc,
uint32_t data_offset, struct nvmf_io_request *io);
u_int (*send_controller_data)(struct nvmf_capsule *nc,
uint32_t data_offset, struct mbuf *m, size_t len);
enum nvmf_trtype trtype;
int priority;
};
/* Either an Admin or I/O Submission/Completion Queue pair. */
struct nvmf_qpair {
struct nvmf_transport *nq_transport;
struct nvmf_transport_ops *nq_ops;
bool nq_controller;
/* Callback to invoke for a received capsule. */
nvmf_capsule_receive_t *nq_receive;
void *nq_receive_arg;
/* Callback to invoke for an error. */
nvmf_qpair_error_t *nq_error;
void *nq_error_arg;
bool nq_admin;
};
struct nvmf_io_request {
/*
* Data buffer contains io_len bytes in the backing store
* described by mem.
*/
struct memdesc io_mem;
size_t io_len;
nvmf_io_complete_t *io_complete;
void *io_complete_arg;
};
/*
* Fabrics Command and Response Capsules. The Fabrics host
* (initiator) and controller (target) drivers work with capsules that
* are transmitted and received by a specific transport.
*/
struct nvmf_capsule {
struct nvmf_qpair *nc_qpair;
/* Either a SQE or CQE. */
union {
struct nvme_command nc_sqe;
struct nvme_completion nc_cqe;
};
int nc_qe_len;
/*
* Is SQHD in received capsule valid? False for locally-
* synthesized responses.
*/
bool nc_sqhd_valid;
bool nc_send_data;
struct nvmf_io_request nc_data;
};
static void __inline
nvmf_qpair_error(struct nvmf_qpair *nq, int error)
{
nq->nq_error(nq->nq_error_arg, error);
}
static void __inline
nvmf_capsule_received(struct nvmf_qpair *nq, struct nvmf_capsule *nc)
{
nq->nq_receive(nq->nq_receive_arg, nc);
}
static void __inline
nvmf_complete_io_request(struct nvmf_io_request *io, size_t xfered, int error)
{
io->io_complete(io->io_complete_arg, xfered, error);
}
int nvmf_transport_module_handler(struct module *, int, void *);
#define NVMF_TRANSPORT(name, ops) \
static moduledata_t nvmf_transport_##name##_mod = { \
"nvmf/" #name, \
nvmf_transport_module_handler, \
&(ops) \
}; \
DECLARE_MODULE(nvmf_transport_##name, nvmf_transport_##name##_mod, \
SI_SUB_DRIVERS, SI_ORDER_ANY); \
MODULE_DEPEND(nvmf_transport_##name, nvmf_transport, 1, 1, 1)
#endif /* !__NVMF_TRANSPORT_INTERNAL_H__ */

View file

@ -296,6 +296,7 @@ SUBDIR= \
nvd \
${_nvdimm} \
nvme \
nvmf \
${_nvram} \
oce \
${_ocs_fc} \

View file

@ -0,0 +1,3 @@
SUBDIR= nvmf_transport
.include <bsd.subdir.mk>

View file

@ -0,0 +1,9 @@
.PATH: ${SRCTOP}/sys/dev/nvmf
KMOD= nvmf_transport
SRCS= nvmf_transport.c
EXPORT_SYMS= YES
.include <bsd.kmod.mk>