nvmft: The in-kernel NVMe over Fabrics controller

This is the server (target in SCSI terms) for NVMe over Fabrics.
Userland is responsible for accepting a new queue pair and receiving
the initial Connect command before handing the queue pair off via an
ioctl to this CTL frontend.

This frontend exposes CTL LUNs as NVMe namespaces to remote hosts.
Users can ask LUNS to CTL that can be shared via either iSCSI or
NVMeoF.

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44726
This commit is contained in:
John Baldwin 2024-05-02 16:34:45 -07:00
parent 51346bd594
commit a15f7c96a2
10 changed files with 2893 additions and 2 deletions

View file

@ -410,6 +410,7 @@ MAN= aac.4 \
nvme.4 \
nvmf.4 \
nvmf_tcp.4 \
nvmft.4 \
${_nvram.4} \
oce.4 \
ocs_fc.4\

85
share/man/man4/nvmft.4 Normal file
View file

@ -0,0 +1,85 @@
.\"
.\" SPDX-License-Identifier: BSD-2-Clause
.\"
.\" Copyright (c) 2024 Chelsio Communications, Inc.
.\"
.Dd May 2, 2024
.Dt NVMFT 4
.Os
.Sh NAME
.Nm nvmft
.Nd "NVM Express over Fabrics CAM Target Layer frontend"
.Sh SYNOPSIS
To compile the subsystem into the kernel,
place the following lines in the
kernel configuration file:
.Bd -ragged -offset indent
.Cd "device nvmft"
.Cd "device ctl"
.Ed
.Pp
Alternatively, to load the subsystem as a
module at boot time, place the following line in
.Xr loader.conf 5 :
.Bd -literal -offset indent
nvmft_load="YES"
.Ed
.Sh DESCRIPTION
The
.Nm
driver provides the kernel component of an NVM Express over Fabrics
controller.
The NVMeoF controller is the server exporting namespaces backed by
local files and volumes to remote hosts.
.Nm
follows the dynamic controller model and creates a new dynamic controller
for each association.
.Pp
.Nm
is implemented as a
.Xr ctl 4
frontend and exports CAM Target Layer LUNs as namespaces to remote hosts.
LUNs can be configured via
.Xr ctladm 8 .
.Pp
Associations between the local controller and remote hosts are managed
using both the
.Xr nvmfd 8
daemon and the
.Xr ctladm 8
utility.
The
.Xr nvmfd 8
daemon listens for new associations and handles transport-specific
negotiation before handing off connected queue pairs to
.Nm
which associates queue pairs with a suitable controller instance.
The
.Cm nvlist
.Xr ctladm 8
command lists active controllers.
The
.Cm nvterminate
command terminates one or more associations between a local controller
and a remote host.
.Pp
Associations require a supported transport such as
.Xr nvmf_tcp 4
for associations using TCP/IP.
.Sh SEE ALSO
.Xr ctl 4 ,
.Xr nvmf 4 ,
.Xr nvmf_tcp 4 ,
.Xr ctladm 8 ,
.Xr nvmfd 8
.Sh HISTORY
The
.Nm
module first appeared in
.Fx 15.0 .
.Sh AUTHORS
The
.Nm
subsystem was developed by
.An John Baldwin Aq Mt jhb@FreeBSD.org
under sponsorship from Chelsio Communications, Inc.

View file

@ -1677,6 +1677,7 @@ device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
#
# nvme: PCI-express NVM Express host controllers
# nvmf: NVM Express over Fabrics host
# nvmft: NVM Express over Fabrics CAM Target Layer frontend
# nvmf_tcp: TCP transport for NVM Express over Fabrics
# nda: CAM NVMe disk driver
# nvd: non-CAM NVMe disk driver
@ -1684,6 +1685,7 @@ device mrsas # LSI/Avago MegaRAID SAS/SATA, 6Gb/s and 12Gb/s
device nvme # PCI-express NVMe host driver
options NVME_USE_NVD=1 # Use nvd(4) instead of the CAM nda(4) driver
device nvmf # NVMeoF host driver
device nvmft # NVMeoF ctl(4) frontend
device nvmf_tcp # NVMeoF TCP transport
device nda # NVMe direct access devices (aka disks)
device nvd # expose NVMe namespaces as disks, depends on nvme

View file

@ -2535,6 +2535,10 @@ dev/nvme/nvme_test.c optional nvme
dev/nvme/nvme_util.c optional nvme
dev/nvmem/nvmem.c optional nvmem fdt
dev/nvmem/nvmem_if.m optional nvmem
dev/nvmf/controller/ctl_frontend_nvmf.c optional nvmft
dev/nvmf/controller/nvmft_controller.c optional nvmft
dev/nvmf/controller/nvmft_subr.c optional nvmft
dev/nvmf/controller/nvmft_qpair.c optional nvmft
dev/nvmf/host/nvmf.c optional nvmf
dev/nvmf/host/nvmf_aer.c optional nvmf
dev/nvmf/host/nvmf_cmd.c optional nvmf
@ -2543,7 +2547,7 @@ dev/nvmf/host/nvmf_ns.c optional nvmf
dev/nvmf/host/nvmf_qpair.c optional nvmf
dev/nvmf/host/nvmf_sim.c optional nvmf
dev/nvmf/nvmf_tcp.c optional nvmf_tcp
dev/nvmf/nvmf_transport.c optional nvmf
dev/nvmf/nvmf_transport.c optional nvmf | optional nvmft
dev/oce/oce_hw.c optional oce pci
dev/oce/oce_if.c optional oce pci
dev/oce/oce_mbox.c optional oce pci

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,361 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/types.h>
#include <sys/_bitset.h>
#include <sys/bitset.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <dev/nvmf/nvmf_transport.h>
#include <dev/nvmf/controller/nvmft_var.h>
/*
* A bitmask of command ID values. This is used to detect duplicate
* commands with the same ID.
*/
#define NUM_CIDS (UINT16_MAX + 1)
BITSET_DEFINE(cidset, NUM_CIDS);
struct nvmft_qpair {
struct nvmft_controller *ctrlr;
struct nvmf_qpair *qp;
struct cidset *cids;
bool admin;
bool sq_flow_control;
uint16_t qid;
u_int qsize;
uint16_t sqhd;
uint16_t sqtail;
volatile u_int qp_refs; /* Internal references on 'qp'. */
struct mtx lock;
char name[16];
};
static int _nvmft_send_generic_error(struct nvmft_qpair *qp,
struct nvmf_capsule *nc, uint8_t sc_status);
static void
nvmft_qpair_error(void *arg, int error)
{
struct nvmft_qpair *qp = arg;
struct nvmft_controller *ctrlr = qp->ctrlr;
/*
* XXX: The Linux TCP initiator sends a RST immediately after
* the FIN, so treat ECONNRESET as plain EOF to avoid spurious
* errors on shutdown.
*/
if (error == ECONNRESET)
error = 0;
if (error != 0)
nvmft_printf(ctrlr, "error %d on %s\n", error, qp->name);
nvmft_controller_error(ctrlr, qp, error);
}
static void
nvmft_receive_capsule(void *arg, struct nvmf_capsule *nc)
{
struct nvmft_qpair *qp = arg;
struct nvmft_controller *ctrlr = qp->ctrlr;
const struct nvme_command *cmd;
uint8_t sc_status;
cmd = nvmf_capsule_sqe(nc);
if (ctrlr == NULL) {
printf("NVMFT: %s received CID %u opcode %u on newborn queue\n",
qp->name, le16toh(cmd->cid), cmd->opc);
nvmf_free_capsule(nc);
return;
}
sc_status = nvmf_validate_command_capsule(nc);
if (sc_status != NVME_SC_SUCCESS) {
_nvmft_send_generic_error(qp, nc, sc_status);
nvmf_free_capsule(nc);
return;
}
/* Don't bother byte-swapping CID. */
if (BIT_TEST_SET_ATOMIC(NUM_CIDS, cmd->cid, qp->cids)) {
_nvmft_send_generic_error(qp, nc, NVME_SC_COMMAND_ID_CONFLICT);
nvmf_free_capsule(nc);
return;
}
if (qp->admin)
nvmft_handle_admin_command(ctrlr, nc);
else
nvmft_handle_io_command(qp, qp->qid, nc);
}
struct nvmft_qpair *
nvmft_qpair_init(enum nvmf_trtype trtype,
const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
const char *name)
{
struct nvmft_qpair *qp;
qp = malloc(sizeof(*qp), M_NVMFT, M_WAITOK | M_ZERO);
qp->admin = handoff->admin;
qp->sq_flow_control = handoff->sq_flow_control;
qp->qsize = handoff->qsize;
qp->qid = qid;
qp->sqhd = handoff->sqhd;
qp->sqtail = handoff->sqtail;
strlcpy(qp->name, name, sizeof(qp->name));
mtx_init(&qp->lock, "nvmft qp", NULL, MTX_DEF);
qp->cids = BITSET_ALLOC(NUM_CIDS, M_NVMFT, M_WAITOK | M_ZERO);
qp->qp = nvmf_allocate_qpair(trtype, true, handoff, nvmft_qpair_error,
qp, nvmft_receive_capsule, qp);
if (qp->qp == NULL) {
mtx_destroy(&qp->lock);
free(qp->cids, M_NVMFT);
free(qp, M_NVMFT);
return (NULL);
}
refcount_init(&qp->qp_refs, 1);
return (qp);
}
void
nvmft_qpair_shutdown(struct nvmft_qpair *qp)
{
struct nvmf_qpair *nq;
mtx_lock(&qp->lock);
nq = qp->qp;
qp->qp = NULL;
mtx_unlock(&qp->lock);
if (nq != NULL && refcount_release(&qp->qp_refs))
nvmf_free_qpair(nq);
}
void
nvmft_qpair_destroy(struct nvmft_qpair *qp)
{
nvmft_qpair_shutdown(qp);
mtx_destroy(&qp->lock);
free(qp->cids, M_NVMFT);
free(qp, M_NVMFT);
}
struct nvmft_controller *
nvmft_qpair_ctrlr(struct nvmft_qpair *qp)
{
return (qp->ctrlr);
}
uint16_t
nvmft_qpair_id(struct nvmft_qpair *qp)
{
return (qp->qid);
}
const char *
nvmft_qpair_name(struct nvmft_qpair *qp)
{
return (qp->name);
}
static int
_nvmft_send_response(struct nvmft_qpair *qp, const void *cqe)
{
struct nvme_completion cpl;
struct nvmf_qpair *nq;
struct nvmf_capsule *rc;
int error;
memcpy(&cpl, cqe, sizeof(cpl));
mtx_lock(&qp->lock);
nq = qp->qp;
if (nq == NULL) {
mtx_unlock(&qp->lock);
return (ENOTCONN);
}
refcount_acquire(&qp->qp_refs);
/* Set SQHD. */
if (qp->sq_flow_control) {
qp->sqhd = (qp->sqhd + 1) % qp->qsize;
cpl.sqhd = htole16(qp->sqhd);
} else
cpl.sqhd = 0;
mtx_unlock(&qp->lock);
rc = nvmf_allocate_response(nq, &cpl, M_WAITOK);
error = nvmf_transmit_capsule(rc);
nvmf_free_capsule(rc);
if (refcount_release(&qp->qp_refs))
nvmf_free_qpair(nq);
return (error);
}
void
nvmft_command_completed(struct nvmft_qpair *qp, struct nvmf_capsule *nc)
{
const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
/* Don't bother byte-swapping CID. */
KASSERT(BIT_ISSET(NUM_CIDS, cmd->cid, qp->cids),
("%s: CID %u not busy", __func__, cmd->cid));
BIT_CLR_ATOMIC(NUM_CIDS, cmd->cid, qp->cids);
}
int
nvmft_send_response(struct nvmft_qpair *qp, const void *cqe)
{
const struct nvme_completion *cpl = cqe;
/* Don't bother byte-swapping CID. */
KASSERT(BIT_ISSET(NUM_CIDS, cpl->cid, qp->cids),
("%s: CID %u not busy", __func__, cpl->cid));
BIT_CLR_ATOMIC(NUM_CIDS, cpl->cid, qp->cids);
return (_nvmft_send_response(qp, cqe));
}
void
nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status)
{
struct nvme_completion *cpl = cqe;
const struct nvme_command *cmd = nvmf_capsule_sqe(nc);
memset(cpl, 0, sizeof(*cpl));
cpl->cid = cmd->cid;
cpl->status = htole16(status);
}
int
nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
uint8_t sc_type, uint8_t sc_status)
{
struct nvme_completion cpl;
uint16_t status;
status = NVMEF(NVME_STATUS_SCT, sc_type) |
NVMEF(NVME_STATUS_SC, sc_status);
nvmft_init_cqe(&cpl, nc, status);
return (nvmft_send_response(qp, &cpl));
}
int
nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
uint8_t sc_status)
{
return (nvmft_send_error(qp, nc, NVME_SCT_GENERIC, sc_status));
}
/*
* This version doesn't clear CID in qp->cids and is used for errors
* before the CID is validated.
*/
static int
_nvmft_send_generic_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
uint8_t sc_status)
{
struct nvme_completion cpl;
uint16_t status;
status = NVMEF(NVME_STATUS_SCT, NVME_SCT_GENERIC) |
NVMEF(NVME_STATUS_SC, sc_status);
nvmft_init_cqe(&cpl, nc, status);
return (_nvmft_send_response(qp, &cpl));
}
int
nvmft_send_success(struct nvmft_qpair *qp, struct nvmf_capsule *nc)
{
return (nvmft_send_generic_error(qp, nc, NVME_SC_SUCCESS));
}
static void
nvmft_init_connect_rsp(struct nvmf_fabric_connect_rsp *rsp,
const struct nvmf_fabric_connect_cmd *cmd, uint16_t status)
{
memset(rsp, 0, sizeof(*rsp));
rsp->cid = cmd->cid;
rsp->status = htole16(status);
}
static int
nvmft_send_connect_response(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_rsp *rsp)
{
struct nvmf_capsule *rc;
struct nvmf_qpair *nq;
int error;
mtx_lock(&qp->lock);
nq = qp->qp;
if (nq == NULL) {
mtx_unlock(&qp->lock);
return (ENOTCONN);
}
refcount_acquire(&qp->qp_refs);
mtx_unlock(&qp->lock);
rc = nvmf_allocate_response(qp->qp, rsp, M_WAITOK);
error = nvmf_transmit_capsule(rc);
nvmf_free_capsule(rc);
if (refcount_release(&qp->qp_refs))
nvmf_free_qpair(nq);
return (error);
}
void
nvmft_connect_error(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
uint8_t sc_status)
{
struct nvmf_fabric_connect_rsp rsp;
uint16_t status;
status = NVMEF(NVME_STATUS_SCT, sc_type) |
NVMEF(NVME_STATUS_SC, sc_status);
nvmft_init_connect_rsp(&rsp, cmd, status);
nvmft_send_connect_response(qp, &rsp);
}
void
nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset)
{
struct nvmf_fabric_connect_rsp rsp;
nvmft_init_connect_rsp(&rsp, cmd,
NVMEF(NVME_STATUS_SCT, NVME_SCT_COMMAND_SPECIFIC) |
NVMEF(NVME_STATUS_SC, NVMF_FABRIC_SC_INVALID_PARAM));
rsp.status_code_specific.invalid.ipo = htole16(offset);
rsp.status_code_specific.invalid.iattr = data ? 1 : 0;
nvmft_send_connect_response(qp, &rsp);
}
int
nvmft_finish_accept(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr)
{
struct nvmf_fabric_connect_rsp rsp;
qp->ctrlr = ctrlr;
nvmft_init_connect_rsp(&rsp, cmd, 0);
if (qp->sq_flow_control)
rsp.sqhd = htole16(qp->sqhd);
else
rsp.sqhd = htole16(0xffff);
rsp.status_code_specific.success.cntlid = htole16(ctrlr->cntlid);
return (nvmft_send_connect_response(qp, &rsp));
}

View file

@ -0,0 +1,174 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#ifndef __NVMFT_VAR_H__
#define __NVMFT_VAR_H__
#include <sys/_callout.h>
#include <sys/refcount.h>
#include <sys/taskqueue.h>
#include <dev/nvmf/nvmf_proto.h>
#include <cam/ctl/ctl.h>
#include <cam/ctl/ctl_io.h>
#include <cam/ctl/ctl_frontend.h>
struct nvmf_capsule;
struct nvmft_controller;
struct nvmft_qpair;
#define NVMFT_NUM_AER 16
struct nvmft_port {
TAILQ_ENTRY(nvmft_port) link;
u_int refs;
struct ctl_port port;
struct nvme_controller_data cdata;
struct nvme_firmware_page fp;
uint64_t cap;
uint32_t max_io_qsize;
bool online;
struct sx lock;
struct unrhdr *ids;
TAILQ_HEAD(, nvmft_controller) controllers;
uint32_t *active_ns;
u_int num_ns;
};
struct nvmft_io_qpair {
struct nvmft_qpair *qp;
bool shutdown;
};
struct nvmft_controller {
struct nvmft_qpair *admin;
struct nvmft_io_qpair *io_qpairs;
u_int num_io_queues;
bool shutdown;
bool admin_closed;
uint16_t cntlid;
uint32_t cc;
uint32_t csts;
struct nvmft_port *np;
struct mtx lock;
struct nvme_controller_data cdata;
struct nvme_health_information_page hip;
sbintime_t create_time;
sbintime_t start_busy;
sbintime_t busy_total;
uint16_t partial_dur;
uint16_t partial_duw;
uint8_t hostid[16];
uint8_t hostnqn[NVME_NQN_FIELD_SIZE];
u_int trtype;
TAILQ_ENTRY(nvmft_controller) link;
/*
* Each queue can have at most UINT16_MAX commands, so the total
* across all queues will fit in a uint32_t.
*/
uint32_t pending_commands;
volatile int ka_active_traffic;
struct callout ka_timer;
sbintime_t ka_sbt;
/* AER fields. */
uint32_t aer_mask;
uint16_t aer_cids[NVMFT_NUM_AER];
uint8_t aer_pending;
uint8_t aer_cidx;
uint8_t aer_pidx;
/* Changed namespace IDs. */
struct nvme_ns_list *changed_ns;
bool changed_ns_reported;
struct task shutdown_task;
struct timeout_task terminate_task;
};
MALLOC_DECLARE(M_NVMFT);
/* ctl_frontend_nvmf.c */
void nvmft_port_free(struct nvmft_port *np);
void nvmft_populate_active_nslist(struct nvmft_port *np, uint32_t nsid,
struct nvme_ns_list *nslist);
void nvmft_dispatch_command(struct nvmft_qpair *qp,
struct nvmf_capsule *nc, bool admin);
void nvmft_terminate_commands(struct nvmft_controller *ctrlr);
/* nvmft_controller.c */
void nvmft_controller_error(struct nvmft_controller *ctrlr,
struct nvmft_qpair *qp, int error);
void nvmft_controller_lun_changed(struct nvmft_controller *ctrlr,
int lun_id);
void nvmft_handle_admin_command(struct nvmft_controller *ctrlr,
struct nvmf_capsule *nc);
void nvmft_handle_io_command(struct nvmft_qpair *qp, uint16_t qid,
struct nvmf_capsule *nc);
int nvmft_handoff_admin_queue(struct nvmft_port *np,
const struct nvmf_handoff_controller_qpair *handoff,
const struct nvmf_fabric_connect_cmd *cmd,
const struct nvmf_fabric_connect_data *data);
int nvmft_handoff_io_queue(struct nvmft_port *np,
const struct nvmf_handoff_controller_qpair *handoff,
const struct nvmf_fabric_connect_cmd *cmd,
const struct nvmf_fabric_connect_data *data);
int nvmft_printf(struct nvmft_controller *ctrlr, const char *fmt, ...)
__printflike(2, 3);
/* nvmft_qpair.c */
struct nvmft_qpair *nvmft_qpair_init(enum nvmf_trtype trtype,
const struct nvmf_handoff_qpair_params *handoff, uint16_t qid,
const char *name);
void nvmft_qpair_shutdown(struct nvmft_qpair *qp);
void nvmft_qpair_destroy(struct nvmft_qpair *qp);
struct nvmft_controller *nvmft_qpair_ctrlr(struct nvmft_qpair *qp);
uint16_t nvmft_qpair_id(struct nvmft_qpair *qp);
const char *nvmft_qpair_name(struct nvmft_qpair *qp);
void nvmft_command_completed(struct nvmft_qpair *qp,
struct nvmf_capsule *nc);
int nvmft_send_response(struct nvmft_qpair *qp, const void *cqe);
void nvmft_init_cqe(void *cqe, struct nvmf_capsule *nc, uint16_t status);
int nvmft_send_error(struct nvmft_qpair *qp, struct nvmf_capsule *nc,
uint8_t sc_type, uint8_t sc_status);
int nvmft_send_generic_error(struct nvmft_qpair *qp,
struct nvmf_capsule *nc, uint8_t sc_status);
int nvmft_send_success(struct nvmft_qpair *qp,
struct nvmf_capsule *nc);
void nvmft_connect_error(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd, uint8_t sc_type,
uint8_t sc_status);
void nvmft_connect_invalid_parameters(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd, bool data, uint16_t offset);
int nvmft_finish_accept(struct nvmft_qpair *qp,
const struct nvmf_fabric_connect_cmd *cmd, struct nvmft_controller *ctrlr);
static __inline void
nvmft_port_ref(struct nvmft_port *np)
{
refcount_acquire(&np->refs);
}
static __inline void
nvmft_port_rele(struct nvmft_port *np)
{
if (refcount_release(&np->refs))
nvmft_port_free(np);
}
#endif /* !__NVMFT_VAR_H__ */

View file

@ -1,5 +1,6 @@
SUBDIR= nvmf \
nvmf_tcp \
nvmf_transport
nvmf_transport \
nvmft
.include <bsd.subdir.mk>

View file

@ -0,0 +1,10 @@
.PATH: ${SRCTOP}/sys/dev/nvmf/controller
KMOD= nvmft
SRCS= ctl_frontend_nvmf.c \
nvmft_controller.c \
nvmft_subr.c \
nvmft_qpair.c
.include <bsd.kmod.mk>