freebsd-src/usr.sbin/nvmfd/io.c
John Baldwin a8089ea5ae nvmfd: A simple userspace daemon for the NVMe over Fabrics controller
This daemon can operate as a purely userspace controller exporting one
or more simulated RAM disks or local block devices as NVMe namespaces
to a remote host.  In this case the daemon provides a discovery
controller with a single entry for an I/O controller.

nvmfd can also offload I/O controller queue pairs to the nvmft.ko
in-kernel Fabrics controller when -K is passed.  In this mode, nvmfd
still accepts connections and performs initial transport-specific
negotitation in userland.  The daemon still provides a userspace-only
discovery controller with a single entry for an I/O controller.
However, queue pairs for the I/O controller are handed off to the CTL
NVMF frontend.

Eventually ctld(8) should be refactored to to provide an abstraction
for the frontend protocol and the discovery and the kernel mode of
this daemon should be merged into ctld(8).  At that point this daemon
can be moved to tools/tools/nvmf as a debugging tool (mostly as sample
code for a userspace controller using libnvmf).

Reviewed by:	imp
Sponsored by:	Chelsio Communications
Differential Revision:	https://reviews.freebsd.org/D44731
2024-05-02 16:38:39 -07:00

678 lines
16 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2023-2024 Chelsio Communications, Inc.
* Written by: John Baldwin <jhb@FreeBSD.org>
*/
#include <sys/sysctl.h>
#include <err.h>
#include <errno.h>
#include <libnvmf.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "internal.h"
struct io_controller {
struct controller *c;
u_int num_io_queues;
u_int active_io_queues;
struct nvmf_qpair **io_qpairs;
int *io_sockets;
struct nvme_firmware_page fp;
struct nvme_health_information_page hip;
uint16_t partial_dur;
uint16_t partial_duw;
uint16_t cntlid;
char hostid[16];
char hostnqn[NVME_NQN_FIELD_SIZE];
};
static struct nvmf_association *io_na;
static pthread_cond_t io_cond;
static pthread_mutex_t io_na_mutex;
static struct io_controller *io_controller;
static const char *nqn;
static char serial[NVME_SERIAL_NUMBER_LENGTH];
void
init_io(const char *subnqn)
{
struct nvmf_association_params aparams;
u_long hostid;
size_t len;
memset(&aparams, 0, sizeof(aparams));
aparams.sq_flow_control = !flow_control_disable;
aparams.dynamic_controller_model = true;
aparams.max_admin_qsize = NVME_MAX_ADMIN_ENTRIES;
aparams.max_io_qsize = NVMF_MAX_IO_ENTRIES;
aparams.tcp.pda = 0;
aparams.tcp.header_digests = header_digests;
aparams.tcp.data_digests = data_digests;
aparams.tcp.maxr2t = 1;
aparams.tcp.maxh2cdata = 256 * 1024;
io_na = nvmf_allocate_association(NVMF_TRTYPE_TCP, true,
&aparams);
if (io_na == NULL)
err(1, "Failed to create I/O controller association");
nqn = subnqn;
/* Generate a serial number from the kern.hostid node. */
len = sizeof(hostid);
if (sysctlbyname("kern.hostid", &hostid, &len, NULL, 0) == -1)
err(1, "sysctl: kern.hostid");
nvmf_controller_serial(serial, sizeof(serial), hostid);
pthread_cond_init(&io_cond, NULL);
pthread_mutex_init(&io_na_mutex, NULL);
if (kernel_io)
init_ctl_port(subnqn, &aparams);
}
void
shutdown_io(void)
{
if (kernel_io)
shutdown_ctl_port(nqn);
}
static void
handle_get_log_page(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
uint64_t offset;
uint32_t numd;
size_t len;
uint8_t lid;
lid = le32toh(cmd->cdw10) & 0xff;
numd = le32toh(cmd->cdw10) >> 16 | le32toh(cmd->cdw11) << 16;
offset = le32toh(cmd->cdw12) | (uint64_t)le32toh(cmd->cdw13) << 32;
if (offset % 3 != 0)
goto error;
len = (numd + 1) * 4;
switch (lid) {
case NVME_LOG_ERROR:
{
void *buf;
if (len % sizeof(struct nvme_error_information_entry) != 0)
goto error;
buf = calloc(1, len);
nvmf_send_controller_data(nc, buf, len);
free(buf);
return;
}
case NVME_LOG_HEALTH_INFORMATION:
if (len != sizeof(ioc->hip))
goto error;
nvmf_send_controller_data(nc, &ioc->hip, sizeof(ioc->hip));
return;
case NVME_LOG_FIRMWARE_SLOT:
if (len != sizeof(ioc->fp))
goto error;
nvmf_send_controller_data(nc, &ioc->fp, sizeof(ioc->fp));
return;
default:
warnx("Unsupported page %#x for GET_LOG_PAGE\n", lid);
goto error;
}
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static bool
handle_io_identify_command(const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
struct nvme_namespace_data nsdata;
struct nvme_ns_list nslist;
uint32_t nsid;
uint8_t cns;
cns = le32toh(cmd->cdw10) & 0xFF;
switch (cns) {
case 0: /* Namespace data. */
if (!device_namespace_data(le32toh(cmd->nsid), &nsdata)) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return (true);
}
nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata));
return (true);
case 2: /* Active namespace list. */
nsid = le32toh(cmd->nsid);
if (nsid >= 0xfffffffe) {
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
return (true);
}
device_active_nslist(nsid, &nslist);
nvmf_send_controller_data(nc, &nslist, sizeof(nslist));
return (true);
case 3: /* Namespace Identification Descriptor list. */
if (!device_identification_descriptor(le32toh(cmd->nsid),
&nsdata)) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
return (true);
}
nvmf_send_controller_data(nc, &nsdata, sizeof(nsdata));
return (true);
default:
return (false);
}
}
static void
handle_set_features(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
struct nvme_completion cqe;
uint8_t fid;
fid = NVMEV(NVME_FEAT_SET_FID, le32toh(cmd->cdw10));
switch (fid) {
case NVME_FEAT_NUMBER_OF_QUEUES:
{
uint32_t num_queues;
if (ioc->num_io_queues != 0) {
nvmf_send_generic_error(nc,
NVME_SC_COMMAND_SEQUENCE_ERROR);
return;
}
num_queues = le32toh(cmd->cdw11) & 0xffff;
/* 5.12.1.7: 65535 is invalid. */
if (num_queues == 65535)
goto error;
/* Fabrics requires the same number of SQs and CQs. */
if (le32toh(cmd->cdw11) >> 16 != num_queues)
goto error;
/* Convert to 1's based */
num_queues++;
/* Lock to synchronize with handle_io_qpair. */
pthread_mutex_lock(&io_na_mutex);
ioc->num_io_queues = num_queues;
ioc->io_qpairs = calloc(num_queues, sizeof(*ioc->io_qpairs));
ioc->io_sockets = calloc(num_queues, sizeof(*ioc->io_sockets));
pthread_mutex_unlock(&io_na_mutex);
nvmf_init_cqe(&cqe, nc, 0);
cqe.cdw0 = cmd->cdw11;
nvmf_send_response(nc, &cqe);
return;
}
case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
{
uint32_t aer_mask;
aer_mask = le32toh(cmd->cdw11);
/* Check for any reserved or unimplemented feature bits. */
if ((aer_mask & 0xffffc000) != 0)
goto error;
/* No AERs are generated by this daemon. */
nvmf_send_success(nc);
return;
}
default:
warnx("Unsupported feature ID %u for SET_FEATURES", fid);
goto error;
}
error:
nvmf_send_generic_error(nc, NVME_SC_INVALID_FIELD);
}
static bool
admin_command(const struct nvmf_capsule *nc, const struct nvme_command *cmd,
void *arg)
{
struct io_controller *ioc = arg;
switch (cmd->opc) {
case NVME_OPC_GET_LOG_PAGE:
handle_get_log_page(ioc, nc, cmd);
return (true);
case NVME_OPC_IDENTIFY:
return (handle_io_identify_command(nc, cmd));
case NVME_OPC_SET_FEATURES:
handle_set_features(ioc, nc, cmd);
return (true);
case NVME_OPC_ASYNC_EVENT_REQUEST:
/* Ignore and never complete. */
return (true);
case NVME_OPC_KEEP_ALIVE:
nvmf_send_success(nc);
return (true);
default:
return (false);
}
}
static void
handle_admin_qpair(struct io_controller *ioc)
{
pthread_setname_np(pthread_self(), "admin queue");
controller_handle_admin_commands(ioc->c, admin_command, ioc);
pthread_mutex_lock(&io_na_mutex);
for (u_int i = 0; i < ioc->num_io_queues; i++) {
if (ioc->io_qpairs[i] == NULL || ioc->io_sockets[i] == -1)
continue;
close(ioc->io_sockets[i]);
ioc->io_sockets[i] = -1;
}
/* Wait for I/O threads to notice. */
while (ioc->active_io_queues > 0)
pthread_cond_wait(&io_cond, &io_na_mutex);
io_controller = NULL;
pthread_mutex_unlock(&io_na_mutex);
free_controller(ioc->c);
free(ioc);
}
static bool
handle_io_fabrics_command(const struct nvmf_capsule *nc,
const struct nvmf_fabric_cmd *fc)
{
switch (fc->fctype) {
case NVMF_FABRIC_COMMAND_CONNECT:
warnx("CONNECT command on connected queue");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
break;
case NVMF_FABRIC_COMMAND_DISCONNECT:
{
const struct nvmf_fabric_disconnect_cmd *dis =
(const struct nvmf_fabric_disconnect_cmd *)fc;
if (dis->recfmt != htole16(0)) {
nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT);
break;
}
nvmf_send_success(nc);
return (true);
}
default:
warnx("Unsupported fabrics command %#x", fc->fctype);
nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
break;
}
return (false);
}
static void
hip_add(uint64_t pair[2], uint64_t addend)
{
uint64_t old, new;
old = le64toh(pair[0]);
new = old + addend;
pair[0] = htole64(new);
if (new < old)
pair[1] += htole64(1);
}
static uint64_t
cmd_lba(const struct nvme_command *cmd)
{
return ((uint64_t)le32toh(cmd->cdw11) << 32 | le32toh(cmd->cdw10));
}
static u_int
cmd_nlb(const struct nvme_command *cmd)
{
return ((le32toh(cmd->cdw12) & 0xffff) + 1);
}
static void
handle_read(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
size_t len;
len = nvmf_capsule_data_len(nc);
device_read(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc);
hip_add(ioc->hip.host_read_commands, 1);
len /= 512;
len += ioc->partial_dur;
if (len > 1000)
hip_add(ioc->hip.data_units_read, len / 1000);
ioc->partial_dur = len % 1000;
}
static void
handle_write(struct io_controller *ioc, const struct nvmf_capsule *nc,
const struct nvme_command *cmd)
{
size_t len;
len = nvmf_capsule_data_len(nc);
device_write(le32toh(cmd->nsid), cmd_lba(cmd), cmd_nlb(cmd), nc);
hip_add(ioc->hip.host_write_commands, 1);
len /= 512;
len += ioc->partial_duw;
if (len > 1000)
hip_add(ioc->hip.data_units_written, len / 1000);
ioc->partial_duw = len % 1000;
}
static void
handle_flush(const struct nvmf_capsule *nc, const struct nvme_command *cmd)
{
device_flush(le32toh(cmd->nsid), nc);
}
static bool
handle_io_commands(struct io_controller *ioc, struct nvmf_qpair *qp)
{
const struct nvme_command *cmd;
struct nvmf_capsule *nc;
int error;
bool disconnect;
disconnect = false;
while (!disconnect) {
error = nvmf_controller_receive_capsule(qp, &nc);
if (error != 0) {
if (error != ECONNRESET)
warnc(error, "Failed to read command capsule");
break;
}
cmd = nvmf_capsule_sqe(nc);
switch (cmd->opc) {
case NVME_OPC_FLUSH:
if (cmd->nsid == htole32(0xffffffff)) {
nvmf_send_generic_error(nc,
NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
break;
}
handle_flush(nc, cmd);
break;
case NVME_OPC_WRITE:
handle_write(ioc, nc, cmd);
break;
case NVME_OPC_READ:
handle_read(ioc, nc, cmd);
break;
case NVME_OPC_FABRICS_COMMANDS:
disconnect = handle_io_fabrics_command(nc,
(const struct nvmf_fabric_cmd *)cmd);
break;
default:
warnx("Unsupported NVM opcode %#x", cmd->opc);
nvmf_send_generic_error(nc, NVME_SC_INVALID_OPCODE);
break;
}
nvmf_free_capsule(nc);
}
return (disconnect);
}
static void
handle_io_qpair(struct io_controller *ioc, struct nvmf_qpair *qp, int qid)
{
char name[64];
bool disconnect;
snprintf(name, sizeof(name), "I/O queue %d", qid);
pthread_setname_np(pthread_self(), name);
disconnect = handle_io_commands(ioc, qp);
pthread_mutex_lock(&io_na_mutex);
if (disconnect)
ioc->io_qpairs[qid - 1] = NULL;
if (ioc->io_sockets[qid - 1] != -1) {
close(ioc->io_sockets[qid - 1]);
ioc->io_sockets[qid - 1] = -1;
}
ioc->active_io_queues--;
if (ioc->active_io_queues == 0)
pthread_cond_broadcast(&io_cond);
pthread_mutex_unlock(&io_na_mutex);
}
static void
connect_admin_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc,
const struct nvmf_fabric_connect_data *data)
{
struct nvme_controller_data cdata;
struct io_controller *ioc;
int error;
/* Can only have one active I/O controller at a time. */
pthread_mutex_lock(&io_na_mutex);
if (io_controller != NULL) {
pthread_mutex_unlock(&io_na_mutex);
nvmf_send_error(nc, NVME_SCT_COMMAND_SPECIFIC,
NVMF_FABRIC_SC_CONTROLLER_BUSY);
goto error;
}
error = nvmf_finish_accept(nc, 2);
if (error != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnc(error, "Failed to send CONNECT response");
goto error;
}
ioc = calloc(1, sizeof(*ioc));
ioc->cntlid = 2;
memcpy(ioc->hostid, data->hostid, sizeof(ioc->hostid));
memcpy(ioc->hostnqn, data->hostnqn, sizeof(ioc->hostnqn));
nvmf_init_io_controller_data(qp, serial, nqn, device_count(),
NVMF_IOCCSZ, &cdata);
ioc->fp.afi = NVMEF(NVME_FIRMWARE_PAGE_AFI_SLOT, 1);
memcpy(ioc->fp.revision[0], cdata.fr, sizeof(cdata.fr));
ioc->hip.power_cycles[0] = 1;
ioc->c = init_controller(qp, &cdata);
io_controller = ioc;
pthread_mutex_unlock(&io_na_mutex);
nvmf_free_capsule(nc);
handle_admin_qpair(ioc);
close(s);
return;
error:
nvmf_free_capsule(nc);
close(s);
}
static void
connect_io_qpair(int s, struct nvmf_qpair *qp, struct nvmf_capsule *nc,
const struct nvmf_fabric_connect_data *data, uint16_t qid)
{
struct io_controller *ioc;
int error;
pthread_mutex_lock(&io_na_mutex);
if (io_controller == NULL) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to create I/O qpair without admin qpair");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
goto error;
}
if (memcmp(io_controller->hostid, data->hostid,
sizeof(data->hostid)) != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnx("hostid mismatch for I/O qpair CONNECT");
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, hostid));
goto error;
}
if (le16toh(data->cntlid) != io_controller->cntlid) {
pthread_mutex_unlock(&io_na_mutex);
warnx("cntlid mismatch for I/O qpair CONNECT");
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, cntlid));
goto error;
}
if (memcmp(io_controller->hostnqn, data->hostnqn,
sizeof(data->hostid)) != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnx("host NQN mismatch for I/O qpair CONNECT");
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, hostnqn));
goto error;
}
if (io_controller->num_io_queues == 0) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to create I/O qpair without enabled queues");
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
goto error;
}
if (qid > io_controller->num_io_queues) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to create invalid I/O qpair %u", qid);
nvmf_connect_invalid_parameters(nc, false,
offsetof(struct nvmf_fabric_connect_cmd, qid));
goto error;
}
if (io_controller->io_qpairs[qid - 1] != NULL) {
pthread_mutex_unlock(&io_na_mutex);
warnx("Attempt to re-create I/O qpair %u", qid);
nvmf_send_generic_error(nc, NVME_SC_COMMAND_SEQUENCE_ERROR);
goto error;
}
error = nvmf_finish_accept(nc, io_controller->cntlid);
if (error != 0) {
pthread_mutex_unlock(&io_na_mutex);
warnc(error, "Failed to send CONNECT response");
goto error;
}
ioc = io_controller;
ioc->active_io_queues++;
ioc->io_qpairs[qid - 1] = qp;
ioc->io_sockets[qid - 1] = s;
pthread_mutex_unlock(&io_na_mutex);
nvmf_free_capsule(nc);
handle_io_qpair(ioc, qp, qid);
return;
error:
nvmf_free_capsule(nc);
close(s);
}
static void *
io_socket_thread(void *arg)
{
struct nvmf_fabric_connect_data data;
struct nvmf_qpair_params qparams;
const struct nvmf_fabric_connect_cmd *cmd;
struct nvmf_capsule *nc;
struct nvmf_qpair *qp;
int s;
pthread_detach(pthread_self());
s = (intptr_t)arg;
memset(&qparams, 0, sizeof(qparams));
qparams.tcp.fd = s;
nc = NULL;
qp = nvmf_accept(io_na, &qparams, &nc, &data);
if (qp == NULL) {
warnx("Failed to create I/O qpair: %s",
nvmf_association_error(io_na));
goto error;
}
if (kernel_io) {
ctl_handoff_qpair(qp, nvmf_capsule_sqe(nc), &data);
goto error;
}
if (strcmp(data.subnqn, nqn) != 0) {
warn("I/O qpair with invalid SubNQN: %.*s",
(int)sizeof(data.subnqn), data.subnqn);
nvmf_connect_invalid_parameters(nc, true,
offsetof(struct nvmf_fabric_connect_data, subnqn));
goto error;
}
/* Is this an admin or I/O queue pair? */
cmd = nvmf_capsule_sqe(nc);
if (cmd->qid == 0)
connect_admin_qpair(s, qp, nc, &data);
else
connect_io_qpair(s, qp, nc, &data, le16toh(cmd->qid));
nvmf_free_qpair(qp);
return (NULL);
error:
if (nc != NULL)
nvmf_free_capsule(nc);
if (qp != NULL)
nvmf_free_qpair(qp);
close(s);
return (NULL);
}
void
handle_io_socket(int s)
{
pthread_t thr;
int error;
error = pthread_create(&thr, NULL, io_socket_thread,
(void *)(uintptr_t)s);
if (error != 0) {
warnc(error, "Failed to create I/O qpair thread");
close(s);
}
}