diff --git a/MAINTAINERS b/MAINTAINERS index 9cbc3766fd74..d8e882229a48 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13107,6 +13107,14 @@ F: drivers/nvme/host/ F: include/linux/nvme.h F: include/uapi/linux/nvme_ioctl.h +NVM EXPRESS TCP OFFLOAD TRANSPORT DRIVERS +M: Shai Malin +M: Ariel Elior +L: linux-nvme@lists.infradead.org +S: Supported +F: drivers/nvme/host/tcp-offload.c +F: drivers/nvme/host/tcp-offload.h + NVM EXPRESS FC TRANSPORT DRIVERS M: James Smart L: linux-nvme@lists.infradead.org diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index a44d49d63968..caedc35e1f0d 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -84,3 +84,20 @@ config NVME_TCP from https://github.com/linux-nvme/nvme-cli. If unsure, say N. + +config NVME_TCP_OFFLOAD + tristate "NVM Express over Fabrics TCP offload common layer" + default m + depends on BLOCK + depends on INET + select NVME_CORE + select NVME_FABRICS + help + This provides support for the NVMe over Fabrics protocol using + the TCP offload transport. This allows you to use remote block devices + exported using the NVMe protocol set. + + To configure a NVMe over Fabrics controller use the nvme-cli tool + from https://github.com/linux-nvme/nvme-cli. + + If unsure, say N. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index cbc509784b2e..3c3fdf83ce38 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o +obj-$(CONFIG_NVME_TCP_OFFLOAD) += nvme-tcp-offload.o nvme-core-y := core.o ioctl.o nvme-core-$(CONFIG_TRACING) += trace.o @@ -26,3 +27,5 @@ nvme-rdma-y += rdma.o nvme-fc-y += fc.o nvme-tcp-y += tcp.o + +nvme-tcp-offload-y += tcp-offload.o diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a2bb7fc63a73..ceb263eb50fb 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -860,8 +860,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, return ret; } -static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, - unsigned int required_opts) +int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, + unsigned int required_opts) { if ((opts->mask & required_opts) != required_opts) { int i; @@ -879,6 +879,7 @@ static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, return 0; } +EXPORT_SYMBOL_GPL(nvmf_check_required_opts); bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts) @@ -942,13 +943,6 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) } EXPORT_SYMBOL_GPL(nvmf_free_options); -#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) -#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ - NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ - NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ - NVMF_OPT_DISABLE_SQFLOW |\ - NVMF_OPT_FAIL_FAST_TMO) - static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index d7f7974dc208..8399fcc063ef 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -68,6 +68,13 @@ enum { NVMF_OPT_FAIL_FAST_TMO = 1 << 20, }; +#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) +#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ + NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ + NVMF_OPT_DISABLE_SQFLOW |\ + NVMF_OPT_FAIL_FAST_TMO) + /** * struct nvmf_ctrl_options - Used to hold the options specified * with the parsing opts enum. @@ -186,5 +193,7 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts); +int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, + unsigned int required_opts); #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/tcp-offload.c b/drivers/nvme/host/tcp-offload.c new file mode 100644 index 000000000000..c76822e5ada7 --- /dev/null +++ b/drivers/nvme/host/tcp-offload.c @@ -0,0 +1,1318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Marvell. All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* Kernel includes */ +#include +#include + +/* Driver includes */ +#include "tcp-offload.h" + +static LIST_HEAD(nvme_tcp_ofld_devices); +static DEFINE_MUTEX(nvme_tcp_ofld_devices_mutex); +static LIST_HEAD(nvme_tcp_ofld_ctrl_list); +static DEFINE_MUTEX(nvme_tcp_ofld_ctrl_mutex); +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops; +static struct blk_mq_ops nvme_tcp_ofld_mq_ops; + +static inline struct nvme_tcp_ofld_ctrl *to_tcp_ofld_ctrl(struct nvme_ctrl *nctrl) +{ + return container_of(nctrl, struct nvme_tcp_ofld_ctrl, nctrl); +} + +static inline int nvme_tcp_ofld_qid(struct nvme_tcp_ofld_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +/** + * nvme_tcp_ofld_register_dev() - NVMeTCP Offload Library registration + * function. + * @dev: NVMeTCP offload device instance to be registered to the + * common tcp offload instance. + * + * API function that registers the type of vendor specific driver + * being implemented to the common NVMe over TCP offload library. Part of + * the overall init sequence of starting up an offload driver. + */ +int nvme_tcp_ofld_register_dev(struct nvme_tcp_ofld_dev *dev) +{ + struct nvme_tcp_ofld_ops *ops = dev->ops; + + if (!ops->claim_dev || + !ops->setup_ctrl || + !ops->release_ctrl || + !ops->create_queue || + !ops->drain_queue || + !ops->destroy_queue || + !ops->poll_queue || + !ops->send_req) + return -EINVAL; + + mutex_lock(&nvme_tcp_ofld_devices_mutex); + list_add_tail(&dev->entry, &nvme_tcp_ofld_devices); + mutex_unlock(&nvme_tcp_ofld_devices_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_tcp_ofld_register_dev); + +/** + * nvme_tcp_ofld_unregister_dev() - NVMeTCP Offload Library unregistration + * function. + * @dev: NVMeTCP offload device instance to be unregistered from the + * common tcp offload instance. + * + * API function that unregisters the type of vendor specific driver being + * implemented from the common NVMe over TCP offload library. + * Part of the overall exit sequence of unloading the implemented driver. + */ +void nvme_tcp_ofld_unregister_dev(struct nvme_tcp_ofld_dev *dev) +{ + mutex_lock(&nvme_tcp_ofld_devices_mutex); + list_del(&dev->entry); + mutex_unlock(&nvme_tcp_ofld_devices_mutex); +} +EXPORT_SYMBOL_GPL(nvme_tcp_ofld_unregister_dev); + +/** + * nvme_tcp_ofld_error_recovery() - NVMeTCP Offload library error recovery. + * function. + * @nctrl: NVMe controller instance to change to resetting. + * + * API function that change the controller state to resseting. + * Part of the overall controller reset sequence. + */ +void nvme_tcp_ofld_error_recovery(struct nvme_ctrl *nctrl) +{ + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_RESETTING)) + return; + + queue_work(nvme_reset_wq, &to_tcp_ofld_ctrl(nctrl)->err_work); +} +EXPORT_SYMBOL_GPL(nvme_tcp_ofld_error_recovery); + +/** + * nvme_tcp_ofld_report_queue_err() - NVMeTCP Offload report error event + * callback function. Pointed to by nvme_tcp_ofld_queue->report_err. + * @queue: NVMeTCP offload queue instance on which the error has occurred. + * + * API function that allows the vendor specific offload driver to reports errors + * to the common offload layer, to invoke error recovery. + */ +int nvme_tcp_ofld_report_queue_err(struct nvme_tcp_ofld_queue *queue) +{ + pr_err("nvme-tcp-offload queue error\n"); + nvme_tcp_ofld_error_recovery(&queue->ctrl->nctrl); + + return 0; +} + +/** + * nvme_tcp_ofld_req_done() - NVMeTCP Offload request done callback + * function. Pointed to by nvme_tcp_ofld_req->done. + * Handles both NVME_TCP_F_DATA_SUCCESS flag and NVMe CQ. + * @req: NVMeTCP offload request to complete. + * @result: The nvme_result. + * @status: The completion status. + * + * API function that allows the vendor specific offload driver to report request + * completions to the common offload layer. + */ +void nvme_tcp_ofld_req_done(struct nvme_tcp_ofld_req *req, + union nvme_result *result, + __le16 status) +{ + struct request *rq = blk_mq_rq_from_pdu(req); + + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), *result)) + nvme_complete_rq(rq); +} + +/** + * nvme_tcp_ofld_async_req_done() - NVMeTCP Offload request done callback + * function for async request. Pointed to by nvme_tcp_ofld_req->done. + * Handles both NVME_TCP_F_DATA_SUCCESS flag and NVMe CQ. + * @req: NVMeTCP offload request to complete. + * @result: The nvme_result. + * @status: The completion status. + * + * API function that allows the vendor specific offload driver to report request + * completions to the common offload layer. + */ +void nvme_tcp_ofld_async_req_done(struct nvme_tcp_ofld_req *req, + union nvme_result *result, __le16 status) +{ + struct nvme_tcp_ofld_queue *queue = req->queue; + struct nvme_tcp_ofld_ctrl *ctrl = queue->ctrl; + + nvme_complete_async_event(&ctrl->nctrl, status, result); +} + +static struct nvme_tcp_ofld_dev * +nvme_tcp_ofld_lookup_dev(struct nvme_tcp_ofld_ctrl *ctrl) +{ + struct nvme_tcp_ofld_dev *dev; + + mutex_lock(&nvme_tcp_ofld_devices_mutex); + list_for_each_entry(dev, &nvme_tcp_ofld_devices, entry) { + if (dev->ops->claim_dev(dev, ctrl)) + goto out; + } + + dev = NULL; +out: + mutex_unlock(&nvme_tcp_ofld_devices_mutex); + + return dev; +} + +static struct blk_mq_tag_set * +nvme_tcp_ofld_alloc_tagset(struct nvme_ctrl *nctrl, bool admin) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct blk_mq_tag_set *set; + int rc; + + if (admin) { + set = &ctrl->admin_tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_ofld_admin_mq_ops; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = nctrl->numa_node; + set->flags = BLK_MQ_F_BLOCKING; + set->cmd_size = sizeof(struct nvme_tcp_ofld_req); + set->driver_data = ctrl; + set->nr_hw_queues = 1; + set->timeout = NVME_ADMIN_TIMEOUT; + } else { + set = &ctrl->tag_set; + memset(set, 0, sizeof(*set)); + set->ops = &nvme_tcp_ofld_mq_ops; + set->queue_depth = nctrl->sqsize + 1; + set->reserved_tags = NVMF_RESERVED_TAGS; + set->numa_node = nctrl->numa_node; + set->flags = BLK_MQ_F_SHOULD_MERGE; + set->cmd_size = sizeof(struct nvme_tcp_ofld_req); + set->driver_data = ctrl; + set->nr_hw_queues = nctrl->queue_count - 1; + set->timeout = NVME_IO_TIMEOUT; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; + } + + rc = blk_mq_alloc_tag_set(set); + if (rc) + return ERR_PTR(rc); + + return set; +} + +static void __nvme_tcp_ofld_stop_queue(struct nvme_tcp_ofld_queue *queue) +{ + queue->dev->ops->drain_queue(queue); +} + +static void nvme_tcp_ofld_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvme_tcp_ofld_queue *queue = &ctrl->queues[qid]; + + mutex_lock(&queue->queue_lock); + if (test_and_clear_bit(NVME_TCP_OFLD_Q_LIVE, &queue->flags)) + __nvme_tcp_ofld_stop_queue(queue); + mutex_unlock(&queue->queue_lock); +} + +static void nvme_tcp_ofld_stop_io_queues(struct nvme_ctrl *ctrl) +{ + int i; + + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_ofld_stop_queue(ctrl, i); +} + +static void __nvme_tcp_ofld_free_queue(struct nvme_tcp_ofld_queue *queue) +{ + queue->dev->ops->destroy_queue(queue); +} + +static void nvme_tcp_ofld_free_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvme_tcp_ofld_queue *queue = &ctrl->queues[qid]; + + if (test_and_clear_bit(NVME_TCP_OFLD_Q_ALLOCATED, &queue->flags)) { + __nvme_tcp_ofld_free_queue(queue); + mutex_destroy(&queue->queue_lock); + } +} + +static void +nvme_tcp_ofld_free_io_queues(struct nvme_ctrl *nctrl) +{ + int i; + + for (i = 1; i < nctrl->queue_count; i++) + nvme_tcp_ofld_free_queue(nctrl, i); +} + +static void nvme_tcp_ofld_destroy_io_queues(struct nvme_ctrl *nctrl, bool remove) +{ + nvme_tcp_ofld_stop_io_queues(nctrl); + if (remove) { + blk_cleanup_queue(nctrl->connect_q); + blk_mq_free_tag_set(nctrl->tagset); + } + nvme_tcp_ofld_free_io_queues(nctrl); +} + +static void nvme_tcp_ofld_destroy_admin_queue(struct nvme_ctrl *nctrl, bool remove) +{ + nvme_tcp_ofld_stop_queue(nctrl, 0); + if (remove) { + blk_cleanup_queue(nctrl->admin_q); + blk_cleanup_queue(nctrl->fabrics_q); + blk_mq_free_tag_set(nctrl->admin_tagset); + } + nvme_tcp_ofld_free_queue(nctrl, 0); +} + +static int nvme_tcp_ofld_start_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvme_tcp_ofld_queue *queue = &ctrl->queues[qid]; + int rc; + + queue = &ctrl->queues[qid]; + if (qid) { + queue->cmnd_capsule_len = nctrl->ioccsz * 16; + rc = nvmf_connect_io_queue(nctrl, qid, false); + } else { + queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; + rc = nvmf_connect_admin_queue(nctrl); + } + + if (!rc) { + set_bit(NVME_TCP_OFLD_Q_LIVE, &queue->flags); + } else { + if (test_bit(NVME_TCP_OFLD_Q_ALLOCATED, &queue->flags)) + __nvme_tcp_ofld_stop_queue(queue); + dev_err(nctrl->device, + "failed to connect queue: %d ret=%d\n", qid, rc); + } + + return rc; +} + +static int nvme_tcp_ofld_configure_admin_queue(struct nvme_ctrl *nctrl, + bool new) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvme_tcp_ofld_queue *queue = &ctrl->queues[0]; + int rc; + + mutex_init(&queue->queue_lock); + + rc = ctrl->dev->ops->create_queue(queue, 0, NVME_AQ_DEPTH); + if (rc) + return rc; + + set_bit(NVME_TCP_OFLD_Q_ALLOCATED, &queue->flags); + if (new) { + nctrl->admin_tagset = + nvme_tcp_ofld_alloc_tagset(nctrl, true); + if (IS_ERR(nctrl->admin_tagset)) { + rc = PTR_ERR(nctrl->admin_tagset); + nctrl->admin_tagset = NULL; + goto out_free_queue; + } + + nctrl->fabrics_q = blk_mq_init_queue(nctrl->admin_tagset); + if (IS_ERR(nctrl->fabrics_q)) { + rc = PTR_ERR(nctrl->fabrics_q); + nctrl->fabrics_q = NULL; + goto out_free_tagset; + } + + nctrl->admin_q = blk_mq_init_queue(nctrl->admin_tagset); + if (IS_ERR(nctrl->admin_q)) { + rc = PTR_ERR(nctrl->admin_q); + nctrl->admin_q = NULL; + goto out_cleanup_fabrics_q; + } + } + + rc = nvme_tcp_ofld_start_queue(nctrl, 0); + if (rc) + goto out_cleanup_queue; + + rc = nvme_enable_ctrl(nctrl); + if (rc) + goto out_stop_queue; + + blk_mq_unquiesce_queue(nctrl->admin_q); + + rc = nvme_init_ctrl_finish(nctrl); + if (rc) + goto out_quiesce_queue; + + return 0; + +out_quiesce_queue: + blk_mq_quiesce_queue(nctrl->admin_q); + blk_sync_queue(nctrl->admin_q); +out_stop_queue: + nvme_tcp_ofld_stop_queue(nctrl, 0); + nvme_cancel_admin_tagset(nctrl); +out_cleanup_queue: + if (new) + blk_cleanup_queue(nctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(nctrl->fabrics_q); +out_free_tagset: + if (new) + blk_mq_free_tag_set(nctrl->admin_tagset); +out_free_queue: + nvme_tcp_ofld_free_queue(nctrl, 0); + + return rc; +} + +static unsigned int nvme_tcp_ofld_nr_io_queues(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvme_tcp_ofld_dev *dev = ctrl->dev; + u32 hw_vectors = dev->num_hw_vectors; + u32 nr_write_queues, nr_poll_queues; + u32 nr_io_queues, nr_total_queues; + + nr_io_queues = min3(nctrl->opts->nr_io_queues, num_online_cpus(), + hw_vectors); + nr_write_queues = min3(nctrl->opts->nr_write_queues, num_online_cpus(), + hw_vectors); + nr_poll_queues = min3(nctrl->opts->nr_poll_queues, num_online_cpus(), + hw_vectors); + + nr_total_queues = nr_io_queues + nr_write_queues + nr_poll_queues; + + return nr_total_queues; +} + +static void +nvme_tcp_ofld_set_io_queues(struct nvme_ctrl *nctrl, unsigned int nr_io_queues) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvmf_ctrl_options *opts = nctrl->opts; + + if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_write_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_io_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } +} + +static int nvme_tcp_ofld_create_io_queues(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + int i, rc; + + for (i = 1; i < nctrl->queue_count; i++) { + mutex_init(&ctrl->queues[i].queue_lock); + + rc = ctrl->dev->ops->create_queue(&ctrl->queues[i], + i, nctrl->sqsize + 1); + if (rc) + goto out_free_queues; + + set_bit(NVME_TCP_OFLD_Q_ALLOCATED, &ctrl->queues[i].flags); + } + + return 0; + +out_free_queues: + for (i--; i >= 1; i--) + nvme_tcp_ofld_free_queue(nctrl, i); + + return rc; +} + +static int nvme_tcp_ofld_alloc_io_queues(struct nvme_ctrl *nctrl) +{ + unsigned int nr_io_queues; + int rc; + + nr_io_queues = nvme_tcp_ofld_nr_io_queues(nctrl); + rc = nvme_set_queue_count(nctrl, &nr_io_queues); + if (rc) + return rc; + + nctrl->queue_count = nr_io_queues + 1; + if (nctrl->queue_count < 2) { + dev_err(nctrl->device, + "unable to set any I/O queues\n"); + + return -ENOMEM; + } + + dev_info(nctrl->device, "creating %d I/O queues.\n", nr_io_queues); + nvme_tcp_ofld_set_io_queues(nctrl, nr_io_queues); + + return nvme_tcp_ofld_create_io_queues(nctrl); +} + +static int nvme_tcp_ofld_start_io_queues(struct nvme_ctrl *nctrl) +{ + int i, rc = 0; + + for (i = 1; i < nctrl->queue_count; i++) { + rc = nvme_tcp_ofld_start_queue(nctrl, i); + if (rc) + goto out_stop_queues; + } + + return 0; + +out_stop_queues: + for (i--; i >= 1; i--) + nvme_tcp_ofld_stop_queue(nctrl, i); + + return rc; +} + +static int +nvme_tcp_ofld_configure_io_queues(struct nvme_ctrl *nctrl, bool new) +{ + int rc = nvme_tcp_ofld_alloc_io_queues(nctrl); + + if (rc) + return rc; + + if (new) { + nctrl->tagset = nvme_tcp_ofld_alloc_tagset(nctrl, false); + if (IS_ERR(nctrl->tagset)) { + rc = PTR_ERR(nctrl->tagset); + nctrl->tagset = NULL; + goto out_free_io_queues; + } + + nctrl->connect_q = blk_mq_init_queue(nctrl->tagset); + if (IS_ERR(nctrl->connect_q)) { + rc = PTR_ERR(nctrl->connect_q); + nctrl->connect_q = NULL; + goto out_free_tag_set; + } + } + + rc = nvme_tcp_ofld_start_io_queues(nctrl); + if (rc) + goto out_cleanup_connect_q; + + if (!new) { + nvme_start_queues(nctrl); + if (!nvme_wait_freeze_timeout(nctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + rc = -ENODEV; + goto out_wait_freeze_timed_out; + } + blk_mq_update_nr_hw_queues(nctrl->tagset, nctrl->queue_count - 1); + nvme_unfreeze(nctrl); + } + + return 0; + +out_wait_freeze_timed_out: + nvme_stop_queues(nctrl); + nvme_sync_io_queues(nctrl); + nvme_tcp_ofld_stop_io_queues(nctrl); +out_cleanup_connect_q: + nvme_cancel_tagset(nctrl); + if (new) + blk_cleanup_queue(nctrl->connect_q); +out_free_tag_set: + if (new) + blk_mq_free_tag_set(nctrl->tagset); +out_free_io_queues: + nvme_tcp_ofld_free_io_queues(nctrl); + + return rc; +} + +static void nvme_tcp_ofld_reconnect_or_remove(struct nvme_ctrl *nctrl) +{ + /* If we are resetting/deleting then do nothing */ + if (nctrl->state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(nctrl->state == NVME_CTRL_NEW || + nctrl->state == NVME_CTRL_LIVE); + + return; + } + + if (nvmf_should_reconnect(nctrl)) { + dev_info(nctrl->device, "Reconnecting in %d seconds...\n", + nctrl->opts->reconnect_delay); + queue_delayed_work(nvme_wq, + &to_tcp_ofld_ctrl(nctrl)->connect_work, + nctrl->opts->reconnect_delay * HZ); + } else { + dev_info(nctrl->device, "Removing controller...\n"); + nvme_delete_ctrl(nctrl); + } +} + +static int +nvme_tcp_ofld_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ofld_ctrl *ctrl = data; + + hctx->driver_data = &ctrl->queues[0]; + + return 0; +} + +static int nvme_tcp_ofld_setup_ctrl(struct nvme_ctrl *nctrl, bool new) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvmf_ctrl_options *opts = nctrl->opts; + int rc = 0; + + rc = ctrl->dev->ops->setup_ctrl(ctrl); + if (rc) + return rc; + + rc = nvme_tcp_ofld_configure_admin_queue(nctrl, new); + if (rc) + goto out_release_ctrl; + + if (nctrl->icdoff) { + dev_err(nctrl->device, "icdoff is not supported!\n"); + rc = -EINVAL; + goto destroy_admin; + } + + if (!(nctrl->sgls & ((1 << 0) | (1 << 1)))) { + dev_err(nctrl->device, "Mandatory sgls are not supported!\n"); + goto destroy_admin; + } + + if (opts->queue_size > nctrl->sqsize + 1) + dev_warn(nctrl->device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + opts->queue_size, nctrl->sqsize + 1); + + if (nctrl->sqsize + 1 > nctrl->maxcmd) { + dev_warn(nctrl->device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + nctrl->sqsize + 1, nctrl->maxcmd); + nctrl->sqsize = nctrl->maxcmd - 1; + } + + if (nctrl->queue_count > 1) { + rc = nvme_tcp_ofld_configure_io_queues(nctrl, new); + if (rc) + goto destroy_admin; + } + + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_LIVE)) { + /* + * state change failure is ok if we started ctrl delete, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ + WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING && + nctrl->state != NVME_CTRL_DELETING_NOIO); + WARN_ON_ONCE(new); + rc = -EINVAL; + goto destroy_io; + } + + nvme_start_ctrl(nctrl); + + return 0; + +destroy_io: + if (nctrl->queue_count > 1) { + nvme_stop_queues(nctrl); + nvme_sync_io_queues(nctrl); + nvme_tcp_ofld_stop_io_queues(nctrl); + nvme_cancel_tagset(nctrl); + nvme_tcp_ofld_destroy_io_queues(nctrl, new); + } +destroy_admin: + blk_mq_quiesce_queue(nctrl->admin_q); + blk_sync_queue(nctrl->admin_q); + nvme_tcp_ofld_stop_queue(nctrl, 0); + nvme_cancel_admin_tagset(nctrl); + nvme_tcp_ofld_destroy_admin_queue(nctrl, new); +out_release_ctrl: + ctrl->dev->ops->release_ctrl(ctrl); + + return rc; +} + +static int +nvme_tcp_ofld_check_dev_opts(struct nvmf_ctrl_options *opts, + struct nvme_tcp_ofld_ops *ofld_ops) +{ + unsigned int nvme_tcp_ofld_opt_mask = NVMF_ALLOWED_OPTS | + ofld_ops->allowed_opts | ofld_ops->required_opts; + struct nvmf_ctrl_options dev_opts_mask; + + if (opts->mask & ~nvme_tcp_ofld_opt_mask) { + pr_warn("One or more nvmf options missing from ofld drvr %s.\n", + ofld_ops->name); + + dev_opts_mask.mask = nvme_tcp_ofld_opt_mask; + + return nvmf_check_required_opts(&dev_opts_mask, opts->mask); + } + + return 0; +} + +static void nvme_tcp_ofld_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + struct nvme_tcp_ofld_dev *dev = ctrl->dev; + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + ctrl->dev->ops->release_ctrl(ctrl); + + mutex_lock(&nvme_tcp_ofld_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_tcp_ofld_ctrl_mutex); + + nvmf_free_options(nctrl->opts); +free_ctrl: + module_put(dev->ops->module); + kfree(ctrl->queues); + kfree(ctrl); +} + +static void nvme_tcp_ofld_set_sg_null(struct nvme_command *c) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = 0; + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | NVME_SGL_FMT_TRANSPORT_A; +} + +inline void nvme_tcp_ofld_set_sg_inline(struct nvme_tcp_ofld_queue *queue, + struct nvme_command *c, u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = cpu_to_le64(queue->ctrl->nctrl.icdoff); + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; +} + +static void nvme_tcp_ofld_map_data(struct nvme_command *c, u32 data_len) +{ + struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + + sg->addr = 0; + sg->length = cpu_to_le32(data_len); + sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) | NVME_SGL_FMT_TRANSPORT_A; +} + +static void nvme_tcp_ofld_submit_async_event(struct nvme_ctrl *arg) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(arg); + struct nvme_tcp_ofld_queue *queue = &ctrl->queues[0]; + struct nvme_tcp_ofld_dev *dev = queue->dev; + struct nvme_tcp_ofld_ops *ops = dev->ops; + + ctrl->async_req.nvme_cmd.common.opcode = nvme_admin_async_event; + ctrl->async_req.nvme_cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; + ctrl->async_req.nvme_cmd.common.flags |= NVME_CMD_SGL_METABUF; + + nvme_tcp_ofld_set_sg_null(&ctrl->async_req.nvme_cmd); + + ctrl->async_req.async = true; + ctrl->async_req.queue = queue; + ctrl->async_req.done = nvme_tcp_ofld_async_req_done; + + ops->send_req(&ctrl->async_req); +} + +static void +nvme_tcp_ofld_teardown_admin_queue(struct nvme_ctrl *nctrl, bool remove) +{ + blk_mq_quiesce_queue(nctrl->admin_q); + blk_sync_queue(nctrl->admin_q); + + nvme_tcp_ofld_stop_queue(nctrl, 0); + nvme_cancel_admin_tagset(nctrl); + + if (remove) + blk_mq_unquiesce_queue(nctrl->admin_q); + + nvme_tcp_ofld_destroy_admin_queue(nctrl, remove); +} + +static void +nvme_tcp_ofld_teardown_io_queues(struct nvme_ctrl *nctrl, bool remove) +{ + if (nctrl->queue_count <= 1) + return; + + blk_mq_quiesce_queue(nctrl->admin_q); + nvme_start_freeze(nctrl); + nvme_stop_queues(nctrl); + nvme_sync_io_queues(nctrl); + nvme_tcp_ofld_stop_io_queues(nctrl); + nvme_cancel_tagset(nctrl); + + if (remove) + nvme_start_queues(nctrl); + + nvme_tcp_ofld_destroy_io_queues(nctrl, remove); +} + +static void nvme_tcp_ofld_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_tcp_ofld_ctrl *ctrl = + container_of(to_delayed_work(work), + struct nvme_tcp_ofld_ctrl, + connect_work); + struct nvme_ctrl *nctrl = &ctrl->nctrl; + + ++nctrl->nr_reconnects; + + if (nvme_tcp_ofld_setup_ctrl(nctrl, false)) + goto requeue; + + dev_info(nctrl->device, "Successfully reconnected (%d attempt)\n", + nctrl->nr_reconnects); + + nctrl->nr_reconnects = 0; + + return; + +requeue: + dev_info(nctrl->device, "Failed reconnect attempt %d\n", + nctrl->nr_reconnects); + nvme_tcp_ofld_reconnect_or_remove(nctrl); +} + +static void nvme_tcp_ofld_error_recovery_work(struct work_struct *work) +{ + struct nvme_tcp_ofld_ctrl *ctrl = + container_of(work, struct nvme_tcp_ofld_ctrl, err_work); + struct nvme_ctrl *nctrl = &ctrl->nctrl; + + nvme_stop_keep_alive(nctrl); + nvme_tcp_ofld_teardown_io_queues(nctrl, false); + /* unquiesce to fail fast pending requests */ + nvme_start_queues(nctrl); + nvme_tcp_ofld_teardown_admin_queue(nctrl, false); + blk_mq_unquiesce_queue(nctrl->admin_q); + + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started nctrl delete */ + WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING && + nctrl->state != NVME_CTRL_DELETING_NOIO); + + return; + } + + nvme_tcp_ofld_reconnect_or_remove(nctrl); +} + +static void +nvme_tcp_ofld_teardown_ctrl(struct nvme_ctrl *nctrl, bool shutdown) +{ + struct nvme_tcp_ofld_ctrl *ctrl = to_tcp_ofld_ctrl(nctrl); + + cancel_work_sync(&ctrl->err_work); + cancel_delayed_work_sync(&ctrl->connect_work); + nvme_tcp_ofld_teardown_io_queues(nctrl, shutdown); + blk_mq_quiesce_queue(nctrl->admin_q); + if (shutdown) + nvme_shutdown_ctrl(nctrl); + else + nvme_disable_ctrl(nctrl); + nvme_tcp_ofld_teardown_admin_queue(nctrl, shutdown); +} + +static void nvme_tcp_ofld_delete_ctrl(struct nvme_ctrl *nctrl) +{ + nvme_tcp_ofld_teardown_ctrl(nctrl, true); +} + +static void nvme_tcp_ofld_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *nctrl = + container_of(work, struct nvme_ctrl, reset_work); + + nvme_stop_ctrl(nctrl); + nvme_tcp_ofld_teardown_ctrl(nctrl, false); + + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) { + /* state change failure is ok if we started ctrl delete */ + WARN_ON_ONCE(nctrl->state != NVME_CTRL_DELETING && + nctrl->state != NVME_CTRL_DELETING_NOIO); + + return; + } + + if (nvme_tcp_ofld_setup_ctrl(nctrl, false)) + goto out_fail; + + return; + +out_fail: + ++nctrl->nr_reconnects; + nvme_tcp_ofld_reconnect_or_remove(nctrl); +} + +static int +nvme_tcp_ofld_init_request(struct blk_mq_tag_set *set, + struct request *rq, + unsigned int hctx_idx, + unsigned int numa_node) +{ + struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data; + int qid; + + qid = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; + req->queue = &ctrl->queues[qid]; + nvme_req(rq)->ctrl = &ctrl->nctrl; + nvme_req(rq)->cmd = &req->nvme_cmd; + req->done = nvme_tcp_ofld_req_done; + + return 0; +} + +inline size_t nvme_tcp_ofld_inline_data_size(struct nvme_tcp_ofld_queue *queue) +{ + return queue->cmnd_capsule_len - sizeof(struct nvme_command); +} +EXPORT_SYMBOL_GPL(nvme_tcp_ofld_inline_data_size); + +static blk_status_t +nvme_tcp_ofld_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(bd->rq); + struct nvme_tcp_ofld_queue *queue = hctx->driver_data; + struct nvme_tcp_ofld_ctrl *ctrl = queue->ctrl; + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_tcp_ofld_dev *dev = queue->dev; + struct nvme_tcp_ofld_ops *ops = dev->ops; + struct nvme_command *nvme_cmd; + struct request *rq = bd->rq; + bool queue_ready; + u32 data_len; + int rc; + + queue_ready = test_bit(NVME_TCP_OFLD_Q_LIVE, &queue->flags); + + req->async = false; + + if (!nvme_check_ready(&ctrl->nctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&ctrl->nctrl, rq); + + rc = nvme_setup_cmd(ns, rq); + if (unlikely(rc)) + return rc; + + blk_mq_start_request(rq); + + nvme_cmd = &req->nvme_cmd; + nvme_cmd->common.flags |= NVME_CMD_SGL_METABUF; + + data_len = blk_rq_nr_phys_segments(rq) ? blk_rq_payload_bytes(rq) : 0; + if (!data_len) + nvme_tcp_ofld_set_sg_null(&req->nvme_cmd); + else if ((rq_data_dir(rq) == WRITE) && + data_len <= nvme_tcp_ofld_inline_data_size(queue)) + nvme_tcp_ofld_set_sg_inline(queue, nvme_cmd, data_len); + else + nvme_tcp_ofld_map_data(nvme_cmd, data_len); + + rc = ops->send_req(req); + if (unlikely(rc)) + return rc; + + return BLK_STS_OK; +} + +static void +nvme_tcp_ofld_exit_request(struct blk_mq_tag_set *set, + struct request *rq, unsigned int hctx_idx) +{ + /* + * Nothing is allocated in nvme_tcp_ofld_init_request, + * hence empty. + */ +} + +static int +nvme_tcp_ofld_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_tcp_ofld_ctrl *ctrl = data; + + hctx->driver_data = &ctrl->queues[hctx_idx + 1]; + + return 0; +} + +static int nvme_tcp_ofld_map_queues(struct blk_mq_tag_set *set) +{ + struct nvme_tcp_ofld_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->nctrl.opts; + + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; + set->map[HCTX_TYPE_READ].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + + dev_info(ctrl->nctrl.device, + "mapped %d/%d/%d default/read/poll queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); + + return 0; +} + +static int nvme_tcp_ofld_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_ofld_queue *queue = hctx->driver_data; + struct nvme_tcp_ofld_dev *dev = queue->dev; + struct nvme_tcp_ofld_ops *ops = dev->ops; + + return ops->poll_queue(queue); +} + +static void nvme_tcp_ofld_complete_timed_out(struct request *rq) +{ + struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *nctrl = &req->queue->ctrl->nctrl; + + nvme_tcp_ofld_stop_queue(nctrl, nvme_tcp_ofld_qid(req->queue)); + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) { + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; + blk_mq_complete_request(rq); + } +} + +static enum blk_eh_timer_return nvme_tcp_ofld_timeout(struct request *rq, bool reserved) +{ + struct nvme_tcp_ofld_req *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_ofld_ctrl *ctrl = req->queue->ctrl; + + dev_warn(ctrl->nctrl.device, + "queue %d: timeout request %#x type %d\n", + nvme_tcp_ofld_qid(req->queue), rq->tag, req->nvme_cmd.common.opcode); + + if (ctrl->nctrl.state != NVME_CTRL_LIVE) { + /* + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. + */ + nvme_tcp_ofld_complete_timed_out(rq); + + return BLK_EH_DONE; + } + + nvme_tcp_ofld_error_recovery(&ctrl->nctrl); + + return BLK_EH_RESET_TIMER; +} + +static struct blk_mq_ops nvme_tcp_ofld_mq_ops = { + .queue_rq = nvme_tcp_ofld_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_ofld_init_request, + .exit_request = nvme_tcp_ofld_exit_request, + .init_hctx = nvme_tcp_ofld_init_hctx, + .timeout = nvme_tcp_ofld_timeout, + .map_queues = nvme_tcp_ofld_map_queues, + .poll = nvme_tcp_ofld_poll, +}; + +static struct blk_mq_ops nvme_tcp_ofld_admin_mq_ops = { + .queue_rq = nvme_tcp_ofld_queue_rq, + .complete = nvme_complete_rq, + .init_request = nvme_tcp_ofld_init_request, + .exit_request = nvme_tcp_ofld_exit_request, + .init_hctx = nvme_tcp_ofld_init_admin_hctx, + .timeout = nvme_tcp_ofld_timeout, +}; + +static const struct nvme_ctrl_ops nvme_tcp_ofld_ctrl_ops = { + .name = "tcp_offload", + .module = THIS_MODULE, + .flags = NVME_F_FABRICS, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .free_ctrl = nvme_tcp_ofld_free_ctrl, + .submit_async_event = nvme_tcp_ofld_submit_async_event, + .delete_ctrl = nvme_tcp_ofld_delete_ctrl, + .get_address = nvmf_get_address, +}; + +static bool +nvme_tcp_ofld_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ofld_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_tcp_ofld_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) { + found = nvmf_ip_options_match(&ctrl->nctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_tcp_ofld_ctrl_mutex); + + return found; +} + +static struct nvme_ctrl * +nvme_tcp_ofld_create_ctrl(struct device *ndev, struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ofld_queue *queue; + struct nvme_tcp_ofld_ctrl *ctrl; + struct nvme_tcp_ofld_dev *dev; + struct nvme_ctrl *nctrl; + int i, rc = 0; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ctrl->list); + nctrl = &ctrl->nctrl; + nctrl->opts = opts; + nctrl->queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; + nctrl->sqsize = opts->queue_size - 1; + nctrl->kato = opts->kato; + INIT_DELAYED_WORK(&ctrl->connect_work, + nvme_tcp_ofld_reconnect_ctrl_work); + INIT_WORK(&ctrl->err_work, nvme_tcp_ofld_error_recovery_work); + INIT_WORK(&nctrl->reset_work, nvme_tcp_ofld_reset_ctrl_work); + if (!(opts->mask & NVMF_OPT_TRSVCID)) { + opts->trsvcid = + kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL); + if (!opts->trsvcid) { + rc = -ENOMEM; + goto out_free_ctrl; + } + opts->mask |= NVMF_OPT_TRSVCID; + } + + rc = inet_pton_with_scope(&init_net, AF_UNSPEC, opts->traddr, + opts->trsvcid, + &ctrl->conn_params.remote_ip_addr); + if (rc) { + pr_err("malformed address passed: %s:%s\n", + opts->traddr, opts->trsvcid); + goto out_free_ctrl; + } + + if (opts->mask & NVMF_OPT_HOST_TRADDR) { + rc = inet_pton_with_scope(&init_net, AF_UNSPEC, + opts->host_traddr, NULL, + &ctrl->conn_params.local_ip_addr); + if (rc) { + pr_err("malformed src address passed: %s\n", + opts->host_traddr); + goto out_free_ctrl; + } + } + + if (!opts->duplicate_connect && + nvme_tcp_ofld_existing_controller(opts)) { + rc = -EALREADY; + goto out_free_ctrl; + } + + /* Find device that can reach the dest addr */ + dev = nvme_tcp_ofld_lookup_dev(ctrl); + if (!dev) { + pr_info("no device found for addr %s:%s.\n", + opts->traddr, opts->trsvcid); + rc = -EINVAL; + goto out_free_ctrl; + } + + /* Increase driver refcnt */ + if (!try_module_get(dev->ops->module)) { + pr_err("try_module_get failed\n"); + dev = NULL; + goto out_free_ctrl; + } + + rc = nvme_tcp_ofld_check_dev_opts(opts, dev->ops); + if (rc) + goto out_module_put; + + ctrl->dev = dev; + + if (ctrl->dev->ops->max_hw_sectors) + nctrl->max_hw_sectors = ctrl->dev->ops->max_hw_sectors; + if (ctrl->dev->ops->max_segments) + nctrl->max_segments = ctrl->dev->ops->max_segments; + + ctrl->queues = kcalloc(nctrl->queue_count, + sizeof(struct nvme_tcp_ofld_queue), + GFP_KERNEL); + if (!ctrl->queues) { + rc = -ENOMEM; + goto out_module_put; + } + + for (i = 0; i < nctrl->queue_count; ++i) { + queue = &ctrl->queues[i]; + queue->ctrl = ctrl; + queue->dev = dev; + queue->report_err = nvme_tcp_ofld_report_queue_err; + } + + rc = nvme_init_ctrl(nctrl, ndev, &nvme_tcp_ofld_ctrl_ops, 0); + if (rc) + goto out_free_queues; + + if (!nvme_change_ctrl_state(nctrl, NVME_CTRL_CONNECTING)) { + WARN_ON_ONCE(1); + rc = -EINTR; + goto out_uninit_ctrl; + } + + rc = nvme_tcp_ofld_setup_ctrl(nctrl, true); + if (rc) + goto out_uninit_ctrl; + + dev_info(nctrl->device, "new ctrl: NQN \"%s\", addr %pISp\n", + opts->subsysnqn, &ctrl->conn_params.remote_ip_addr); + + mutex_lock(&nvme_tcp_ofld_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_tcp_ofld_ctrl_list); + mutex_unlock(&nvme_tcp_ofld_ctrl_mutex); + + return nctrl; + +out_uninit_ctrl: + nvme_uninit_ctrl(nctrl); + nvme_put_ctrl(nctrl); +out_free_queues: + kfree(ctrl->queues); +out_module_put: + module_put(dev->ops->module); +out_free_ctrl: + kfree(ctrl); + + return ERR_PTR(rc); +} + +static struct nvmf_transport_ops nvme_tcp_ofld_transport = { + .name = "tcp_offload", + .module = THIS_MODULE, + .required_opts = NVMF_OPT_TRADDR, + .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_NR_WRITE_QUEUES | + NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | + NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HDR_DIGEST | + NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, + .create_ctrl = nvme_tcp_ofld_create_ctrl, +}; + +static int __init nvme_tcp_ofld_init_module(void) +{ + nvmf_register_transport(&nvme_tcp_ofld_transport); + + return 0; +} + +static void __exit nvme_tcp_ofld_cleanup_module(void) +{ + struct nvme_tcp_ofld_ctrl *ctrl; + + nvmf_unregister_transport(&nvme_tcp_ofld_transport); + + mutex_lock(&nvme_tcp_ofld_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_tcp_ofld_ctrl_list, list) + nvme_delete_ctrl(&ctrl->nctrl); + mutex_unlock(&nvme_tcp_ofld_ctrl_mutex); + flush_workqueue(nvme_delete_wq); +} + +module_init(nvme_tcp_ofld_init_module); +module_exit(nvme_tcp_ofld_cleanup_module); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/host/tcp-offload.h b/drivers/nvme/host/tcp-offload.h new file mode 100644 index 000000000000..2ac5b2428612 --- /dev/null +++ b/drivers/nvme/host/tcp-offload.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Marvell. All rights reserved. + */ + +/* Linux includes */ +#include +#include +#include +#include + +/* Driver includes */ +#include "nvme.h" +#include "fabrics.h" + +/* Forward declarations */ +struct nvme_tcp_ofld_ops; + +/* Representation of a vendor-specific device. This is the struct used to + * register to the offload layer by the vendor-specific driver during its probe + * function. + * Allocated by vendor-specific driver. + */ +struct nvme_tcp_ofld_dev { + struct list_head entry; + struct net_device *ndev; + struct nvme_tcp_ofld_ops *ops; + + /* Vendor specific driver context */ + int num_hw_vectors; +}; + +/* Per IO struct holding the nvme_request and command + * Allocated by blk-mq. + */ +struct nvme_tcp_ofld_req { + struct nvme_request req; + struct nvme_command nvme_cmd; + struct list_head queue_entry; + struct nvme_tcp_ofld_queue *queue; + + /* Vendor specific driver context */ + void *private_data; + + /* async flag is used to distinguish between async and IO flow + * in common send_req() of nvme_tcp_ofld_ops. + */ + bool async; + + void (*done)(struct nvme_tcp_ofld_req *req, + union nvme_result *result, + __le16 status); +}; + +enum nvme_tcp_ofld_queue_flags { + NVME_TCP_OFLD_Q_ALLOCATED = 0, + NVME_TCP_OFLD_Q_LIVE = 1, +}; + +/* Allocated by nvme_tcp_ofld */ +struct nvme_tcp_ofld_queue { + /* Offload device associated to this queue */ + struct nvme_tcp_ofld_dev *dev; + struct nvme_tcp_ofld_ctrl *ctrl; + unsigned long flags; + size_t cmnd_capsule_len; + + /* mutex used during stop_queue */ + struct mutex queue_lock; + + u8 hdr_digest; + u8 data_digest; + u8 tos; + + /* Vendor specific driver context */ + void *private_data; + + /* Error callback function */ + int (*report_err)(struct nvme_tcp_ofld_queue *queue); +}; + +/* Connectivity (routing) params used for establishing a connection */ +struct nvme_tcp_ofld_ctrl_con_params { + struct sockaddr_storage remote_ip_addr; + + /* If NVMF_OPT_HOST_TRADDR is provided it will be set in local_ip_addr + * in nvme_tcp_ofld_create_ctrl(). + * If NVMF_OPT_HOST_TRADDR is not provided the local_ip_addr will be + * initialized by claim_dev(). + */ + struct sockaddr_storage local_ip_addr; +}; + +/* Allocated by nvme_tcp_ofld */ +struct nvme_tcp_ofld_ctrl { + struct nvme_ctrl nctrl; + struct list_head list; + struct nvme_tcp_ofld_dev *dev; + + /* admin and IO queues */ + struct blk_mq_tag_set tag_set; + struct blk_mq_tag_set admin_tag_set; + struct nvme_tcp_ofld_queue *queues; + + struct work_struct err_work; + struct delayed_work connect_work; + + /* + * Each entry in the array indicates the number of queues of + * corresponding type. + */ + u32 io_queues[HCTX_MAX_TYPES]; + + /* Connectivity params */ + struct nvme_tcp_ofld_ctrl_con_params conn_params; + + struct nvme_tcp_ofld_req async_req; + + /* Vendor specific driver context */ + void *private_data; +}; + +struct nvme_tcp_ofld_ops { + const char *name; + struct module *module; + + /* For vendor-specific driver to report what opts it supports. + * It could be different than the ULP supported opts due to hardware + * limitations. Also it could be different among different vendor + * drivers. + */ + int required_opts; /* bitmap using enum nvmf_parsing_opts */ + int allowed_opts; /* bitmap using enum nvmf_parsing_opts */ + + /* For vendor-specific max num of segments and IO sizes */ + u32 max_hw_sectors; + u32 max_segments; + + /** + * claim_dev: Return True if addr is reachable via offload device. + * @dev: The offload device to check. + * @ctrl: The offload ctrl have the conn_params field. The + * conn_params is to be filled with routing params by the lower + * driver. + */ + int (*claim_dev)(struct nvme_tcp_ofld_dev *dev, + struct nvme_tcp_ofld_ctrl *ctrl); + + /** + * setup_ctrl: Setup device specific controller structures. + * @ctrl: The offload ctrl. + */ + int (*setup_ctrl)(struct nvme_tcp_ofld_ctrl *ctrl); + + /** + * release_ctrl: Release/Free device specific controller structures. + * @ctrl: The offload ctrl. + */ + int (*release_ctrl)(struct nvme_tcp_ofld_ctrl *ctrl); + + /** + * create_queue: Create offload queue and establish TCP + NVMeTCP + * (icreq+icresp) connection. Return true on successful connection. + * Based on nvme_tcp_alloc_queue. + * @queue: The queue itself - used as input and output. + * @qid: The queue ID associated with the requested queue. + * @q_size: The queue depth. + */ + int (*create_queue)(struct nvme_tcp_ofld_queue *queue, int qid, + size_t queue_size); + + /** + * drain_queue: Drain a given queue - blocking function call. + * Return from this function ensures that no additional + * completions will arrive on this queue and that the HW will + * not access host memory. + * @queue: The queue to drain. + */ + void (*drain_queue)(struct nvme_tcp_ofld_queue *queue); + + /** + * destroy_queue: Close the TCP + NVMeTCP connection of a given queue + * and make sure its no longer active (no completions will arrive on the + * queue). + * @queue: The queue to destroy. + */ + void (*destroy_queue)(struct nvme_tcp_ofld_queue *queue); + + /** + * poll_queue: Poll a given queue for completions. + * @queue: The queue to poll. + */ + int (*poll_queue)(struct nvme_tcp_ofld_queue *queue); + + /** + * send_req: Dispatch a request. Returns the execution status. + * @req: Ptr to request to be sent. + */ + int (*send_req)(struct nvme_tcp_ofld_req *req); +}; + +/* Exported functions for lower vendor specific offload drivers */ +int nvme_tcp_ofld_register_dev(struct nvme_tcp_ofld_dev *dev); +void nvme_tcp_ofld_unregister_dev(struct nvme_tcp_ofld_dev *dev); +void nvme_tcp_ofld_error_recovery(struct nvme_ctrl *nctrl); +inline size_t nvme_tcp_ofld_inline_data_size(struct nvme_tcp_ofld_queue *queue);