for-5.19/io_uring-passthrough-2022-05-22

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmKKovAQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpv9oD/4qCs7k3bPZZWZ6xoWb4EObyyWOUifi26lp
 vpsJHFUbA67S/i4++LV9H18SazWJ7h08ac4bjgZ+NQz40/1WkTN8/Fa76jo+BnNK
 7T10Wp4Ak6uwWVrKaA81pnT+G9+xmHlJ3X27aKxzLuT7BEPpShZ6ouFVjTkx9CzN
 LrLjuCDTOBBN+ZoaroWYfdLwTQX2VCAl9B15lOtQIlFvuuU8VlrvLboY+80K8TvY
 1wvTA2HTjnXoYx+/cTTMIFZIwQH3r1hsbwEDD8/YJj1+ouhSRQ1b0p/nk2pA+3ws
 HF5r/YS/rLBjlPF094IzeOBaUyA433AN1VhZqnII8ek7ViT3W3x+BRrgE9O6ZkWT
 0AjX1BXReI5rdFmxBmwsSdBnrSoGaJOf2GdsCCdubXBIi+F/RvyajrPf7PTB5zbW
 9WEK/uy3xvZsRVkUGAzOb9QGdvjcllgMzwPJsDegDCw5PdcPdT3mzy6KGIWipFLp
 j8R+br7hRMpOJv/YpihJDMzSDkQ/r1/SCwR4fpLid/QdSHG/eRTQK6c4Su5bNYEy
 QDy2F6kQdBVtEJCQHcEOsbhXzSTNBcdB+ujUUM5653FkaHe6y4JbomLrsNx407Id
 i/4ROwA5K1dioJx503Eap+OhbI5rV+PFytJTwxvLrNyVGccwbH2YOVq80fsVBP2e
 cZbn6EX4Vg==
 =/peE
 -----END PGP SIGNATURE-----

Merge tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block

Pull io_uring NVMe command passthrough from Jens Axboe:
 "On top of everything else, this adds support for passthrough for
  io_uring.

  The initial feature for this is NVMe passthrough support, which allows
  non-filesystem based IO commands and admin commands.

  To support this, io_uring grows support for SQE and CQE members that
  are twice as big, allowing to pass in a full NVMe command without
  having to copy data around. And to complete with more than just a
  single 32-bit value as the output"

* tag 'for-5.19/io_uring-passthrough-2022-05-22' of git://git.kernel.dk/linux-block: (22 commits)
  io_uring: cleanup handling of the two task_work lists
  nvme: enable uring-passthrough for admin commands
  nvme: helper for uring-passthrough checks
  blk-mq: fix passthrough plugging
  nvme: add vectored-io support for uring-cmd
  nvme: wire-up uring-cmd support for io-passthru on char-device.
  nvme: refactor nvme_submit_user_cmd()
  block: wire-up support for passthrough plugging
  fs,io_uring: add infrastructure for uring-cmd
  io_uring: support CQE32 for nop operation
  io_uring: enable CQE32
  io_uring: support CQE32 in /proc info
  io_uring: add tracing for additional CQE32 fields
  io_uring: overflow processing for CQE32
  io_uring: flush completions for CQE32
  io_uring: modify io_get_cqe for CQE32
  io_uring: add CQE32 completion processing
  io_uring: add CQE32 setup processing
  io_uring: change ring size calculation for CQE32
  io_uring: store add. return values for CQE32
  ...
This commit is contained in:
Linus Torvalds 2022-05-23 13:06:15 -07:00
commit 9836e93c0a
11 changed files with 809 additions and 141 deletions

View file

@ -1169,6 +1169,62 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
complete(waiting);
}
/*
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
* queues. This is important for md arrays to benefit from merging
* requests.
*/
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
if (plug->multiple_queues)
return BLK_MAX_REQUEST_COUNT * 2;
return BLK_MAX_REQUEST_COUNT;
}
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
struct request *last = rq_list_peek(&plug->mq_list);
if (!plug->rq_count) {
trace_block_plug(rq->q);
} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
(!blk_queue_nomerges(rq->q) &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_mq_flush_plug_list(plug, false);
trace_block_plug(rq->q);
}
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true;
if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
plug->rq_count++;
}
static void __blk_execute_rq_nowait(struct request *rq, bool at_head,
rq_end_io_fn *done, bool use_plug)
{
WARN_ON(irqs_disabled());
WARN_ON(!blk_rq_is_passthrough(rq));
rq->end_io = done;
blk_account_io_start(rq);
if (use_plug && current->plug) {
blk_add_rq_to_plug(current->plug, rq);
return;
}
/*
* don't check dying flag for MQ because the request won't
* be reused after dying flag is set
*/
blk_mq_sched_insert_request(rq, at_head, true, false);
}
/**
* blk_execute_rq_nowait - insert a request to I/O scheduler for execution
* @rq: request to insert
@ -1184,18 +1240,8 @@ static void blk_end_sync_rq(struct request *rq, blk_status_t error)
*/
void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done)
{
WARN_ON(irqs_disabled());
WARN_ON(!blk_rq_is_passthrough(rq));
__blk_execute_rq_nowait(rq, at_head, done, true);
rq->end_io = done;
blk_account_io_start(rq);
/*
* don't check dying flag for MQ because the request won't
* be reused after dying flag is set
*/
blk_mq_sched_insert_request(rq, at_head, true, false);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
@ -1233,8 +1279,13 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
DECLARE_COMPLETION_ONSTACK(wait);
unsigned long hang_check;
/*
* iopoll requires request to be submitted to driver, so can't
* use plug
*/
rq->end_io_data = &wait;
blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq);
__blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq,
!blk_rq_is_poll(rq));
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
@ -2676,40 +2727,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
hctx->queue->mq_ops->commit_rqs(hctx);
}
/*
* Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
* queues. This is important for md arrays to benefit from merging
* requests.
*/
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
if (plug->multiple_queues)
return BLK_MAX_REQUEST_COUNT * 2;
return BLK_MAX_REQUEST_COUNT;
}
static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
struct request *last = rq_list_peek(&plug->mq_list);
if (!plug->rq_count) {
trace_block_plug(rq->q);
} else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
(!blk_queue_nomerges(rq->q) &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_mq_flush_plug_list(plug, false);
trace_block_plug(rq->q);
}
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true;
if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
plug->rq_count++;
}
static bool blk_mq_attempt_bio_merge(struct request_queue *q,
struct bio *bio, unsigned int nr_segs)
{

View file

@ -3146,6 +3146,7 @@ static const struct file_operations nvme_dev_fops = {
.release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.uring_cmd = nvme_dev_uring_cmd,
};
static ssize_t nvme_sysfs_reset(struct device *dev,
@ -3699,6 +3700,7 @@ static const struct file_operations nvme_ns_chr_fops = {
.release = nvme_ns_chr_release,
.unlocked_ioctl = nvme_ns_chr_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.uring_cmd = nvme_ns_chr_uring_cmd,
};
static int nvme_add_ns_cdev(struct nvme_ns *ns)

View file

@ -5,6 +5,7 @@
*/
#include <linux/ptrace.h> /* for force_successful_syscall_return */
#include <linux/nvme_ioctl.h>
#include <linux/io_uring.h>
#include "nvme.h"
/*
@ -53,10 +54,21 @@ static void *nvme_add_user_metadata(struct bio *bio, void __user *ubuf,
return ERR_PTR(ret);
}
static int nvme_submit_user_cmd(struct request_queue *q,
static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
void *meta, unsigned len, int ret)
{
if (!ret && req_op(req) == REQ_OP_DRV_IN &&
copy_to_user(ubuf, meta, len))
ret = -EFAULT;
kfree(meta);
return ret;
}
static struct request *nvme_alloc_user_request(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, u64 *result, unsigned timeout, bool vec)
u32 meta_seed, void **metap, unsigned timeout, bool vec,
unsigned int rq_flags, blk_mq_req_flags_t blk_flags)
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
@ -66,9 +78,9 @@ static int nvme_submit_user_cmd(struct request_queue *q,
void *meta = NULL;
int ret;
req = blk_mq_alloc_request(q, nvme_req_op(cmd), 0);
req = blk_mq_alloc_request(q, nvme_req_op(cmd) | rq_flags, blk_flags);
if (IS_ERR(req))
return PTR_ERR(req);
return req;
nvme_init_request(req, cmd);
if (timeout)
@ -105,26 +117,50 @@ static int nvme_submit_user_cmd(struct request_queue *q,
goto out_unmap;
}
req->cmd_flags |= REQ_INTEGRITY;
*metap = meta;
}
}
ret = nvme_execute_passthru_rq(req);
if (result)
*result = le64_to_cpu(nvme_req(req)->result.u64);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
}
kfree(meta);
out_unmap:
return req;
out_unmap:
if (bio)
blk_rq_unmap_user(bio);
out:
blk_mq_free_request(req);
return ERR_PTR(ret);
}
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, u64 *result, unsigned timeout, bool vec)
{
struct request *req;
void *meta = NULL;
struct bio *bio;
int ret;
req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer,
meta_len, meta_seed, &meta, timeout, vec, 0, 0);
if (IS_ERR(req))
return PTR_ERR(req);
bio = req->bio;
ret = nvme_execute_passthru_rq(req);
if (result)
*result = le64_to_cpu(nvme_req(req)->result.u64);
if (meta)
ret = nvme_finish_user_metadata(req, meta_buffer, meta,
meta_len, ret);
if (bio)
blk_rq_unmap_user(bio);
out:
blk_mq_free_request(req);
return ret;
}
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
@ -296,6 +332,139 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return status;
}
struct nvme_uring_data {
__u64 metadata;
__u64 addr;
__u32 data_len;
__u32 metadata_len;
__u32 timeout_ms;
};
/*
* This overlays struct io_uring_cmd pdu.
* Expect build errors if this grows larger than that.
*/
struct nvme_uring_cmd_pdu {
union {
struct bio *bio;
struct request *req;
};
void *meta; /* kernel-resident buffer */
void __user *meta_buffer;
u32 meta_len;
};
static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
struct io_uring_cmd *ioucmd)
{
return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
}
static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd)
{
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
struct request *req = pdu->req;
struct bio *bio = req->bio;
int status;
u64 result;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
status = -EINTR;
else
status = nvme_req(req)->status;
result = le64_to_cpu(nvme_req(req)->result.u64);
if (pdu->meta)
status = nvme_finish_user_metadata(req, pdu->meta_buffer,
pdu->meta, pdu->meta_len, status);
if (bio)
blk_rq_unmap_user(bio);
blk_mq_free_request(req);
io_uring_cmd_done(ioucmd, status, result);
}
static void nvme_uring_cmd_end_io(struct request *req, blk_status_t err)
{
struct io_uring_cmd *ioucmd = req->end_io_data;
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
/* extract bio before reusing the same field for request */
struct bio *bio = pdu->bio;
pdu->req = req;
req->bio = bio;
/* this takes care of moving rest of completion-work to task context */
io_uring_cmd_complete_in_task(ioucmd, nvme_uring_task_cb);
}
static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
{
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
const struct nvme_uring_cmd *cmd = ioucmd->cmd;
struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
struct nvme_uring_data d;
struct nvme_command c;
struct request *req;
unsigned int rq_flags = 0;
blk_mq_req_flags_t blk_flags = 0;
void *meta = NULL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
c.common.opcode = READ_ONCE(cmd->opcode);
c.common.flags = READ_ONCE(cmd->flags);
if (c.common.flags)
return -EINVAL;
c.common.command_id = 0;
c.common.nsid = cpu_to_le32(cmd->nsid);
if (!nvme_validate_passthru_nsid(ctrl, ns, le32_to_cpu(c.common.nsid)))
return -EINVAL;
c.common.cdw2[0] = cpu_to_le32(READ_ONCE(cmd->cdw2));
c.common.cdw2[1] = cpu_to_le32(READ_ONCE(cmd->cdw3));
c.common.metadata = 0;
c.common.dptr.prp1 = c.common.dptr.prp2 = 0;
c.common.cdw10 = cpu_to_le32(READ_ONCE(cmd->cdw10));
c.common.cdw11 = cpu_to_le32(READ_ONCE(cmd->cdw11));
c.common.cdw12 = cpu_to_le32(READ_ONCE(cmd->cdw12));
c.common.cdw13 = cpu_to_le32(READ_ONCE(cmd->cdw13));
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
d.metadata = READ_ONCE(cmd->metadata);
d.addr = READ_ONCE(cmd->addr);
d.data_len = READ_ONCE(cmd->data_len);
d.metadata_len = READ_ONCE(cmd->metadata_len);
d.timeout_ms = READ_ONCE(cmd->timeout_ms);
if (issue_flags & IO_URING_F_NONBLOCK) {
rq_flags = REQ_NOWAIT;
blk_flags = BLK_MQ_REQ_NOWAIT;
}
req = nvme_alloc_user_request(q, &c, nvme_to_user_ptr(d.addr),
d.data_len, nvme_to_user_ptr(d.metadata),
d.metadata_len, 0, &meta, d.timeout_ms ?
msecs_to_jiffies(d.timeout_ms) : 0, vec, rq_flags,
blk_flags);
if (IS_ERR(req))
return PTR_ERR(req);
req->end_io_data = ioucmd;
/* to free bio on completion, as req->bio will be null at that time */
pdu->bio = req->bio;
pdu->meta = meta;
pdu->meta_buffer = nvme_to_user_ptr(d.metadata);
pdu->meta_len = d.metadata_len;
blk_execute_rq_nowait(req, 0, nvme_uring_cmd_end_io);
return -EIOCBQUEUED;
}
static bool is_ctrl_ioctl(unsigned int cmd)
{
if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
@ -387,6 +556,53 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return __nvme_ioctl(ns, cmd, (void __user *)arg);
}
static int nvme_uring_cmd_checks(unsigned int issue_flags)
{
/* IOPOLL not supported yet */
if (issue_flags & IO_URING_F_IOPOLL)
return -EOPNOTSUPP;
/* NVMe passthrough requires big SQE/CQE support */
if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
(IO_URING_F_SQE128|IO_URING_F_CQE32))
return -EOPNOTSUPP;
return 0;
}
static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
unsigned int issue_flags)
{
struct nvme_ctrl *ctrl = ns->ctrl;
int ret;
BUILD_BUG_ON(sizeof(struct nvme_uring_cmd_pdu) > sizeof(ioucmd->pdu));
ret = nvme_uring_cmd_checks(issue_flags);
if (ret)
return ret;
switch (ioucmd->cmd_op) {
case NVME_URING_CMD_IO:
ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, false);
break;
case NVME_URING_CMD_IO_VEC:
ret = nvme_uring_cmd_io(ctrl, ns, ioucmd, issue_flags, true);
break;
default:
ret = -ENOTTY;
}
return ret;
}
int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
{
struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
struct nvme_ns, cdev);
return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
}
#ifdef CONFIG_NVME_MULTIPATH
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
void __user *argp, struct nvme_ns_head *head, int srcu_idx)
@ -453,8 +669,46 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
}
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags)
{
struct cdev *cdev = file_inode(ioucmd->file)->i_cdev;
struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev);
int srcu_idx = srcu_read_lock(&head->srcu);
struct nvme_ns *ns = nvme_find_path(head);
int ret = -EINVAL;
if (ns)
ret = nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
{
struct nvme_ctrl *ctrl = ioucmd->file->private_data;
int ret;
ret = nvme_uring_cmd_checks(issue_flags);
if (ret)
return ret;
switch (ioucmd->cmd_op) {
case NVME_URING_CMD_ADMIN:
ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, false);
break;
case NVME_URING_CMD_ADMIN_VEC:
ret = nvme_uring_cmd_io(ctrl, NULL, ioucmd, issue_flags, true);
break;
default:
ret = -ENOTTY;
}
return ret;
}
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
{
struct nvme_ns *ns;

View file

@ -437,6 +437,7 @@ static const struct file_operations nvme_ns_head_chr_fops = {
.release = nvme_ns_head_chr_release,
.unlocked_ioctl = nvme_ns_head_chr_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.uring_cmd = nvme_ns_head_chr_uring_cmd,
};
static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)

View file

@ -782,7 +782,12 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
long nvme_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_id_attr_groups[];
extern const struct pr_ops nvme_pr_ops;

View file

@ -204,13 +204,6 @@ struct io_rings {
struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp;
};
enum io_uring_cmd_flags {
IO_URING_F_COMPLETE_DEFER = 1,
IO_URING_F_UNLOCKED = 2,
/* int's last bit, sign checks are usually faster than a bit test */
IO_URING_F_NONBLOCK = INT_MIN,
};
struct io_mapped_ubuf {
u64 ubuf;
u64 ubuf_end;
@ -222,8 +215,8 @@ struct io_mapped_ubuf {
struct io_ring_ctx;
struct io_overflow_cqe {
struct io_uring_cqe cqe;
struct list_head list;
struct io_uring_cqe cqe;
};
/*
@ -551,7 +544,7 @@ struct io_uring_task {
spinlock_t task_lock;
struct io_wq_work_list task_list;
struct io_wq_work_list prior_task_list;
struct io_wq_work_list prio_task_list;
struct callback_head task_work;
struct file **registered_rings;
bool task_running;
@ -788,6 +781,12 @@ struct io_msg {
u32 len;
};
struct io_nop {
struct file *file;
u64 extra1;
u64 extra2;
};
struct io_async_connect {
struct sockaddr_storage address;
};
@ -992,6 +991,8 @@ struct io_kiocb {
struct io_msg msg;
struct io_xattr xattr;
struct io_socket sock;
struct io_nop nop;
struct io_uring_cmd uring_cmd;
};
u8 opcode;
@ -1036,7 +1037,13 @@ struct io_kiocb {
atomic_t poll_refs;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
union {
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
@ -1070,6 +1077,14 @@ struct io_cancel_data {
int seq;
};
/*
* The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
* the following sqe if SQE128 is used.
*/
#define uring_cmd_pdu_size(is_sqe128) \
((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
offsetof(struct io_uring_sqe, cmd))
struct io_op_def {
/* needs req->file assigned */
unsigned needs_file : 1;
@ -1311,6 +1326,12 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_SOCKET] = {
.audit_skip = 1,
},
[IORING_OP_URING_CMD] = {
.needs_file = 1,
.plug = 1,
.needs_async_setup = 1,
.async_size = uring_cmd_pdu_size(1),
},
};
/* requests with any of those set should undergo io_disarm_next() */
@ -1450,6 +1471,8 @@ const char *io_uring_get_opcode(u8 opcode)
return "GETXATTR";
case IORING_OP_SOCKET:
return "SOCKET";
case IORING_OP_URING_CMD:
return "URING_CMD";
case IORING_OP_LAST:
return "INVALID";
}
@ -2119,8 +2142,12 @@ static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
unsigned int shift = 0;
unsigned int free, queued, len;
if (ctx->flags & IORING_SETUP_CQE32)
shift = 1;
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
free = ctx->cq_entries - queued;
@ -2132,15 +2159,26 @@ static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
ctx->cached_cq_tail++;
ctx->cqe_cached = &rings->cqes[off];
ctx->cqe_sentinel = ctx->cqe_cached + len;
return ctx->cqe_cached++;
ctx->cqe_cached++;
return &rings->cqes[off << shift];
}
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
{
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
struct io_uring_cqe *cqe = ctx->cqe_cached;
if (ctx->flags & IORING_SETUP_CQE32) {
unsigned int off = ctx->cqe_cached - ctx->rings->cqes;
cqe += off;
}
ctx->cached_cq_tail++;
return ctx->cqe_cached++;
ctx->cqe_cached++;
return cqe;
}
return __io_get_cqe(ctx);
}
@ -2212,10 +2250,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
{
bool all_flushed, posted;
size_t cqe_size = sizeof(struct io_uring_cqe);
if (!force && __io_cqring_events(ctx) == ctx->cq_entries)
return false;
if (ctx->flags & IORING_SETUP_CQE32)
cqe_size <<= 1;
posted = false;
spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->cq_overflow_list)) {
@ -2227,7 +2269,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
if (cqe)
memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
memcpy(cqe, &ocqe->cqe, cqe_size);
else
io_account_cq_overflow(ctx);
@ -2315,11 +2357,17 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
s32 res, u32 cflags, u64 extra1,
u64 extra2)
{
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
if (is_cqe32)
ocq_size += sizeof(struct io_uring_cqe);
ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe);
if (!ocqe) {
/*
@ -2339,6 +2387,10 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
if (is_cqe32) {
ocqe->cqe.big_cqe[0] = extra1;
ocqe->cqe.big_cqe[1] = extra2;
}
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
@ -2360,7 +2412,7 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
WRITE_ONCE(cqe->flags, cflags);
return true;
}
return io_cqring_event_overflow(ctx, user_data, res, cflags);
return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
}
static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
@ -2369,7 +2421,7 @@ static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
struct io_uring_cqe *cqe;
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags);
req->cqe.res, req->cqe.flags, 0, 0);
/*
* If we can't get a cq entry, userspace overflowed the
@ -2382,35 +2434,91 @@ static inline bool __io_fill_cqe_req_filled(struct io_ring_ctx *ctx,
return true;
}
return io_cqring_event_overflow(ctx, req->cqe.user_data,
req->cqe.res, req->cqe.flags);
req->cqe.res, req->cqe.flags, 0, 0);
}
static inline bool __io_fill_cqe32_req_filled(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
u64 extra1 = req->extra1;
u64 extra2 = req->extra2;
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags, extra1, extra2);
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqe(ctx);
if (likely(cqe)) {
memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe));
cqe->big_cqe[0] = extra1;
cqe->big_cqe[1] = extra2;
return true;
}
return io_cqring_event_overflow(ctx, req->cqe.user_data, req->cqe.res,
req->cqe.flags, extra1, extra2);
}
static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags);
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, res, cflags, 0, 0);
return __io_fill_cqe(req->ctx, req->cqe.user_data, res, cflags);
}
static inline void __io_fill_cqe32_req(struct io_kiocb *req, s32 res, u32 cflags,
u64 extra1, u64 extra2)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_cqe *cqe;
if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
return;
if (req->flags & REQ_F_CQE_SKIP)
return;
trace_io_uring_complete(ctx, req, req->cqe.user_data, res, cflags,
extra1, extra2);
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqe(ctx);
if (likely(cqe)) {
WRITE_ONCE(cqe->user_data, req->cqe.user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
return;
}
io_cqring_event_overflow(ctx, req->cqe.user_data, res, cflags, extra1, extra2);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
static void __io_req_complete_post(struct io_kiocb *req, s32 res,
u32 cflags)
static void __io_req_complete_put(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CQE_SKIP))
__io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
*/
if (req_ref_put_and_test(req)) {
struct io_ring_ctx *ctx = req->ctx;
if (req->flags & IO_REQ_LINK_FLAGS) {
if (req->flags & IO_DISARM_MASK)
io_disarm_next(req);
@ -2433,8 +2541,23 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
}
}
static void io_req_complete_post(struct io_kiocb *req, s32 res,
u32 cflags)
static void __io_req_complete_post(struct io_kiocb *req, s32 res,
u32 cflags)
{
if (!(req->flags & REQ_F_CQE_SKIP))
__io_fill_cqe_req(req, res, cflags);
__io_req_complete_put(req);
}
static void __io_req_complete_post32(struct io_kiocb *req, s32 res,
u32 cflags, u64 extra1, u64 extra2)
{
if (!(req->flags & REQ_F_CQE_SKIP))
__io_fill_cqe32_req(req, res, cflags, extra1, extra2);
__io_req_complete_put(req);
}
static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags)
{
struct io_ring_ctx *ctx = req->ctx;
@ -2445,6 +2568,18 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res,
io_cqring_ev_posted(ctx);
}
static void io_req_complete_post32(struct io_kiocb *req, s32 res,
u32 cflags, u64 extra1, u64 extra2)
{
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
__io_req_complete_post32(req, res, cflags, extra1, extra2);
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
}
static inline void io_req_complete_state(struct io_kiocb *req, s32 res,
u32 cflags)
{
@ -2462,6 +2597,19 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags,
io_req_complete_post(req, res, cflags);
}
static inline void __io_req_complete32(struct io_kiocb *req,
unsigned int issue_flags, s32 res,
u32 cflags, u64 extra1, u64 extra2)
{
if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
io_req_complete_state(req, res, cflags);
req->extra1 = extra1;
req->extra2 = extra2;
} else {
io_req_complete_post32(req, res, cflags, extra1, extra2);
}
}
static inline void io_req_complete(struct io_kiocb *req, s32 res)
{
if (res < 0)
@ -2803,10 +2951,10 @@ static void tctx_task_work(struct callback_head *cb)
struct io_wq_work_node *node1, *node2;
spin_lock_irq(&tctx->task_lock);
node1 = tctx->prior_task_list.first;
node1 = tctx->prio_task_list.first;
node2 = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
INIT_WQ_LIST(&tctx->prior_task_list);
INIT_WQ_LIST(&tctx->prio_task_list);
if (!node2 && !node1)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
@ -2820,7 +2968,7 @@ static void tctx_task_work(struct callback_head *cb)
cond_resched();
if (data_race(!tctx->task_list.first) &&
data_race(!tctx->prior_task_list.first) && uring_locked)
data_race(!tctx->prio_task_list.first) && uring_locked)
io_submit_flush_completions(ctx);
}
@ -2831,24 +2979,19 @@ static void tctx_task_work(struct callback_head *cb)
io_uring_drop_tctx_refs(current);
}
static void io_req_task_work_add(struct io_kiocb *req, bool priority)
static void __io_req_task_work_add(struct io_kiocb *req,
struct io_uring_task *tctx,
struct io_wq_work_list *list)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_task *tctx = tsk->io_uring;
struct io_wq_work_node *node;
unsigned long flags;
bool running;
WARN_ON_ONCE(!tctx);
io_drop_inflight_file(req);
spin_lock_irqsave(&tctx->task_lock, flags);
if (priority)
wq_list_add_tail(&req->io_task_work.node, &tctx->prior_task_list);
else
wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
wq_list_add_tail(&req->io_task_work.node, list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
@ -2861,12 +3004,12 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
if (likely(!task_work_add(tsk, &tctx->task_work, ctx->notify_method)))
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return;
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false;
node = wq_list_merge(&tctx->prior_task_list, &tctx->task_list);
node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) {
@ -2878,6 +3021,23 @@ static void io_req_task_work_add(struct io_kiocb *req, bool priority)
}
}
static void io_req_task_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
__io_req_task_work_add(req, tctx, &tctx->task_list);
}
static void io_req_task_prio_work_add(struct io_kiocb *req)
{
struct io_uring_task *tctx = req->task->io_uring;
if (req->ctx->flags & IORING_SETUP_SQPOLL)
__io_req_task_work_add(req, tctx, &tctx->prio_task_list);
else
__io_req_task_work_add(req, tctx, &tctx->task_list);
}
static void io_req_tw_post(struct io_kiocb *req, bool *locked)
{
io_req_complete_post(req, req->cqe.res, req->cqe.flags);
@ -2888,7 +3048,7 @@ static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags)
req->cqe.res = res;
req->cqe.flags = cflags;
req->io_task_work.func = io_req_tw_post;
io_req_task_work_add(req, false);
io_req_task_work_add(req);
}
static void io_req_task_cancel(struct io_kiocb *req, bool *locked)
@ -2912,19 +3072,19 @@ static void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{
req->cqe.res = ret;
req->io_task_work.func = io_req_task_cancel;
io_req_task_work_add(req, false);
io_req_task_work_add(req);
}
static void io_req_task_queue(struct io_kiocb *req)
{
req->io_task_work.func = io_req_task_submit;
io_req_task_work_add(req, false);
io_req_task_work_add(req);
}
static void io_req_task_queue_reissue(struct io_kiocb *req)
{
req->io_task_work.func = io_queue_iowq;
io_req_task_work_add(req, false);
io_req_task_work_add(req);
}
static void io_queue_next(struct io_kiocb *req)
@ -2998,8 +3158,12 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP))
__io_fill_cqe_req_filled(ctx, req);
if (!(req->flags & REQ_F_CQE_SKIP)) {
if (!(ctx->flags & IORING_SETUP_CQE32))
__io_fill_cqe_req_filled(ctx, req);
else
__io_fill_cqe32_req_filled(ctx, req);
}
}
io_commit_cqring(ctx);
@ -3328,7 +3492,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
return;
req->cqe.res = res;
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
io_req_task_prio_work_add(req);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
@ -4462,10 +4626,6 @@ static int __io_getxattr_prep(struct io_kiocb *req,
const char __user *name;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->ioprio))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@ -4575,10 +4735,6 @@ static int __io_setxattr_prep(struct io_kiocb *req,
const char __user *name;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->ioprio))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@ -4857,6 +5013,96 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
{
req->uring_cmd.task_work_cb(&req->uring_cmd);
}
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *))
{
struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
req->uring_cmd.task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
io_req_task_prio_work_add(req);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
/*
* Called by consumers of io_uring_cmd, if they originally returned
* -EIOCBQUEUED upon receiving the command.
*/
void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
{
struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
if (ret < 0)
req_set_fail(req);
if (req->ctx->flags & IORING_SETUP_CQE32)
__io_req_complete32(req, 0, ret, 0, res2, 0);
else
io_req_complete(req, ret);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
static int io_uring_cmd_prep_async(struct io_kiocb *req)
{
size_t cmd_size;
cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
memcpy(req->async_data, req->uring_cmd.cmd, cmd_size);
return 0;
}
static int io_uring_cmd_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
struct io_uring_cmd *ioucmd = &req->uring_cmd;
if (sqe->rw_flags)
return -EINVAL;
ioucmd->cmd = sqe->cmd;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
return 0;
}
static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_uring_cmd *ioucmd = &req->uring_cmd;
struct io_ring_ctx *ctx = req->ctx;
struct file *file = req->file;
int ret;
if (!req->file->f_op->uring_cmd)
return -EOPNOTSUPP;
if (ctx->flags & IORING_SETUP_SQE128)
issue_flags |= IO_URING_F_SQE128;
if (ctx->flags & IORING_SETUP_CQE32)
issue_flags |= IO_URING_F_CQE32;
if (ctx->flags & IORING_SETUP_IOPOLL)
issue_flags |= IO_URING_F_IOPOLL;
if (req_has_async_data(req))
ioucmd->cmd = req->async_data;
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
if (ret == -EAGAIN) {
if (!req_has_async_data(req)) {
if (io_alloc_async_data(req))
return -ENOMEM;
io_uring_cmd_prep_async(req);
}
return -EAGAIN;
}
if (ret != -EIOCBQUEUED)
io_uring_cmd_done(ioucmd, ret, 0);
return 0;
}
static int io_shutdown_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@ -4992,11 +5238,25 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
/*
* If the ring is setup with CQE32, relay back addr/addr
*/
if (req->ctx->flags & IORING_SETUP_CQE32) {
req->nop.extra1 = READ_ONCE(sqe->addr);
req->nop.extra2 = READ_ONCE(sqe->addr2);
}
return 0;
}
/*
* IORING_OP_NOP just posts a completion event, nothing else.
*/
static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
{
unsigned int cflags;
void __user *buf;
if (req->flags & REQ_F_BUFFER_SELECT) {
@ -5007,7 +5267,12 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return -ENOBUFS;
}
__io_req_complete(req, issue_flags, 0, io_put_kbuf(req, issue_flags));
cflags = io_put_kbuf(req, issue_flags);
if (!(req->ctx->flags & IORING_SETUP_CQE32))
__io_req_complete(req, issue_flags, 0, cflags);
else
__io_req_complete32(req, issue_flags, 0, cflags,
req->nop.extra1, req->nop.extra2);
return 0;
}
@ -6366,9 +6631,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_socket *sock = &req->sock;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->addr || sqe->rw_flags || sqe->buf_index)
if (sqe->addr || sqe->rw_flags || sqe->buf_index)
return -EINVAL;
sock->domain = READ_ONCE(sqe->fd);
@ -6750,7 +7013,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t events)
req->io_task_work.func = io_apoll_task_func;
trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask);
io_req_task_work_add(req, false);
io_req_task_work_add(req);
}
static inline void io_poll_execute(struct io_kiocb *req, int res,
@ -7255,7 +7518,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
req->cqe.res = -ETIME;
req->io_task_work.func = io_req_task_complete;
io_req_task_work_add(req, false);
io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
@ -7751,7 +8014,7 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
switch (req->opcode) {
case IORING_OP_NOP:
return 0;
return io_nop_prep(req, sqe);
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
@ -7835,6 +8098,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_getxattr_prep(req, sqe);
case IORING_OP_SOCKET:
return io_socket_prep(req, sqe);
case IORING_OP_URING_CMD:
return io_uring_cmd_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@ -7867,6 +8132,8 @@ static int io_req_prep_async(struct io_kiocb *req)
return io_recvmsg_prep_async(req);
case IORING_OP_CONNECT:
return io_connect_prep_async(req);
case IORING_OP_URING_CMD:
return io_uring_cmd_prep_async(req);
}
printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
req->opcode);
@ -8161,6 +8428,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_SOCKET:
ret = io_socket(req, issue_flags);
break;
case IORING_OP_URING_CMD:
ret = io_uring_cmd(req, issue_flags);
break;
default:
ret = -EINVAL;
break;
@ -8371,7 +8641,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
spin_unlock_irqrestore(&ctx->timeout_lock, flags);
req->io_task_work.func = io_req_task_link_timeout;
io_req_task_work_add(req, false);
io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
@ -8761,8 +9031,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx)
* though the application is the one updating it.
*/
head = READ_ONCE(ctx->sq_array[sq_idx]);
if (likely(head < ctx->sq_entries))
if (likely(head < ctx->sq_entries)) {
/* double index for 128-byte SQEs, twice as long */
if (ctx->flags & IORING_SETUP_SQE128)
head <<= 1;
return &ctx->sq_sqes[head];
}
/* drop invalid entries */
ctx->cq_extra--;
@ -10080,7 +10354,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
task->io_uring = tctx;
spin_lock_init(&tctx->task_lock);
INIT_WQ_LIST(&tctx->task_list);
INIT_WQ_LIST(&tctx->prior_task_list);
INIT_WQ_LIST(&tctx->prio_task_list);
init_task_work(&tctx->task_work, tctx_task_work);
return 0;
}
@ -10258,8 +10532,8 @@ static void *io_mem_alloc(size_t size)
return (void *) __get_free_pages(gfp, get_order(size));
}
static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
size_t *sq_offset)
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{
struct io_rings *rings;
size_t off, sq_array_size;
@ -10267,6 +10541,10 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
off = struct_size(rings, cqes, cq_entries);
if (off == SIZE_MAX)
return SIZE_MAX;
if (ctx->flags & IORING_SETUP_CQE32) {
if (check_shl_overflow(off, 1, &off))
return SIZE_MAX;
}
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
@ -11833,10 +12111,15 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int sq_tail = READ_ONCE(r->sq.tail);
unsigned int cq_head = READ_ONCE(r->cq.head);
unsigned int cq_tail = READ_ONCE(r->cq.tail);
unsigned int cq_shift = 0;
unsigned int sq_entries, cq_entries;
bool has_lock;
bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
unsigned int i;
if (is_cqe32)
cq_shift = 1;
/*
* we may get imprecise sqe and cqe info if uring is actively running
* since we get cached_sq_head and cached_cq_tail without uring_lock
@ -11869,11 +12152,18 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
cq_entries = min(cq_tail - cq_head, ctx->cq_entries);
for (i = 0; i < cq_entries; i++) {
unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask];
struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift];
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
if (!is_cqe32) {
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
} else {
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, "
"extra1:%llu, extra2:%llu\n",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]);
}
}
/*
@ -11976,7 +12266,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@ -11991,7 +12281,10 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->sq_ring_entries = p->sq_entries;
rings->cq_ring_entries = p->cq_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (p->flags & IORING_SETUP_SQE128)
size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
else
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {
io_mem_free(ctx->rings);
ctx->rings = NULL;
@ -12235,7 +12528,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32))
return -EINVAL;
return io_uring_create(entries, &p, params);
@ -12924,6 +13218,8 @@ static int __init io_uring_init(void)
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64);
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT);
return 0;

View file

@ -1953,6 +1953,7 @@ struct dir_context {
#define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN)
struct iov_iter;
struct io_uring_cmd;
struct file_operations {
struct module *owner;
@ -1995,6 +1996,7 @@ struct file_operations {
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
} __randomize_layout;
struct inode_operations {

View file

@ -5,7 +5,32 @@
#include <linux/sched.h>
#include <linux/xarray.h>
enum io_uring_cmd_flags {
IO_URING_F_COMPLETE_DEFER = 1,
IO_URING_F_UNLOCKED = 2,
/* int's last bit, sign checks are usually faster than a bit test */
IO_URING_F_NONBLOCK = INT_MIN,
/* ctx state flags, for URING_CMD */
IO_URING_F_SQE128 = 4,
IO_URING_F_CQE32 = 8,
IO_URING_F_IOPOLL = 16,
};
struct io_uring_cmd {
struct file *file;
const void *cmd;
/* callback to defer completions to task context */
void (*task_work_cb)(struct io_uring_cmd *cmd);
u32 cmd_op;
u32 pad;
u8 pdu[32]; /* available inline for free use */
};
#if defined(CONFIG_IO_URING)
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *));
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
@ -30,6 +55,14 @@ static inline void io_uring_free(struct task_struct *tsk)
__io_uring_free(tsk);
}
#else
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
ssize_t ret2)
{
}
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *))
{
}
static inline struct sock *io_uring_get_socket(struct file *file)
{
return NULL;

View file

@ -321,13 +321,16 @@ TRACE_EVENT(io_uring_fail_link,
* @user_data: user data associated with the request
* @res: result of the request
* @cflags: completion flags
* @extra1: extra 64-bit data for CQE32
* @extra2: extra 64-bit data for CQE32
*
*/
TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags),
TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
u64 extra1, u64 extra2),
TP_ARGS(ctx, req, user_data, res, cflags),
TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
TP_STRUCT__entry (
__field( void *, ctx )
@ -335,6 +338,8 @@ TRACE_EVENT(io_uring_complete,
__field( u64, user_data )
__field( int, res )
__field( unsigned, cflags )
__field( u64, extra1 )
__field( u64, extra2 )
),
TP_fast_assign(
@ -343,12 +348,17 @@ TRACE_EVENT(io_uring_complete,
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
__entry->extra1 = extra1;
__entry->extra2 = extra2;
),
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x",
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
"extra1 %llu extra2 %llu ",
__entry->ctx, __entry->req,
__entry->user_data,
__entry->res, __entry->cflags)
__entry->res, __entry->cflags,
(unsigned long long) __entry->extra1,
(unsigned long long) __entry->extra2)
);
/**

View file

@ -22,6 +22,7 @@ struct io_uring_sqe {
union {
__u64 off; /* offset into file */
__u64 addr2;
__u32 cmd_op;
};
union {
__u64 addr; /* pointer to buffer or iovecs */
@ -61,8 +62,17 @@ struct io_uring_sqe {
__s32 splice_fd_in;
__u32 file_index;
};
__u64 addr3;
__u64 __pad2[1];
union {
struct {
__u64 addr3;
__u64 __pad2[1];
};
/*
* If the ring is initialized with IORING_SETUP_SQE128, then
* this field is used for 80 bytes of arbitrary command data
*/
__u8 cmd[0];
};
};
/*
@ -128,6 +138,9 @@ enum {
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@ -175,6 +188,7 @@ enum io_uring_op {
IORING_OP_FGETXATTR,
IORING_OP_GETXATTR,
IORING_OP_SOCKET,
IORING_OP_URING_CMD,
/* this goes last, obviously */
IORING_OP_LAST,
@ -251,6 +265,12 @@ struct io_uring_cqe {
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
/*
* If the ring is initialized with IORING_SETUP_CQE32, then this field
* contains 16-bytes of padding, doubling the size of the CQE.
*/
__u64 big_cqe[];
};
/*

View file

@ -70,6 +70,28 @@ struct nvme_passthru_cmd64 {
__u64 result;
};
/* same as struct nvme_passthru_cmd64, minus the 8b result field */
struct nvme_uring_cmd {
__u8 opcode;
__u8 flags;
__u16 rsvd1;
__u32 nsid;
__u32 cdw2;
__u32 cdw3;
__u64 metadata;
__u64 addr;
__u32 metadata_len;
__u32 data_len;
__u32 cdw10;
__u32 cdw11;
__u32 cdw12;
__u32 cdw13;
__u32 cdw14;
__u32 cdw15;
__u32 timeout_ms;
__u32 rsvd2;
};
#define nvme_admin_cmd nvme_passthru_cmd
#define NVME_IOCTL_ID _IO('N', 0x40)
@ -83,4 +105,10 @@ struct nvme_passthru_cmd64 {
#define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64)
#define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64)
/* io_uring async commands: */
#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd)
#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd)
#define NVME_URING_CMD_ADMIN _IOWR('N', 0x82, struct nvme_uring_cmd)
#define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd)
#endif /* _UAPI_LINUX_NVME_IOCTL_H */