linux/io_uring/opdef.c
Pavel Begunkov b48c312be0 io_uring/net: simplify zerocopy send user API
Following user feedback, this patch simplifies zerocopy send API. One of
the main complaints is that the current API is difficult with the
userspace managing notification slots, and then send retries with error
handling make it even worse.

Instead of keeping notification slots change it to the per-request
notifications model, which posts both completion and notification CQEs
for each request when any data has been sent, and only one CQE if it
fails. All notification CQEs will have IORING_CQE_F_NOTIF set and
IORING_CQE_F_MORE in completion CQEs indicates whether to wait a
notification or not.

IOSQE_CQE_SKIP_SUCCESS is disallowed with zerocopy sends for now.

This is less flexible, but greatly simplifies the user API and also the
kernel implementation. We reuse notif helpers in this patch, but in the
future there won't be need for keeping two requests.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/95287640ab98fc9417370afb16e310677c63e6ce.1662027856.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2022-09-01 09:13:33 -06:00

513 lines
11 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* io_uring opcode handling table
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/io_uring.h>
#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
#include "rsrc.h"
#include "xattr.h"
#include "nop.h"
#include "fs.h"
#include "splice.h"
#include "sync.h"
#include "advise.h"
#include "openclose.h"
#include "uring_cmd.h"
#include "epoll.h"
#include "statx.h"
#include "net.h"
#include "msg_ring.h"
#include "timeout.h"
#include "poll.h"
#include "cancel.h"
#include "rw.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
WARN_ON_ONCE(1);
return -ECANCELED;
}
static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
const struct io_uring_sqe *sqe)
{
return -EOPNOTSUPP;
}
const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {
.audit_skip = 1,
.iopoll = 1,
.name = "NOP",
.prep = io_nop_prep,
.issue = io_nop,
},
[IORING_OP_READV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READV",
.prep = io_prep_rw,
.issue = io_read,
.prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
.prep = io_prep_rw,
.issue = io_write,
.prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
.audit_skip = 1,
.name = "FSYNC",
.prep = io_fsync_prep,
.issue = io_fsync,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.prep = io_prep_rw,
.issue = io_read,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.prep = io_prep_rw,
.issue = io_write,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "POLL_ADD",
.prep = io_poll_add_prep,
.issue = io_poll_add,
},
[IORING_OP_POLL_REMOVE] = {
.audit_skip = 1,
.name = "POLL_REMOVE",
.prep = io_poll_remove_prep,
.issue = io_poll_remove,
},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
.audit_skip = 1,
.name = "SYNC_FILE_RANGE",
.prep = io_sfr_prep,
.issue = io_sync_file_range,
},
[IORING_OP_SENDMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.ioprio = 1,
.name = "SENDMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep,
.issue = io_sendmsg,
.prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_RECVMSG] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.ioprio = 1,
.name = "RECVMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_recvmsg_prep,
.issue = io_recvmsg,
.prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_TIMEOUT] = {
.audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
.prep = io_timeout_prep,
.issue = io_timeout,
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
.audit_skip = 1,
.name = "TIMEOUT_REMOVE",
.prep = io_timeout_remove_prep,
.issue = io_timeout_remove,
},
[IORING_OP_ACCEPT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.poll_exclusive = 1,
.ioprio = 1, /* used for flags */
.name = "ACCEPT",
#if defined(CONFIG_NET)
.prep = io_accept_prep,
.issue = io_accept,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
.name = "ASYNC_CANCEL",
.prep = io_async_cancel_prep,
.issue = io_async_cancel,
},
[IORING_OP_LINK_TIMEOUT] = {
.audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
.prep = io_link_timeout_prep,
.issue = io_no_issue,
},
[IORING_OP_CONNECT] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.name = "CONNECT",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_connect),
.prep = io_connect_prep,
.issue = io_connect,
.prep_async = io_connect_prep_async,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.name = "FALLOCATE",
.prep = io_fallocate_prep,
.issue = io_fallocate,
},
[IORING_OP_OPENAT] = {
.name = "OPENAT",
.prep = io_openat_prep,
.issue = io_openat,
.cleanup = io_open_cleanup,
},
[IORING_OP_CLOSE] = {
.name = "CLOSE",
.prep = io_close_prep,
.issue = io_close,
},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
.iopoll = 1,
.name = "FILES_UPDATE",
.prep = io_files_update_prep,
.issue = io_files_update,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
.name = "STATX",
.prep = io_statx_prep,
.issue = io_statx,
.cleanup = io_statx_cleanup,
},
[IORING_OP_READ] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ",
.prep = io_prep_rw,
.issue = io_read,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
.audit_skip = 1,
.ioprio = 1,
.iopoll = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.prep = io_prep_rw,
.issue = io_write,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
.audit_skip = 1,
.name = "FADVISE",
.prep = io_fadvise_prep,
.issue = io_fadvise,
},
[IORING_OP_MADVISE] = {
.name = "MADVISE",
.prep = io_madvise_prep,
.issue = io_madvise,
},
[IORING_OP_SEND] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.audit_skip = 1,
.ioprio = 1,
.name = "SEND",
#if defined(CONFIG_NET)
.prep = io_sendmsg_prep,
.issue = io_send,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
.audit_skip = 1,
.ioprio = 1,
.name = "RECV",
#if defined(CONFIG_NET)
.prep = io_recvmsg_prep,
.issue = io_recv,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_OPENAT2] = {
.name = "OPENAT2",
.prep = io_openat2_prep,
.issue = io_openat2,
.cleanup = io_open_cleanup,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "EPOLL",
#if defined(CONFIG_EPOLL)
.prep = io_epoll_ctl_prep,
.issue = io_epoll_ctl,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_SPLICE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "SPLICE",
.prep = io_splice_prep,
.issue = io_splice,
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.name = "PROVIDE_BUFFERS",
.prep = io_provide_buffers_prep,
.issue = io_provide_buffers,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.name = "REMOVE_BUFFERS",
.prep = io_remove_buffers_prep,
.issue = io_remove_buffers,
},
[IORING_OP_TEE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "TEE",
.prep = io_tee_prep,
.issue = io_tee,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
.name = "SHUTDOWN",
#if defined(CONFIG_NET)
.prep = io_shutdown_prep,
.issue = io_shutdown,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_RENAMEAT] = {
.name = "RENAMEAT",
.prep = io_renameat_prep,
.issue = io_renameat,
.cleanup = io_renameat_cleanup,
},
[IORING_OP_UNLINKAT] = {
.name = "UNLINKAT",
.prep = io_unlinkat_prep,
.issue = io_unlinkat,
.cleanup = io_unlinkat_cleanup,
},
[IORING_OP_MKDIRAT] = {
.name = "MKDIRAT",
.prep = io_mkdirat_prep,
.issue = io_mkdirat,
.cleanup = io_mkdirat_cleanup,
},
[IORING_OP_SYMLINKAT] = {
.name = "SYMLINKAT",
.prep = io_symlinkat_prep,
.issue = io_symlinkat,
.cleanup = io_link_cleanup,
},
[IORING_OP_LINKAT] = {
.name = "LINKAT",
.prep = io_linkat_prep,
.issue = io_linkat,
.cleanup = io_link_cleanup,
},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
.iopoll = 1,
.name = "MSG_RING",
.prep = io_msg_ring_prep,
.issue = io_msg_ring,
},
[IORING_OP_FSETXATTR] = {
.needs_file = 1,
.name = "FSETXATTR",
.prep = io_fsetxattr_prep,
.issue = io_fsetxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SETXATTR] = {
.name = "SETXATTR",
.prep = io_setxattr_prep,
.issue = io_setxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_FGETXATTR] = {
.needs_file = 1,
.name = "FGETXATTR",
.prep = io_fgetxattr_prep,
.issue = io_fgetxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_GETXATTR] = {
.name = "GETXATTR",
.prep = io_getxattr_prep,
.issue = io_getxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SOCKET] = {
.audit_skip = 1,
.name = "SOCKET",
#if defined(CONFIG_NET)
.prep = io_socket_prep,
.issue = io_socket,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_URING_CMD] = {
.needs_file = 1,
.plug = 1,
.name = "URING_CMD",
.async_size = uring_cmd_pdu_size(1),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
.prep_async = io_uring_cmd_prep_async,
},
[IORING_OP_SEND_ZC] = {
.name = "SENDZC_NOTIF",
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.audit_skip = 1,
.ioprio = 1,
.manual_alloc = 1,
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendzc_prep,
.issue = io_sendzc,
.prep_async = io_sendzc_prep_async,
.cleanup = io_sendzc_cleanup,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
const char *io_uring_get_opcode(u8 opcode)
{
if (opcode < IORING_OP_LAST)
return io_op_defs[opcode].name;
return "INVALID";
}
void __init io_uring_optable_init(void)
{
int i;
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
BUG_ON(!io_op_defs[i].prep);
if (io_op_defs[i].prep != io_eopnotsupp_prep)
BUG_ON(!io_op_defs[i].issue);
WARN_ON_ONCE(!io_op_defs[i].name);
}
}