From 3ab1db3c6039e02a9deb9d5091d28d559917a645 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2024 09:56:14 -0600 Subject: [PATCH] io_uring: get rid of remap_pfn_range() for mapping rings/sqes Rather than use remap_pfn_range() for this and manually free later, switch to using vm_insert_pages() and have it Just Work. If possible, allocate a single compound page that covers the range that is needed. If that works, then we can just use page_address() on that page. If we fail to get a compound page, allocate single pages and use vmap() to map them into the kernel virtual address space. This just covers the rings/sqes, the other remaining user of the mmap remap_pfn_range() user will be converted separately. Once that is done, we can kill the old alloc/free code. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++--- io_uring/io_uring.h | 2 + 2 files changed, 133 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2d091f72ba6c..fba68c37a77d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2599,6 +2599,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } +static void io_pages_unmap(void *ptr, struct page ***pages, + unsigned short *npages) +{ + bool do_vunmap = false; + + if (!ptr) + return; + + if (*npages) { + struct page **to_free = *pages; + int i; + + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + void io_mem_free(void *ptr) { if (!ptr) @@ -2699,8 +2729,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; @@ -2712,6 +2742,80 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); +} + +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) +{ + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } + + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +static void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } + + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ret; +} + void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; @@ -3298,14 +3402,12 @@ static void *io_uring_validate_mmap_request(struct file *file, /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); - ptr = ctx->rings; - break; + return ctx->rings; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); - ptr = ctx->sq_sqes; - break; + return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; @@ -3329,11 +3431,22 @@ static void *io_uring_validate_mmap_request(struct file *file, return ptr; } +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + #ifdef CONFIG_MMU static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { + struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long pfn; void *ptr; @@ -3341,6 +3454,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) if (IS_ERR(ptr)) return PTR_ERR(ptr); + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, + ctx->n_ring_pages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + } + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } @@ -3630,7 +3753,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_mem_alloc(size); + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); @@ -3655,7 +3778,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_mem_alloc(size); + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dbd9a2b870eb..75230d914007 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -70,6 +70,8 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd,