userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx

Increments and loads to mmap_changing are always in mmap_lock critical
section.  This ensures that if userspace requests event notification for
non-cooperative operations (e.g.  mremap), userfaultfd operations don't
occur concurrently.

This can be achieved by using a separate read-write semaphore in
userfaultfd_ctx such that increments are done in write-mode and loads in
read-mode, thereby eliminating the dependency on mmap_lock for this
purpose.

This is a preparatory step before we replace mmap_lock usage with per-vma
locks in fill/move ioctls.

Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Lokesh Gidra 2024-02-15 10:27:54 -08:00 committed by Andrew Morton
parent f91e6b41dd
commit 5e4c24a57b
3 changed files with 75 additions and 58 deletions

View file

@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags;
ctx->features = octx->features;
ctx->released = false;
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing);
up_write(&octx->map_changing_lock);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
vma_start_write(vma);
@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm);
msg_init(&ewq.msg);
@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len, &ctx->mmap_changing,
flags);
ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len,
&ctx->mmap_changing);
ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;
if (mmget_not_zero(ctx->mm)) {
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len, mode_wp,
&ctx->mmap_changing);
ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
uffdio_wp.range.len, mode_wp);
mmput(ctx->mm);
} else {
return -ESRCH;
@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len,
&ctx->mmap_changing, flags);
ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
uffdio_continue.range.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
uffdio_poison.range.len,
&ctx->mmap_changing, 0);
ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
uffdio_poison.range.len, 0);
mmput(ctx->mm);
} else {
return -ESRCH;
@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(mm)) {
mmap_read_lock(mm);
/* Re-check after taking mmap_lock */
/* Re-check after taking map_changing_lock */
down_read(&ctx->map_changing_lock);
if (likely(!atomic_read(&ctx->mmap_changing)))
ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
uffdio_move.len, uffdio_move.mode);
else
ret = -EAGAIN;
up_read(&ctx->map_changing_lock);
mmap_read_unlock(mm);
mmput(mm);
} else {
@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags;
ctx->features = 0;
ctx->released = false;
init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */

View file

@ -69,6 +69,13 @@ struct userfaultfd_ctx {
unsigned int features;
/* released */
bool released;
/*
* Prevents userfaultfd operations (fill/move/wp) from happening while
* some non-cooperative event(s) is taking place. Increments are done
* in write-mode. Whereas, userfaultfd operations, which includes
* reading mmap_changing, is done under read-mode.
*/
struct rw_semaphore map_changing_lock;
/* memory mappings are changing because of non-cooperative event */
atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, uffd_flags_t flags);
extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
atomic_t *mmap_changing, uffd_flags_t flags);
extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
uffd_flags_t flags);
extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long len,
atomic_t *mmap_changing);
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags);
extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
unsigned long len);
extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long len, uffd_flags_t flags);
extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, uffd_flags_t flags);
extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);

View file

@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
* called with mmap_lock held, it will release mmap_lock before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* feature is not supported.
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@ -463,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
cond_resched();
if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
BUG_ON(!folio);
@ -473,12 +475,13 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
goto out;
}
mmap_read_lock(dst_mm);
down_read(&ctx->map_changing_lock);
/*
* If memory mappings are changing because of non-cooperative
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
if (mmap_changing && atomic_read(mmap_changing)) {
if (atomic_read(&ctx->mmap_changing)) {
err = -EAGAIN;
break;
}
@ -501,6 +504,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
}
out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
out:
if (folio)
@ -512,11 +516,11 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
return err;
}
static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
@ -600,8 +604,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN;
if (mmap_changing && atomic_read(mmap_changing))
if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
/*
@ -633,8 +638,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
len, mmap_changing, flags);
return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
src_start, len, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@ -693,6 +698,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
if (unlikely(err == -ENOENT)) {
void *kaddr;
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
BUG_ON(!folio);
@ -723,6 +729,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
}
out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
out:
if (folio)
@ -733,34 +740,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
return copied ? copied : err;
}
ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
atomic_t *mmap_changing, uffd_flags_t flags)
uffd_flags_t flags)
{
return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
return mfill_atomic(ctx, dst_start, src_start, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}
ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing)
ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
unsigned long start,
unsigned long len)
{
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}
ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags)
ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, uffd_flags_t flags)
{
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}
ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t flags)
ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, uffd_flags_t flags)
{
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
}
@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
return ret;
}
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, bool enable_wp,
atomic_t *mmap_changing)
int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp)
{
struct mm_struct *dst_mm = ctx->mm;
unsigned long end = start + len;
unsigned long _start, _end;
struct vm_area_struct *dst_vma;
@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
down_read(&ctx->map_changing_lock);
err = -EAGAIN;
if (mmap_changing && atomic_read(mmap_changing))
if (atomic_read(&ctx->mmap_changing))
goto out_unlock;
err = -ENOENT;
@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
}
out_unlock:
up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return err;
}