mirror of
https://github.com/torvalds/linux
synced 2024-10-15 07:47:34 +00:00
userfaultfd: handle zeropage moves by UFFDIO_MOVE
Current implementation of UFFDIO_MOVE fails to move zeropages and returns EBUSY when it encounters one. We can handle them by mapping a zeropage at the destination and clearing the mapping at the source. This is done both for ordinary and for huge zeropages. Link: https://lkml.kernel.org/r/20240131175618.2417291-1-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Reported-by: kernel test robot <lkp@intel.com> Reported-by: Dan Carpenter <dan.carpenter@linaro.org> Closes: https://lore.kernel.org/r/202401300107.U8iMAkTl-lkp@intel.com/ Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Brian Geffon <bgeffon@google.com> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Rapoport (IBM) <rppt@kernel.org> Cc: Nicolas Geoffray <ngeoffray@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Shuah Khan <shuah@kernel.org> Cc: ZhangPeng <zhangpeng362@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
e777ae44e3
commit
eb1521dad8
111
mm/huge_memory.c
111
mm/huge_memory.c
|
@ -2200,13 +2200,18 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
|
||||||
}
|
}
|
||||||
|
|
||||||
src_page = pmd_page(src_pmdval);
|
src_page = pmd_page(src_pmdval);
|
||||||
if (unlikely(!PageAnonExclusive(src_page))) {
|
|
||||||
spin_unlock(src_ptl);
|
|
||||||
return -EBUSY;
|
|
||||||
}
|
|
||||||
|
|
||||||
src_folio = page_folio(src_page);
|
if (!is_huge_zero_pmd(src_pmdval)) {
|
||||||
folio_get(src_folio);
|
if (unlikely(!PageAnonExclusive(src_page))) {
|
||||||
|
spin_unlock(src_ptl);
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
|
src_folio = page_folio(src_page);
|
||||||
|
folio_get(src_folio);
|
||||||
|
} else
|
||||||
|
src_folio = NULL;
|
||||||
|
|
||||||
spin_unlock(src_ptl);
|
spin_unlock(src_ptl);
|
||||||
|
|
||||||
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
|
flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
|
||||||
|
@ -2214,19 +2219,22 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
|
||||||
src_addr + HPAGE_PMD_SIZE);
|
src_addr + HPAGE_PMD_SIZE);
|
||||||
mmu_notifier_invalidate_range_start(&range);
|
mmu_notifier_invalidate_range_start(&range);
|
||||||
|
|
||||||
folio_lock(src_folio);
|
if (src_folio) {
|
||||||
|
folio_lock(src_folio);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* split_huge_page walks the anon_vma chain without the page
|
* split_huge_page walks the anon_vma chain without the page
|
||||||
* lock. Serialize against it with the anon_vma lock, the page
|
* lock. Serialize against it with the anon_vma lock, the page
|
||||||
* lock is not enough.
|
* lock is not enough.
|
||||||
*/
|
*/
|
||||||
src_anon_vma = folio_get_anon_vma(src_folio);
|
src_anon_vma = folio_get_anon_vma(src_folio);
|
||||||
if (!src_anon_vma) {
|
if (!src_anon_vma) {
|
||||||
err = -EAGAIN;
|
err = -EAGAIN;
|
||||||
goto unlock_folio;
|
goto unlock_folio;
|
||||||
}
|
}
|
||||||
anon_vma_lock_write(src_anon_vma);
|
anon_vma_lock_write(src_anon_vma);
|
||||||
|
} else
|
||||||
|
src_anon_vma = NULL;
|
||||||
|
|
||||||
dst_ptl = pmd_lockptr(mm, dst_pmd);
|
dst_ptl = pmd_lockptr(mm, dst_pmd);
|
||||||
double_pt_lock(src_ptl, dst_ptl);
|
double_pt_lock(src_ptl, dst_ptl);
|
||||||
|
@ -2235,45 +2243,54 @@ int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pm
|
||||||
err = -EAGAIN;
|
err = -EAGAIN;
|
||||||
goto unlock_ptls;
|
goto unlock_ptls;
|
||||||
}
|
}
|
||||||
if (folio_maybe_dma_pinned(src_folio) ||
|
if (src_folio) {
|
||||||
!PageAnonExclusive(&src_folio->page)) {
|
if (folio_maybe_dma_pinned(src_folio) ||
|
||||||
err = -EBUSY;
|
!PageAnonExclusive(&src_folio->page)) {
|
||||||
goto unlock_ptls;
|
err = -EBUSY;
|
||||||
|
goto unlock_ptls;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
|
||||||
|
WARN_ON_ONCE(!folio_test_anon(src_folio))) {
|
||||||
|
err = -EBUSY;
|
||||||
|
goto unlock_ptls;
|
||||||
|
}
|
||||||
|
|
||||||
|
folio_move_anon_rmap(src_folio, dst_vma);
|
||||||
|
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
|
||||||
|
|
||||||
|
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
|
||||||
|
/* Folio got pinned from under us. Put it back and fail the move. */
|
||||||
|
if (folio_maybe_dma_pinned(src_folio)) {
|
||||||
|
set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
|
||||||
|
err = -EBUSY;
|
||||||
|
goto unlock_ptls;
|
||||||
|
}
|
||||||
|
|
||||||
|
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
|
||||||
|
/* Follow mremap() behavior and treat the entry dirty after the move */
|
||||||
|
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
|
||||||
|
} else {
|
||||||
|
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
|
||||||
|
_dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
|
|
||||||
WARN_ON_ONCE(!folio_test_anon(src_folio))) {
|
|
||||||
err = -EBUSY;
|
|
||||||
goto unlock_ptls;
|
|
||||||
}
|
|
||||||
|
|
||||||
folio_move_anon_rmap(src_folio, dst_vma);
|
|
||||||
WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
|
|
||||||
|
|
||||||
src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
|
|
||||||
/* Folio got pinned from under us. Put it back and fail the move. */
|
|
||||||
if (folio_maybe_dma_pinned(src_folio)) {
|
|
||||||
set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
|
|
||||||
err = -EBUSY;
|
|
||||||
goto unlock_ptls;
|
|
||||||
}
|
|
||||||
|
|
||||||
_dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
|
|
||||||
/* Follow mremap() behavior and treat the entry dirty after the move */
|
|
||||||
_dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
|
|
||||||
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
|
set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);
|
||||||
|
|
||||||
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
|
src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
|
||||||
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
|
pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
|
||||||
unlock_ptls:
|
unlock_ptls:
|
||||||
double_pt_unlock(src_ptl, dst_ptl);
|
double_pt_unlock(src_ptl, dst_ptl);
|
||||||
anon_vma_unlock_write(src_anon_vma);
|
if (src_anon_vma) {
|
||||||
put_anon_vma(src_anon_vma);
|
anon_vma_unlock_write(src_anon_vma);
|
||||||
|
put_anon_vma(src_anon_vma);
|
||||||
|
}
|
||||||
unlock_folio:
|
unlock_folio:
|
||||||
/* unblock rmap walks */
|
/* unblock rmap walks */
|
||||||
folio_unlock(src_folio);
|
if (src_folio)
|
||||||
|
folio_unlock(src_folio);
|
||||||
mmu_notifier_invalidate_range_end(&range);
|
mmu_notifier_invalidate_range_end(&range);
|
||||||
folio_put(src_folio);
|
if (src_folio)
|
||||||
|
folio_put(src_folio);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_USERFAULTFD */
|
#endif /* CONFIG_USERFAULTFD */
|
||||||
|
|
|
@ -959,6 +959,33 @@ static int move_swap_pte(struct mm_struct *mm,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int move_zeropage_pte(struct mm_struct *mm,
|
||||||
|
struct vm_area_struct *dst_vma,
|
||||||
|
struct vm_area_struct *src_vma,
|
||||||
|
unsigned long dst_addr, unsigned long src_addr,
|
||||||
|
pte_t *dst_pte, pte_t *src_pte,
|
||||||
|
pte_t orig_dst_pte, pte_t orig_src_pte,
|
||||||
|
spinlock_t *dst_ptl, spinlock_t *src_ptl)
|
||||||
|
{
|
||||||
|
pte_t zero_pte;
|
||||||
|
|
||||||
|
double_pt_lock(dst_ptl, src_ptl);
|
||||||
|
if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
|
||||||
|
!pte_same(ptep_get(dst_pte), orig_dst_pte)) {
|
||||||
|
double_pt_unlock(dst_ptl, src_ptl);
|
||||||
|
return -EAGAIN;
|
||||||
|
}
|
||||||
|
|
||||||
|
zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
|
||||||
|
dst_vma->vm_page_prot));
|
||||||
|
ptep_clear_flush(src_vma, src_addr, src_pte);
|
||||||
|
set_pte_at(mm, dst_addr, dst_pte, zero_pte);
|
||||||
|
double_pt_unlock(dst_ptl, src_ptl);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The mmap_lock for reading is held by the caller. Just move the page
|
* The mmap_lock for reading is held by the caller. Just move the page
|
||||||
* from src_pmd to dst_pmd if possible, and return true if succeeded
|
* from src_pmd to dst_pmd if possible, and return true if succeeded
|
||||||
|
@ -1041,6 +1068,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pte_present(orig_src_pte)) {
|
if (pte_present(orig_src_pte)) {
|
||||||
|
if (is_zero_pfn(pte_pfn(orig_src_pte))) {
|
||||||
|
err = move_zeropage_pte(mm, dst_vma, src_vma,
|
||||||
|
dst_addr, src_addr, dst_pte, src_pte,
|
||||||
|
orig_dst_pte, orig_src_pte,
|
||||||
|
dst_ptl, src_ptl);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Pin and lock both source folio and anon_vma. Since we are in
|
* Pin and lock both source folio and anon_vma. Since we are in
|
||||||
* RCU read section, we can't block, so on contention have to
|
* RCU read section, we can't block, so on contention have to
|
||||||
|
@ -1404,19 +1439,14 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
|
||||||
err = -ENOENT;
|
err = -ENOENT;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* Avoid moving zeropages for now */
|
|
||||||
if (is_huge_zero_pmd(*src_pmd)) {
|
|
||||||
spin_unlock(ptl);
|
|
||||||
err = -EBUSY;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Check if we can move the pmd without splitting it. */
|
/* Check if we can move the pmd without splitting it. */
|
||||||
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
|
if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
|
||||||
!pmd_none(dst_pmdval)) {
|
!pmd_none(dst_pmdval)) {
|
||||||
struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
|
struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
|
||||||
|
|
||||||
if (!folio || !PageAnonExclusive(&folio->page)) {
|
if (!folio || (!is_huge_zero_page(&folio->page) &&
|
||||||
|
!PageAnonExclusive(&folio->page))) {
|
||||||
spin_unlock(ptl);
|
spin_unlock(ptl);
|
||||||
err = -EBUSY;
|
err = -EBUSY;
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Reference in a new issue