Merge branch kvm-arm64/mte-map-shared into kvmarm-master/next

* kvm-arm64/mte-map-shared:
  : .
  : Update the MTE support to allow the VMM to use shared mappings
  : to back the memslots exposed to MTE-enabled guests.
  :
  : Patches courtesy of Catalin Marinas and Peter Collingbourne.
  : .
  : Fix a number of issues with MTE, such as races on the tags
  : being initialised vs the PG_mte_tagged flag as well as the
  : lack of support for VM_SHARED when KVM is involved.
  :
  : Patches from Catalin Marinas and Peter Collingbourne.
  : .
  Documentation: document the ABI changes for KVM_CAP_ARM_MTE
  KVM: arm64: permit all VM_MTE_ALLOWED mappings with MTE enabled
  KVM: arm64: unify the tests for VMAs in memslots when MTE is enabled
  arm64: mte: Lock a page for MTE tag initialisation
  mm: Add PG_arch_3 page flag
  KVM: arm64: Simplify the sanitise_mte_tags() logic
  arm64: mte: Fix/clarify the PG_mte_tagged semantics
  mm: Do not enable PG_arch_2 for all 64-bit architectures

Signed-off-by: Marc Zyngier <maz@kernel.org>
This commit is contained in:
Marc Zyngier 2022-12-05 14:38:24 +00:00
commit 382b5b87a9
19 changed files with 152 additions and 79 deletions

View file

@ -7385,8 +7385,9 @@ hibernation of the host; however the VMM needs to manually save/restore the
tags as appropriate if the VM is migrated.
When this capability is enabled all memory in memslots must be mapped as
not-shareable (no MAP_SHARED), attempts to create a memslot with a
MAP_SHARED mmap will result in an -EINVAL return.
``MAP_ANONYMOUS`` or with a RAM-based file mapping (``tmpfs``, ``memfd``),
attempts to create a memslot with an invalid mmap will result in an
-EINVAL return.
When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to
perform a bulk copy of tags to/from the guest.

View file

@ -1965,6 +1965,7 @@ config ARM64_MTE
depends on ARM64_PAN
select ARCH_HAS_SUBPAGE_FAULTS
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_USES_PG_ARCH_X
help
Memory Tagging (part of the ARMv8.5 Extensions) provides
architectural support for run-time, always-on detection of

View file

@ -25,7 +25,7 @@ unsigned long mte_copy_tags_to_user(void __user *to, void *from,
unsigned long n);
int mte_save_tags(struct page *page);
void mte_save_page_tags(const void *page_addr, void *tag_storage);
bool mte_restore_tags(swp_entry_t entry, struct page *page);
void mte_restore_tags(swp_entry_t entry, struct page *page);
void mte_restore_page_tags(void *page_addr, const void *tag_storage);
void mte_invalidate_tags(int type, pgoff_t offset);
void mte_invalidate_tags_area(int type);
@ -36,6 +36,58 @@ void mte_free_tag_storage(char *storage);
/* track which pages have valid allocation tags */
#define PG_mte_tagged PG_arch_2
/* simple lock to avoid multiple threads tagging the same page */
#define PG_mte_lock PG_arch_3
static inline void set_page_mte_tagged(struct page *page)
{
/*
* Ensure that the tags written prior to this function are visible
* before the page flags update.
*/
smp_wmb();
set_bit(PG_mte_tagged, &page->flags);
}
static inline bool page_mte_tagged(struct page *page)
{
bool ret = test_bit(PG_mte_tagged, &page->flags);
/*
* If the page is tagged, ensure ordering with a likely subsequent
* read of the tags.
*/
if (ret)
smp_rmb();
return ret;
}
/*
* Lock the page for tagging and return 'true' if the page can be tagged,
* 'false' if already tagged. PG_mte_tagged is never cleared and therefore the
* locking only happens once for page initialisation.
*
* The page MTE lock state:
*
* Locked: PG_mte_lock && !PG_mte_tagged
* Unlocked: !PG_mte_lock || PG_mte_tagged
*
* Acquire semantics only if the page is tagged (returning 'false').
*/
static inline bool try_page_mte_tagging(struct page *page)
{
if (!test_and_set_bit(PG_mte_lock, &page->flags))
return true;
/*
* The tags are either being initialised or may have been initialised
* already. Check if the PG_mte_tagged flag has been set or wait
* otherwise.
*/
smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged));
return false;
}
void mte_zero_clear_page_tags(void *addr);
void mte_sync_tags(pte_t old_pte, pte_t pte);
@ -56,6 +108,17 @@ size_t mte_probe_user_range(const char __user *uaddr, size_t size);
/* unused if !CONFIG_ARM64_MTE, silence the compiler */
#define PG_mte_tagged 0
static inline void set_page_mte_tagged(struct page *page)
{
}
static inline bool page_mte_tagged(struct page *page)
{
return false;
}
static inline bool try_page_mte_tagging(struct page *page)
{
return false;
}
static inline void mte_zero_clear_page_tags(void *addr)
{
}

View file

@ -1049,8 +1049,8 @@ static inline void arch_swap_invalidate_area(int type)
#define __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
if (system_supports_mte() && mte_restore_tags(entry, &folio->page))
set_bit(PG_mte_tagged, &folio->flags);
if (system_supports_mte())
mte_restore_tags(entry, &folio->page);
}
#endif /* CONFIG_ARM64_MTE */

View file

@ -2074,8 +2074,10 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
* Clear the tags in the zero page. This needs to be done via the
* linear map which has the Tagged attribute.
*/
if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags))
if (try_page_mte_tagging(ZERO_PAGE(0))) {
mte_clear_page_tags(lm_alias(empty_zero_page));
set_page_mte_tagged(ZERO_PAGE(0));
}
kasan_init_hw_tags_cpu();
}

View file

@ -47,7 +47,7 @@ static int mte_dump_tag_range(struct coredump_params *cprm,
* Pages mapped in user space as !pte_access_permitted() (e.g.
* PROT_EXEC only) may not have the PG_mte_tagged flag set.
*/
if (!test_bit(PG_mte_tagged, &page->flags)) {
if (!page_mte_tagged(page)) {
put_page(page);
dump_skip(cprm, MTE_PAGE_TAG_STORAGE);
continue;

View file

@ -271,7 +271,7 @@ static int swsusp_mte_save_tags(void)
if (!page)
continue;
if (!test_bit(PG_mte_tagged, &page->flags))
if (!page_mte_tagged(page))
continue;
ret = save_tags(page, pfn);

View file

@ -41,19 +41,17 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte,
if (check_swap && is_swap_pte(old_pte)) {
swp_entry_t entry = pte_to_swp_entry(old_pte);
if (!non_swap_entry(entry) && mte_restore_tags(entry, page))
return;
if (!non_swap_entry(entry))
mte_restore_tags(entry, page);
}
if (!pte_is_tagged)
return;
/*
* Test PG_mte_tagged again in case it was racing with another
* set_pte_at().
*/
if (!test_and_set_bit(PG_mte_tagged, &page->flags))
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
set_page_mte_tagged(page);
}
}
void mte_sync_tags(pte_t old_pte, pte_t pte)
@ -69,9 +67,11 @@ void mte_sync_tags(pte_t old_pte, pte_t pte)
/* if PG_mte_tagged is set, tags have already been initialised */
for (i = 0; i < nr_pages; i++, page++) {
if (!test_bit(PG_mte_tagged, &page->flags))
if (!page_mte_tagged(page)) {
mte_sync_page_tags(page, old_pte, check_swap,
pte_is_tagged);
set_page_mte_tagged(page);
}
}
/* ensure the tags are visible before the PTE is set */
@ -96,8 +96,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
* pages is tagged, set_pte_at() may zero or change the tags of the
* other page via mte_sync_tags().
*/
if (test_bit(PG_mte_tagged, &page1->flags) ||
test_bit(PG_mte_tagged, &page2->flags))
if (page_mte_tagged(page1) || page_mte_tagged(page2))
return addr1 != addr2;
return ret;
@ -454,7 +453,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
put_page(page);
break;
}
WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags));
WARN_ON_ONCE(!page_mte_tagged(page));
/* limit access to the end of the page */
offset = offset_in_page(addr);

View file

@ -1059,7 +1059,7 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
maddr = page_address(page);
if (!write) {
if (test_bit(PG_mte_tagged, &page->flags))
if (page_mte_tagged(page))
num_tags = mte_copy_tags_to_user(tags, maddr,
MTE_GRANULES_PER_PAGE);
else
@ -1068,15 +1068,19 @@ long kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
clear_user(tags, MTE_GRANULES_PER_PAGE);
kvm_release_pfn_clean(pfn);
} else {
/*
* Only locking to serialise with a concurrent
* set_pte_at() in the VMM but still overriding the
* tags, hence ignoring the return value.
*/
try_page_mte_tagging(page);
num_tags = mte_copy_tags_from_user(maddr, tags,
MTE_GRANULES_PER_PAGE);
/*
* Set the flag after checking the write
* completed fully
*/
if (num_tags == MTE_GRANULES_PER_PAGE)
set_bit(PG_mte_tagged, &page->flags);
/* uaccess failed, don't leave stale tags */
if (num_tags != MTE_GRANULES_PER_PAGE)
mte_clear_page_tags(page);
set_page_mte_tagged(page);
kvm_release_pfn_dirty(pfn);
}

View file

@ -1164,32 +1164,26 @@ static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
* - mmap_lock protects between a VM faulting a page in and the VMM performing
* an mprotect() to add VM_MTE
*/
static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
unsigned long size)
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
unsigned long size)
{
unsigned long i, nr_pages = size >> PAGE_SHIFT;
struct page *page;
struct page *page = pfn_to_page(pfn);
if (!kvm_has_mte(kvm))
return 0;
/*
* pfn_to_online_page() is used to reject ZONE_DEVICE pages
* that may not support tags.
*/
page = pfn_to_online_page(pfn);
if (!page)
return -EFAULT;
return;
for (i = 0; i < nr_pages; i++, page++) {
if (!test_bit(PG_mte_tagged, &page->flags)) {
if (try_page_mte_tagging(page)) {
mte_clear_page_tags(page_address(page));
set_bit(PG_mte_tagged, &page->flags);
set_page_mte_tagged(page);
}
}
}
return 0;
static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_MTE_ALLOWED;
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@ -1200,7 +1194,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
bool write_fault, writable, force_pte = false;
bool exec_fault;
bool device = false;
bool shared;
unsigned long mmu_seq;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@ -1247,8 +1240,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
vma_shift = get_vma_page_shift(vma, hva);
}
shared = (vma->vm_flags & VM_SHARED);
switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
case PUD_SHIFT:
@ -1360,13 +1351,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
}
if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) {
/* Check the VMM hasn't introduced a new VM_SHARED VMA */
if (!shared)
ret = sanitise_mte_tags(kvm, pfn, vma_pagesize);
else
/* Check the VMM hasn't introduced a new disallowed VMA */
if (kvm_vma_mte_allowed(vma)) {
sanitise_mte_tags(kvm, pfn, vma_pagesize);
} else {
ret = -EFAULT;
if (ret)
goto out_unlock;
}
}
if (writable)
@ -1582,15 +1573,18 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_pfn_t pfn = pte_pfn(range->pte);
int ret;
if (!kvm->arch.mmu.pgt)
return false;
WARN_ON(range->end - range->start != 1);
ret = sanitise_mte_tags(kvm, pfn, PAGE_SIZE);
if (ret)
/*
* If the page isn't tagged, defer to user_mem_abort() for sanitising
* the MTE tags. The S2 pte should have been unmapped by
* mmu_notifier_invalidate_range_end().
*/
if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
return false;
/*
@ -1822,12 +1816,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
if (!vma)
break;
/*
* VM_SHARED mappings are not allowed with MTE to avoid races
* when updating the PG_mte_tagged page flag, see
* sanitise_mte_tags for more details.
*/
if (kvm_has_mte(kvm) && vma->vm_flags & VM_SHARED) {
if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
ret = -EINVAL;
break;
}

View file

@ -21,9 +21,12 @@ void copy_highpage(struct page *to, struct page *from)
copy_page(kto, kfrom);
if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
set_bit(PG_mte_tagged, &to->flags);
if (system_supports_mte() && page_mte_tagged(from)) {
page_kasan_tag_reset(to);
/* It's a new page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(to));
mte_copy_page_tags(kto, kfrom);
set_page_mte_tagged(to);
}
}
EXPORT_SYMBOL(copy_highpage);

View file

@ -937,6 +937,8 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
void tag_clear_highpage(struct page *page)
{
/* Newly allocated page, shouldn't have been tagged yet */
WARN_ON_ONCE(!try_page_mte_tagging(page));
mte_zero_clear_page_tags(page_address(page));
set_bit(PG_mte_tagged, &page->flags);
set_page_mte_tagged(page);
}

View file

@ -24,7 +24,7 @@ int mte_save_tags(struct page *page)
{
void *tag_storage, *ret;
if (!test_bit(PG_mte_tagged, &page->flags))
if (!page_mte_tagged(page))
return 0;
tag_storage = mte_allocate_tag_storage();
@ -46,21 +46,17 @@ int mte_save_tags(struct page *page)
return 0;
}
bool mte_restore_tags(swp_entry_t entry, struct page *page)
void mte_restore_tags(swp_entry_t entry, struct page *page)
{
void *tags = xa_load(&mte_pages, entry.val);
if (!tags)
return false;
return;
/*
* Test PG_mte_tagged again in case it was racing with another
* set_pte_at().
*/
if (!test_and_set_bit(PG_mte_tagged, &page->flags))
if (try_page_mte_tagging(page)) {
mte_restore_page_tags(page_address(page), tags);
return true;
set_page_mte_tagged(page);
}
}
void mte_invalidate_tags(int type, pgoff_t offset)

View file

@ -219,8 +219,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
#ifdef CONFIG_64BIT
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
return u;

View file

@ -18,5 +18,6 @@
#define KPF_UNCACHED 39
#define KPF_SOFTDIRTY 40
#define KPF_ARCH_2 41
#define KPF_ARCH_3 42
#endif /* LINUX_KERNEL_PAGE_FLAGS_H */

View file

@ -132,8 +132,9 @@ enum pageflags {
PG_young,
PG_idle,
#endif
#ifdef CONFIG_64BIT
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
PG_arch_2,
PG_arch_3,
#endif
#ifdef CONFIG_KASAN_HW_TAGS
PG_skip_kasan_poison,

View file

@ -91,10 +91,10 @@
#define IF_HAVE_PG_IDLE(flag,string)
#endif
#ifdef CONFIG_64BIT
#define IF_HAVE_PG_ARCH_2(flag,string) ,{1UL << flag, string}
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
#define IF_HAVE_PG_ARCH_X(flag,string) ,{1UL << flag, string}
#else
#define IF_HAVE_PG_ARCH_2(flag,string)
#define IF_HAVE_PG_ARCH_X(flag,string)
#endif
#ifdef CONFIG_KASAN_HW_TAGS
@ -130,7 +130,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) \
IF_HAVE_PG_ARCH_X(PG_arch_2, "arch_2" ) \
IF_HAVE_PG_ARCH_X(PG_arch_3, "arch_3" ) \
IF_HAVE_PG_SKIP_KASAN_POISON(PG_skip_kasan_poison, "skip_kasan_poison")
#define show_page_flags(flags) \

View file

@ -1005,6 +1005,14 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
config ARCH_USES_PG_ARCH_X
bool
help
Enable the definition of PG_arch_x page flags with x > 1. Only
suitable for 64-bit architectures with CONFIG_FLATMEM or
CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
enough room for additional bits in page->flags.
config VM_EVENT_COUNTERS
default y
bool "Enable VM event counters for /proc/vmstat" if EXPERT

View file

@ -2444,8 +2444,9 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
#ifdef CONFIG_64BIT
#ifdef CONFIG_ARCH_USES_PG_ARCH_X
(1L << PG_arch_2) |
(1L << PG_arch_3) |
#endif
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));