powerpc/book3s64/radix: add support for vmemmap optimization for radix

With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap
page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)).  Hence
with 64K page size, we don't use vmemmap deduplication for PMD-level
mapping.

[aneesh.kumar@linux.ibm.com: ppc64: don't include radix headers if CONFIG_PPC_RADIX_MMU=n]
  Link: https://lkml.kernel.org/r/87zg3jw8km.fsf@linux.ibm.com
Link: https://lkml.kernel.org/r/20230724190759.483013-12-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Aneesh Kumar K.V 2023-07-25 00:37:57 +05:30 committed by Andrew Morton
parent 368a0590d9
commit f2b79c0d79
6 changed files with 318 additions and 0 deletions

View file

@ -210,6 +210,7 @@ the device (altmap).
The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst
The differences with HugeTLB are relatively minor.

View file

@ -36,6 +36,7 @@ powerpc
ultravisor
vas-api
vcpudispatch_stats
vmemmap_dedup
features

View file

@ -0,0 +1,101 @@
.. SPDX-License-Identifier: GPL-2.0
==========
Device DAX
==========
The device-dax interface uses the tail deduplication technique explained in
Documentation/mm/vmemmap_dedup.rst
On powerpc, vmemmap deduplication is only used with radix MMU translation. Also
with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap
deduplication.
With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap
page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no
vmemmap deduplication possible.
With 1G PUD level mapping, we require 16384 struct pages and a single 64K
vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we
require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping.
Here's how things look like on device-dax after the sections are populated::
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
| | | 0 | -------------> | 0 |
| | +-----------+ +-----------+
| | | 1 | -------------> | 1 |
| | +-----------+ +-----------+
| | | 2 | ----------------^ ^ ^ ^ ^ ^
| | +-----------+ | | | | |
| | | 3 | ------------------+ | | | |
| | +-----------+ | | | |
| | | 4 | --------------------+ | | |
| PUD | +-----------+ | | |
| level | | . | ----------------------+ | |
| mapping | +-----------+ | |
| | | . | ------------------------+ |
| | +-----------+ |
| | | 15 | --------------------------+
| | +-----------+
| |
| |
| |
+-----------+
With 4K page size, 2M PMD level mapping requires 512 struct pages and a single
4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we
require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping.
Here's how things look like on device-dax after the sections are populated::
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
| | | 0 | -------------> | 0 |
| | +-----------+ +-----------+
| | | 1 | -------------> | 1 |
| | +-----------+ +-----------+
| | | 2 | ----------------^ ^ ^ ^ ^ ^
| | +-----------+ | | | | |
| | | 3 | ------------------+ | | | |
| | +-----------+ | | | |
| | | 4 | --------------------+ | | |
| PMD | +-----------+ | | |
| level | | 5 | ----------------------+ | |
| mapping | +-----------+ | |
| | | 6 | ------------------------+ |
| | +-----------+ |
| | | 7 | --------------------------+
| | +-----------+
| |
| |
| |
+-----------+
With 1G PUD level mapping, we require 262144 struct pages and a single 4K
vmemmap page can contain 64 struct pages (4K/sizeof(struct page)). Hence we
require 4096 4K pages in vmemmap to map the struct pages for 1G PUD level
mapping.
Here's how things look like on device-dax after the sections are populated::
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
| | | 0 | -------------> | 0 |
| | +-----------+ +-----------+
| | | 1 | -------------> | 1 |
| | +-----------+ +-----------+
| | | 2 | ----------------^ ^ ^ ^ ^ ^
| | +-----------+ | | | | |
| | | 3 | ------------------+ | | | |
| | +-----------+ | | | |
| | | 4 | --------------------+ | | |
| PUD | +-----------+ | | |
| level | | . | ----------------------+ | |
| mapping | +-----------+ | |
| | | . | ------------------------+ |
| | +-----------+ |
| | | 4095 | --------------------------+
| | +-----------+
| |
| |
| |
+-----------+

View file

@ -174,6 +174,7 @@ config PPC
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU
select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx
select ARCH_WEAK_RELEASE_ACQUIRE
select BINFMT_ELF

View file

@ -326,6 +326,7 @@ static inline pud_t radix__pud_mkdevmap(pud_t pud)
}
struct vmem_altmap;
struct dev_pagemap;
extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys);
@ -363,5 +364,15 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end);
void radix__kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
#define vmemmap_can_optimize vmemmap_can_optimize
bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
#endif
#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
unsigned long start,
unsigned long end, int node,
struct dev_pagemap *pgmap);
#endif /* __ASSEMBLY__ */
#endif

View file

@ -986,6 +986,15 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
return 0;
}
bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
{
if (radix_enabled())
return __vmemmap_can_optimize(altmap, pgmap);
return false;
}
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
unsigned long addr, unsigned long next)
{
@ -1193,6 +1202,200 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
return 0;
}
static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
struct vmem_altmap *altmap,
struct page *reuse)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset_k(addr);
p4d = p4d_offset(pgd, addr);
pud = vmemmap_pud_alloc(p4d, node, addr);
if (!pud)
return NULL;
pmd = vmemmap_pmd_alloc(pud, node, addr);
if (!pmd)
return NULL;
if (pmd_leaf(*pmd))
/*
* The second page is mapped as a hugepage due to a nearby request.
* Force our mapping to page size without deduplication
*/
return NULL;
pte = vmemmap_pte_alloc(pmd, node, addr);
if (!pte)
return NULL;
radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
return pte;
}
static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
unsigned long pfn_offset, int node)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned long map_addr;
/* the second vmemmap page which we use for duplication */
map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
pgd = pgd_offset_k(map_addr);
p4d = p4d_offset(pgd, map_addr);
pud = vmemmap_pud_alloc(p4d, node, map_addr);
if (!pud)
return NULL;
pmd = vmemmap_pmd_alloc(pud, node, map_addr);
if (!pmd)
return NULL;
if (pmd_leaf(*pmd))
/*
* The second page is mapped as a hugepage due to a nearby request.
* Force our mapping to page size without deduplication
*/
return NULL;
pte = vmemmap_pte_alloc(pmd, node, map_addr);
if (!pte)
return NULL;
/*
* Check if there exist a mapping to the left
*/
if (pte_none(*pte)) {
/*
* Populate the head page vmemmap page.
* It can fall in different pmd, hence
* vmemmap_populate_address()
*/
pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
if (!pte)
return NULL;
/*
* Populate the tail pages vmemmap page
*/
pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
if (!pte)
return NULL;
vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
return pte;
}
return pte;
}
int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
unsigned long start,
unsigned long end, int node,
struct dev_pagemap *pgmap)
{
/*
* we want to map things as base page size mapping so that
* we can save space in vmemmap. We could have huge mapping
* covering out both edges.
*/
unsigned long addr;
unsigned long addr_pfn = start_pfn;
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
for (addr = start; addr < end; addr = next) {
pgd = pgd_offset_k(addr);
p4d = p4d_offset(pgd, addr);
pud = vmemmap_pud_alloc(p4d, node, addr);
if (!pud)
return -ENOMEM;
pmd = vmemmap_pmd_alloc(pud, node, addr);
if (!pmd)
return -ENOMEM;
if (pmd_leaf(READ_ONCE(*pmd))) {
/* existing huge mapping. Skip the range */
addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
next = pmd_addr_end(addr, end);
continue;
}
pte = vmemmap_pte_alloc(pmd, node, addr);
if (!pte)
return -ENOMEM;
if (!pte_none(*pte)) {
/*
* This could be because we already have a compound
* page whose VMEMMAP_RESERVE_NR pages were mapped and
* this request fall in those pages.
*/
addr_pfn += 1;
next = addr + PAGE_SIZE;
continue;
} else {
unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
pte_t *tail_page_pte;
/*
* if the address is aligned to huge page size it is the
* head mapping.
*/
if (pfn_offset == 0) {
/* Populate the head page vmemmap page */
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
/*
* Populate the tail pages vmemmap page
* It can fall in different pmd, hence
* vmemmap_populate_address()
*/
pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
if (!pte)
return -ENOMEM;
addr_pfn += 2;
next = addr + 2 * PAGE_SIZE;
continue;
}
/*
* get the 2nd mapping details
* Also create it if that doesn't exist
*/
tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
if (!tail_page_pte) {
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
addr_pfn += 1;
next = addr + PAGE_SIZE;
continue;
}
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
addr_pfn += 1;
next = addr + PAGE_SIZE;
continue;
}
}
return 0;
}
#ifdef CONFIG_MEMORY_HOTPLUG
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{