mirror of
https://github.com/torvalds/linux
synced 2024-10-15 15:59:15 +00:00
powerpc/book3s64/radix: add support for vmemmap optimization for radix
With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence with 64K page size, we don't use vmemmap deduplication for PMD-level mapping. [aneesh.kumar@linux.ibm.com: ppc64: don't include radix headers if CONFIG_PPC_RADIX_MMU=n] Link: https://lkml.kernel.org/r/87zg3jw8km.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-12-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Joao Martins <joao.m.martins@oracle.com> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
368a0590d9
commit
f2b79c0d79
|
@ -210,6 +210,7 @@ the device (altmap).
|
|||
|
||||
The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
|
||||
PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
|
||||
For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst
|
||||
|
||||
The differences with HugeTLB are relatively minor.
|
||||
|
||||
|
|
|
@ -36,6 +36,7 @@ powerpc
|
|||
ultravisor
|
||||
vas-api
|
||||
vcpudispatch_stats
|
||||
vmemmap_dedup
|
||||
|
||||
features
|
||||
|
||||
|
|
101
Documentation/powerpc/vmemmap_dedup.rst
Normal file
101
Documentation/powerpc/vmemmap_dedup.rst
Normal file
|
@ -0,0 +1,101 @@
|
|||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========
|
||||
Device DAX
|
||||
==========
|
||||
|
||||
The device-dax interface uses the tail deduplication technique explained in
|
||||
Documentation/mm/vmemmap_dedup.rst
|
||||
|
||||
On powerpc, vmemmap deduplication is only used with radix MMU translation. Also
|
||||
with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap
|
||||
deduplication.
|
||||
|
||||
With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap
|
||||
page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no
|
||||
vmemmap deduplication possible.
|
||||
|
||||
With 1G PUD level mapping, we require 16384 struct pages and a single 64K
|
||||
vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we
|
||||
require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping.
|
||||
|
||||
Here's how things look like on device-dax after the sections are populated::
|
||||
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
|
||||
| | | 0 | -------------> | 0 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 1 | -------------> | 1 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 2 | ----------------^ ^ ^ ^ ^ ^
|
||||
| | +-----------+ | | | | |
|
||||
| | | 3 | ------------------+ | | | |
|
||||
| | +-----------+ | | | |
|
||||
| | | 4 | --------------------+ | | |
|
||||
| PUD | +-----------+ | | |
|
||||
| level | | . | ----------------------+ | |
|
||||
| mapping | +-----------+ | |
|
||||
| | | . | ------------------------+ |
|
||||
| | +-----------+ |
|
||||
| | | 15 | --------------------------+
|
||||
| | +-----------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
+-----------+
|
||||
|
||||
|
||||
With 4K page size, 2M PMD level mapping requires 512 struct pages and a single
|
||||
4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we
|
||||
require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping.
|
||||
|
||||
Here's how things look like on device-dax after the sections are populated::
|
||||
|
||||
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
|
||||
| | | 0 | -------------> | 0 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 1 | -------------> | 1 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 2 | ----------------^ ^ ^ ^ ^ ^
|
||||
| | +-----------+ | | | | |
|
||||
| | | 3 | ------------------+ | | | |
|
||||
| | +-----------+ | | | |
|
||||
| | | 4 | --------------------+ | | |
|
||||
| PMD | +-----------+ | | |
|
||||
| level | | 5 | ----------------------+ | |
|
||||
| mapping | +-----------+ | |
|
||||
| | | 6 | ------------------------+ |
|
||||
| | +-----------+ |
|
||||
| | | 7 | --------------------------+
|
||||
| | +-----------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
+-----------+
|
||||
|
||||
With 1G PUD level mapping, we require 262144 struct pages and a single 4K
|
||||
vmemmap page can contain 64 struct pages (4K/sizeof(struct page)). Hence we
|
||||
require 4096 4K pages in vmemmap to map the struct pages for 1G PUD level
|
||||
mapping.
|
||||
|
||||
Here's how things look like on device-dax after the sections are populated::
|
||||
|
||||
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
|
||||
| | | 0 | -------------> | 0 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 1 | -------------> | 1 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 2 | ----------------^ ^ ^ ^ ^ ^
|
||||
| | +-----------+ | | | | |
|
||||
| | | 3 | ------------------+ | | | |
|
||||
| | +-----------+ | | | |
|
||||
| | | 4 | --------------------+ | | |
|
||||
| PUD | +-----------+ | | |
|
||||
| level | | . | ----------------------+ | |
|
||||
| mapping | +-----------+ | |
|
||||
| | | . | ------------------------+ |
|
||||
| | +-----------+ |
|
||||
| | | 4095 | --------------------------+
|
||||
| | +-----------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
+-----------+
|
|
@ -174,6 +174,7 @@ config PPC
|
|||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU
|
||||
select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx
|
||||
select ARCH_WEAK_RELEASE_ACQUIRE
|
||||
select BINFMT_ELF
|
||||
|
|
|
@ -326,6 +326,7 @@ static inline pud_t radix__pud_mkdevmap(pud_t pud)
|
|||
}
|
||||
|
||||
struct vmem_altmap;
|
||||
struct dev_pagemap;
|
||||
extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
|
||||
unsigned long page_size,
|
||||
unsigned long phys);
|
||||
|
@ -363,5 +364,15 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end);
|
|||
|
||||
void radix__kernel_map_pages(struct page *page, int numpages, int enable);
|
||||
|
||||
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
|
||||
#define vmemmap_can_optimize vmemmap_can_optimize
|
||||
bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap);
|
||||
#endif
|
||||
|
||||
#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
|
||||
int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
|
||||
unsigned long start,
|
||||
unsigned long end, int node,
|
||||
struct dev_pagemap *pgmap);
|
||||
#endif /* __ASSEMBLY__ */
|
||||
#endif
|
||||
|
|
|
@ -986,6 +986,15 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start,
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (radix_enabled())
|
||||
return __vmemmap_can_optimize(altmap, pgmap);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
|
||||
unsigned long addr, unsigned long next)
|
||||
{
|
||||
|
@ -1193,6 +1202,200 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
|
|||
return 0;
|
||||
}
|
||||
|
||||
static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
|
||||
struct vmem_altmap *altmap,
|
||||
struct page *reuse)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
pgd = pgd_offset_k(addr);
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
pud = vmemmap_pud_alloc(p4d, node, addr);
|
||||
if (!pud)
|
||||
return NULL;
|
||||
pmd = vmemmap_pmd_alloc(pud, node, addr);
|
||||
if (!pmd)
|
||||
return NULL;
|
||||
if (pmd_leaf(*pmd))
|
||||
/*
|
||||
* The second page is mapped as a hugepage due to a nearby request.
|
||||
* Force our mapping to page size without deduplication
|
||||
*/
|
||||
return NULL;
|
||||
pte = vmemmap_pte_alloc(pmd, node, addr);
|
||||
if (!pte)
|
||||
return NULL;
|
||||
radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
|
||||
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
|
||||
unsigned long pfn_offset, int node)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
unsigned long map_addr;
|
||||
|
||||
/* the second vmemmap page which we use for duplication */
|
||||
map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
|
||||
pgd = pgd_offset_k(map_addr);
|
||||
p4d = p4d_offset(pgd, map_addr);
|
||||
pud = vmemmap_pud_alloc(p4d, node, map_addr);
|
||||
if (!pud)
|
||||
return NULL;
|
||||
pmd = vmemmap_pmd_alloc(pud, node, map_addr);
|
||||
if (!pmd)
|
||||
return NULL;
|
||||
if (pmd_leaf(*pmd))
|
||||
/*
|
||||
* The second page is mapped as a hugepage due to a nearby request.
|
||||
* Force our mapping to page size without deduplication
|
||||
*/
|
||||
return NULL;
|
||||
pte = vmemmap_pte_alloc(pmd, node, map_addr);
|
||||
if (!pte)
|
||||
return NULL;
|
||||
/*
|
||||
* Check if there exist a mapping to the left
|
||||
*/
|
||||
if (pte_none(*pte)) {
|
||||
/*
|
||||
* Populate the head page vmemmap page.
|
||||
* It can fall in different pmd, hence
|
||||
* vmemmap_populate_address()
|
||||
*/
|
||||
pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
|
||||
if (!pte)
|
||||
return NULL;
|
||||
/*
|
||||
* Populate the tail pages vmemmap page
|
||||
*/
|
||||
pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
|
||||
if (!pte)
|
||||
return NULL;
|
||||
vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
|
||||
return pte;
|
||||
}
|
||||
return pte;
|
||||
}
|
||||
|
||||
int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
|
||||
unsigned long start,
|
||||
unsigned long end, int node,
|
||||
struct dev_pagemap *pgmap)
|
||||
{
|
||||
/*
|
||||
* we want to map things as base page size mapping so that
|
||||
* we can save space in vmemmap. We could have huge mapping
|
||||
* covering out both edges.
|
||||
*/
|
||||
unsigned long addr;
|
||||
unsigned long addr_pfn = start_pfn;
|
||||
unsigned long next;
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
for (addr = start; addr < end; addr = next) {
|
||||
|
||||
pgd = pgd_offset_k(addr);
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
pud = vmemmap_pud_alloc(p4d, node, addr);
|
||||
if (!pud)
|
||||
return -ENOMEM;
|
||||
pmd = vmemmap_pmd_alloc(pud, node, addr);
|
||||
if (!pmd)
|
||||
return -ENOMEM;
|
||||
|
||||
if (pmd_leaf(READ_ONCE(*pmd))) {
|
||||
/* existing huge mapping. Skip the range */
|
||||
addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
|
||||
next = pmd_addr_end(addr, end);
|
||||
continue;
|
||||
}
|
||||
pte = vmemmap_pte_alloc(pmd, node, addr);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
if (!pte_none(*pte)) {
|
||||
/*
|
||||
* This could be because we already have a compound
|
||||
* page whose VMEMMAP_RESERVE_NR pages were mapped and
|
||||
* this request fall in those pages.
|
||||
*/
|
||||
addr_pfn += 1;
|
||||
next = addr + PAGE_SIZE;
|
||||
continue;
|
||||
} else {
|
||||
unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
|
||||
unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
|
||||
pte_t *tail_page_pte;
|
||||
|
||||
/*
|
||||
* if the address is aligned to huge page size it is the
|
||||
* head mapping.
|
||||
*/
|
||||
if (pfn_offset == 0) {
|
||||
/* Populate the head page vmemmap page */
|
||||
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* Populate the tail pages vmemmap page
|
||||
* It can fall in different pmd, hence
|
||||
* vmemmap_populate_address()
|
||||
*/
|
||||
pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
|
||||
addr_pfn += 2;
|
||||
next = addr + 2 * PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
/*
|
||||
* get the 2nd mapping details
|
||||
* Also create it if that doesn't exist
|
||||
*/
|
||||
tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
|
||||
if (!tail_page_pte) {
|
||||
|
||||
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
||||
|
||||
addr_pfn += 1;
|
||||
next = addr + PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
|
||||
pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
|
||||
if (!pte)
|
||||
return -ENOMEM;
|
||||
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
|
||||
|
||||
addr_pfn += 1;
|
||||
next = addr + PAGE_SIZE;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG
|
||||
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue