linux/mm/sparse-vmemmap.c
Muchun Song f41f2ed43c mm: hugetlb: free the vmemmap pages associated with each HugeTLB page
Every HugeTLB has more than one struct page structure.  We __know__ that
we only use the first 4 (__NR_USED_SUBPAGE) struct page structures to
store metadata associated with each HugeTLB.

There are a lot of struct page structures associated with each HugeTLB
page.  For tail pages, the value of compound_head is the same.  So we can
reuse first page of tail page structures.  We map the virtual addresses of
the remaining pages of tail page structures to the first tail page struct,
and then free these page frames.  Therefore, we need to reserve two pages
as vmemmap areas.

When we allocate a HugeTLB page from the buddy, we can free some vmemmap
pages associated with each HugeTLB page.  It is more appropriate to do it
in the prep_new_huge_page().

The free_vmemmap_pages_per_hpage(), which indicates how many vmemmap pages
associated with a HugeTLB page can be freed, returns zero for now, which
means the feature is disabled.  We will enable it once all the
infrastructure is there.

[willy@infradead.org: fix documentation warning]
  Link: https://lkml.kernel.org/r/20210615200242.1716568-5-willy@infradead.org

Link: https://lkml.kernel.org/r/20210510030027.56044-5-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Chen Huang <chenhuang5@huawei.com>
Tested-by: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: HORIGUCHI NAOYA <naoya.horiguchi@nec.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Joao Martins <joao.m.martins@oracle.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Oliver Neukum <oneukum@suse.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2021-06-30 20:47:25 -07:00

460 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Virtual Memory Map support
*
* (C) 2007 sgi. Christoph Lameter.
*
* Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
* virt_to_page, page_address() to be implemented as a base offset
* calculation without memory access.
*
* However, virtual mappings need a page table and TLBs. Many Linux
* architectures already map their physical space using 1-1 mappings
* via TLBs. For those arches the virtual memory map is essentially
* for free if we use the same page size as the 1-1 mappings. In that
* case the overhead consists of a few additional pages that are
* allocated to create a view of memory for vmemmap.
*
* The architecture is expected to provide a vmemmap_populate() function
* to instantiate the mapping.
*/
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/memblock.h>
#include <linux/memremap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/bootmem_info.h>
#include <asm/dma.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
/**
* struct vmemmap_remap_walk - walk vmemmap page table
*
* @remap_pte: called for each lowest-level entry (PTE).
* @reuse_page: the page which is reused for the tail vmemmap pages.
* @reuse_addr: the virtual address of the @reuse_page page.
* @vmemmap_pages: the list head of the vmemmap pages that can be freed.
*/
struct vmemmap_remap_walk {
void (*remap_pte)(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk);
struct page *reuse_page;
unsigned long reuse_addr;
struct list_head *vmemmap_pages;
};
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
/*
* The reuse_page is found 'first' in table walk before we start
* remapping (which is calling @walk->remap_pte).
*/
if (!walk->reuse_page) {
walk->reuse_page = pte_page(*pte);
/*
* Because the reuse address is part of the range that we are
* walking, skip the reuse address range.
*/
addr += PAGE_SIZE;
pte++;
}
for (; addr != end; addr += PAGE_SIZE, pte++)
walk->remap_pte(pte, addr, walk);
}
static void vmemmap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
{
pmd_t *pmd;
unsigned long next;
pmd = pmd_offset(pud, addr);
do {
BUG_ON(pmd_leaf(*pmd));
next = pmd_addr_end(addr, end);
vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end);
}
static void vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
{
pud_t *pud;
unsigned long next;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
vmemmap_pmd_range(pud, addr, next, walk);
} while (pud++, addr = next, addr != end);
}
static void vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
{
p4d_t *p4d;
unsigned long next;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
vmemmap_pud_range(p4d, addr, next, walk);
} while (p4d++, addr = next, addr != end);
}
static void vmemmap_remap_range(unsigned long start, unsigned long end,
struct vmemmap_remap_walk *walk)
{
unsigned long addr = start;
unsigned long next;
pgd_t *pgd;
VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
vmemmap_p4d_range(pgd, addr, next, walk);
} while (pgd++, addr = next, addr != end);
/*
* We only change the mapping of the vmemmap virtual address range
* [@start + PAGE_SIZE, end), so we only need to flush the TLB which
* belongs to the range.
*/
flush_tlb_kernel_range(start + PAGE_SIZE, end);
}
/*
* Free a vmemmap page. A vmemmap page can be allocated from the memblock
* allocator or buddy allocator. If the PG_reserved flag is set, it means
* that it allocated from the memblock allocator, just free it via the
* free_bootmem_page(). Otherwise, use __free_page().
*/
static inline void free_vmemmap_page(struct page *page)
{
if (PageReserved(page))
free_bootmem_page(page);
else
__free_page(page);
}
/* Free a list of the vmemmap pages */
static void free_vmemmap_page_list(struct list_head *list)
{
struct page *page, *next;
list_for_each_entry_safe(page, next, list, lru) {
list_del(&page->lru);
free_vmemmap_page(page);
}
}
static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk)
{
/*
* Remap the tail pages as read-only to catch illegal write operation
* to the tail pages.
*/
pgprot_t pgprot = PAGE_KERNEL_RO;
pte_t entry = mk_pte(walk->reuse_page, pgprot);
struct page *page = pte_page(*pte);
list_add(&page->lru, walk->vmemmap_pages);
set_pte_at(&init_mm, addr, pte, entry);
}
/**
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
* to the page which @reuse is mapped to, then free vmemmap
* which the range are mapped to.
* @start: start address of the vmemmap virtual address range that we want
* to remap.
* @end: end address of the vmemmap virtual address range that we want to
* remap.
* @reuse: reuse address.
*
* Note: This function depends on vmemmap being base page mapped. Please make
* sure that we disable PMD mapping of vmemmap pages when calling this function.
*/
void vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse)
{
LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_remap_pte,
.reuse_addr = reuse,
.vmemmap_pages = &vmemmap_pages,
};
/*
* In order to make remapping routine most efficient for the huge pages,
* the routine of vmemmap page table walking has the following rules
* (see more details from the vmemmap_pte_range()):
*
* - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
* should be continuous.
* - The @reuse address is part of the range [@reuse, @end) that we are
* walking which is passed to vmemmap_remap_range().
* - The @reuse address is the first in the complete range.
*
* So we need to make sure that @start and @reuse meet the above rules.
*/
BUG_ON(start - reuse != PAGE_SIZE);
vmemmap_remap_range(reuse, end, &walk);
free_vmemmap_page_list(&vmemmap_pages);
}
/*
* Allocate a block of memory to be used to back the virtual memory map
* or to back the page tables that are used to create the mapping.
* Uses the main allocators if they are available, else bootmem.
*/
static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long size,
unsigned long align,
unsigned long goal)
{
return memblock_alloc_try_nid_raw(size, align, goal,
MEMBLOCK_ALLOC_ACCESSIBLE, node);
}
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
if (slab_is_available()) {
gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
int order = get_order(size);
static bool warned;
struct page *page;
page = alloc_pages_node(node, gfp_mask, order);
if (page)
return page_address(page);
if (!warned) {
warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
"vmemmap alloc failure: order:%u", order);
warned = true;
}
return NULL;
} else
return __earlyonly_bootmem_alloc(node, size, size,
__pa(MAX_DMA_ADDRESS));
}
static void * __meminit altmap_alloc_block_buf(unsigned long size,
struct vmem_altmap *altmap);
/* need to make sure size is all the same during early stage */
void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
struct vmem_altmap *altmap)
{
void *ptr;
if (altmap)
return altmap_alloc_block_buf(size, altmap);
ptr = sparse_buffer_alloc(size);
if (!ptr)
ptr = vmemmap_alloc_block(size, node);
return ptr;
}
static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
{
return altmap->base_pfn + altmap->reserve + altmap->alloc
+ altmap->align;
}
static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
{
unsigned long allocated = altmap->alloc + altmap->align;
if (altmap->free > allocated)
return altmap->free - allocated;
return 0;
}
static void * __meminit altmap_alloc_block_buf(unsigned long size,
struct vmem_altmap *altmap)
{
unsigned long pfn, nr_pfns, nr_align;
if (size & ~PAGE_MASK) {
pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
__func__, size);
return NULL;
}
pfn = vmem_altmap_next_pfn(altmap);
nr_pfns = size >> PAGE_SHIFT;
nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
nr_align = ALIGN(pfn, nr_align) - pfn;
if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
return NULL;
altmap->alloc += nr_pfns;
altmap->align += nr_align;
pfn += nr_align;
pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
__func__, pfn, altmap->alloc, altmap->align, nr_pfns);
return __va(__pfn_to_phys(pfn));
}
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
unsigned long pfn = pte_pfn(*pte);
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
pr_warn("[%lx-%lx] potential offnode page_structs\n",
start, end - 1);
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
struct vmem_altmap *altmap)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
void *p;
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
set_pte_at(&init_mm, addr, pte, entry);
}
return pte;
}
static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
{
void *p = vmemmap_alloc_block(size, node);
if (!p)
return NULL;
memset(p, 0, size);
return p;
}
pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pmd_populate_kernel(&init_mm, pmd, p);
}
return pmd;
}
pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pud_populate(&init_mm, pud, p);
}
return pud;
}
p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
p4d_populate(&init_mm, p4d, p);
}
return p4d;
}
pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
pgd_t *pgd = pgd_offset_k(addr);
if (pgd_none(*pgd)) {
void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pgd_populate(&init_mm, pgd, p);
}
return pgd;
}
int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
int node, struct vmem_altmap *altmap)
{
unsigned long addr = start;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
for (; addr < end; addr += PAGE_SIZE) {
pgd = vmemmap_pgd_populate(addr, node);
if (!pgd)
return -ENOMEM;
p4d = vmemmap_p4d_populate(pgd, addr, node);
if (!p4d)
return -ENOMEM;
pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return -ENOMEM;
pte = vmemmap_pte_populate(pmd, addr, node, altmap);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
}
return 0;
}
struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long start = (unsigned long) pfn_to_page(pfn);
unsigned long end = start + nr_pages * sizeof(struct page);
if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
return NULL;
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
return pfn_to_page(pfn);
}