vm_page: Implement lazy page initialization

FreeBSD's boot times have decreased to the point where vm_page array
initialization represents a significant fraction of the total boot time.
For example, when booting FreeBSD in Firecracker (a VMM designed to
support lightweight VMs) with 128MB and 1GB of RAM, vm_page
initialization consumes 9% (3ms) and 37% (21.5ms) of the kernel boot
time, respectively.  This is generally relevant in cloud environments,
where one wants to be able to spin up VMs as quickly as possible.

This patch implements lazy initialization of (most) page structures,
following a suggestion from cperciva@.  The idea is to introduce a new
free pool, VM_FREEPOOL_LAZYINIT, into which all vm_page structures are
initially placed.  For this to work, we need only initialize the first
free page of each chunk placed into the buddy allocator.  Then, early
page allocations draw from the lazy init pool and initialize vm_page
chunks (up to 16MB, 4096 pages) on demand.  Once APs are started, an
idle-priority thread drains the lazy init pool in the background to
avoid introducing extra latency in the allocator.  With this scheme,
almost all of the initialization work is moved out of the critical path.

A couple of vm_phys operations require the pool to be drained before
they can run: vm_phys_find_range() and vm_phys_unfree_page().  However,
these are rare operations.  I believe that
vm_phys_find_freelist_contig() does not require any special treatment,
as it only ever accesses the first page in a power-of-2-sized free page
chunk, which is always initialized.

For now the new pool is only used on amd64 and arm64, since that's where
I can easily test and those platforms would get the most benefit.

Reviewed by:	alc, kib
Differential Revision:	https://reviews.freebsd.org/D40403
This commit is contained in:
Mark Johnston 2024-06-13 20:11:47 -04:00
parent 69cbb18746
commit b16b4c22d2
5 changed files with 262 additions and 36 deletions

View File

@ -94,14 +94,16 @@
#define VM_PHYSSEG_MAX 63
/*
* Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
* from which physical pages are allocated and VM_FREEPOOL_DIRECT is
* the pool from which physical pages for page tables and small UMA
* objects are allocated.
* Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from
* which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from
* which physical pages for page tables and small UMA objects are allocated.
* VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during
* boot and is used to implement deferred initialization of page structures.
*/
#define VM_NFREEPOOL 2
#define VM_FREEPOOL_DEFAULT 0
#define VM_FREEPOOL_DIRECT 1
#define VM_NFREEPOOL 3
#define VM_FREEPOOL_LAZYINIT 0
#define VM_FREEPOOL_DEFAULT 1
#define VM_FREEPOOL_DIRECT 2
/*
* Create up to three free page lists: VM_FREELIST_DMA32 is for physical pages

View File

@ -73,14 +73,16 @@
#define VM_PHYSSEG_MAX 64
/*
* Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
* from which physical pages are allocated and VM_FREEPOOL_DIRECT is
* the pool from which physical pages for small UMA objects are
* allocated.
* Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from
* which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from
* which physical pages for page tables and small UMA objects are allocated.
* VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during
* boot and is used to implement deferred initialization of page structures.
*/
#define VM_NFREEPOOL 2
#define VM_FREEPOOL_DEFAULT 0
#define VM_FREEPOOL_DIRECT 1
#define VM_NFREEPOOL 3
#define VM_FREEPOOL_LAZYINIT 0
#define VM_FREEPOOL_DEFAULT 1
#define VM_FREEPOOL_DIRECT 2
/*
* Create two free page lists: VM_FREELIST_DMA32 is for physical pages that have

View File

@ -333,9 +333,9 @@ vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
if (m == NULL)
return (true); /* page does not exist, no failure */
vmd = vm_pagequeue_domain(m);
vmd = VM_DOMAIN(vm_phys_domain(pa));
vm_domain_free_lock(vmd);
found = vm_phys_unfree_page(m);
found = vm_phys_unfree_page(pa);
vm_domain_free_unlock(vmd);
if (found) {
vm_domain_freecnt_inc(vmd, -1);
@ -568,6 +568,9 @@ vm_page_startup(vm_offset_t vaddr)
#if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
long ii;
#endif
#ifdef VM_FREEPOOL_LAZYINIT
int lazyinit;
#endif
vaddr = round_page(vaddr);
@ -748,6 +751,11 @@ vm_page_startup(vm_offset_t vaddr)
*/
vm_phys_init();
#ifdef VM_FREEPOOL_LAZYINIT
lazyinit = 1;
TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit);
#endif
/*
* Initialize the page structures and add every available page to the
* physical memory allocator's free lists.
@ -763,9 +771,50 @@ vm_page_startup(vm_offset_t vaddr)
vm_cnt.v_page_count = 0;
for (segind = 0; segind < vm_phys_nsegs; segind++) {
seg = &vm_phys_segs[segind];
for (m = seg->first_page, pa = seg->start; pa < seg->end;
m++, pa += PAGE_SIZE)
vm_page_init_page(m, pa, segind, VM_FREEPOOL_DEFAULT);
/*
* If lazy vm_page initialization is not enabled, simply
* initialize all of the pages in the segment. Otherwise, we
* only initialize:
* 1. Pages not covered by phys_avail[], since they might be
* freed to the allocator at some future point, e.g., by
* kmem_bootstrap_free().
* 2. The first page of each run of free pages handed to the
* vm_phys allocator, which in turn defers initialization
* of pages until they are needed.
* This avoids blocking the boot process for long periods, which
* may be relevant for VMs (which ought to boot as quickly as
* possible) and/or systems with large amounts of physical
* memory.
*/
#ifdef VM_FREEPOOL_LAZYINIT
if (lazyinit) {
startp = seg->start;
for (i = 0; phys_avail[i + 1] != 0; i += 2) {
if (startp >= seg->end)
break;
if (phys_avail[i + 1] < startp)
continue;
if (phys_avail[i] <= startp) {
startp = phys_avail[i + 1];
continue;
}
m = vm_phys_seg_paddr_to_vm_page(seg, startp);
for (endp = MIN(phys_avail[i], seg->end);
startp < endp; startp += PAGE_SIZE, m++) {
vm_page_init_page(m, startp, segind,
VM_FREEPOOL_DEFAULT);
}
}
} else
#endif
for (m = seg->first_page, pa = seg->start;
pa < seg->end; m++, pa += PAGE_SIZE) {
vm_page_init_page(m, pa, segind,
VM_FREEPOOL_DEFAULT);
}
/*
* Add the segment's pages that are covered by one of
@ -783,6 +832,12 @@ vm_page_startup(vm_offset_t vaddr)
continue;
m = vm_phys_seg_paddr_to_vm_page(seg, startp);
#ifdef VM_FREEPOOL_LAZYINIT
if (lazyinit) {
vm_page_init_page(m, startp, segind,
VM_FREEPOOL_LAZYINIT);
}
#endif
vmd = VM_DOMAIN(seg->domain);
vm_domain_free_lock(vmd);
vm_phys_enqueue_contig(m, pagecount);

View File

@ -47,14 +47,18 @@
#include <sys/domainset.h>
#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <sys/sbuf.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/tree.h>
#include <sys/tslog.h>
#include <sys/unistd.h>
#include <sys/vmmeter.h>
#include <ddb/ddb.h>
@ -141,6 +145,7 @@ vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
* Provides the mapping from VM_FREELIST_* to free list indices (flind).
*/
static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
static int __read_mostly vm_default_freepool;
CTASSERT(VM_FREELIST_DEFAULT == 0);
@ -184,6 +189,16 @@ static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
int order, int tail);
static bool __diagused
vm_phys_pool_valid(int pool)
{
#ifdef VM_FREEPOOL_LAZYINIT
if (pool == VM_FREEPOOL_LAZYINIT)
return (false);
#endif
return (pool >= 0 && pool < VM_NFREEPOOL);
}
/*
* Red-black tree helpers for vm fictitious range management.
*/
@ -621,6 +636,12 @@ vm_phys_init(void)
}
}
#ifdef VM_FREEPOOL_LAZYINIT
vm_default_freepool = VM_FREEPOOL_LAZYINIT;
#else
vm_default_freepool = VM_FREEPOOL_DEFAULT;
#endif
rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
}
@ -687,6 +708,17 @@ vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail)
("%s: invalid order %d", __func__, order));
vm_freelist_add(fl, m, order, tail);
#ifdef VM_FREEPOOL_LAZYINIT
if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
vm_page_t m_next;
int npages;
npages = 1 << order;
m_next = m + npages;
vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind,
VM_FREEPOOL_LAZYINIT);
}
#endif
}
/*
@ -760,15 +792,33 @@ vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
}
/*
* Set the pool for a contiguous, power of two-sized set of physical pages.
* Set the pool for a contiguous, power of two-sized set of physical pages.
*
* If the pages currently belong to the lazy init pool, then the corresponding
* page structures must be initialized. In this case it is assumed that the
* first page in the run has already been initialized.
*/
static void
vm_phys_set_pool(int pool, vm_page_t m, int order)
{
vm_page_t m_tmp;
#ifdef VM_FREEPOOL_LAZYINIT
if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
vm_paddr_t pa;
int segind;
for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
m_tmp->pool = pool;
m->pool = pool;
TSENTER();
pa = m->phys_addr + PAGE_SIZE;
segind = m->segind;
for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
m_tmp++, pa += PAGE_SIZE)
vm_page_init_page(m_tmp, pa, segind, pool);
TSEXIT();
} else
#endif
for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
m_tmp->pool = pool;
}
/*
@ -792,7 +842,7 @@ vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
KASSERT(domain >= 0 && domain < vm_ndomains,
("vm_phys_alloc_npages: domain %d is out of range", domain));
KASSERT(pool < VM_NFREEPOOL,
KASSERT(vm_phys_pool_valid(pool),
("vm_phys_alloc_npages: pool %d is out of range", pool));
KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
("vm_phys_alloc_npages: npages %d is out of range", npages));
@ -821,7 +871,8 @@ vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
}
}
for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
pind++) {
alt = vm_phys_free_queues[domain][flind][pind];
while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
NULL) {
@ -888,7 +939,7 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
KASSERT(freelist < VM_NFREELIST,
("vm_phys_alloc_freelist_pages: freelist %d is out of range",
freelist));
KASSERT(pool < VM_NFREEPOOL,
KASSERT(vm_phys_pool_valid(pool),
("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
KASSERT(order < VM_NFREEORDER,
("vm_phys_alloc_freelist_pages: order %d is out of range", order));
@ -917,7 +968,7 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
* use them to satisfy the allocation.
*/
for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
alt = &vm_phys_free_queues[domain][flind][pind][0];
m = TAILQ_FIRST(&alt[oind].pl);
if (m != NULL) {
@ -1157,7 +1208,7 @@ vm_phys_free_pages(vm_page_t m, int order)
KASSERT(m->order == VM_NFREEORDER,
("vm_phys_free_pages: page %p has unexpected order %d",
m, m->order));
KASSERT(m->pool < VM_NFREEPOOL,
KASSERT(vm_phys_pool_valid(m->pool),
("vm_phys_free_pages: page %p has unexpected pool %d",
m, m->pool));
KASSERT(order < VM_NFREEORDER,
@ -1186,6 +1237,107 @@ vm_phys_free_pages(vm_page_t m, int order)
vm_freelist_add(fl, m, order, 1);
}
#ifdef VM_FREEPOOL_LAZYINIT
/*
* Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
* them to the default pool. This is a prerequisite for some rare operations
* which need to scan the page array and thus depend on all pages being
* initialized.
*/
static void
vm_phys_lazy_init_domain(int domain, bool locked)
{
static bool initdone[MAXMEMDOM];
struct vm_domain *vmd;
struct vm_freelist *fl;
vm_page_t m;
int pind;
bool unlocked;
if (__predict_true(atomic_load_bool(&initdone[domain])))
return;
vmd = VM_DOMAIN(domain);
if (locked)
vm_domain_free_assert_locked(vmd);
else
vm_domain_free_lock(vmd);
if (atomic_load_bool(&initdone[domain]))
goto out;
pind = VM_FREEPOOL_LAZYINIT;
for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
int flind;
flind = vm_freelist_to_flind[freelist];
if (flind < 0)
continue;
fl = vm_phys_free_queues[domain][flind][pind];
for (int oind = 0; oind < VM_NFREEORDER; oind++) {
if (atomic_load_int(&fl[oind].lcnt) == 0)
continue;
while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
/*
* Avoid holding the lock across the
* initialization unless there's a free page
* shortage.
*/
vm_freelist_rem(fl, m, oind);
unlocked = vm_domain_allocate(vmd,
VM_ALLOC_NORMAL, 1 << oind);
if (unlocked)
vm_domain_free_unlock(vmd);
vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
if (unlocked) {
vm_domain_freecnt_inc(vmd, 1 << oind);
vm_domain_free_lock(vmd);
}
vm_phys_free_pages(m, oind);
}
}
}
atomic_store_bool(&initdone[domain], true);
out:
if (!locked)
vm_domain_free_unlock(vmd);
}
static void
vm_phys_lazy_init(void)
{
for (int domain = 0; domain < vm_ndomains; domain++)
vm_phys_lazy_init_domain(domain, false);
atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
}
static void
vm_phys_lazy_init_kthr(void *arg __unused)
{
vm_phys_lazy_init();
kthread_exit();
}
static void
vm_phys_lazy_sysinit(void *arg __unused)
{
struct thread *td;
int error;
error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
RFSTOPPED, 0, "vmlazyinit");
if (error == 0) {
thread_lock(td);
sched_prio(td, PRI_MIN_IDLE);
sched_add(td, SRQ_BORING);
} else {
printf("%s: could not create lazy init thread: %d\n",
__func__, error);
vm_phys_lazy_init();
}
}
SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
NULL);
#endif /* VM_FREEPOOL_LAZYINIT */
/*
* Free a contiguous, arbitrarily sized set of physical pages, without
* merging across set boundaries.
@ -1291,6 +1443,12 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
pa_end = MIN(high, seg->end);
if (pa_end - pa_start < ptoa(npages))
continue;
#ifdef VM_FREEPOOL_LAZYINIT
/*
* The pages on the free lists must be initialized.
*/
vm_phys_lazy_init_domain(domain, false);
#endif
bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
bounds[1] = vm_phys_seg_paddr_to_vm_page(seg, pa_end);
return (seg - vm_phys_segs);
@ -1306,21 +1464,30 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
* The free page queues must be locked.
*/
bool
vm_phys_unfree_page(vm_page_t m)
vm_phys_unfree_page(vm_paddr_t pa)
{
struct vm_freelist *fl;
struct vm_phys_seg *seg;
vm_paddr_t pa, pa_half;
vm_page_t m_set, m_tmp;
vm_paddr_t pa_half;
vm_page_t m, m_set, m_tmp;
int order;
seg = vm_phys_paddr_to_seg(pa);
vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
/*
* The pages on the free lists must be initialized.
*/
#ifdef VM_FREEPOOL_LAZYINIT
vm_phys_lazy_init_domain(seg->domain, true);
#endif
/*
* First, find the contiguous, power of two-sized set of free
* physical pages containing the given physical page "m" and
* assign it to "m_set".
*/
seg = &vm_phys_segs[m->segind];
vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
m = vm_phys_paddr_to_vm_page(pa);
for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
order < VM_NFREEORDER - 1; ) {
order++;
@ -1459,7 +1626,7 @@ vm_phys_find_queues_contig(
/* Search for a large enough free block. */
size = npages << PAGE_SHIFT;
for (oind = order; oind < VM_NFREEORDER; oind++) {
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
fl = (*queues)[pind];
TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
/*
@ -1479,7 +1646,7 @@ vm_phys_find_queues_contig(
if (order < VM_NFREEORDER)
return (NULL);
/* Search for a long-enough sequence of max-order blocks. */
for (pind = 0; pind < VM_NFREEPOOL; pind++) {
for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
fl = (*queues)[pind];
m_ret = vm_phys_find_freelist_contig(fl, npages,
low, high, alignment, boundary);

View File

@ -80,7 +80,7 @@ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
vm_page_t vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa);
void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
int *locality);
bool vm_phys_unfree_page(vm_page_t m);
bool vm_phys_unfree_page(vm_paddr_t pa);
int vm_phys_mem_affinity(int f, int t);
void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end);
vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size);