vm_page: Implement lazy page initialization

FreeBSD's boot times have decreased to the point where vm_page array initialization represents a significant fraction of the total boot time. For example, when booting FreeBSD in Firecracker (a VMM designed to support lightweight VMs) with 128MB and 1GB of RAM, vm_page initialization consumes 9% (3ms) and 37% (21.5ms) of the kernel boot time, respectively. This is generally relevant in cloud environments, where one wants to be able to spin up VMs as quickly as possible. This patch implements lazy initialization of (most) page structures, following a suggestion from cperciva@. The idea is to introduce a new free pool, VM_FREEPOOL_LAZYINIT, into which all vm_page structures are initially placed. For this to work, we need only initialize the first free page of each chunk placed into the buddy allocator. Then, early page allocations draw from the lazy init pool and initialize vm_page chunks (up to 16MB, 4096 pages) on demand. Once APs are started, an idle-priority thread drains the lazy init pool in the background to avoid introducing extra latency in the allocator. With this scheme, almost all of the initialization work is moved out of the critical path. A couple of vm_phys operations require the pool to be drained before they can run: vm_phys_find_range() and vm_phys_unfree_page(). However, these are rare operations. I believe that vm_phys_find_freelist_contig() does not require any special treatment, as it only ever accesses the first page in a power-of-2-sized free page chunk, which is always initialized. For now the new pool is only used on amd64 and arm64, since that's where I can easily test and those platforms would get the most benefit. Reviewed by: alc, kib Differential Revision: https://reviews.freebsd.org/D40403
2024-07-08 20:26:31 +00:00 · 2024-06-13 20:11:47 -04:00 · 2024-06-13 20:11:47 -04:00 · b16b4c22d2
commit b16b4c22d2
parent 69cbb18746
5 changed files with 262 additions and 36 deletions
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@ -94,14 +94,16 @@
 #define	VM_PHYSSEG_MAX		63

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
- * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
- * the pool from which physical pages for page tables and small UMA
- * objects are allocated.
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from
+ * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from
+ * which physical pages for page tables and small UMA objects are allocated.
+ * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during
+ * boot and is used to implement deferred initialization of page structures.
 */
-#define	VM_NFREEPOOL		2
-#define	VM_FREEPOOL_DEFAULT	0
-#define	VM_FREEPOOL_DIRECT	1
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_LAZYINIT	0
+#define	VM_FREEPOOL_DEFAULT	1
+#define	VM_FREEPOOL_DIRECT	2

 /*
 * Create up to three free page lists: VM_FREELIST_DMA32 is for physical pages
--- a/sys/arm64/include/vmparam.h
+++ b/sys/arm64/include/vmparam.h
@ -73,14 +73,16 @@
 #define	VM_PHYSSEG_MAX		64

 /*
- * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool
- * from which physical pages are allocated and VM_FREEPOOL_DIRECT is
- * the pool from which physical pages for small UMA objects are
- * allocated.
+ * Create three free page pools: VM_FREEPOOL_DEFAULT is the default pool from
+ * which physical pages are allocated and VM_FREEPOOL_DIRECT is the pool from
+ * which physical pages for page tables and small UMA objects are allocated.
+ * VM_FREEPOOL_LAZINIT is a special-purpose pool that is populated only during
+ * boot and is used to implement deferred initialization of page structures.
 */
-#define	VM_NFREEPOOL		2
-#define	VM_FREEPOOL_DEFAULT	0
-#define	VM_FREEPOOL_DIRECT	1
+#define	VM_NFREEPOOL		3
+#define	VM_FREEPOOL_LAZYINIT	0
+#define	VM_FREEPOOL_DEFAULT	1
+#define	VM_FREEPOOL_DIRECT	2

 /*
 * Create two free page lists: VM_FREELIST_DMA32 is for physical pages that have
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@ -333,9 +333,9 @@ vm_page_blacklist_add(vm_paddr_t pa, bool verbose)
 	if (m == NULL)
 		return (true); /* page does not exist, no failure */

-	vmd = vm_pagequeue_domain(m);
+	vmd = VM_DOMAIN(vm_phys_domain(pa));
 	vm_domain_free_lock(vmd);
-	found = vm_phys_unfree_page(m);
+	found = vm_phys_unfree_page(pa);
 	vm_domain_free_unlock(vmd);
 	if (found) {
 		vm_domain_freecnt_inc(vmd, -1);
@ -568,6 +568,9 @@ vm_page_startup(vm_offset_t vaddr)
 #if defined(__i386__) && defined(VM_PHYSSEG_DENSE)
 	long ii;
 #endif
+#ifdef VM_FREEPOOL_LAZYINIT
+	int lazyinit;
+#endif

 	vaddr = round_page(vaddr);

@ -748,6 +751,11 @@ vm_page_startup(vm_offset_t vaddr)
 	 */
 	vm_phys_init();

+#ifdef VM_FREEPOOL_LAZYINIT
+	lazyinit = 1;
+	TUNABLE_INT_FETCH("debug.vm.lazy_page_init", &lazyinit);
+#endif
+
 	/*
 	 * Initialize the page structures and add every available page to the
 	 * physical memory allocator's free lists.
@ -763,9 +771,50 @@ vm_page_startup(vm_offset_t vaddr)
 	vm_cnt.v_page_count = 0;
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
-		for (m = seg->first_page, pa = seg->start; pa < seg->end;
-		    m++, pa += PAGE_SIZE)
-			vm_page_init_page(m, pa, segind, VM_FREEPOOL_DEFAULT);
+
+		/*
+		 * If lazy vm_page initialization is not enabled, simply
+		 * initialize all of the pages in the segment.  Otherwise, we
+		 * only initialize:
+		 * 1. Pages not covered by phys_avail[], since they might be
+		 *    freed to the allocator at some future point, e.g., by
+		 *    kmem_bootstrap_free().
+		 * 2. The first page of each run of free pages handed to the
+		 *    vm_phys allocator, which in turn defers initialization
+		 *    of pages until they are needed.
+		 * This avoids blocking the boot process for long periods, which
+		 * may be relevant for VMs (which ought to boot as quickly as
+		 * possible) and/or systems with large amounts of physical
+		 * memory.
+		 */
+#ifdef VM_FREEPOOL_LAZYINIT
+		if (lazyinit) {
+			startp = seg->start;
+			for (i = 0; phys_avail[i + 1] != 0; i += 2) {
+				if (startp >= seg->end)
+					break;
+
+				if (phys_avail[i + 1] < startp)
+					continue;
+				if (phys_avail[i] <= startp) {
+					startp = phys_avail[i + 1];
+					continue;
+				}
+
+				m = vm_phys_seg_paddr_to_vm_page(seg, startp);
+				for (endp = MIN(phys_avail[i], seg->end);
+				    startp < endp; startp += PAGE_SIZE, m++) {
+					vm_page_init_page(m, startp, segind,
+					    VM_FREEPOOL_DEFAULT);
+				}
+			}
+		} else
+#endif
+			for (m = seg->first_page, pa = seg->start;
+			    pa < seg->end; m++, pa += PAGE_SIZE) {
+				vm_page_init_page(m, pa, segind,
+				    VM_FREEPOOL_DEFAULT);
+			}

 		/*
 		 * Add the segment's pages that are covered by one of
@ -783,6 +832,12 @@ vm_page_startup(vm_offset_t vaddr)
 				continue;

 			m = vm_phys_seg_paddr_to_vm_page(seg, startp);
+#ifdef VM_FREEPOOL_LAZYINIT
+			if (lazyinit) {
+				vm_page_init_page(m, startp, segind,
+				    VM_FREEPOOL_LAZYINIT);
+			}
+#endif
 			vmd = VM_DOMAIN(seg->domain);
 			vm_domain_free_lock(vmd);
 			vm_phys_enqueue_contig(m, pagecount);
--- a/sys/vm/vm_phys.c
+++ b/sys/vm/vm_phys.c
@ -47,14 +47,18 @@
 #include <sys/domainset.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
+#include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
+#include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/tree.h>
+#include <sys/tslog.h>
+#include <sys/unistd.h>
 #include <sys/vmmeter.h>

 #include <ddb/ddb.h>
@ -141,6 +145,7 @@ vm_paddr_t dump_avail[PHYS_AVAIL_COUNT];
 * Provides the mapping from VM_FREELIST_* to free list indices (flind).
 */
 static int __read_mostly vm_freelist_to_flind[VM_NFREELIST];
+static int __read_mostly vm_default_freepool;

 CTASSERT(VM_FREELIST_DEFAULT == 0);

@ -184,6 +189,16 @@ static void vm_phys_create_seg(vm_paddr_t start, vm_paddr_t end);
 static void vm_phys_split_pages(vm_page_t m, int oind, struct vm_freelist *fl,
    int order, int tail);

+static bool __diagused
+vm_phys_pool_valid(int pool)
+{
+#ifdef VM_FREEPOOL_LAZYINIT
+	if (pool == VM_FREEPOOL_LAZYINIT)
+		return (false);
+#endif
+	return (pool >= 0 && pool < VM_NFREEPOOL);
+}
+
 /*
 * Red-black tree helpers for vm fictitious range management.
 */
@ -621,6 +636,12 @@ vm_phys_init(void)
 		}
 	}

+#ifdef VM_FREEPOOL_LAZYINIT
+	vm_default_freepool = VM_FREEPOOL_LAZYINIT;
+#else
+	vm_default_freepool = VM_FREEPOOL_DEFAULT;
+#endif
+
 	rw_init(&vm_phys_fictitious_reg_lock, "vmfctr");
 }

@ -687,6 +708,17 @@ vm_phys_enq_chunk(struct vm_freelist *fl, vm_page_t m, int order, int tail)
 	    ("%s: invalid order %d", __func__, order));

 	vm_freelist_add(fl, m, order, tail);
+#ifdef VM_FREEPOOL_LAZYINIT
+	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
+		vm_page_t m_next;
+		int npages;
+
+		npages = 1 << order;
+		m_next = m + npages;
+		vm_page_init_page(m_next, m->phys_addr + ptoa(npages), m->segind,
+		    VM_FREEPOOL_LAZYINIT);
+	}
+#endif
 }

 /*
@ -760,15 +792,33 @@ vm_phys_enq_range(vm_page_t m, u_int npages, struct vm_freelist *fl, int tail)
 }

 /*
- * Set the pool for a contiguous, power of two-sized set of physical pages. 
+ * Set the pool for a contiguous, power of two-sized set of physical pages.
+ *
+ * If the pages currently belong to the lazy init pool, then the corresponding
+ * page structures must be initialized.  In this case it is assumed that the
+ * first page in the run has already been initialized.
 */
 static void
 vm_phys_set_pool(int pool, vm_page_t m, int order)
 {
-	vm_page_t m_tmp;
+#ifdef VM_FREEPOOL_LAZYINIT
+	if (__predict_false(m->pool == VM_FREEPOOL_LAZYINIT)) {
+		vm_paddr_t pa;
+		int segind;

-	for (m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
-		m_tmp->pool = pool;
+		m->pool = pool;
+
+		TSENTER();
+		pa = m->phys_addr + PAGE_SIZE;
+		segind = m->segind;
+		for (vm_page_t m_tmp = m + 1; m_tmp < &m[1 << order];
+		    m_tmp++, pa += PAGE_SIZE)
+			vm_page_init_page(m_tmp, pa, segind, pool);
+		TSEXIT();
+	} else
+#endif
+		for (vm_page_t m_tmp = m; m_tmp < &m[1 << order]; m_tmp++)
+			m_tmp->pool = pool;
 }

 /*
@ -792,7 +842,7 @@ vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])

 	KASSERT(domain >= 0 && domain < vm_ndomains,
 	    ("vm_phys_alloc_npages: domain %d is out of range", domain));
-	KASSERT(pool < VM_NFREEPOOL,
+	KASSERT(vm_phys_pool_valid(pool),
 	    ("vm_phys_alloc_npages: pool %d is out of range", pool));
 	KASSERT(npages <= 1 << (VM_NFREEORDER - 1),
 	    ("vm_phys_alloc_npages: npages %d is out of range", npages));
@ -821,7 +871,8 @@ vm_phys_alloc_npages(int domain, int pool, int npages, vm_page_t ma[])
 			}
 		}
 		for (oind = VM_NFREEORDER - 1; oind >= 0; oind--) {
-			for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+			for (pind = vm_default_freepool; pind < VM_NFREEPOOL;
+			    pind++) {
 				alt = vm_phys_free_queues[domain][flind][pind];
 				while ((m = TAILQ_FIRST(&alt[oind].pl)) !=
 				    NULL) {
@ -888,7 +939,7 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
 	KASSERT(freelist < VM_NFREELIST,
 	    ("vm_phys_alloc_freelist_pages: freelist %d is out of range",
 	    freelist));
-	KASSERT(pool < VM_NFREEPOOL,
+	KASSERT(vm_phys_pool_valid(pool),
 	    ("vm_phys_alloc_freelist_pages: pool %d is out of range", pool));
 	KASSERT(order < VM_NFREEORDER,
 	    ("vm_phys_alloc_freelist_pages: order %d is out of range", order));
@ -917,7 +968,7 @@ vm_phys_alloc_freelist_pages(int domain, int freelist, int pool, int order)
 	 * use them to satisfy the allocation.
 	 */
 	for (oind = VM_NFREEORDER - 1; oind >= order; oind--) {
-		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
 			alt = &vm_phys_free_queues[domain][flind][pind][0];
 			m = TAILQ_FIRST(&alt[oind].pl);
 			if (m != NULL) {
@ -1157,7 +1208,7 @@ vm_phys_free_pages(vm_page_t m, int order)
 	KASSERT(m->order == VM_NFREEORDER,
 	    ("vm_phys_free_pages: page %p has unexpected order %d",
 	    m, m->order));
-	KASSERT(m->pool < VM_NFREEPOOL,
+	KASSERT(vm_phys_pool_valid(m->pool),
 	    ("vm_phys_free_pages: page %p has unexpected pool %d",
 	    m, m->pool));
 	KASSERT(order < VM_NFREEORDER,
@ -1186,6 +1237,107 @@ vm_phys_free_pages(vm_page_t m, int order)
 	vm_freelist_add(fl, m, order, 1);
 }

+#ifdef VM_FREEPOOL_LAZYINIT
+/*
+ * Initialize all pages lingering in the lazy init pool of a NUMA domain, moving
+ * them to the default pool.  This is a prerequisite for some rare operations
+ * which need to scan the page array and thus depend on all pages being
+ * initialized.
+ */
+static void
+vm_phys_lazy_init_domain(int domain, bool locked)
+{
+	static bool initdone[MAXMEMDOM];
+	struct vm_domain *vmd;
+	struct vm_freelist *fl;
+	vm_page_t m;
+	int pind;
+	bool unlocked;
+
+	if (__predict_true(atomic_load_bool(&initdone[domain])))
+		return;
+
+	vmd = VM_DOMAIN(domain);
+	if (locked)
+		vm_domain_free_assert_locked(vmd);
+	else
+		vm_domain_free_lock(vmd);
+	if (atomic_load_bool(&initdone[domain]))
+		goto out;
+	pind = VM_FREEPOOL_LAZYINIT;
+	for (int freelist = 0; freelist < VM_NFREELIST; freelist++) {
+		int flind;
+
+		flind = vm_freelist_to_flind[freelist];
+		if (flind < 0)
+			continue;
+		fl = vm_phys_free_queues[domain][flind][pind];
+		for (int oind = 0; oind < VM_NFREEORDER; oind++) {
+			if (atomic_load_int(&fl[oind].lcnt) == 0)
+				continue;
+			while ((m = TAILQ_FIRST(&fl[oind].pl)) != NULL) {
+				/*
+				 * Avoid holding the lock across the
+				 * initialization unless there's a free page
+				 * shortage.
+				 */
+				vm_freelist_rem(fl, m, oind);
+				unlocked = vm_domain_allocate(vmd,
+				    VM_ALLOC_NORMAL, 1 << oind);
+				if (unlocked)
+					vm_domain_free_unlock(vmd);
+				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, oind);
+				if (unlocked) {
+					vm_domain_freecnt_inc(vmd, 1 << oind);
+					vm_domain_free_lock(vmd);
+				}
+				vm_phys_free_pages(m, oind);
+			}
+		}
+	}
+	atomic_store_bool(&initdone[domain], true);
+out:
+	if (!locked)
+		vm_domain_free_unlock(vmd);
+}
+
+static void
+vm_phys_lazy_init(void)
+{
+	for (int domain = 0; domain < vm_ndomains; domain++)
+		vm_phys_lazy_init_domain(domain, false);
+	atomic_store_int(&vm_default_freepool, VM_FREEPOOL_DEFAULT);
+}
+
+static void
+vm_phys_lazy_init_kthr(void *arg __unused)
+{
+	vm_phys_lazy_init();
+	kthread_exit();
+}
+
+static void
+vm_phys_lazy_sysinit(void *arg __unused)
+{
+	struct thread *td;
+	int error;
+
+	error = kthread_add(vm_phys_lazy_init_kthr, NULL, curproc, &td,
+	    RFSTOPPED, 0, "vmlazyinit");
+	if (error == 0) {
+		thread_lock(td);
+		sched_prio(td, PRI_MIN_IDLE);
+		sched_add(td, SRQ_BORING);
+	} else {
+		printf("%s: could not create lazy init thread: %d\n",
+		    __func__, error);
+		vm_phys_lazy_init();
+	}
+}
+SYSINIT(vm_phys_lazy_init, SI_SUB_SMP, SI_ORDER_ANY, vm_phys_lazy_sysinit,
+    NULL);
+#endif /* VM_FREEPOOL_LAZYINIT */
+
 /*
 * Free a contiguous, arbitrarily sized set of physical pages, without
 * merging across set boundaries.
@ -1291,6 +1443,12 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
 		pa_end = MIN(high, seg->end);
 		if (pa_end - pa_start < ptoa(npages))
 			continue;
+#ifdef VM_FREEPOOL_LAZYINIT
+		/*
+		 * The pages on the free lists must be initialized.
+		 */
+		vm_phys_lazy_init_domain(domain, false);
+#endif
 		bounds[0] = vm_phys_seg_paddr_to_vm_page(seg, pa_start);
 		bounds[1] = vm_phys_seg_paddr_to_vm_page(seg, pa_end);
 		return (seg - vm_phys_segs);
@ -1306,21 +1464,30 @@ vm_phys_find_range(vm_page_t bounds[], int segind, int domain,
 * The free page queues must be locked.
 */
 bool
-vm_phys_unfree_page(vm_page_t m)
+vm_phys_unfree_page(vm_paddr_t pa)
 {
 	struct vm_freelist *fl;
 	struct vm_phys_seg *seg;
-	vm_paddr_t pa, pa_half;
-	vm_page_t m_set, m_tmp;
+	vm_paddr_t pa_half;
+	vm_page_t m, m_set, m_tmp;
 	int order;

+	seg = vm_phys_paddr_to_seg(pa);
+	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
+
+	/*
+	 * The pages on the free lists must be initialized.
+	 */
+#ifdef VM_FREEPOOL_LAZYINIT
+	vm_phys_lazy_init_domain(seg->domain, true);
+#endif
+
 	/*
 	 * First, find the contiguous, power of two-sized set of free
 	 * physical pages containing the given physical page "m" and
 	 * assign it to "m_set".
 	 */
-	seg = &vm_phys_segs[m->segind];
-	vm_domain_free_assert_locked(VM_DOMAIN(seg->domain));
+	m = vm_phys_paddr_to_vm_page(pa);
 	for (m_set = m, order = 0; m_set->order == VM_NFREEORDER &&
 	    order < VM_NFREEORDER - 1; ) {
 		order++;
@ -1459,7 +1626,7 @@ vm_phys_find_queues_contig(
 	/* Search for a large enough free block. */
 	size = npages << PAGE_SHIFT;
 	for (oind = order; oind < VM_NFREEORDER; oind++) {
-		for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+		for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
 			fl = (*queues)[pind];
 			TAILQ_FOREACH(m_ret, &fl[oind].pl, listq) {
 				/*
@ -1479,7 +1646,7 @@ vm_phys_find_queues_contig(
 	if (order < VM_NFREEORDER)
 		return (NULL);
 	/* Search for a long-enough sequence of max-order blocks. */
-	for (pind = 0; pind < VM_NFREEPOOL; pind++) {
+	for (pind = vm_default_freepool; pind < VM_NFREEPOOL; pind++) {
 		fl = (*queues)[pind];
 		m_ret = vm_phys_find_freelist_contig(fl, npages,
 		    low, high, alignment, boundary);
--- a/sys/vm/vm_phys.h
+++ b/sys/vm/vm_phys.h
@ -80,7 +80,7 @@ vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
 vm_page_t vm_phys_seg_paddr_to_vm_page(struct vm_phys_seg *seg, vm_paddr_t pa);
 void vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
    int *locality);
-bool vm_phys_unfree_page(vm_page_t m);
+bool vm_phys_unfree_page(vm_paddr_t pa);
 int vm_phys_mem_affinity(int f, int t);
 void vm_phys_early_add_seg(vm_paddr_t start, vm_paddr_t end);
 vm_paddr_t vm_phys_early_alloc(int domain, size_t alloc_size);