Implement NUMA support in uma(9) and malloc(9). Allocations from specific

domains can be done by the _domain() API variants.  UMA also supports a
first-touch policy via the NUMA zone flag.

The slab layer is now segregated by VM domains and is precise.  It handles
iteration for round-robin directly.  The per-cpu cache layer remains
a mix of domains according to where memory is allocated and freed.  Well
behaved clients can achieve perfect locality with no performance penalty.

The direct domain allocation functions have to visit the slab layer and
so require per-zone locks which come at some expense.

Reviewed by:	Attilio (a slightly older version)
Tested by:	pho
Sponsored by:	Netflix, Dell/EMC Isilon
This commit is contained in:
Jeff Roberson 2018-01-12 23:25:05 +00:00
parent 7a469c8ef3
commit ab3185d15e
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=327900
19 changed files with 729 additions and 269 deletions

View file

@ -55,6 +55,8 @@ static struct nlist namelist[] = {
{ .n_name = "_mp_maxid" },
#define X_ALL_CPUS 2
{ .n_name = "_all_cpus" },
#define X_VM_NDOMAINS 3
{ .n_name = "_vm_ndomains" },
{ .n_name = "" },
};
@ -297,11 +299,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle)
{
LIST_HEAD(, uma_keg) uma_kegs;
struct memory_type *mtp;
struct uma_zone_domain uzd;
struct uma_bucket *ubp, ub;
struct uma_cache *ucp, *ucp_array;
struct uma_zone *uzp, uz;
struct uma_keg *kzp, kz;
int hint_dontsearch, i, mp_maxid, ret;
int hint_dontsearch, i, mp_maxid, ndomains, ret;
char name[MEMTYPE_MAXNAME];
cpuset_t all_cpus;
long cpusetsize;
@ -323,6 +326,12 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle)
list->mtl_error = ret;
return (-1);
}
ret = kread_symbol(kvm, X_VM_NDOMAINS, &ndomains,
sizeof(ndomains), 0);
if (ret != 0) {
list->mtl_error = ret;
return (-1);
}
ret = kread_symbol(kvm, X_UMA_KEGS, &uma_kegs, sizeof(uma_kegs), 0);
if (ret != 0) {
list->mtl_error = ret;
@ -447,10 +456,17 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle)
kz.uk_ipers;
mtp->mt_byteslimit = mtp->mt_countlimit * mtp->mt_size;
mtp->mt_count = mtp->mt_numallocs - mtp->mt_numfrees;
for (ubp = LIST_FIRST(&uz.uz_buckets); ubp !=
NULL; ubp = LIST_NEXT(&ub, ub_link)) {
ret = kread(kvm, ubp, &ub, sizeof(ub), 0);
mtp->mt_zonefree += ub.ub_cnt;
for (i = 0; i < ndomains; i++) {
ret = kread(kvm, &uz.uz_domain[i], &uzd,
sizeof(uzd), 0);
for (ubp =
LIST_FIRST(&uzd.uzd_buckets);
ubp != NULL;
ubp = LIST_NEXT(&ub, ub_link)) {
ret = kread(kvm, ubp, &ub,
sizeof(ub), 0);
mtp->mt_zonefree += ub.ub_cnt;
}
}
if (!((kz.uk_flags & UMA_ZONE_SECONDARY) &&
LIST_FIRST(&kz.uk_zones) != uzp)) {

View file

@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
{
vm_page_t m;
vm_paddr_t pa;
void *va;
*flags = UMA_SLAB_PRIV;
m = vm_page_alloc(NULL, 0,
m = vm_page_alloc_domain(NULL, 0, domain,
malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
if (m == NULL)
return (NULL);

View file

@ -42,14 +42,15 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
{
vm_page_t m;
vm_paddr_t pa;
void *va;
*flags = UMA_SLAB_PRIV;
m = vm_page_alloc(NULL, 0,
m = vm_page_alloc_domain(NULL, 0, domain,
malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
if (m == NULL)
return (NULL);

View file

@ -338,8 +338,8 @@ static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
static void pmap_pte_release(pt_entry_t *pte);
static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
#if defined(PAE) || defined(PAE_TABLES)
static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
int wait);
static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
uint8_t *flags, int wait);
#endif
static void pmap_set_pg(void);
@ -697,12 +697,13 @@ pmap_page_init(vm_page_t m)
#if defined(PAE) || defined(PAE_TABLES)
static void *
pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
int wait)
{
/* Inform UMA that this allocator uses kernel_map/object. */
*flags = UMA_SLAB_KERNEL;
return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL,
return ((void *)kmem_alloc_contig_domain(domain, bytes, wait, 0x0ULL,
0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
}
#endif

View file

@ -96,6 +96,11 @@ __FBSDID("$FreeBSD$");
dtrace_malloc_probe_func_t dtrace_malloc_probe;
#endif
#if defined(INVARIANTS) || defined(MALLOC_MAKE_FAILURES) || \
defined(DEBUG_MEMGUARD) || defined(DEBUG_REDZONE)
#define MALLOC_DEBUG 1
#endif
/*
* When realloc() is called, if the new size is sufficiently smaller than
* the old size, realloc() will allocate a new, smaller block to avoid
@ -417,6 +422,20 @@ contigmalloc(unsigned long size, struct malloc_type *type, int flags,
return (ret);
}
void *
contigmalloc_domain(unsigned long size, struct malloc_type *type,
int domain, int flags, vm_paddr_t low, vm_paddr_t high,
unsigned long alignment, vm_paddr_t boundary)
{
void *ret;
ret = (void *)kmem_alloc_contig_domain(domain, size, flags, low, high,
alignment, boundary, VM_MEMATTR_DEFAULT);
if (ret != NULL)
malloc_type_allocated(type, round_page(size));
return (ret);
}
/*
* contigfree:
*
@ -432,26 +451,14 @@ contigfree(void *addr, unsigned long size, struct malloc_type *type)
malloc_type_freed(type, round_page(size));
}
/*
* malloc:
*
* Allocate a block of memory.
*
* If M_NOWAIT is set, this routine will not block and return NULL if
* the allocation fails.
*/
void *
malloc(unsigned long size, struct malloc_type *mtp, int flags)
#ifdef MALLOC_DEBUG
static int
malloc_dbg(caddr_t *vap, unsigned long *sizep, struct malloc_type *mtp,
int flags)
{
int indx;
struct malloc_type_internal *mtip;
caddr_t va;
uma_zone_t zone;
#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
unsigned long osize = size;
#endif
#ifdef INVARIANTS
int indx;
KASSERT(mtp->ks_magic == M_MAGIC, ("malloc: bad malloc type magic"));
/*
* Check that exactly one of M_WAITOK or M_NOWAIT is specified.
@ -474,7 +481,8 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags)
if ((malloc_nowait_count % malloc_failure_rate) == 0) {
atomic_add_int(&malloc_failure_count, 1);
t_malloc_fail = time_uptime;
return (NULL);
*vap = NULL;
return (EJUSTRETURN);
}
}
#endif
@ -485,16 +493,44 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags)
("malloc: called with spinlock or critical section held"));
#ifdef DEBUG_MEMGUARD
if (memguard_cmp_mtp(mtp, size)) {
va = memguard_alloc(size, flags);
if (va != NULL)
return (va);
if (memguard_cmp_mtp(mtp, *sizep)) {
*vap = memguard_alloc(*sizep, flags);
if (*vap != NULL)
return (EJUSTRETURN);
/* This is unfortunate but should not be fatal. */
}
#endif
#ifdef DEBUG_REDZONE
size = redzone_size_ntor(size);
*sizep = redzone_size_ntor(*sizep);
#endif
return (0);
}
#endif
/*
* malloc:
*
* Allocate a block of memory.
*
* If M_NOWAIT is set, this routine will not block and return NULL if
* the allocation fails.
*/
void *
malloc(unsigned long size, struct malloc_type *mtp, int flags)
{
int indx;
struct malloc_type_internal *mtip;
caddr_t va;
uma_zone_t zone;
#if defined(DEBUG_REDZONE)
unsigned long osize = size;
#endif
#ifdef MALLOC_DEBUG
if (malloc_dbg(&va, &size, mtp, flags) != 0)
return (va);
#endif
if (size <= kmem_zmax) {
@ -523,11 +559,55 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags)
KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
else if (va == NULL)
t_malloc_fail = time_uptime;
#ifdef DIAGNOSTIC
if (va != NULL && !(flags & M_ZERO)) {
memset(va, 0x70, osize);
}
#ifdef DEBUG_REDZONE
if (va != NULL)
va = redzone_setup(va, osize);
#endif
return ((void *) va);
}
void *
malloc_domain(unsigned long size, struct malloc_type *mtp, int domain,
int flags)
{
int indx;
struct malloc_type_internal *mtip;
caddr_t va;
uma_zone_t zone;
#if defined(DEBUG_REDZONE)
unsigned long osize = size;
#endif
#ifdef MALLOC_DEBUG
if (malloc_dbg(&va, &size, mtp, flags) != 0)
return (va);
#endif
if (size <= kmem_zmax) {
mtip = mtp->ks_handle;
if (size & KMEM_ZMASK)
size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
indx = kmemsize[size >> KMEM_ZSHIFT];
KASSERT(mtip->mti_zone < numzones,
("mti_zone %u out of range %d",
mtip->mti_zone, numzones));
zone = kmemzones[indx].kz_zone[mtip->mti_zone];
#ifdef MALLOC_PROFILE
krequests[size >> KMEM_ZSHIFT]++;
#endif
va = uma_zalloc_domain(zone, NULL, domain, flags);
if (va != NULL)
size = zone->uz_size;
malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
} else {
size = roundup(size, PAGE_SIZE);
zone = NULL;
va = uma_large_malloc_domain(size, domain, flags);
malloc_type_allocated(mtp, va == NULL ? 0 : size);
}
if (flags & M_WAITOK)
KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
else if (va == NULL)
t_malloc_fail = time_uptime;
#ifdef DEBUG_REDZONE
if (va != NULL)
va = redzone_setup(va, osize);
@ -545,6 +625,58 @@ mallocarray(size_t nmemb, size_t size, struct malloc_type *type, int flags)
return (malloc(size * nmemb, type, flags));
}
#ifdef INVARIANTS
static void
free_save_type(void *addr, struct malloc_type *mtp, u_long size)
{
struct malloc_type **mtpp = addr;
/*
* Cache a pointer to the malloc_type that most recently freed
* this memory here. This way we know who is most likely to
* have stepped on it later.
*
* This code assumes that size is a multiple of 8 bytes for
* 64 bit machines
*/
mtpp = (struct malloc_type **) ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
mtpp += (size - sizeof(struct malloc_type *)) /
sizeof(struct malloc_type *);
*mtpp = mtp;
}
#endif
#ifdef MALLOC_DEBUG
static int
free_dbg(void **addrp, struct malloc_type *mtp)
{
void *addr;
addr = *addrp;
KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
("free: called with spinlock or critical section held"));
/* free(NULL, ...) does nothing */
if (addr == NULL)
return (EJUSTRETURN);
#ifdef DEBUG_MEMGUARD
if (is_memguard_addr(addr)) {
memguard_free(addr);
return (EJUSTRETURN);
}
#endif
#ifdef DEBUG_REDZONE
redzone_check(addr);
*addrp = redzone_addr_ntor(addr);
#endif
return (0);
}
#endif
/*
* free:
*
@ -558,51 +690,23 @@ free(void *addr, struct malloc_type *mtp)
uma_slab_t slab;
u_long size;
KASSERT(mtp->ks_magic == M_MAGIC, ("free: bad malloc type magic"));
KASSERT(curthread->td_critnest == 0 || SCHEDULER_STOPPED(),
("free: called with spinlock or critical section held"));
#ifdef MALLOC_DEBUG
if (free_dbg(&addr, mtp) != 0)
return;
#endif
/* free(NULL, ...) does nothing */
if (addr == NULL)
return;
#ifdef DEBUG_MEMGUARD
if (is_memguard_addr(addr)) {
memguard_free(addr);
return;
}
#endif
#ifdef DEBUG_REDZONE
redzone_check(addr);
addr = redzone_addr_ntor(addr);
#endif
slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
if (slab == NULL)
panic("free: address %p(%p) has not been allocated.\n",
addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
#ifdef INVARIANTS
struct malloc_type **mtpp = addr;
#endif
size = slab->us_keg->uk_size;
#ifdef INVARIANTS
/*
* Cache a pointer to the malloc_type that most recently freed
* this memory here. This way we know who is most likely to
* have stepped on it later.
*
* This code assumes that size is a multiple of 8 bytes for
* 64 bit machines
*/
mtpp = (struct malloc_type **)
((unsigned long)mtpp & ~UMA_ALIGN_PTR);
mtpp += (size - sizeof(struct malloc_type *)) /
sizeof(struct malloc_type *);
*mtpp = mtp;
free_save_type(addr, mtp, size);
#endif
uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
} else {
@ -612,6 +716,40 @@ free(void *addr, struct malloc_type *mtp)
malloc_type_freed(mtp, size);
}
void
free_domain(void *addr, struct malloc_type *mtp)
{
uma_slab_t slab;
u_long size;
#ifdef MALLOC_DEBUG
if (free_dbg(&addr, mtp) != 0)
return;
#endif
/* free(NULL, ...) does nothing */
if (addr == NULL)
return;
slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
if (slab == NULL)
panic("free_domain: address %p(%p) has not been allocated.\n",
addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
size = slab->us_keg->uk_size;
#ifdef INVARIANTS
free_save_type(addr, mtp, size);
#endif
uma_zfree_domain(LIST_FIRST(&slab->us_keg->uk_zones),
addr, slab);
} else {
size = slab->us_size;
uma_large_free(slab);
}
malloc_type_freed(mtp, size);
}
/*
* realloc: change the size of a memory block
*/

View file

@ -283,7 +283,7 @@ static void mb_dtor_pack(void *, int, void *);
static int mb_zinit_pack(void *, int, int);
static void mb_zfini_pack(void *, int);
static void mb_reclaim(uma_zone_t, int);
static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, uint8_t *, int);
static void *mbuf_jumbo_alloc(uma_zone_t, vm_size_t, int, uint8_t *, int);
/* Ensure that MSIZE is a power of 2. */
CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
@ -386,12 +386,13 @@ SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL);
* pages.
*/
static void *
mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait)
mbuf_jumbo_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
int wait)
{
/* Inform UMA that this allocator uses kernel_map/object. */
*flags = UMA_SLAB_KERNEL;
return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait,
return ((void *)kmem_alloc_contig_domain(domain, bytes, wait,
(vm_paddr_t)0, ~(vm_paddr_t)0, 1, 0, VM_MEMATTR_DEFAULT));
}

View file

@ -149,7 +149,7 @@ busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
}
void *
busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size, int domain,
uint8_t *pflag, int wait)
{
#ifdef VM_MEMATTR_UNCACHEABLE
@ -157,7 +157,7 @@ busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
/* Inform UMA that this allocator uses kernel_arena/object. */
*pflag = UMA_SLAB_KERNEL;
return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0,
return ((void *)kmem_alloc_attr_domain(domain, size, wait, 0,
BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
#else

View file

@ -500,7 +500,7 @@ bt_insfree(vmem_t *vm, bt_t *bt)
* Import from the arena into the quantum cache in UMA.
*/
static int
qc_import(void *arg, void **store, int cnt, int flags)
qc_import(void *arg, void **store, int cnt, int domain, int flags)
{
qcache_t *qc;
vmem_addr_t addr;
@ -614,13 +614,12 @@ static struct mtx_padalign __exclusive_cache_line vmem_bt_lock;
* we are really out of KVA.
*/
static void *
vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag, int wait)
vmem_bt_alloc(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *pflag,
int wait)
{
vmem_addr_t addr;
int domain;
*pflag = UMA_SLAB_KERNEL;
domain = 0; /* XXX Temporary. */
/*
* Single thread boundary tag allocation so that the address space

View file

@ -133,7 +133,7 @@ static __inline void bd_wakeup(void);
static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
static void bufkva_reclaim(vmem_t *, int);
static void bufkva_free(struct buf *);
static int buf_import(void *, void **, int, int);
static int buf_import(void *, void **, int, int, int);
static void buf_release(void *, void **, int);
static void maxbcachebuf_adjust(void);
@ -1419,7 +1419,7 @@ buf_free(struct buf *bp)
* only as a per-cpu cache of bufs still maintained on a global list.
*/
static int
buf_import(void *arg, void **store, int cnt, int flags)
buf_import(void *arg, void **store, int cnt, int domain, int flags)
{
struct buf *bp;
int i;

View file

@ -44,7 +44,8 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
{
vm_paddr_t pa;
vm_page_t m;
@ -59,7 +60,8 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
#endif
for (;;) {
m = vm_page_alloc_freelist(VM_FREELIST_DIRECT, pflags);
m = vm_page_alloc_freelist_domain(domain, VM_FREELIST_DIRECT,
pflags);
#ifndef __mips_n64
if (m == NULL && vm_page_reclaim_contig(pflags, 1,
0, MIPS_KSEG0_LARGEST_PHYS, PAGE_SIZE, 0))

View file

@ -1504,8 +1504,8 @@ moea64_extract_and_hold(mmu_t mmu, pmap_t pmap, vm_offset_t va, vm_prot_t prot)
static mmu_t installed_mmu;
static void *
moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
int wait)
moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
uint8_t *flags, int wait)
{
struct pvo_entry *pvo;
vm_offset_t va;
@ -1522,7 +1522,7 @@ moea64_uma_page_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *flags,
*flags = UMA_SLAB_PRIV;
needed_lock = !PMAP_LOCKED(kernel_pmap);
m = vm_page_alloc(NULL, 0,
m = vm_page_alloc_domain(NULL, 0, domain,
malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
if (m == NULL)
return (NULL);

View file

@ -480,7 +480,8 @@ slb_insert_user(pmap_t pm, struct slb *slb)
}
static void *
slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
u_int8_t *flags, int wait)
{
static vm_offset_t realmax = 0;
void *va;
@ -490,7 +491,7 @@ slb_uma_real_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
realmax = platform_real_maxaddr();
*flags = UMA_SLAB_PRIV;
m = vm_page_alloc_contig(NULL, 0,
m = vm_page_alloc_contig_domain(NULL, 0, domain,
malloc2vm_flags(wait) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED,
1, 0, realmax, PAGE_SIZE, PAGE_SIZE, VM_MEMATTR_DEFAULT);
if (m == NULL)

View file

@ -51,7 +51,8 @@ SYSCTL_INT(_hw, OID_AUTO, uma_mdpages, CTLFLAG_RD, &hw_uma_mdpages, 0,
"UMA MD pages in use");
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
{
void *va;
vm_paddr_t pa;
@ -59,7 +60,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
*flags = UMA_SLAB_PRIV;
m = vm_page_alloc(NULL, 0,
m = vm_page_alloc_domain(NULL, 0, domain,
malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
if (m == NULL)
return (NULL);

View file

@ -41,7 +41,8 @@ __FBSDID("$FreeBSD$");
#include <machine/vmparam.h>
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
{
panic("uma_small_alloc");

View file

@ -392,7 +392,8 @@ swi_vm(void *v)
}
void *
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain, u_int8_t *flags,
int wait)
{
vm_paddr_t pa;
vm_page_t m;
@ -402,7 +403,7 @@ uma_small_alloc(uma_zone_t zone, vm_size_t bytes, u_int8_t *flags, int wait)
*flags = UMA_SLAB_PRIV;
m = vm_page_alloc(NULL, 0,
m = vm_page_alloc_domain(NULL, 0, domain,
malloc2vm_flags(wait) | VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
if (m == NULL)
return (NULL);

View file

@ -175,9 +175,17 @@ void *contigmalloc(unsigned long size, struct malloc_type *type, int flags,
vm_paddr_t low, vm_paddr_t high, unsigned long alignment,
vm_paddr_t boundary) __malloc_like __result_use_check
__alloc_size(1) __alloc_align(6);
void *contigmalloc_domain(unsigned long size, struct malloc_type *type,
int domain, int flags, vm_paddr_t low, vm_paddr_t high,
unsigned long alignment, vm_paddr_t boundary)
__malloc_like __result_use_check __alloc_size(1) __alloc_align(6);
void free(void *addr, struct malloc_type *type);
void free_domain(void *addr, struct malloc_type *type);
void *malloc(unsigned long size, struct malloc_type *type, int flags)
__malloc_like __result_use_check __alloc_size(1);
void *malloc_domain(unsigned long size, struct malloc_type *type,
int domain, int flags)
__malloc_like __result_use_check __alloc_size(1);
void *mallocarray(size_t nmemb, size_t size, struct malloc_type *type,
int flags) __malloc_like __result_use_check
__alloc_size(1) __alloc_size(2);

View file

@ -128,7 +128,8 @@ typedef void (*uma_fini)(void *mem, int size);
/*
* Import new memory into a cache zone.
*/
typedef int (*uma_import)(void *arg, void **store, int count, int flags);
typedef int (*uma_import)(void *arg, void **store, int count, int domain,
int flags);
/*
* Free memory from a cache zone.
@ -281,6 +282,10 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
* Allocates mp_maxid + 1 slabs sized to
* sizeof(struct pcpu).
*/
#define UMA_ZONE_NUMA 0x10000 /*
* NUMA aware Zone. Implements a best
* effort first-touch policy.
*/
/*
* These flags are shared between the keg and zone. In zones wishing to add
@ -325,6 +330,19 @@ void uma_zdestroy(uma_zone_t zone);
void *uma_zalloc_arg(uma_zone_t zone, void *arg, int flags);
/*
* Allocate an item from a specific NUMA domain. This uses a slow path in
* the allocator but is guaranteed to allocate memory from the requested
* domain if M_WAITOK is set.
*
* Arguments:
* zone The zone we are allocating from
* arg This data is passed to the ctor function
* domain The domain to allocate from.
* flags See sys/malloc.h for available flags.
*/
void *uma_zalloc_domain(uma_zone_t zone, void *arg, int domain, int flags);
/*
* Allocates an item out of a zone without supplying an argument
*
@ -353,6 +371,16 @@ uma_zalloc(uma_zone_t zone, int flags)
void uma_zfree_arg(uma_zone_t zone, void *item, void *arg);
/*
* Frees an item back to the specified zone's domain specific pool.
*
* Arguments:
* zone The zone the item was originally allocated out of.
* item The memory to be freed.
* arg Argument passed to the destructor
*/
void uma_zfree_domain(uma_zone_t zone, void *item, void *arg);
/*
* Frees an item back to a zone without supplying an argument
*
@ -372,11 +400,6 @@ uma_zfree(uma_zone_t zone, void *item)
*/
void uma_zwait(uma_zone_t zone);
/*
* XXX The rest of the prototypes in this header are h0h0 magic for the VM.
* If you think you need to use it for a normal zone you're probably incorrect.
*/
/*
* Backend page supplier routines
*
@ -384,14 +407,15 @@ void uma_zwait(uma_zone_t zone);
* zone The zone that is requesting pages.
* size The number of bytes being requested.
* pflag Flags for these memory pages, see below.
* domain The NUMA domain that we prefer for this allocation.
* wait Indicates our willingness to block.
*
* Returns:
* A pointer to the allocated memory or NULL on failure.
*/
typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
int wait);
typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, int domain,
uint8_t *pflag, int wait);
/*
* Backend page free routines
@ -406,8 +430,6 @@ typedef void *(*uma_alloc)(uma_zone_t zone, vm_size_t size, uint8_t *pflag,
*/
typedef void (*uma_free)(void *item, vm_size_t size, uint8_t pflag);
/*
* Sets up the uma allocator. (Called by vm_mem_init)
*

File diff suppressed because it is too large Load diff

View file

@ -39,7 +39,22 @@
*/
/*
* Here's a quick description of the relationship between the objects:
* The brief summary; Zones describe unique allocation types. Zones are
* organized into per-CPU caches which are filled by buckets. Buckets are
* organized according to memory domains. Buckets are filled from kegs which
* are also organized according to memory domains. Kegs describe a unique
* allocation type, backend memory provider, and layout. Kegs are associated
* with one or more zones and zones reference one or more kegs. Kegs provide
* slabs which are virtually contiguous collections of pages. Each slab is
* broken down int one or more items that will satisfy an individual allocation.
*
* Allocation is satisfied in the following order:
* 1) Per-CPU cache
* 2) Per-domain cache of buckets
* 3) Slab from any of N kegs
* 4) Backend page provider
*
* More detail on individual objects is contained below:
*
* Kegs contain lists of slabs which are stored in either the full bin, empty
* bin, or partially allocated bin, to reduce fragmentation. They also contain
@ -47,6 +62,13 @@
* and rsize is the result of that. The Keg also stores information for
* managing a hash of page addresses that maps pages to uma_slab_t structures
* for pages that don't have embedded uma_slab_t's.
*
* Keg slab lists are organized by memory domain to support NUMA allocation
* policies. By default allocations are spread across domains to reduce the
* potential for hotspots. Special keg creation flags may be specified to
* prefer location allocation. However there is no strict enforcement as frees
* may happen on any CPU and these are returned to the CPU-local cache
* regardless of the originating domain.
*
* The uma_slab_t may be embedded in a UMA_SLAB_SIZE chunk of memory or it may
* be allocated off the page from a special slab zone. The free list within a
@ -181,6 +203,17 @@ struct uma_cache {
typedef struct uma_cache * uma_cache_t;
/*
* Per-domain memory list. Embedded in the kegs.
*/
struct uma_domain {
LIST_HEAD(,uma_slab) ud_part_slab; /* partially allocated slabs */
LIST_HEAD(,uma_slab) ud_free_slab; /* empty slab list */
LIST_HEAD(,uma_slab) ud_full_slab; /* full slabs */
};
typedef struct uma_domain * uma_domain_t;
/*
* Keg management structure
*
@ -192,10 +225,8 @@ struct uma_keg {
struct uma_hash uk_hash;
LIST_HEAD(,uma_zone) uk_zones; /* Keg's zones */
LIST_HEAD(,uma_slab) uk_part_slab; /* partially allocated slabs */
LIST_HEAD(,uma_slab) uk_free_slab; /* empty slab list */
LIST_HEAD(,uma_slab) uk_full_slab; /* full slabs */
uint32_t uk_cursor; /* Domain alloc cursor. */
uint32_t uk_align; /* Alignment mask */
uint32_t uk_pages; /* Total page count */
uint32_t uk_free; /* Count of items free in slabs */
@ -221,6 +252,9 @@ struct uma_keg {
/* Least used fields go to the last cache line. */
const char *uk_name; /* Name of creating zone. */
LIST_ENTRY(uma_keg) uk_link; /* List of all kegs */
/* Must be last, variable sized. */
struct uma_domain uk_domain[]; /* Keg's slab lists. */
};
typedef struct uma_keg * uma_keg_t;
@ -248,14 +282,18 @@ struct uma_slab {
#endif
uint16_t us_freecount; /* How many are free? */
uint8_t us_flags; /* Page flags see uma.h */
uint8_t us_pad; /* Pad to 32bits, unused. */
uint8_t us_domain; /* Backing NUMA domain. */
};
#define us_link us_type._us_link
#define us_size us_type._us_size
#if MAXMEMDOM >= 255
#error "Slab domain type insufficient"
#endif
typedef struct uma_slab * uma_slab_t;
typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int);
typedef uma_slab_t (*uma_slaballoc)(uma_zone_t, uma_keg_t, int, int);
struct uma_klink {
LIST_ENTRY(uma_klink) kl_link;
@ -263,6 +301,12 @@ struct uma_klink {
};
typedef struct uma_klink *uma_klink_t;
struct uma_zone_domain {
LIST_HEAD(,uma_bucket) uzd_buckets; /* full buckets */
};
typedef struct uma_zone_domain * uma_zone_domain_t;
/*
* Zone management structure
*
@ -275,7 +319,7 @@ struct uma_zone {
const char *uz_name; /* Text name of the zone */
LIST_ENTRY(uma_zone) uz_link; /* List of all zones in keg */
LIST_HEAD(,uma_bucket) uz_buckets; /* full buckets */
struct uma_zone_domain *uz_domain; /* per-domain buckets */
LIST_HEAD(,uma_klink) uz_kegs; /* List of kegs. */
struct uma_klink uz_klink; /* klink for first keg. */
@ -309,7 +353,9 @@ struct uma_zone {
* This HAS to be the last item because we adjust the zone size
* based on NCPU and then allocate the space for the zones.
*/
struct uma_cache uz_cpu[1]; /* Per cpu caches */
struct uma_cache uz_cpu[]; /* Per cpu caches */
/* uz_domain follows here. */
};
/*
@ -340,6 +386,7 @@ zone_first_keg(uma_zone_t zone)
/* Internal prototypes */
static __inline uma_slab_t hash_sfind(struct uma_hash *hash, uint8_t *data);
void *uma_large_malloc(vm_size_t size, int wait);
void *uma_large_malloc_domain(vm_size_t size, int domain, int wait);
void uma_large_free(uma_slab_t slab);
/* Lock Macros */
@ -422,8 +469,8 @@ vsetslab(vm_offset_t va, uma_slab_t slab)
* if they can provide more efficient allocation functions. This is useful
* for using direct mapped addresses.
*/
void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, uint8_t *pflag,
int wait);
void *uma_small_alloc(uma_zone_t zone, vm_size_t bytes, int domain,
uint8_t *pflag, int wait);
void uma_small_free(void *mem, vm_size_t size, uint8_t flags);
/* Set a global soft limit on UMA managed memory. */