Add two new kernel options to control memory locality on NUMA hardware.

- UMA_XDOMAIN enables an additional per-cpu bucket for freed memory that
   was freed on a different domain from where it was allocated.  This is
   only used for UMA_ZONE_NUMA (first-touch) zones.
 - UMA_FIRSTTOUCH sets the default UMA policy to be first-touch for all
   zones.  This tries to maintain locality for kernel memory.

Reviewed by:	gallatin, alc, kib
Tested by:	pho, gallatin
Sponsored by:	Netflix
Differential Revision:	https://reviews.freebsd.org/D20929
This commit is contained in:
Jeff Roberson 2019-08-06 21:50:34 +00:00
parent cb71f1fa01
commit c168508655
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=350659
11 changed files with 181 additions and 48 deletions

View file

@ -326,6 +326,13 @@ memstat_get_sleeps(const struct memory_type *mtp)
return (mtp->mt_sleeps);
}
uint64_t
memstat_get_xdomain(const struct memory_type *mtp)
{
return (mtp->mt_xdomain);
}
void *
memstat_get_caller_pointer(const struct memory_type *mtp, int index)
{

View file

@ -136,6 +136,7 @@ uint64_t memstat_get_count(const struct memory_type *mtp);
uint64_t memstat_get_free(const struct memory_type *mtp);
uint64_t memstat_get_failures(const struct memory_type *mtp);
uint64_t memstat_get_sleeps(const struct memory_type *mtp);
uint64_t memstat_get_xdomain(const struct memory_type *mtp);
void *memstat_get_caller_pointer(const struct memory_type *mtp,
int index);
void memstat_set_caller_pointer(struct memory_type *mtp,

View file

@ -69,6 +69,7 @@ struct memory_type {
uint64_t mt_free; /* Number of cached free items. */
uint64_t mt_failures; /* Number of allocation failures. */
uint64_t mt_sleeps; /* Number of allocation sleeps. */
uint64_t mt_xdomain; /* Number of cross domain sleeps. */
/*
* Caller-owned memory.

View file

@ -201,6 +201,7 @@ memstat_sysctl_uma(struct memory_type_list *list, int flags)
mtp->mt_numfrees = uthp->uth_frees;
mtp->mt_failures = uthp->uth_fails;
mtp->mt_sleeps = uthp->uth_sleeps;
mtp->mt_xdomain = uthp->uth_xdomain;
for (j = 0; j < maxcpus; j++) {
upsp = (struct uma_percpu_stat *)p;
@ -423,11 +424,11 @@ memstat_kvm_uma(struct memory_type_list *list, void *kvm_handle)
mtp->mt_failures = kvm_counter_u64_fetch(kvm,
(unsigned long )uz.uz_fails);
mtp->mt_sleeps = uz.uz_sleeps;
/* See comment above in memstat_sysctl_uma(). */
if (mtp->mt_numallocs < mtp->mt_numfrees)
mtp->mt_numallocs = mtp->mt_numfrees;
mtp->mt_xdomain = uz.uz_xdomain;
if (kz.uk_flags & UMA_ZFLAG_INTERNAL)
goto skip_percpu;
for (i = 0; i < mp_maxid + 1; i++) {

View file

@ -618,6 +618,8 @@ NO_SWAPPING opt_vm.h
MALLOC_MAKE_FAILURES opt_vm.h
MALLOC_PROFILE opt_vm.h
MALLOC_DEBUG_MAXZONES opt_vm.h
UMA_XDOMAIN opt_vm.h
UMA_FIRSTTOUCH opt_vm.h
# The MemGuard replacement allocator used for tamper-after-free detection
DEBUG_MEMGUARD opt_vm.h

View file

@ -282,7 +282,7 @@ uma_zone_t uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
*/
#define UMA_ZONE_INHERIT \
(UMA_ZONE_OFFPAGE | UMA_ZONE_MALLOC | UMA_ZONE_NOFREE | \
UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU)
UMA_ZONE_HASH | UMA_ZONE_VTOSLAB | UMA_ZONE_PCPU | UMA_ZONE_NUMA)
/* Definitions for align */
#define UMA_ALIGN_PTR (sizeof(void *) - 1) /* Alignment fit for ptr */
@ -689,7 +689,8 @@ struct uma_type_header {
uint64_t uth_frees; /* Zone: number of frees. */
uint64_t uth_fails; /* Zone: number of alloc failures. */
uint64_t uth_sleeps; /* Zone: number of alloc sleeps. */
uint64_t _uth_reserved1[2]; /* Reserved. */
uint64_t uth_xdomain; /* Zone: Number of cross domain frees. */
uint64_t _uth_reserved1[1]; /* Reserved. */
};
struct uma_percpu_stat {

View file

@ -785,12 +785,17 @@ cache_drain(uma_zone_t zone)
CPU_FOREACH(cpu) {
cache = &zone->uz_cpu[cpu];
bucket_drain(zone, cache->uc_allocbucket);
bucket_drain(zone, cache->uc_freebucket);
if (cache->uc_allocbucket != NULL)
bucket_free(zone, cache->uc_allocbucket, NULL);
cache->uc_allocbucket = NULL;
bucket_drain(zone, cache->uc_freebucket);
if (cache->uc_freebucket != NULL)
bucket_free(zone, cache->uc_freebucket, NULL);
cache->uc_allocbucket = cache->uc_freebucket = NULL;
cache->uc_freebucket = NULL;
bucket_drain(zone, cache->uc_crossbucket);
if (cache->uc_crossbucket != NULL)
bucket_free(zone, cache->uc_crossbucket, NULL);
cache->uc_crossbucket = NULL;
}
ZONE_LOCK(zone);
bucket_cache_drain(zone);
@ -813,13 +818,13 @@ static void
cache_drain_safe_cpu(uma_zone_t zone)
{
uma_cache_t cache;
uma_bucket_t b1, b2;
uma_bucket_t b1, b2, b3;
int domain;
if (zone->uz_flags & UMA_ZFLAG_INTERNAL)
return;
b1 = b2 = NULL;
b1 = b2 = b3 = NULL;
ZONE_LOCK(zone);
critical_enter();
if (zone->uz_flags & UMA_ZONE_NUMA)
@ -843,12 +848,18 @@ cache_drain_safe_cpu(uma_zone_t zone)
b2 = cache->uc_freebucket;
cache->uc_freebucket = NULL;
}
b3 = cache->uc_crossbucket;
cache->uc_crossbucket = NULL;
critical_exit();
ZONE_UNLOCK(zone);
if (b1)
bucket_free(zone, b1, NULL);
if (b2)
bucket_free(zone, b2, NULL);
if (b3) {
bucket_drain(zone, b3);
bucket_free(zone, b3, NULL);
}
}
/*
@ -1754,6 +1765,7 @@ zone_ctor(void *mem, int size, void *udata, int flags)
zone->uz_init = NULL;
zone->uz_fini = NULL;
zone->uz_sleeps = 0;
zone->uz_xdomain = 0;
zone->uz_count = 0;
zone->uz_count_min = 0;
zone->uz_count_max = BUCKET_MAX;
@ -2185,6 +2197,11 @@ uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
KASSERT(powerof2(align + 1), ("invalid zone alignment %d for \"%s\"",
align, name));
/* Sets all zones to a first-touch domain policy. */
#ifdef UMA_FIRSTTOUCH
flags |= UMA_ZONE_NUMA;
#endif
/* This stuff is essential for the zone ctor */
memset(&args, 0, sizeof(args));
args.name = name;
@ -2456,16 +2473,13 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
if (bucket != NULL)
bucket_free(zone, bucket, udata);
if (zone->uz_flags & UMA_ZONE_NUMA) {
domain = PCPU_GET(domain);
if (VM_DOMAIN_EMPTY(domain))
domain = UMA_ANYDOMAIN;
} else
domain = UMA_ANYDOMAIN;
/* Short-circuit for zones without buckets and low memory. */
if (zone->uz_count == 0 || bucketdisable) {
ZONE_LOCK(zone);
if (zone->uz_flags & UMA_ZONE_NUMA)
domain = PCPU_GET(domain);
else
domain = UMA_ANYDOMAIN;
goto zalloc_item;
}
@ -2497,10 +2511,14 @@ uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
/*
* Check the zone's cache of buckets.
*/
if (domain == UMA_ANYDOMAIN)
zdom = &zone->uz_domain[0];
else
if (zone->uz_flags & UMA_ZONE_NUMA) {
domain = PCPU_GET(domain);
zdom = &zone->uz_domain[domain];
} else {
domain = UMA_ANYDOMAIN;
zdom = &zone->uz_domain[0];
}
if ((bucket = zone_try_fetch_bucket(zone, zdom, true)) != NULL) {
KASSERT(bucket->ub_cnt != 0,
("uma_zalloc_arg: Returning an empty bucket."));
@ -2842,6 +2860,10 @@ zone_alloc_bucket(uma_zone_t zone, void *udata, int domain, int flags, int max)
CTR1(KTR_UMA, "zone_alloc:_bucket domain %d)", domain);
/* Avoid allocs targeting empty domains. */
if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
domain = UMA_ANYDOMAIN;
/* Don't wait for buckets, preserve caller's NOVM setting. */
bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
if (bucket == NULL)
@ -2941,11 +2963,10 @@ zone_alloc_item_locked(uma_zone_t zone, void *udata, int domain, int flags)
}
ZONE_UNLOCK(zone);
if (domain != UMA_ANYDOMAIN) {
/* avoid allocs targeting empty domains */
if (VM_DOMAIN_EMPTY(domain))
domain = UMA_ANYDOMAIN;
}
/* Avoid allocs targeting empty domains. */
if (domain != UMA_ANYDOMAIN && VM_DOMAIN_EMPTY(domain))
domain = UMA_ANYDOMAIN;
if (zone->uz_import(zone->uz_arg, &item, 1, domain, flags) != 1)
goto fail;
@ -3006,6 +3027,9 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
uma_bucket_t bucket;
uma_zone_domain_t zdom;
int cpu, domain;
#ifdef UMA_XDOMAIN
int itemdomain;
#endif
bool lockfail;
#ifdef INVARIANTS
bool skipdbg;
@ -3055,6 +3079,11 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
if (zone->uz_sleepers > 0)
goto zfree_item;
#ifdef UMA_XDOMAIN
if ((zone->uz_flags & UMA_ZONE_NUMA) != 0)
itemdomain = _vm_phys_domain(pmap_kextract((vm_offset_t)item));
#endif
/*
* If possible, free to the per-CPU cache. There are two
* requirements for safe access to the per-CPU cache: (1) the thread
@ -3072,14 +3101,26 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
cache = &zone->uz_cpu[cpu];
zfree_start:
domain = PCPU_GET(domain);
#ifdef UMA_XDOMAIN
if ((zone->uz_flags & UMA_ZONE_NUMA) == 0)
itemdomain = domain;
#endif
/*
* Try to free into the allocbucket first to give LIFO ordering
* for cache-hot datastructures. Spill over into the freebucket
* if necessary. Alloc will swap them if one runs dry.
*/
bucket = cache->uc_allocbucket;
if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
bucket = cache->uc_freebucket;
#ifdef UMA_XDOMAIN
if (domain != itemdomain) {
bucket = cache->uc_crossbucket;
} else
#endif
{
bucket = cache->uc_allocbucket;
if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
bucket = cache->uc_freebucket;
}
if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
("uma_zfree: Freeing to non free bucket index."));
@ -3110,24 +3151,66 @@ uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
}
critical_enter();
cpu = curcpu;
domain = PCPU_GET(domain);
cache = &zone->uz_cpu[cpu];
bucket = cache->uc_freebucket;
#ifdef UMA_XDOMAIN
if (domain != itemdomain)
bucket = cache->uc_crossbucket;
else
#endif
bucket = cache->uc_freebucket;
if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
ZONE_UNLOCK(zone);
goto zfree_start;
}
cache->uc_freebucket = NULL;
#ifdef UMA_XDOMAIN
if (domain != itemdomain)
cache->uc_crossbucket = NULL;
else
#endif
cache->uc_freebucket = NULL;
/* We are no longer associated with this CPU. */
critical_exit();
#ifdef UMA_XDOMAIN
if (domain != itemdomain) {
if (bucket != NULL) {
zone->uz_xdomain += bucket->ub_cnt;
if (vm_ndomains > 2 ||
zone->uz_bkt_count >= zone->uz_bkt_max) {
ZONE_UNLOCK(zone);
bucket_drain(zone, bucket);
bucket_free(zone, bucket, udata);
} else {
zdom = &zone->uz_domain[itemdomain];
zone_put_bucket(zone, zdom, bucket, true);
ZONE_UNLOCK(zone);
}
} else
ZONE_UNLOCK(zone);
bucket = bucket_alloc(zone, udata, M_NOWAIT);
if (bucket == NULL)
goto zfree_item;
critical_enter();
cpu = curcpu;
cache = &zone->uz_cpu[cpu];
if (cache->uc_crossbucket == NULL) {
cache->uc_crossbucket = bucket;
goto zfree_start;
}
critical_exit();
bucket_free(zone, bucket, udata);
goto zfree_restart;
}
#endif
if ((zone->uz_flags & UMA_ZONE_NUMA) != 0) {
domain = PCPU_GET(domain);
if (VM_DOMAIN_EMPTY(domain))
domain = UMA_ANYDOMAIN;
} else
zdom = &zone->uz_domain[domain];
} else {
domain = 0;
zdom = &zone->uz_domain[0];
zdom = &zone->uz_domain[0];
}
/* Can we throw this on the zone full list? */
if (bucket != NULL) {
@ -3768,11 +3851,13 @@ slab_print(uma_slab_t slab)
static void
cache_print(uma_cache_t cache)
{
printf("alloc: %p(%d), free: %p(%d)\n",
printf("alloc: %p(%d), free: %p(%d), cross: %p(%d)j\n",
cache->uc_allocbucket,
cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
cache->uc_freebucket,
cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0,
cache->uc_crossbucket,
cache->uc_crossbucket?cache->uc_crossbucket->ub_cnt:0);
}
static void
@ -3834,13 +3919,13 @@ uma_print_zone(uma_zone_t zone)
*/
static void
uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
uint64_t *freesp, uint64_t *sleepsp)
uint64_t *freesp, uint64_t *sleepsp, uint64_t *xdomainp)
{
uma_cache_t cache;
uint64_t allocs, frees, sleeps;
uint64_t allocs, frees, sleeps, xdomain;
int cachefree, cpu;
allocs = frees = sleeps = 0;
allocs = frees = sleeps = xdomain = 0;
cachefree = 0;
CPU_FOREACH(cpu) {
cache = &z->uz_cpu[cpu];
@ -3848,12 +3933,17 @@ uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
cachefree += cache->uc_allocbucket->ub_cnt;
if (cache->uc_freebucket != NULL)
cachefree += cache->uc_freebucket->ub_cnt;
if (cache->uc_crossbucket != NULL) {
xdomain += cache->uc_crossbucket->ub_cnt;
cachefree += cache->uc_crossbucket->ub_cnt;
}
allocs += cache->uc_allocs;
frees += cache->uc_frees;
}
allocs += counter_u64_fetch(z->uz_allocs);
frees += counter_u64_fetch(z->uz_frees);
sleeps += z->uz_sleeps;
xdomain += z->uz_xdomain;
if (cachefreep != NULL)
*cachefreep = cachefree;
if (allocsp != NULL)
@ -3862,6 +3952,8 @@ uma_zone_sumstat(uma_zone_t z, long *cachefreep, uint64_t *allocsp,
*freesp = frees;
if (sleepsp != NULL)
*sleepsp = sleeps;
if (xdomainp != NULL)
*xdomainp = xdomain;
}
#endif /* DDB */
@ -3902,6 +3994,7 @@ uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
uth->uth_frees = counter_u64_fetch(z->uz_frees);
uth->uth_fails = counter_u64_fetch(z->uz_fails);
uth->uth_sleeps = z->uz_sleeps;
uth->uth_xdomain = z->uz_xdomain;
/*
* While it is not normally safe to access the cache
* bucket pointers while not on the CPU that owns the
@ -3921,6 +4014,9 @@ uma_vm_zone_stats(struct uma_type_header *uth, uma_zone_t z, struct sbuf *sbuf,
if (cache->uc_freebucket != NULL)
ups[i].ups_cache_free +=
cache->uc_freebucket->ub_cnt;
if (cache->uc_crossbucket != NULL)
ups[i].ups_cache_free +=
cache->uc_crossbucket->ub_cnt;
ups[i].ups_allocs = cache->uc_allocs;
ups[i].ups_frees = cache->uc_frees;
}
@ -4176,12 +4272,12 @@ DB_SHOW_COMMAND(uma, db_show_uma)
{
uma_keg_t kz;
uma_zone_t z;
uint64_t allocs, frees, sleeps;
uint64_t allocs, frees, sleeps, xdomain;
long cachefree;
int i;
db_printf("%18s %8s %8s %8s %12s %8s %8s\n", "Zone", "Size", "Used",
"Free", "Requests", "Sleeps", "Bucket");
db_printf("%18s %8s %8s %8s %12s %8s %8s %8s\n", "Zone", "Size", "Used",
"Free", "Requests", "Sleeps", "Bucket", "XFree");
LIST_FOREACH(kz, &uma_kegs, uk_link) {
LIST_FOREACH(z, &kz->uk_zones, uz_link) {
if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
@ -4191,17 +4287,17 @@ DB_SHOW_COMMAND(uma, db_show_uma)
cachefree = 0;
} else
uma_zone_sumstat(z, &cachefree, &allocs,
&frees, &sleeps);
&frees, &sleeps, &xdomain);
if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
(LIST_FIRST(&kz->uk_zones) != z)))
cachefree += kz->uk_free;
for (i = 0; i < vm_ndomains; i++)
cachefree += z->uz_domain[i].uzd_nitems;
db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u\n",
db_printf("%18s %8ju %8jd %8ld %12ju %8ju %8u %8ju\n",
z->uz_name, (uintmax_t)kz->uk_size,
(intmax_t)(allocs - frees), cachefree,
(uintmax_t)allocs, sleeps, z->uz_count);
(uintmax_t)allocs, sleeps, z->uz_count, xdomain);
if (db_pager_quit)
return;
}
@ -4218,7 +4314,7 @@ DB_SHOW_COMMAND(umacache, db_show_umacache)
db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
"Requests", "Bucket");
LIST_FOREACH(z, &uma_cachezones, uz_link) {
uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL);
uma_zone_sumstat(z, &cachefree, &allocs, &frees, NULL, NULL);
for (i = 0; i < vm_ndomains; i++)
cachefree += z->uz_domain[i].uzd_nitems;
db_printf("%18s %8ju %8jd %8ld %12ju %8u\n",

View file

@ -208,6 +208,7 @@ typedef struct uma_bucket * uma_bucket_t;
struct uma_cache {
uma_bucket_t uc_freebucket; /* Bucket we're freeing to */
uma_bucket_t uc_allocbucket; /* Bucket to allocate from */
uma_bucket_t uc_crossbucket; /* cross domain bucket */
uint64_t uc_allocs; /* Count of allocations */
uint64_t uc_frees; /* Count of frees */
} UMA_ALIGN;
@ -368,6 +369,7 @@ struct uma_zone {
counter_u64_t uz_frees; /* Total number of frees */
counter_u64_t uz_fails; /* Total number of alloc failures */
uint64_t uz_sleeps; /* Total number of alloc sleeps */
uint64_t uz_xdomain; /* Total number of cross-domain frees */
/*
* This HAS to be the last item because we adjust the zone size

View file

@ -623,6 +623,26 @@ vm_phys_register_domains(int ndomains, struct mem_affinity *affinity,
#endif
}
int
_vm_phys_domain(vm_paddr_t pa)
{
#ifdef NUMA
int i;
if (vm_ndomains == 1 || mem_affinity == NULL)
return (0);
/*
* Check for any memory that overlaps.
*/
for (i = 0; mem_affinity[i].end != 0; i++)
if (mem_affinity[i].start <= pa &&
mem_affinity[i].end >= pa)
return (mem_affinity[i].domain);
#endif
return (0);
}
/*
* Split a contiguous, power of two-sized set of physical pages.
*

View file

@ -123,6 +123,7 @@ vm_phys_domain(vm_page_t m)
return (0);
#endif
}
int _vm_phys_domain(vm_paddr_t pa);
#endif /* _KERNEL */
#endif /* !_VM_PHYS_H_ */

View file

@ -1483,9 +1483,9 @@ domemstat_zone(void)
}
}
xo_open_container("memory-zone-statistics");
xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} "
xo_emit("{T:/%-20s} {T:/%6s} {T:/%6s} {T:/%8s} {T:/%8s} {T:/%8s} {T:/%8s}"
"{T:/%4s} {T:/%4s}\n\n", "ITEM", "SIZE",
"LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP");
"LIMIT", "USED", "FREE", "REQ", "FAIL", "SLEEP", "XDOMAIN");
xo_open_list("zone");
for (mtp = memstat_mtl_first(mtlp); mtp != NULL;
mtp = memstat_mtl_next(mtp)) {
@ -1495,7 +1495,7 @@ domemstat_zone(void)
xo_emit("{d:name/%-20s}{ke:name/%s} {:size/%6ju}, "
"{:limit/%6ju},{:used/%8ju},"
"{:free/%8ju},{:requests/%8ju},"
"{:fail/%4ju},{:sleep/%4ju}\n", name,
"{:fail/%4ju},{:sleep/%4ju},{:xdomain/%4ju}\n", name,
memstat_get_name(mtp),
(uintmax_t)memstat_get_size(mtp),
(uintmax_t)memstat_get_countlimit(mtp),
@ -1503,7 +1503,8 @@ domemstat_zone(void)
(uintmax_t)memstat_get_free(mtp),
(uintmax_t)memstat_get_numallocs(mtp),
(uintmax_t)memstat_get_failures(mtp),
(uintmax_t)memstat_get_sleeps(mtp));
(uintmax_t)memstat_get_sleeps(mtp),
(uintmax_t)memstat_get_xdomain(mtp));
xo_close_instance("zone");
}
memstat_mtl_free(mtlp);