mirror of
https://github.com/golang/go
synced 2024-10-02 22:25:08 +00:00
runtime: combine small NoScan allocations
Combine NoScan allocations < 16 bytes into a single memory block.
Reduces number of allocations on json/garbage benchmarks by 10+%.
json-1
allocated 8039872
7949194 -1.13%
allocs 105774 93776 -11.34%
cputime 156200000 100700000 -35.53%
gc-pause-one 4908873 3814853 -22.29%
gc-pause-total 2748969 2899288 +5.47%
rss 52674560 43560960 -17.30%
sys-gc 3796976 3256304 -14.24%
sys-heap 43843584 35192832 -19.73%
sys-other 5589312 5310784 -4.98%
sys-stack 393216 393216 +0.00%
sys-total 53623088 44153136 -17.66%
time 156193436 100886714 -35.41%
virtual-mem 256548864 256540672 -0.00%
garbage-1
allocated 2996885 2932982 -2.13%
allocs 62904 55200 -12.25%
cputime 17470000 17400000 -0.40%
gc-pause-one 932757485 925806143 -0.75%
gc-pause-total 4663787 4629030 -0.75%
rss 1151074304 1133670400 -1.51%
sys-gc 66068352 65085312 -1.49%
sys-heap 1039728640 1024065536 -1.51%
sys-other 38038208 37485248 -1.45%
sys-stack 8650752 8781824 +1.52%
sys-total 1152485952 1135417920 -1.48%
time 17478088 17418005 -0.34%
virtual-mem 1343709184 1324204032 -1.45%
LGTM=iant, bradfitz
R=golang-codereviews, dave, iant, rsc, bradfitz
CC=golang-codereviews, khr
https://golang.org/cl/38750047
This commit is contained in:
parent
f8e0057bb7
commit
1fa7029425
|
@ -5,6 +5,8 @@
|
|||
// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
|
||||
|
||||
#include "runtime.h"
|
||||
#include "arch_GOARCH.h"
|
||||
#include "malloc.h"
|
||||
|
||||
Slice syscall·envs;
|
||||
|
||||
|
@ -44,15 +46,24 @@ void
|
|||
syscall·setenv_c(String k, String v)
|
||||
{
|
||||
byte *arg[2];
|
||||
uintptr len;
|
||||
|
||||
if(_cgo_setenv == nil)
|
||||
return;
|
||||
|
||||
arg[0] = runtime·malloc(k.len + 1);
|
||||
// Objects that are explicitly freed must be at least 16 bytes in size,
|
||||
// so that they are not allocated using tiny alloc.
|
||||
len = k.len + 1;
|
||||
if(len < TinySize)
|
||||
len = TinySize;
|
||||
arg[0] = runtime·malloc(len);
|
||||
runtime·memmove(arg[0], k.str, k.len);
|
||||
arg[0][k.len] = 0;
|
||||
|
||||
arg[1] = runtime·malloc(v.len + 1);
|
||||
len = v.len + 1;
|
||||
if(len < TinySize)
|
||||
len = TinySize;
|
||||
arg[1] = runtime·malloc(len);
|
||||
runtime·memmove(arg[1], v.str, v.len);
|
||||
arg[1][v.len] = 0;
|
||||
|
||||
|
|
|
@ -26,6 +26,8 @@ extern MStats mstats; // defined in zruntime_def_$GOOS_$GOARCH.go
|
|||
|
||||
extern volatile intgo runtime·MemProfileRate;
|
||||
|
||||
static void* largealloc(uint32, uintptr*);
|
||||
|
||||
// Allocate an object of at least size bytes.
|
||||
// Small objects are allocated from the per-thread cache's free lists.
|
||||
// Large objects (> 32 kB) are allocated straight from the heap.
|
||||
|
@ -34,12 +36,13 @@ void*
|
|||
runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
|
||||
{
|
||||
int32 sizeclass;
|
||||
uintptr tinysize, size1;
|
||||
intgo rate;
|
||||
MCache *c;
|
||||
MCacheList *l;
|
||||
uintptr npages;
|
||||
MSpan *s;
|
||||
MLink *v;
|
||||
byte *tiny;
|
||||
P *p;
|
||||
|
||||
if(size == 0) {
|
||||
// All 0-length allocations use this pointer.
|
||||
|
@ -59,6 +62,79 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
|
|||
|
||||
c = m->mcache;
|
||||
if(!runtime·debug.efence && size <= MaxSmallSize) {
|
||||
if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
|
||||
// Tiny allocator.
|
||||
//
|
||||
// Tiny allocator combines several tiny allocation requests
|
||||
// into a single memory block. The resulting memory block
|
||||
// is freed when all subobjects are unreachable. The subobjects
|
||||
// must be FlagNoScan (don't have pointers), this ensures that
|
||||
// the amount of potentially wasted memory is bounded.
|
||||
//
|
||||
// Size of the memory block used for combining (TinySize) is tunable.
|
||||
// Current setting is 16 bytes, which relates to 2x worst case memory
|
||||
// wastage (when all but one subobjects are unreachable).
|
||||
// 8 bytes would result in no wastage at all, but provides less
|
||||
// opportunities for combining.
|
||||
// 32 bytes provides more opportunities for combining,
|
||||
// but can lead to 4x worst case wastage.
|
||||
// The best case winning is 8x regardless of block size.
|
||||
//
|
||||
// Objects obtained from tiny allocator must not be freed explicitly.
|
||||
// So when an object will be freed explicitly, we ensure that
|
||||
// its size >= TinySize.
|
||||
//
|
||||
// SetFinalizer has a special case for objects potentially coming
|
||||
// from tiny allocator, it such case it allows to set finalizers
|
||||
// for an inner byte of a memory block.
|
||||
//
|
||||
// The main targets of tiny allocator are small strings and
|
||||
// standalone escaping variables. On a json benchmark
|
||||
// the allocator reduces number of allocations by ~12% and
|
||||
// reduces heap size by ~20%.
|
||||
|
||||
p = m->p;
|
||||
tinysize = p->tinysize;
|
||||
if(size <= tinysize) {
|
||||
tiny = p->tiny;
|
||||
// Align tiny pointer for required (conservative) alignment.
|
||||
if((size&7) == 0)
|
||||
tiny = (byte*)ROUND((uintptr)tiny, 8);
|
||||
else if((size&3) == 0)
|
||||
tiny = (byte*)ROUND((uintptr)tiny, 4);
|
||||
else if((size&1) == 0)
|
||||
tiny = (byte*)ROUND((uintptr)tiny, 2);
|
||||
size1 = size + (tiny - p->tiny);
|
||||
if(size1 <= tinysize) {
|
||||
// The object fits into existing tiny block.
|
||||
v = (MLink*)tiny;
|
||||
p->tiny += size1;
|
||||
p->tinysize -= size1;
|
||||
m->mallocing = 0;
|
||||
m->locks--;
|
||||
if(m->locks == 0 && g->preempt) // restore the preemption request in case we've cleared it in newstack
|
||||
g->stackguard0 = StackPreempt;
|
||||
return v;
|
||||
}
|
||||
}
|
||||
// Allocate a new TinySize block.
|
||||
l = &c->list[TinySizeClass];
|
||||
if(l->list == nil)
|
||||
runtime·MCache_Refill(c, TinySizeClass);
|
||||
v = l->list;
|
||||
l->list = v->next;
|
||||
l->nlist--;
|
||||
((uint64*)v)[0] = 0;
|
||||
((uint64*)v)[1] = 0;
|
||||
// See if we need to replace the existing tiny block with the new one
|
||||
// based on amount of remaining free space.
|
||||
if(TinySize-size > tinysize) {
|
||||
p->tiny = (byte*)v + size;
|
||||
p->tinysize = TinySize - size;
|
||||
}
|
||||
size = TinySize;
|
||||
goto done;
|
||||
}
|
||||
// Allocate from mcache free lists.
|
||||
// Inlined version of SizeToClass().
|
||||
if(size <= 1024-8)
|
||||
|
@ -78,23 +154,11 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
|
|||
if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
|
||||
runtime·memclr((byte*)v, size);
|
||||
}
|
||||
done:
|
||||
c->local_cachealloc += size;
|
||||
} else {
|
||||
// TODO(rsc): Report tracebacks for very large allocations.
|
||||
|
||||
// Allocate directly from heap.
|
||||
npages = size >> PageShift;
|
||||
if((size & PageMask) != 0)
|
||||
npages++;
|
||||
s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
|
||||
if(s == nil)
|
||||
runtime·throw("out of memory");
|
||||
s->limit = (byte*)(s->start<<PageShift) + size;
|
||||
size = npages<<PageShift;
|
||||
v = (void*)(s->start << PageShift);
|
||||
|
||||
// setup for mark sweep
|
||||
runtime·markspan(v, 0, 0, true);
|
||||
v = largealloc(flag, &size);
|
||||
}
|
||||
|
||||
if(flag & FlagNoGC)
|
||||
|
@ -151,6 +215,29 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
|
|||
return v;
|
||||
}
|
||||
|
||||
static void*
|
||||
largealloc(uint32 flag, uintptr *sizep)
|
||||
{
|
||||
uintptr npages, size;
|
||||
MSpan *s;
|
||||
void *v;
|
||||
|
||||
// Allocate directly from heap.
|
||||
size = *sizep;
|
||||
npages = size >> PageShift;
|
||||
if((size & PageMask) != 0)
|
||||
npages++;
|
||||
s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
|
||||
if(s == nil)
|
||||
runtime·throw("out of memory");
|
||||
s->limit = (byte*)(s->start<<PageShift) + size;
|
||||
*sizep = npages<<PageShift;
|
||||
v = (void*)(s->start << PageShift);
|
||||
// setup for mark sweep
|
||||
runtime·markspan(v, 0, 0, true);
|
||||
return v;
|
||||
}
|
||||
|
||||
void*
|
||||
runtime·malloc(uintptr size)
|
||||
{
|
||||
|
@ -182,6 +269,10 @@ runtime·free(void *v)
|
|||
}
|
||||
size = s->elemsize;
|
||||
sizeclass = s->sizeclass;
|
||||
// Objects that are smaller than TinySize can be allocated using tiny alloc,
|
||||
// if then such object is combined with an object with finalizer, we will crash.
|
||||
if(size < TinySize)
|
||||
runtime·throw("freeing too small block");
|
||||
|
||||
if(raceenabled)
|
||||
runtime·racefree(v);
|
||||
|
@ -347,6 +438,9 @@ runtime·mallocinit(void)
|
|||
|
||||
runtime·InitSizes();
|
||||
|
||||
if(runtime·class_to_size[TinySizeClass] != TinySize)
|
||||
runtime·throw("bad TinySizeClass");
|
||||
|
||||
// limit = runtime·memlimit();
|
||||
// See https://code.google.com/p/go/issues/detail?id=5049
|
||||
// TODO(rsc): Fix after 1.1.
|
||||
|
@ -450,7 +544,7 @@ runtime·mallocinit(void)
|
|||
m->mcache = runtime·allocmcache();
|
||||
|
||||
// See if it works.
|
||||
runtime·free(runtime·malloc(1));
|
||||
runtime·free(runtime·malloc(TinySize));
|
||||
}
|
||||
|
||||
void*
|
||||
|
@ -760,12 +854,17 @@ func SetFinalizer(obj Eface, finalizer Eface) {
|
|||
goto throw;
|
||||
}
|
||||
ot = (PtrType*)obj.type;
|
||||
if(ot->elem != nil && ot->elem->size == 0) {
|
||||
// As an implementation detail we do not run finalizers for zero-sized objects,
|
||||
// because we use &runtime·zerobase for all such allocations.
|
||||
if(ot->elem != nil && ot->elem->size == 0)
|
||||
return;
|
||||
}
|
||||
if(!runtime·mlookup(obj.data, &base, &size, nil) || obj.data != base) {
|
||||
runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
|
||||
goto throw;
|
||||
// As an implementation detail we allow to set finalizers for an inner byte
|
||||
// of an object if it could come from tiny alloc (see mallocgc for details).
|
||||
if(ot->elem == nil || (ot->elem->kind&KindNoPointers) == 0 || ot->elem->size >= TinySize) {
|
||||
runtime·printf("runtime.SetFinalizer: pointer not at beginning of allocated block\n");
|
||||
goto throw;
|
||||
}
|
||||
}
|
||||
if(finalizer.type != nil) {
|
||||
if(finalizer.type->kind != KindFunc)
|
||||
|
|
|
@ -108,6 +108,10 @@ enum
|
|||
// Tunable constants.
|
||||
MaxSmallSize = 32<<10,
|
||||
|
||||
// Tiny allocator parameters, see "Tiny allocator" comment in malloc.goc.
|
||||
TinySize = 16,
|
||||
TinySizeClass = 2,
|
||||
|
||||
FixAllocChunk = 16<<10, // Chunk size for FixAlloc
|
||||
MaxMHeapList = 1<<(20 - PageShift), // Maximum page length for fixed-size list in MHeap.
|
||||
HeapAllocChunk = 1<<20, // Chunk size for heap growth
|
||||
|
|
|
@ -84,8 +84,11 @@ clearpools(void)
|
|||
}
|
||||
pools.head = nil;
|
||||
|
||||
// clear defer pools
|
||||
for(pp=runtime·allp; p=*pp; pp++) {
|
||||
// clear tinyalloc pool
|
||||
p->tiny = nil;
|
||||
p->tinysize = 0;
|
||||
// clear defer pools
|
||||
for(i=0; i<nelem(p->deferpool); i++)
|
||||
p->deferpool[i] = nil;
|
||||
}
|
||||
|
@ -1202,6 +1205,7 @@ markroot(ParFor *desc, uint32 i)
|
|||
MSpan **allspans, *s;
|
||||
uint32 spanidx;
|
||||
G *gp;
|
||||
void *p;
|
||||
|
||||
USED(&desc);
|
||||
wbuf = getempty(nil);
|
||||
|
@ -1241,7 +1245,9 @@ markroot(ParFor *desc, uint32 i)
|
|||
// don't mark finalized object, but scan it so we
|
||||
// retain everything it points to.
|
||||
spf = (SpecialFinalizer*)sp;
|
||||
enqueue1(&wbuf, (Obj){(void*)((s->start << PageShift) + spf->offset), s->elemsize, 0});
|
||||
// A finalizer can be set for an inner byte of an object, find object beginning.
|
||||
p = (void*)((s->start << PageShift) + spf->offset/s->elemsize*s->elemsize);
|
||||
enqueue1(&wbuf, (Obj){p, s->elemsize, 0});
|
||||
enqueue1(&wbuf, (Obj){(void*)&spf->fn, PtrSize, 0});
|
||||
enqueue1(&wbuf, (Obj){(void*)&spf->fint, PtrSize, 0});
|
||||
enqueue1(&wbuf, (Obj){(void*)&spf->ot, PtrSize, 0});
|
||||
|
@ -1663,12 +1669,16 @@ sweepspan(ParFor *desc, uint32 idx)
|
|||
specialp = &s->specials;
|
||||
special = *specialp;
|
||||
while(special != nil) {
|
||||
p = (byte*)(s->start << PageShift) + special->offset;
|
||||
// A finalizer can be set for an inner byte of an object, find object beginning.
|
||||
p = (byte*)(s->start << PageShift) + special->offset/size*size;
|
||||
off = (uintptr*)p - (uintptr*)arena_start;
|
||||
bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
|
||||
shift = off % wordsPerBitmapWord;
|
||||
bits = *bitp>>shift;
|
||||
if((bits & (bitAllocated|bitMarked)) == bitAllocated) {
|
||||
// Find the exact byte for which the special was setup
|
||||
// (as opposed to object beginning).
|
||||
p = (byte*)(s->start << PageShift) + special->offset;
|
||||
// about to free object: splice out special record
|
||||
y = special;
|
||||
special = special->next;
|
||||
|
|
|
@ -605,6 +605,8 @@ removespecial(void *p, byte kind)
|
|||
runtime·lock(&span->specialLock);
|
||||
t = &span->specials;
|
||||
while((s = *t) != nil) {
|
||||
// This function is used for finalizers only, so we don't check for
|
||||
// "interior" specials (p must be exactly equal to s->offset).
|
||||
if(offset == s->offset && kind == s->kind) {
|
||||
*t = s->next;
|
||||
runtime·unlock(&span->specialLock);
|
||||
|
@ -713,9 +715,9 @@ runtime·freeallspecials(MSpan *span, void *p, uintptr size)
|
|||
runtime·lock(&span->specialLock);
|
||||
t = &span->specials;
|
||||
while((s = *t) != nil) {
|
||||
if(offset < s->offset)
|
||||
if(offset + size <= s->offset)
|
||||
break;
|
||||
if(offset == s->offset) {
|
||||
if(offset <= s->offset) {
|
||||
*t = s->next;
|
||||
s->next = list;
|
||||
list = s;
|
||||
|
|
|
@ -385,6 +385,11 @@ struct P
|
|||
MCache* mcache;
|
||||
Defer* deferpool[5]; // pool of available Defer structs of different sizes (see panic.c)
|
||||
|
||||
// Allocator cache for tiny objects w/o pointers.
|
||||
// See "Tiny allocator" comment in malloc.goc.
|
||||
byte* tiny;
|
||||
uintptr tinysize;
|
||||
|
||||
// Cache of goroutine ids, amortizes accesses to runtime·sched.goidgen.
|
||||
uint64 goidcache;
|
||||
uint64 goidcacheend;
|
||||
|
|
|
@ -73,8 +73,8 @@ func TestPoolGC(t *testing.T) {
|
|||
var fin uint32
|
||||
const N = 100
|
||||
for i := 0; i < N; i++ {
|
||||
v := new(int)
|
||||
runtime.SetFinalizer(v, func(vv *int) {
|
||||
v := new(string)
|
||||
runtime.SetFinalizer(v, func(vv *string) {
|
||||
atomic.AddUint32(&fin, 1)
|
||||
})
|
||||
p.Put(v)
|
||||
|
|
|
@ -34,17 +34,17 @@ func main() {
|
|||
for i := 0; i < N; i++ {
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
v := new(int)
|
||||
v := new(string)
|
||||
f := func() {
|
||||
if *v != 0 {
|
||||
if *v != "" {
|
||||
panic("oops")
|
||||
}
|
||||
}
|
||||
if *v != 0 {
|
||||
if *v != "" {
|
||||
// let the compiler think f escapes
|
||||
sink = f
|
||||
}
|
||||
runtime.SetFinalizer(v, func(p *int) {
|
||||
runtime.SetFinalizer(v, func(p *string) {
|
||||
atomic.AddInt32(&count, -1)
|
||||
})
|
||||
defer f()
|
||||
|
|
|
@ -30,7 +30,7 @@ func G() {
|
|||
func main() {
|
||||
nf := testing.AllocsPerRun(100, F)
|
||||
ng := testing.AllocsPerRun(100, G)
|
||||
if int(nf) != 1 {
|
||||
if int(nf) > 1 {
|
||||
fmt.Printf("AllocsPerRun(100, F) = %v, want 1\n", nf)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
|
|
@ -26,11 +26,11 @@ func F() {
|
|||
func main() {
|
||||
nf := testing.AllocsPerRun(100, F)
|
||||
ng := testing.AllocsPerRun(100, G)
|
||||
if int(nf) != 1 {
|
||||
if int(nf) > 1 {
|
||||
fmt.Printf("AllocsPerRun(100, F) = %v, want 1\n", nf)
|
||||
os.Exit(1)
|
||||
}
|
||||
if int(ng) != 1 {
|
||||
if int(ng) > 1 {
|
||||
fmt.Printf("AllocsPerRun(100, G) = %v, want 1\n", ng)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
|
62
test/tinyfin.go
Normal file
62
test/tinyfin.go
Normal file
|
@ -0,0 +1,62 @@
|
|||
// run
|
||||
|
||||
// Copyright 2014 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test finalizers work for tiny (combined) allocations.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Does not work on 32-bits due to partially conservative GC.
|
||||
// Try to enable when we have fully precise GC.
|
||||
if runtime.GOARCH != "amd64" {
|
||||
return
|
||||
}
|
||||
// Likewise for gccgo.
|
||||
if runtime.Compiler == "gccgo" {
|
||||
return
|
||||
}
|
||||
N := int32(100)
|
||||
count := N
|
||||
done := make([]bool, N)
|
||||
for i := int32(0); i < N; i++ {
|
||||
x := i // subject to tiny alloc
|
||||
// the closure must be big enough to be combined
|
||||
runtime.SetFinalizer(&x, func(p *int32) {
|
||||
// Check that p points to the correct subobject of the tiny allocation.
|
||||
// It's a bit tricky, because we can't capture another variable
|
||||
// with the expected value (it would be combined as well).
|
||||
if *p < 0 || *p >= N {
|
||||
println("got", *p)
|
||||
panic("corrupted")
|
||||
}
|
||||
if done[*p] {
|
||||
println("got", *p)
|
||||
panic("already finalized")
|
||||
}
|
||||
done[*p] = true
|
||||
atomic.AddInt32(&count, -1)
|
||||
})
|
||||
}
|
||||
for i := 0; i < 4; i++ {
|
||||
runtime.GC()
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
// Some of the finalizers may not be executed,
|
||||
// if the outermost allocations are combined with something persistent.
|
||||
// Currently 4 int32's are combined into a 16-byte block,
|
||||
// ensure that most of them are finalized.
|
||||
if count >= N/4 {
|
||||
println(count, "out of", N, "finalizer are not called")
|
||||
panic("not all finalizers are called")
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in a new issue