mirror of
https://github.com/golang/go
synced 2024-11-05 18:36:08 +00:00
runtime: faster GC mark phase
Also bump MaxGcproc to 8. benchmark old ns/op new ns/op delta Parser 3796323000 3763880000 -0.85% Parser-2 3591752500 3518560250 -2.04% Parser-4 3423825250 3334955250 -2.60% Parser-8 3304585500 3267014750 -1.14% Parser-16 3313615750 3286160500 -0.83% Tree 984128500 942501166 -4.23% Tree-2 932564444 883266222 -5.29% Tree-4 835831000 799912777 -4.30% Tree-8 819238500 789717333 -3.73% Tree-16 880837833 837840055 -5.13% Tree2 604698100 579716900 -4.13% Tree2-2 372414500 356765200 -4.20% Tree2-4 187488100 177455900 -5.56% Tree2-8 136315300 102086700 -25.11% Tree2-16 93725900 76705800 -22.18% ParserPause 157441210 166202783 +5.56% ParserPause-2 93842650 85199900 -9.21% ParserPause-4 56844404 53535684 -5.82% ParserPause-8 35739446 30767613 -16.15% ParserPause-16 32718255 27212441 -16.83% TreePause 29610557 29787725 +0.60% TreePause-2 24001659 20674421 -13.86% TreePause-4 15114887 12842781 -15.03% TreePause-8 13128725 10741747 -22.22% TreePause-16 16131360 12506901 -22.47% Tree2Pause 2673350920 2651045280 -0.83% Tree2Pause-2 1796999200 1709350040 -4.88% Tree2Pause-4 1163553320 1090706480 -6.67% Tree2Pause-8 987032520 858916360 -25.11% Tree2Pause-16 864758560 809567480 -6.81% ParserLastPause 280537000 289047000 +3.03% ParserLastPause-2 183030000 166748000 -8.90% ParserLastPause-4 105817000 91552000 -13.48% ParserLastPause-8 65127000 53288000 -18.18% ParserLastPause-16 45258000 38334000 -15.30% TreeLastPause 45072000 51449000 +12.39% TreeLastPause-2 39269000 37866000 -3.57% TreeLastPause-4 23564000 20649000 -12.37% TreeLastPause-8 20881000 15807000 -24.30% TreeLastPause-16 23297000 17309000 -25.70% Tree2LastPause 6046912000 5797120000 -4.13% Tree2LastPause-2 3724034000 3567592000 -4.20% Tree2LastPause-4 1874831000 1774524000 -5.65% Tree2LastPause-8 1363108000 1020809000 -12.79% Tree2LastPause-16 937208000 767019000 -22.18% R=rsc, 0xe2.0x9a.0x9b CC=golang-dev https://golang.org/cl/6223050
This commit is contained in:
parent
82e2272566
commit
b0702bd0db
2 changed files with 113 additions and 155 deletions
|
@ -124,8 +124,8 @@ enum
|
|||
// Max number of threads to run garbage collection.
|
||||
// 2, 3, and 4 are all plausible maximums depending
|
||||
// on the hardware details of the machine. The garbage
|
||||
// collector scales well to 4 cpus.
|
||||
MaxGcproc = 4,
|
||||
// collector scales well to 8 cpus.
|
||||
MaxGcproc = 8,
|
||||
};
|
||||
|
||||
// A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).)
|
||||
|
|
|
@ -13,6 +13,7 @@ enum {
|
|||
Debug = 0,
|
||||
PtrSize = sizeof(void*),
|
||||
DebugMark = 0, // run second pass to check mark
|
||||
DataBlock = 8*1024,
|
||||
|
||||
// Four bits per word (see #defines below).
|
||||
wordsPerBitmapWord = sizeof(void*)*8/4,
|
||||
|
@ -72,9 +73,9 @@ static int32 gctrace;
|
|||
typedef struct Workbuf Workbuf;
|
||||
struct Workbuf
|
||||
{
|
||||
Workbuf *next;
|
||||
LFNode node; // must be first
|
||||
uintptr nobj;
|
||||
byte *obj[512-2];
|
||||
byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)];
|
||||
};
|
||||
|
||||
typedef struct Finalizer Finalizer;
|
||||
|
@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*);
|
|||
static void putempty(Workbuf*);
|
||||
static Workbuf* handoff(Workbuf*);
|
||||
|
||||
typedef struct GcRoot GcRoot;
|
||||
struct GcRoot
|
||||
{
|
||||
byte *p;
|
||||
uintptr n;
|
||||
};
|
||||
|
||||
static struct {
|
||||
Lock fmu;
|
||||
Workbuf *full;
|
||||
Lock emu;
|
||||
Workbuf *empty;
|
||||
uint64 full; // lock-free list of full blocks
|
||||
uint64 empty; // lock-free list of empty blocks
|
||||
byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
|
||||
uint32 nproc;
|
||||
volatile uint32 nwait;
|
||||
volatile uint32 ndone;
|
||||
volatile uint32 debugmarkdone;
|
||||
Note alldone;
|
||||
ParFor *markfor;
|
||||
ParFor *sweepfor;
|
||||
|
||||
Lock;
|
||||
byte *chunk;
|
||||
uintptr nchunk;
|
||||
|
||||
GcRoot *roots;
|
||||
uint32 nroot;
|
||||
uint32 rootcap;
|
||||
} work;
|
||||
|
||||
// scanblock scans a block of n bytes starting at pointer b for references
|
||||
|
@ -162,7 +174,7 @@ scanblock(byte *b, int64 n)
|
|||
nobj = 0; // number of queued objects
|
||||
|
||||
// Scanblock helpers pass b==nil.
|
||||
// The main proc needs to return to make more
|
||||
// Procs needs to return to make more
|
||||
// calls to scanblock. But if work.nproc==1 then
|
||||
// might as well process blocks as soon as we
|
||||
// have them.
|
||||
|
@ -246,6 +258,14 @@ scanblock(byte *b, int64 n)
|
|||
bits = xbits >> shift;
|
||||
|
||||
found:
|
||||
// If another proc wants a pointer, give it some.
|
||||
if(work.nwait > 0 && nobj > 4 && work.full == 0) {
|
||||
wbuf->nobj = nobj;
|
||||
wbuf = handoff(wbuf);
|
||||
nobj = wbuf->nobj;
|
||||
wp = wbuf->obj + nobj;
|
||||
}
|
||||
|
||||
// Now we have bits, bitp, and shift correct for
|
||||
// obj pointing at the base of the object.
|
||||
// Only care about allocated and not marked.
|
||||
|
@ -269,14 +289,6 @@ scanblock(byte *b, int64 n)
|
|||
|
||||
PREFETCH(obj);
|
||||
|
||||
// If another proc wants a pointer, give it some.
|
||||
if(nobj > 4 && work.nwait > 0 && work.full == nil) {
|
||||
wbuf->nobj = nobj;
|
||||
wbuf = handoff(wbuf);
|
||||
nobj = wbuf->nobj;
|
||||
wp = wbuf->obj + nobj;
|
||||
}
|
||||
|
||||
// If buffer is full, get a new one.
|
||||
if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
|
||||
if(wbuf != nil)
|
||||
|
@ -296,7 +308,8 @@ scanblock(byte *b, int64 n)
|
|||
// Fetch b from the work buffer.
|
||||
if(nobj == 0) {
|
||||
if(!keepworking) {
|
||||
putempty(wbuf);
|
||||
if(wbuf)
|
||||
putempty(wbuf);
|
||||
return;
|
||||
}
|
||||
// Emptied our buffer: refill.
|
||||
|
@ -401,53 +414,33 @@ debug_scanblock(byte *b, int64 n)
|
|||
}
|
||||
}
|
||||
|
||||
static void
|
||||
markroot(ParFor *desc, uint32 i)
|
||||
{
|
||||
USED(&desc);
|
||||
scanblock(work.roots[i].p, work.roots[i].n);
|
||||
}
|
||||
|
||||
// Get an empty work buffer off the work.empty list,
|
||||
// allocating new buffers as needed.
|
||||
static Workbuf*
|
||||
getempty(Workbuf *b)
|
||||
{
|
||||
if(work.nproc == 1) {
|
||||
// Put b on full list.
|
||||
if(b != nil) {
|
||||
b->next = work.full;
|
||||
work.full = b;
|
||||
if(b != nil)
|
||||
runtime·lfstackpush(&work.full, &b->node);
|
||||
b = (Workbuf*)runtime·lfstackpop(&work.empty);
|
||||
if(b == nil) {
|
||||
// Need to allocate.
|
||||
runtime·lock(&work);
|
||||
if(work.nchunk < sizeof *b) {
|
||||
work.nchunk = 1<<20;
|
||||
work.chunk = runtime·SysAlloc(work.nchunk);
|
||||
}
|
||||
// Grab from empty list if possible.
|
||||
b = work.empty;
|
||||
if(b != nil) {
|
||||
work.empty = b->next;
|
||||
goto haveb;
|
||||
}
|
||||
} else {
|
||||
// Put b on full list.
|
||||
if(b != nil) {
|
||||
runtime·lock(&work.fmu);
|
||||
b->next = work.full;
|
||||
work.full = b;
|
||||
runtime·unlock(&work.fmu);
|
||||
}
|
||||
// Grab from empty list if possible.
|
||||
runtime·lock(&work.emu);
|
||||
b = work.empty;
|
||||
if(b != nil)
|
||||
work.empty = b->next;
|
||||
runtime·unlock(&work.emu);
|
||||
if(b != nil)
|
||||
goto haveb;
|
||||
b = (Workbuf*)work.chunk;
|
||||
work.chunk += sizeof *b;
|
||||
work.nchunk -= sizeof *b;
|
||||
runtime·unlock(&work);
|
||||
}
|
||||
|
||||
// Need to allocate.
|
||||
runtime·lock(&work);
|
||||
if(work.nchunk < sizeof *b) {
|
||||
work.nchunk = 1<<20;
|
||||
work.chunk = runtime·SysAlloc(work.nchunk);
|
||||
}
|
||||
b = (Workbuf*)work.chunk;
|
||||
work.chunk += sizeof *b;
|
||||
work.nchunk -= sizeof *b;
|
||||
runtime·unlock(&work);
|
||||
|
||||
haveb:
|
||||
b->nobj = 0;
|
||||
return b;
|
||||
}
|
||||
|
@ -455,19 +448,7 @@ haveb:
|
|||
static void
|
||||
putempty(Workbuf *b)
|
||||
{
|
||||
if(b == nil)
|
||||
return;
|
||||
|
||||
if(work.nproc == 1) {
|
||||
b->next = work.empty;
|
||||
work.empty = b;
|
||||
return;
|
||||
}
|
||||
|
||||
runtime·lock(&work.emu);
|
||||
b->next = work.empty;
|
||||
work.empty = b;
|
||||
runtime·unlock(&work.emu);
|
||||
runtime·lfstackpush(&work.empty, &b->node);
|
||||
}
|
||||
|
||||
// Get a full work buffer off the work.full list, or return nil.
|
||||
|
@ -475,54 +456,21 @@ static Workbuf*
|
|||
getfull(Workbuf *b)
|
||||
{
|
||||
int32 i;
|
||||
Workbuf *b1;
|
||||
|
||||
if(work.nproc == 1) {
|
||||
// Put b on empty list.
|
||||
if(b != nil) {
|
||||
b->next = work.empty;
|
||||
work.empty = b;
|
||||
}
|
||||
// Grab from full list if possible.
|
||||
// Since work.nproc==1, no one else is
|
||||
// going to give us work.
|
||||
b = work.full;
|
||||
if(b != nil)
|
||||
work.full = b->next;
|
||||
if(b != nil)
|
||||
runtime·lfstackpush(&work.empty, &b->node);
|
||||
b = (Workbuf*)runtime·lfstackpop(&work.full);
|
||||
if(b != nil || work.nproc == 1)
|
||||
return b;
|
||||
}
|
||||
|
||||
putempty(b);
|
||||
|
||||
// Grab buffer from full list if possible.
|
||||
for(;;) {
|
||||
b1 = work.full;
|
||||
if(b1 == nil)
|
||||
break;
|
||||
runtime·lock(&work.fmu);
|
||||
if(work.full != nil) {
|
||||
b1 = work.full;
|
||||
work.full = b1->next;
|
||||
runtime·unlock(&work.fmu);
|
||||
return b1;
|
||||
}
|
||||
runtime·unlock(&work.fmu);
|
||||
}
|
||||
|
||||
runtime·xadd(&work.nwait, +1);
|
||||
for(i=0;; i++) {
|
||||
b1 = work.full;
|
||||
if(b1 != nil) {
|
||||
runtime·lock(&work.fmu);
|
||||
if(work.full != nil) {
|
||||
runtime·xadd(&work.nwait, -1);
|
||||
b1 = work.full;
|
||||
work.full = b1->next;
|
||||
runtime·unlock(&work.fmu);
|
||||
return b1;
|
||||
}
|
||||
runtime·unlock(&work.fmu);
|
||||
continue;
|
||||
if(work.full != 0) {
|
||||
runtime·xadd(&work.nwait, -1);
|
||||
b = (Workbuf*)runtime·lfstackpop(&work.full);
|
||||
if(b != nil)
|
||||
return b;
|
||||
runtime·xadd(&work.nwait, +1);
|
||||
}
|
||||
if(work.nwait == work.nproc)
|
||||
return nil;
|
||||
|
@ -555,17 +503,35 @@ handoff(Workbuf *b)
|
|||
m->gcstats.nhandoffcnt += n;
|
||||
|
||||
// Put b on full list - let first half of b get stolen.
|
||||
runtime·lock(&work.fmu);
|
||||
b->next = work.full;
|
||||
work.full = b;
|
||||
runtime·unlock(&work.fmu);
|
||||
|
||||
runtime·lfstackpush(&work.full, &b->node);
|
||||
return b1;
|
||||
}
|
||||
|
||||
// Scanstack calls scanblock on each of gp's stack segments.
|
||||
static void
|
||||
scanstack(void (*scanblock)(byte*, int64), G *gp)
|
||||
addroot(byte *p, uintptr n)
|
||||
{
|
||||
uint32 cap;
|
||||
GcRoot *new;
|
||||
|
||||
if(work.nroot >= work.rootcap) {
|
||||
cap = PageSize/sizeof(GcRoot);
|
||||
if(cap < 2*work.rootcap)
|
||||
cap = 2*work.rootcap;
|
||||
new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
|
||||
if(work.roots != nil) {
|
||||
runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot));
|
||||
runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot));
|
||||
}
|
||||
work.roots = new;
|
||||
work.rootcap = cap;
|
||||
}
|
||||
work.roots[work.nroot].p = p;
|
||||
work.roots[work.nroot].n = n;
|
||||
work.nroot++;
|
||||
}
|
||||
|
||||
static void
|
||||
addstackroots(G *gp)
|
||||
{
|
||||
M *mp;
|
||||
int32 n;
|
||||
|
@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
|
|||
}
|
||||
}
|
||||
|
||||
if(Debug > 1)
|
||||
runtime·printf("scanstack %d %p\n", gp->goid, sp);
|
||||
n = 0;
|
||||
while(stk) {
|
||||
if(sp < guard-StackGuard || (byte*)stk < sp) {
|
||||
runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
|
||||
runtime·throw("scanstack");
|
||||
}
|
||||
scanblock(sp, (byte*)stk - sp);
|
||||
addroot(sp, (byte*)stk - sp);
|
||||
sp = stk->gobuf.sp;
|
||||
guard = stk->stackguard;
|
||||
stk = (Stktop*)stk->stackbase;
|
||||
|
@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
|
|||
}
|
||||
}
|
||||
|
||||
// Markfin calls scanblock on the blocks that have finalizers:
|
||||
// the things pointed at cannot be freed until the finalizers have run.
|
||||
static void
|
||||
markfin(void *v)
|
||||
addfinroots(void *v)
|
||||
{
|
||||
uintptr size;
|
||||
|
||||
|
@ -626,30 +588,22 @@ markfin(void *v)
|
|||
runtime·throw("mark - finalizer inconsistency");
|
||||
|
||||
// do not mark the finalizer block itself. just mark the things it points at.
|
||||
scanblock(v, size);
|
||||
addroot(v, size);
|
||||
}
|
||||
|
||||
static void
|
||||
debug_markfin(void *v)
|
||||
{
|
||||
uintptr size;
|
||||
|
||||
if(!runtime·mlookup(v, &v, &size, nil))
|
||||
runtime·throw("debug_mark - finalizer inconsistency");
|
||||
debug_scanblock(v, size);
|
||||
}
|
||||
|
||||
// Mark
|
||||
static void
|
||||
mark(void (*scan)(byte*, int64))
|
||||
addroots(void)
|
||||
{
|
||||
G *gp;
|
||||
FinBlock *fb;
|
||||
byte *p;
|
||||
|
||||
work.nroot = 0;
|
||||
|
||||
// mark data+bss.
|
||||
scan(data, ebss - data);
|
||||
for(p=data; p<ebss; p+=DataBlock)
|
||||
addroot(p, p+DataBlock < ebss ? DataBlock : ebss-p);
|
||||
|
||||
// mark stacks
|
||||
for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
|
||||
switch(gp->status){
|
||||
default:
|
||||
|
@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64))
|
|||
case Grunning:
|
||||
if(gp != g)
|
||||
runtime·throw("mark - world not stopped");
|
||||
scanstack(scan, gp);
|
||||
addstackroots(gp);
|
||||
break;
|
||||
case Grunnable:
|
||||
case Gsyscall:
|
||||
case Gwaiting:
|
||||
scanstack(scan, gp);
|
||||
addstackroots(gp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// mark things pointed at by objects with finalizers
|
||||
if(scan == debug_scanblock)
|
||||
runtime·walkfintab(debug_markfin);
|
||||
else
|
||||
runtime·walkfintab(markfin);
|
||||
runtime·walkfintab(addfinroots);
|
||||
|
||||
for(fb=allfin; fb; fb=fb->alllink)
|
||||
scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
|
||||
|
||||
// in multiproc mode, join in the queued work.
|
||||
scan(nil, 0);
|
||||
addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
|
||||
}
|
||||
|
||||
static bool
|
||||
|
@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx)
|
|||
void
|
||||
runtime·gchelper(void)
|
||||
{
|
||||
// parallel mark for over gc roots
|
||||
runtime·parfordo(work.markfor);
|
||||
// help other threads scan secondary blocks
|
||||
scanblock(nil, 0);
|
||||
|
||||
if(DebugMark) {
|
||||
|
@ -902,6 +852,7 @@ runtime·gc(int32 force)
|
|||
uint64 heap0, heap1, obj0, obj1;
|
||||
byte *p;
|
||||
GCStats stats;
|
||||
uint32 i;
|
||||
|
||||
// The gc is turned off (via enablegc) until
|
||||
// the bootstrap has completed.
|
||||
|
@ -953,6 +904,10 @@ runtime·gc(int32 force)
|
|||
work.ndone = 0;
|
||||
work.debugmarkdone = 0;
|
||||
work.nproc = runtime·gcprocs();
|
||||
addroots();
|
||||
if(work.markfor == nil)
|
||||
work.markfor = runtime·parforalloc(MaxGcproc);
|
||||
runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
|
||||
if(work.sweepfor == nil)
|
||||
work.sweepfor = runtime·parforalloc(MaxGcproc);
|
||||
runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
|
||||
|
@ -961,9 +916,12 @@ runtime·gc(int32 force)
|
|||
runtime·helpgc(work.nproc);
|
||||
}
|
||||
|
||||
mark(scanblock);
|
||||
runtime·parfordo(work.markfor);
|
||||
scanblock(nil, 0);
|
||||
|
||||
if(DebugMark) {
|
||||
mark(debug_scanblock);
|
||||
for(i=0; i<work.nroot; i++)
|
||||
debug_scanblock(work.roots[i].p, work.roots[i].n);
|
||||
runtime·atomicstore(&work.debugmarkdone, 1);
|
||||
}
|
||||
t1 = runtime·nanotime();
|
||||
|
|
Loading…
Reference in a new issue