runtime: faster GC mark phase

Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050
This commit is contained in:
Dmitriy Vyukov 2012-05-24 10:55:50 +04:00
parent 82e2272566
commit b0702bd0db
2 changed files with 113 additions and 155 deletions

View file

@ -124,8 +124,8 @@ enum
// Max number of threads to run garbage collection.
// 2, 3, and 4 are all plausible maximums depending
// on the hardware details of the machine. The garbage
// collector scales well to 4 cpus.
MaxGcproc = 4,
// collector scales well to 8 cpus.
MaxGcproc = 8,
};
// A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).)

View file

@ -13,6 +13,7 @@ enum {
Debug = 0,
PtrSize = sizeof(void*),
DebugMark = 0, // run second pass to check mark
DataBlock = 8*1024,
// Four bits per word (see #defines below).
wordsPerBitmapWord = sizeof(void*)*8/4,
@ -72,9 +73,9 @@ static int32 gctrace;
typedef struct Workbuf Workbuf;
struct Workbuf
{
Workbuf *next;
LFNode node; // must be first
uintptr nobj;
byte *obj[512-2];
byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)];
};
typedef struct Finalizer Finalizer;
@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*);
static void putempty(Workbuf*);
static Workbuf* handoff(Workbuf*);
typedef struct GcRoot GcRoot;
struct GcRoot
{
byte *p;
uintptr n;
};
static struct {
Lock fmu;
Workbuf *full;
Lock emu;
Workbuf *empty;
uint64 full; // lock-free list of full blocks
uint64 empty; // lock-free list of empty blocks
byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
uint32 nproc;
volatile uint32 nwait;
volatile uint32 ndone;
volatile uint32 debugmarkdone;
Note alldone;
ParFor *markfor;
ParFor *sweepfor;
Lock;
byte *chunk;
uintptr nchunk;
GcRoot *roots;
uint32 nroot;
uint32 rootcap;
} work;
// scanblock scans a block of n bytes starting at pointer b for references
@ -162,7 +174,7 @@ scanblock(byte *b, int64 n)
nobj = 0; // number of queued objects
// Scanblock helpers pass b==nil.
// The main proc needs to return to make more
// Procs needs to return to make more
// calls to scanblock. But if work.nproc==1 then
// might as well process blocks as soon as we
// have them.
@ -246,6 +258,14 @@ scanblock(byte *b, int64 n)
bits = xbits >> shift;
found:
// If another proc wants a pointer, give it some.
if(work.nwait > 0 && nobj > 4 && work.full == 0) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
// Now we have bits, bitp, and shift correct for
// obj pointing at the base of the object.
// Only care about allocated and not marked.
@ -269,14 +289,6 @@ scanblock(byte *b, int64 n)
PREFETCH(obj);
// If another proc wants a pointer, give it some.
if(nobj > 4 && work.nwait > 0 && work.full == nil) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
// If buffer is full, get a new one.
if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
if(wbuf != nil)
@ -296,7 +308,8 @@ scanblock(byte *b, int64 n)
// Fetch b from the work buffer.
if(nobj == 0) {
if(!keepworking) {
putempty(wbuf);
if(wbuf)
putempty(wbuf);
return;
}
// Emptied our buffer: refill.
@ -401,53 +414,33 @@ debug_scanblock(byte *b, int64 n)
}
}
static void
markroot(ParFor *desc, uint32 i)
{
USED(&desc);
scanblock(work.roots[i].p, work.roots[i].n);
}
// Get an empty work buffer off the work.empty list,
// allocating new buffers as needed.
static Workbuf*
getempty(Workbuf *b)
{
if(work.nproc == 1) {
// Put b on full list.
if(b != nil) {
b->next = work.full;
work.full = b;
if(b != nil)
runtime·lfstackpush(&work.full, &b->node);
b = (Workbuf*)runtime·lfstackpop(&work.empty);
if(b == nil) {
// Need to allocate.
runtime·lock(&work);
if(work.nchunk < sizeof *b) {
work.nchunk = 1<<20;
work.chunk = runtime·SysAlloc(work.nchunk);
}
// Grab from empty list if possible.
b = work.empty;
if(b != nil) {
work.empty = b->next;
goto haveb;
}
} else {
// Put b on full list.
if(b != nil) {
runtime·lock(&work.fmu);
b->next = work.full;
work.full = b;
runtime·unlock(&work.fmu);
}
// Grab from empty list if possible.
runtime·lock(&work.emu);
b = work.empty;
if(b != nil)
work.empty = b->next;
runtime·unlock(&work.emu);
if(b != nil)
goto haveb;
b = (Workbuf*)work.chunk;
work.chunk += sizeof *b;
work.nchunk -= sizeof *b;
runtime·unlock(&work);
}
// Need to allocate.
runtime·lock(&work);
if(work.nchunk < sizeof *b) {
work.nchunk = 1<<20;
work.chunk = runtime·SysAlloc(work.nchunk);
}
b = (Workbuf*)work.chunk;
work.chunk += sizeof *b;
work.nchunk -= sizeof *b;
runtime·unlock(&work);
haveb:
b->nobj = 0;
return b;
}
@ -455,19 +448,7 @@ haveb:
static void
putempty(Workbuf *b)
{
if(b == nil)
return;
if(work.nproc == 1) {
b->next = work.empty;
work.empty = b;
return;
}
runtime·lock(&work.emu);
b->next = work.empty;
work.empty = b;
runtime·unlock(&work.emu);
runtime·lfstackpush(&work.empty, &b->node);
}
// Get a full work buffer off the work.full list, or return nil.
@ -475,54 +456,21 @@ static Workbuf*
getfull(Workbuf *b)
{
int32 i;
Workbuf *b1;
if(work.nproc == 1) {
// Put b on empty list.
if(b != nil) {
b->next = work.empty;
work.empty = b;
}
// Grab from full list if possible.
// Since work.nproc==1, no one else is
// going to give us work.
b = work.full;
if(b != nil)
work.full = b->next;
if(b != nil)
runtime·lfstackpush(&work.empty, &b->node);
b = (Workbuf*)runtime·lfstackpop(&work.full);
if(b != nil || work.nproc == 1)
return b;
}
putempty(b);
// Grab buffer from full list if possible.
for(;;) {
b1 = work.full;
if(b1 == nil)
break;
runtime·lock(&work.fmu);
if(work.full != nil) {
b1 = work.full;
work.full = b1->next;
runtime·unlock(&work.fmu);
return b1;
}
runtime·unlock(&work.fmu);
}
runtime·xadd(&work.nwait, +1);
for(i=0;; i++) {
b1 = work.full;
if(b1 != nil) {
runtime·lock(&work.fmu);
if(work.full != nil) {
runtime·xadd(&work.nwait, -1);
b1 = work.full;
work.full = b1->next;
runtime·unlock(&work.fmu);
return b1;
}
runtime·unlock(&work.fmu);
continue;
if(work.full != 0) {
runtime·xadd(&work.nwait, -1);
b = (Workbuf*)runtime·lfstackpop(&work.full);
if(b != nil)
return b;
runtime·xadd(&work.nwait, +1);
}
if(work.nwait == work.nproc)
return nil;
@ -555,17 +503,35 @@ handoff(Workbuf *b)
m->gcstats.nhandoffcnt += n;
// Put b on full list - let first half of b get stolen.
runtime·lock(&work.fmu);
b->next = work.full;
work.full = b;
runtime·unlock(&work.fmu);
runtime·lfstackpush(&work.full, &b->node);
return b1;
}
// Scanstack calls scanblock on each of gp's stack segments.
static void
scanstack(void (*scanblock)(byte*, int64), G *gp)
addroot(byte *p, uintptr n)
{
uint32 cap;
GcRoot *new;
if(work.nroot >= work.rootcap) {
cap = PageSize/sizeof(GcRoot);
if(cap < 2*work.rootcap)
cap = 2*work.rootcap;
new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
if(work.roots != nil) {
runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot));
runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot));
}
work.roots = new;
work.rootcap = cap;
}
work.roots[work.nroot].p = p;
work.roots[work.nroot].n = n;
work.nroot++;
}
static void
addstackroots(G *gp)
{
M *mp;
int32 n;
@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
}
}
if(Debug > 1)
runtime·printf("scanstack %d %p\n", gp->goid, sp);
n = 0;
while(stk) {
if(sp < guard-StackGuard || (byte*)stk < sp) {
runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
runtime·throw("scanstack");
}
scanblock(sp, (byte*)stk - sp);
addroot(sp, (byte*)stk - sp);
sp = stk->gobuf.sp;
guard = stk->stackguard;
stk = (Stktop*)stk->stackbase;
@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
}
}
// Markfin calls scanblock on the blocks that have finalizers:
// the things pointed at cannot be freed until the finalizers have run.
static void
markfin(void *v)
addfinroots(void *v)
{
uintptr size;
@ -626,30 +588,22 @@ markfin(void *v)
runtime·throw("mark - finalizer inconsistency");
// do not mark the finalizer block itself. just mark the things it points at.
scanblock(v, size);
addroot(v, size);
}
static void
debug_markfin(void *v)
{
uintptr size;
if(!runtime·mlookup(v, &v, &size, nil))
runtime·throw("debug_mark - finalizer inconsistency");
debug_scanblock(v, size);
}
// Mark
static void
mark(void (*scan)(byte*, int64))
addroots(void)
{
G *gp;
FinBlock *fb;
byte *p;
work.nroot = 0;
// mark data+bss.
scan(data, ebss - data);
for(p=data; p<ebss; p+=DataBlock)
addroot(p, p+DataBlock < ebss ? DataBlock : ebss-p);
// mark stacks
for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
switch(gp->status){
default:
@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64))
case Grunning:
if(gp != g)
runtime·throw("mark - world not stopped");
scanstack(scan, gp);
addstackroots(gp);
break;
case Grunnable:
case Gsyscall:
case Gwaiting:
scanstack(scan, gp);
addstackroots(gp);
break;
}
}
// mark things pointed at by objects with finalizers
if(scan == debug_scanblock)
runtime·walkfintab(debug_markfin);
else
runtime·walkfintab(markfin);
runtime·walkfintab(addfinroots);
for(fb=allfin; fb; fb=fb->alllink)
scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
// in multiproc mode, join in the queued work.
scan(nil, 0);
addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
}
static bool
@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx)
void
runtime·gchelper(void)
{
// parallel mark for over gc roots
runtime·parfordo(work.markfor);
// help other threads scan secondary blocks
scanblock(nil, 0);
if(DebugMark) {
@ -902,6 +852,7 @@ runtime·gc(int32 force)
uint64 heap0, heap1, obj0, obj1;
byte *p;
GCStats stats;
uint32 i;
// The gc is turned off (via enablegc) until
// the bootstrap has completed.
@ -953,6 +904,10 @@ runtime·gc(int32 force)
work.ndone = 0;
work.debugmarkdone = 0;
work.nproc = runtime·gcprocs();
addroots();
if(work.markfor == nil)
work.markfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
if(work.sweepfor == nil)
work.sweepfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
@ -961,9 +916,12 @@ runtime·gc(int32 force)
runtime·helpgc(work.nproc);
}
mark(scanblock);
runtime·parfordo(work.markfor);
scanblock(nil, 0);
if(DebugMark) {
mark(debug_scanblock);
for(i=0; i<work.nroot; i++)
debug_scanblock(work.roots[i].p, work.roots[i].n);
runtime·atomicstore(&work.debugmarkdone, 1);
}
t1 = runtime·nanotime();