diff --git a/src/pkg/runtime/malloc.h b/src/pkg/runtime/malloc.h index bc186981cd3..065f86a42a4 100644 --- a/src/pkg/runtime/malloc.h +++ b/src/pkg/runtime/malloc.h @@ -124,8 +124,8 @@ enum // Max number of threads to run garbage collection. // 2, 3, and 4 are all plausible maximums depending // on the hardware details of the machine. The garbage - // collector scales well to 4 cpus. - MaxGcproc = 4, + // collector scales well to 8 cpus. + MaxGcproc = 8, }; // A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).) diff --git a/src/pkg/runtime/mgc0.c b/src/pkg/runtime/mgc0.c index 4064f6916b4..d718b5aea96 100644 --- a/src/pkg/runtime/mgc0.c +++ b/src/pkg/runtime/mgc0.c @@ -13,6 +13,7 @@ enum { Debug = 0, PtrSize = sizeof(void*), DebugMark = 0, // run second pass to check mark + DataBlock = 8*1024, // Four bits per word (see #defines below). wordsPerBitmapWord = sizeof(void*)*8/4, @@ -72,9 +73,9 @@ static int32 gctrace; typedef struct Workbuf Workbuf; struct Workbuf { - Workbuf *next; + LFNode node; // must be first uintptr nobj; - byte *obj[512-2]; + byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)]; }; typedef struct Finalizer Finalizer; @@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*); static void putempty(Workbuf*); static Workbuf* handoff(Workbuf*); +typedef struct GcRoot GcRoot; +struct GcRoot +{ + byte *p; + uintptr n; +}; + static struct { - Lock fmu; - Workbuf *full; - Lock emu; - Workbuf *empty; + uint64 full; // lock-free list of full blocks + uint64 empty; // lock-free list of empty blocks + byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait uint32 nproc; volatile uint32 nwait; volatile uint32 ndone; volatile uint32 debugmarkdone; Note alldone; + ParFor *markfor; ParFor *sweepfor; Lock; byte *chunk; uintptr nchunk; + + GcRoot *roots; + uint32 nroot; + uint32 rootcap; } work; // scanblock scans a block of n bytes starting at pointer b for references @@ -162,7 +174,7 @@ scanblock(byte *b, int64 n) nobj = 0; // number of queued objects // Scanblock helpers pass b==nil. - // The main proc needs to return to make more + // Procs needs to return to make more // calls to scanblock. But if work.nproc==1 then // might as well process blocks as soon as we // have them. @@ -246,6 +258,14 @@ scanblock(byte *b, int64 n) bits = xbits >> shift; found: + // If another proc wants a pointer, give it some. + if(work.nwait > 0 && nobj > 4 && work.full == 0) { + wbuf->nobj = nobj; + wbuf = handoff(wbuf); + nobj = wbuf->nobj; + wp = wbuf->obj + nobj; + } + // Now we have bits, bitp, and shift correct for // obj pointing at the base of the object. // Only care about allocated and not marked. @@ -269,14 +289,6 @@ scanblock(byte *b, int64 n) PREFETCH(obj); - // If another proc wants a pointer, give it some. - if(nobj > 4 && work.nwait > 0 && work.full == nil) { - wbuf->nobj = nobj; - wbuf = handoff(wbuf); - nobj = wbuf->nobj; - wp = wbuf->obj + nobj; - } - // If buffer is full, get a new one. if(wbuf == nil || nobj >= nelem(wbuf->obj)) { if(wbuf != nil) @@ -296,7 +308,8 @@ scanblock(byte *b, int64 n) // Fetch b from the work buffer. if(nobj == 0) { if(!keepworking) { - putempty(wbuf); + if(wbuf) + putempty(wbuf); return; } // Emptied our buffer: refill. @@ -401,53 +414,33 @@ debug_scanblock(byte *b, int64 n) } } +static void +markroot(ParFor *desc, uint32 i) +{ + USED(&desc); + scanblock(work.roots[i].p, work.roots[i].n); +} + // Get an empty work buffer off the work.empty list, // allocating new buffers as needed. static Workbuf* getempty(Workbuf *b) { - if(work.nproc == 1) { - // Put b on full list. - if(b != nil) { - b->next = work.full; - work.full = b; + if(b != nil) + runtime·lfstackpush(&work.full, &b->node); + b = (Workbuf*)runtime·lfstackpop(&work.empty); + if(b == nil) { + // Need to allocate. + runtime·lock(&work); + if(work.nchunk < sizeof *b) { + work.nchunk = 1<<20; + work.chunk = runtime·SysAlloc(work.nchunk); } - // Grab from empty list if possible. - b = work.empty; - if(b != nil) { - work.empty = b->next; - goto haveb; - } - } else { - // Put b on full list. - if(b != nil) { - runtime·lock(&work.fmu); - b->next = work.full; - work.full = b; - runtime·unlock(&work.fmu); - } - // Grab from empty list if possible. - runtime·lock(&work.emu); - b = work.empty; - if(b != nil) - work.empty = b->next; - runtime·unlock(&work.emu); - if(b != nil) - goto haveb; + b = (Workbuf*)work.chunk; + work.chunk += sizeof *b; + work.nchunk -= sizeof *b; + runtime·unlock(&work); } - - // Need to allocate. - runtime·lock(&work); - if(work.nchunk < sizeof *b) { - work.nchunk = 1<<20; - work.chunk = runtime·SysAlloc(work.nchunk); - } - b = (Workbuf*)work.chunk; - work.chunk += sizeof *b; - work.nchunk -= sizeof *b; - runtime·unlock(&work); - -haveb: b->nobj = 0; return b; } @@ -455,19 +448,7 @@ haveb: static void putempty(Workbuf *b) { - if(b == nil) - return; - - if(work.nproc == 1) { - b->next = work.empty; - work.empty = b; - return; - } - - runtime·lock(&work.emu); - b->next = work.empty; - work.empty = b; - runtime·unlock(&work.emu); + runtime·lfstackpush(&work.empty, &b->node); } // Get a full work buffer off the work.full list, or return nil. @@ -475,54 +456,21 @@ static Workbuf* getfull(Workbuf *b) { int32 i; - Workbuf *b1; - if(work.nproc == 1) { - // Put b on empty list. - if(b != nil) { - b->next = work.empty; - work.empty = b; - } - // Grab from full list if possible. - // Since work.nproc==1, no one else is - // going to give us work. - b = work.full; - if(b != nil) - work.full = b->next; + if(b != nil) + runtime·lfstackpush(&work.empty, &b->node); + b = (Workbuf*)runtime·lfstackpop(&work.full); + if(b != nil || work.nproc == 1) return b; - } - - putempty(b); - - // Grab buffer from full list if possible. - for(;;) { - b1 = work.full; - if(b1 == nil) - break; - runtime·lock(&work.fmu); - if(work.full != nil) { - b1 = work.full; - work.full = b1->next; - runtime·unlock(&work.fmu); - return b1; - } - runtime·unlock(&work.fmu); - } runtime·xadd(&work.nwait, +1); for(i=0;; i++) { - b1 = work.full; - if(b1 != nil) { - runtime·lock(&work.fmu); - if(work.full != nil) { - runtime·xadd(&work.nwait, -1); - b1 = work.full; - work.full = b1->next; - runtime·unlock(&work.fmu); - return b1; - } - runtime·unlock(&work.fmu); - continue; + if(work.full != 0) { + runtime·xadd(&work.nwait, -1); + b = (Workbuf*)runtime·lfstackpop(&work.full); + if(b != nil) + return b; + runtime·xadd(&work.nwait, +1); } if(work.nwait == work.nproc) return nil; @@ -555,17 +503,35 @@ handoff(Workbuf *b) m->gcstats.nhandoffcnt += n; // Put b on full list - let first half of b get stolen. - runtime·lock(&work.fmu); - b->next = work.full; - work.full = b; - runtime·unlock(&work.fmu); - + runtime·lfstackpush(&work.full, &b->node); return b1; } -// Scanstack calls scanblock on each of gp's stack segments. static void -scanstack(void (*scanblock)(byte*, int64), G *gp) +addroot(byte *p, uintptr n) +{ + uint32 cap; + GcRoot *new; + + if(work.nroot >= work.rootcap) { + cap = PageSize/sizeof(GcRoot); + if(cap < 2*work.rootcap) + cap = 2*work.rootcap; + new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot)); + if(work.roots != nil) { + runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot)); + runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot)); + } + work.roots = new; + work.rootcap = cap; + } + work.roots[work.nroot].p = p; + work.roots[work.nroot].n = n; + work.nroot++; +} + +static void +addstackroots(G *gp) { M *mp; int32 n; @@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) } } - if(Debug > 1) - runtime·printf("scanstack %d %p\n", gp->goid, sp); n = 0; while(stk) { if(sp < guard-StackGuard || (byte*)stk < sp) { runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); runtime·throw("scanstack"); } - scanblock(sp, (byte*)stk - sp); + addroot(sp, (byte*)stk - sp); sp = stk->gobuf.sp; guard = stk->stackguard; stk = (Stktop*)stk->stackbase; @@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) } } -// Markfin calls scanblock on the blocks that have finalizers: -// the things pointed at cannot be freed until the finalizers have run. static void -markfin(void *v) +addfinroots(void *v) { uintptr size; @@ -626,30 +588,22 @@ markfin(void *v) runtime·throw("mark - finalizer inconsistency"); // do not mark the finalizer block itself. just mark the things it points at. - scanblock(v, size); + addroot(v, size); } static void -debug_markfin(void *v) -{ - uintptr size; - - if(!runtime·mlookup(v, &v, &size, nil)) - runtime·throw("debug_mark - finalizer inconsistency"); - debug_scanblock(v, size); -} - -// Mark -static void -mark(void (*scan)(byte*, int64)) +addroots(void) { G *gp; FinBlock *fb; + byte *p; + + work.nroot = 0; // mark data+bss. - scan(data, ebss - data); + for(p=data; palllink) { switch(gp->status){ default: @@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64)) case Grunning: if(gp != g) runtime·throw("mark - world not stopped"); - scanstack(scan, gp); + addstackroots(gp); break; case Grunnable: case Gsyscall: case Gwaiting: - scanstack(scan, gp); + addstackroots(gp); break; } } - // mark things pointed at by objects with finalizers - if(scan == debug_scanblock) - runtime·walkfintab(debug_markfin); - else - runtime·walkfintab(markfin); + runtime·walkfintab(addfinroots); for(fb=allfin; fb; fb=fb->alllink) - scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); - - // in multiproc mode, join in the queued work. - scan(nil, 0); + addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); } static bool @@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx) void runtime·gchelper(void) { + // parallel mark for over gc roots + runtime·parfordo(work.markfor); + // help other threads scan secondary blocks scanblock(nil, 0); if(DebugMark) { @@ -902,6 +852,7 @@ runtime·gc(int32 force) uint64 heap0, heap1, obj0, obj1; byte *p; GCStats stats; + uint32 i; // The gc is turned off (via enablegc) until // the bootstrap has completed. @@ -953,6 +904,10 @@ runtime·gc(int32 force) work.ndone = 0; work.debugmarkdone = 0; work.nproc = runtime·gcprocs(); + addroots(); + if(work.markfor == nil) + work.markfor = runtime·parforalloc(MaxGcproc); + runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot); if(work.sweepfor == nil) work.sweepfor = runtime·parforalloc(MaxGcproc); runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan); @@ -961,9 +916,12 @@ runtime·gc(int32 force) runtime·helpgc(work.nproc); } - mark(scanblock); + runtime·parfordo(work.markfor); + scanblock(nil, 0); + if(DebugMark) { - mark(debug_scanblock); + for(i=0; i