runtime: rewrite malloc in Go.

This change introduces gomallocgc, a Go clone of mallocgc. Only a few uses have been moved over, so there are still lots of uses from C. Many of these C uses will be moved over to Go (e.g. in slice.goc), but probably not all. What should remain of C's mallocgc is an open question. LGTM=rsc, dvyukov R=rsc, khr, dave, bradfitz, dvyukov CC=golang-codereviews https://golang.org/cl/108840046
2024-07-20 04:45:26 +00:00 · 2014-07-30 09:01:52 -07:00 · 2014-07-30 09:01:52 -07:00 · 4aa50434e1
parent fe4fc94b04
commit 4aa50434e1
26 changed files with 992 additions and 408 deletions
--- a/src/cmd/api/goapi.go
+++ b/src/cmd/api/goapi.go
@ -378,7 +378,7 @@ func (w *Walker) parseFile(dir, file string) (*ast.File, error) {
 	}
 	if w.context != nil && file == fmt.Sprintf("zruntime_defs_%s_%s.go", w.context.GOOS, w.context.GOARCH) {
 		// Just enough to keep the api checker happy.
-		src := "package runtime; type maptype struct{}; type _type struct{}; type alg struct{}"
+		src := "package runtime; type maptype struct{}; type _type struct{}; type alg struct{}; type mspan struct{}; type m struct{}; type lock struct{}"
 		f, err = parser.ParseFile(fset, filename, src, 0)
 		if err != nil {
 			log.Fatalf("incorrect generated file: %s", err)
--- a/src/cmd/cc/godefs.c
+++ b/src/cmd/cc/godefs.c
@ -206,16 +206,36 @@ printtypename(Type *t)
 		Bprint(&outbuf, "uint16");
 		break;
 	case TLONG:
-		Bprint(&outbuf, "int32");
+		// The 32/64-bit ambiguous types (int,uint,uintptr)
+		// are assigned a TLONG/TULONG to distinguish them
+		// from always 32-bit types which get a TINT/TUINT.
+		// (See int_x/uint_x in pkg/runtime/runtime.h.)
+		// For LONG and VLONG types, we generate the
+		// unqualified Go type when appropriate.
+		// This makes it easier to write Go code that
+		// modifies objects with autogenerated-from-C types.
+		if(ewidth[TIND] == 4)
+			Bprint(&outbuf, "int");
+		else
+			Bprint(&outbuf, "int32");
 		break;
 	case TULONG:
-		Bprint(&outbuf, "uint32");
+		if(ewidth[TIND] == 4)
+			Bprint(&outbuf, "uint");
+		else
+			Bprint(&outbuf, "uint32");
 		break;
 	case TVLONG:
-		Bprint(&outbuf, "int64");
+		if(ewidth[TIND] == 8)
+			Bprint(&outbuf, "int");
+		else
+			Bprint(&outbuf, "int64");
 		break;
 	case TUVLONG:
-		Bprint(&outbuf, "uint64");
+		if(ewidth[TIND] == 8)
+			Bprint(&outbuf, "uint");
+		else
+			Bprint(&outbuf, "uint64");
 		break;
 	case TFLOAT:
 		Bprint(&outbuf, "float32");
--- a/src/cmd/gc/builtin.c
+++ b/src/cmd/gc/builtin.c
@ -2,7 +2,7 @@
 char *runtimeimport =
 	"package runtime\n"
 	"import runtime \"runtime\"\n"
-	"func @\"\".new (@\"\".typ·2 *byte) (? *any)\n"
+	"func @\"\".newobject (@\"\".typ·2 *byte) (? *any)\n"
 	"func @\"\".panicindex ()\n"
 	"func @\"\".panicslice ()\n"
 	"func @\"\".panicdivide ()\n"
--- a/src/cmd/gc/runtime.go
+++ b/src/cmd/gc/runtime.go
@ -12,7 +12,7 @@ package PACKAGE

 // emitted by compiler, not referred to by go programs

-func new(typ *byte) *any
+func newobject(typ *byte) *any
 func panicindex()
 func panicslice()
 func panicdivide()
--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@ -1915,7 +1915,7 @@ callnew(Type *t)
 	Node *fn;

 	dowidth(t);
-	fn = syslook("new", 1);
+	fn = syslook("newobject", 1);
 	argtype(fn, t);
 	return mkcall1(fn, ptrto(t), nil, typename(t));
 }
--- a/src/pkg/runtime/asm_386.s
+++ b/src/pkg/runtime/asm_386.s
@ -195,6 +195,56 @@ TEXT runtime·mcall(SB), NOSPLIT, $0-4
 	JMP	AX
 	RET

+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-4
+	RET
+
+// void onM(void (*fn)())
+// calls fn() on the M stack.
+// switches to the M stack if not already on it, and
+// switches back when fn() returns.
+TEXT runtime·onM(SB), NOSPLIT, $0-4
+	MOVL	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVL	g(CX), AX	// AX = g
+	MOVL	g_m(AX), BX	// BX = m
+	MOVL	m_g0(BX), DX	// DX = g0
+	CMPL	AX, DX
+	JEQ	onm
+
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVL	$runtime·switchtoM(SB), (g_sched+gobuf_pc)(AX)
+	MOVL	SP, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to g0
+	MOVL	DX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(DX), SP
+
+	// call target function
+	ARGSIZE(0)
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), BX
+	MOVL	m_curg(BX), AX
+	MOVL	AX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(AX), SP
+	MOVL	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+onm:
+	// already on m stack, just call directly
+	CALL	DI
+	RET
+
 /*
 * support for morestack
 */
--- a/src/pkg/runtime/asm_amd64.s
+++ b/src/pkg/runtime/asm_amd64.s
@ -186,6 +186,56 @@ TEXT runtime·mcall(SB), NOSPLIT, $0-8
 	JMP	AX
 	RET

+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-8
+	RET
+
+// void onM(void (*fn)())
+// calls fn() on the M stack.
+// switches to the M stack if not already on it, and
+// switches back when fn() returns.
+TEXT runtime·onM(SB), NOSPLIT, $0-8
+	MOVQ	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVQ	g(CX), AX	// AX = g
+	MOVQ	g_m(AX), BX	// BX = m
+	MOVQ	m_g0(BX), DX	// DX = g0
+	CMPQ	AX, DX
+	JEQ	onm
+
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVQ	$runtime·switchtoM(SB), (g_sched+gobuf_pc)(AX)
+	MOVQ	SP, (g_sched+gobuf_sp)(AX)
+	MOVQ	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to g0
+	MOVQ	DX, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(DX), SP
+
+	// call target function
+	ARGSIZE(0)
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), BX
+	MOVQ	m_curg(BX), AX
+	MOVQ	AX, g(CX)
+	MOVQ	(g_sched+gobuf_sp)(AX), SP
+	MOVQ	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+onm:
+	// already on m stack, just call directly
+	CALL	DI
+	RET
+
 /*
 * support for morestack
 */
--- a/src/pkg/runtime/asm_amd64p32.s
+++ b/src/pkg/runtime/asm_amd64p32.s
@ -165,6 +165,57 @@ TEXT runtime·mcall(SB), NOSPLIT, $0-4
 	JMP	AX
 	RET

+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-4
+	RET
+
+// void onM(void (*fn)())
+// calls fn() on the M stack.
+// switches to the M stack if not already on it, and
+// switches back when fn() returns.
+TEXT runtime·onM(SB), NOSPLIT, $0-4
+	MOVL	fn+0(FP), DI	// DI = fn
+	get_tls(CX)
+	MOVL	g(CX), AX	// AX = g
+	MOVL	g_m(AX), BX	// BX = m
+	MOVL	m_g0(BX), DX	// DX = g0
+	CMPL	AX, DX
+	JEQ	onm
+
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVL	$runtime·switchtoM(SB), SI
+	MOVL	SI, (g_sched+gobuf_pc)(AX)
+	MOVL	SP, (g_sched+gobuf_sp)(AX)
+	MOVL	AX, (g_sched+gobuf_g)(AX)
+
+	// switch to g0
+	MOVL	DX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(DX), SP
+
+	// call target function
+	ARGSIZE(0)
+	CALL	DI
+
+	// switch back to g
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), BX
+	MOVL	m_curg(BX), AX
+	MOVL	AX, g(CX)
+	MOVL	(g_sched+gobuf_sp)(AX), SP
+	MOVL	$0, (g_sched+gobuf_sp)(AX)
+	RET
+
+onm:
+	// already on m stack, just call directly
+	CALL	DI
+	RET
+
 /*
 * support for morestack
 */
--- a/src/pkg/runtime/asm_arm.s
+++ b/src/pkg/runtime/asm_arm.s
@ -178,6 +178,56 @@ TEXT runtime·mcall(SB), NOSPLIT, $-4-4
 	B	runtime·badmcall2(SB)
 	RET

+// switchtoM is a dummy routine that onM leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the M stack because the one at the top of
+// the M stack terminates the stack walk (see topofstack()).
+TEXT runtime·switchtoM(SB), NOSPLIT, $0-4
+	MOVW	$0, R0
+	BL	(R0) // clobber lr to ensure push {lr} is kept
+	RET
+
+// void onM(void (*fn)())
+// calls fn() on the M stack.
+// switches to the M stack if not already on it, and
+// switches back when fn() returns.
+TEXT runtime·onM(SB), NOSPLIT, $0-4
+	MOVW	fn+0(FP), R0	// R0 = fn
+	MOVW	g_m(g), R1	// R1 = m
+	MOVW	m_g0(R1), R2	// R2 = g0
+	CMP	g, R2
+	B.EQ	onm
+	
+	// save our state in g->sched.  Pretend to
+	// be switchtoM if the G stack is scanned.
+	MOVW	$runtime·switchtoM(SB), R3
+	ADD	$4, R3, R3 // get past push {lr}
+	MOVW	R3, (g_sched+gobuf_pc)(g)
+	MOVW	SP, (g_sched+gobuf_sp)(g)
+	MOVW	LR, (g_sched+gobuf_lr)(g)
+	MOVW	g, (g_sched+gobuf_g)(g)
+
+	// switch to g0
+	MOVW	R2, g
+	MOVW	(g_sched+gobuf_sp)(R2), SP
+
+	// call target function
+	ARGSIZE(0)
+	BL	(R0)
+
+	// switch back to g
+	MOVW	g_m(g), R1
+	MOVW	m_curg(R1), g
+	MOVW	(g_sched+gobuf_sp)(g), SP
+	MOVW	$0, R3
+	MOVW	R3, (g_sched+gobuf_sp)(g)
+	RET
+
+onm:
+	BL	(R0)
+	RET
+
 /*
 * support for morestack
 */
--- a/src/pkg/runtime/extern.go
+++ b/src/pkg/runtime/extern.go
@ -132,50 +132,6 @@ func funcline_go(*Func, uintptr) (string, int)
 func funcname_go(*Func) string
 func funcentry_go(*Func) uintptr

-// SetFinalizer sets the finalizer associated with x to f.
-// When the garbage collector finds an unreachable block
-// with an associated finalizer, it clears the association and runs
-// f(x) in a separate goroutine.  This makes x reachable again, but
-// now without an associated finalizer.  Assuming that SetFinalizer
-// is not called again, the next time the garbage collector sees
-// that x is unreachable, it will free x.
-//
-// SetFinalizer(x, nil) clears any finalizer associated with x.
-//
-// The argument x must be a pointer to an object allocated by
-// calling new or by taking the address of a composite literal.
-// The argument f must be a function that takes a single argument
-// to which x's type can be assigned, and can have arbitrary ignored return
-// values. If either of these is not true, SetFinalizer aborts the
-// program.
-//
-// Finalizers are run in dependency order: if A points at B, both have
-// finalizers, and they are otherwise unreachable, only the finalizer
-// for A runs; once A is freed, the finalizer for B can run.
-// If a cyclic structure includes a block with a finalizer, that
-// cycle is not guaranteed to be garbage collected and the finalizer
-// is not guaranteed to run, because there is no ordering that
-// respects the dependencies.
-//
-// The finalizer for x is scheduled to run at some arbitrary time after
-// x becomes unreachable.
-// There is no guarantee that finalizers will run before a program exits,
-// so typically they are useful only for releasing non-memory resources
-// associated with an object during a long-running program.
-// For example, an os.File object could use a finalizer to close the
-// associated operating system file descriptor when a program discards
-// an os.File without calling Close, but it would be a mistake
-// to depend on a finalizer to flush an in-memory I/O buffer such as a
-// bufio.Writer, because the buffer would not be flushed at program exit.
-//
-// It is not guaranteed that a finalizer will run if the size of *x is
-// zero bytes.
-//
-// A single goroutine runs all finalizers for a program, sequentially.
-// If a finalizer must run for a long time, it should do so by starting
-// a new goroutine.
-func SetFinalizer(x, f interface{})
-
 func getgoroot() string

 // GOROOT returns the root of the Go tree.
--- a/src/pkg/runtime/hashmap.go
+++ b/src/pkg/runtime/hashmap.go
@ -221,14 +221,14 @@ func makemap(t *maptype, hint int64) *hmap {
 		if checkgc {
 			memstats.next_gc = memstats.heap_alloc
 		}
-		buckets = unsafe_NewArray(t.bucket, uintptr(1)<<B)
+		buckets = newarray(t.bucket, uintptr(1)<<B)
 	}

 	// initialize Hmap
 	if checkgc {
 		memstats.next_gc = memstats.heap_alloc
 	}
-	h := (*hmap)(unsafe_New(t.hmap))
+	h := (*hmap)(newobject(t.hmap))
 	h.count = 0
 	h.B = B
 	h.flags = flags
@ -405,7 +405,7 @@ func mapassign1(t *maptype, h *hmap, key unsafe.Pointer, val unsafe.Pointer) {
 		if checkgc {
 			memstats.next_gc = memstats.heap_alloc
 		}
-		h.buckets = unsafe_NewArray(t.bucket, 1)
+		h.buckets = newarray(t.bucket, 1)
 	}

 again:
@ -467,7 +467,7 @@ again:
 		if checkgc {
 			memstats.next_gc = memstats.heap_alloc
 		}
-		newb := (*bmap)(unsafe_New(t.bucket))
+		newb := (*bmap)(newobject(t.bucket))
 		b.overflow = newb
 		inserti = &newb.tophash[0]
 		insertk = add(unsafe.Pointer(newb), dataOffset)
@ -479,7 +479,7 @@ again:
 		if checkgc {
 			memstats.next_gc = memstats.heap_alloc
 		}
-		kmem := unsafe_New(t.key)
+		kmem := newobject(t.key)
 		*(*unsafe.Pointer)(insertk) = kmem
 		insertk = kmem
 	}
@ -487,7 +487,7 @@ again:
 		if checkgc {
 			memstats.next_gc = memstats.heap_alloc
 		}
-		vmem := unsafe_New(t.elem)
+		vmem := newobject(t.elem)
 		*(*unsafe.Pointer)(insertv) = vmem
 		insertv = vmem
 	}
@ -742,7 +742,7 @@ func hashGrow(t *maptype, h *hmap) {
 	if checkgc {
 		memstats.next_gc = memstats.heap_alloc
 	}
-	newbuckets := unsafe_NewArray(t.bucket, uintptr(1)<<(h.B+1))
+	newbuckets := newarray(t.bucket, uintptr(1)<<(h.B+1))
 	flags := h.flags &^ (iterator | oldIterator)
 	if h.flags&iterator != 0 {
 		flags |= oldIterator
@ -835,7 +835,7 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 						if checkgc {
 							memstats.next_gc = memstats.heap_alloc
 						}
-						newx := (*bmap)(unsafe_New(t.bucket))
+						newx := (*bmap)(newobject(t.bucket))
 						x.overflow = newx
 						x = newx
 						xi = 0
@ -862,7 +862,7 @@ func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 						if checkgc {
 							memstats.next_gc = memstats.heap_alloc
 						}
-						newy := (*bmap)(unsafe_New(t.bucket))
+						newy := (*bmap)(newobject(t.bucket))
 						y.overflow = newy
 						y = newy
 						yi = 0
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@ -6,7 +6,6 @@
 //
 // TODO(rsc): double-check stats.

-package runtime
 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
@ -20,229 +19,20 @@ package runtime
 #pragma dataflag NOPTR
 MHeap runtime·mheap;
 #pragma dataflag NOPTR
-MStats mstats;
+MStats runtime·memstats;

-extern MStats mstats;	// defined in zruntime_def_$GOOS_$GOARCH.go
+void* runtime·cmallocgc(uintptr size, Type *typ, uint32 flag, void **ret);

-extern volatile intgo runtime·MemProfileRate;
-
-static MSpan* largealloc(uint32, uintptr*);
-static void profilealloc(void *v, uintptr size);
-static void settype(MSpan *s, void *v, uintptr typ);
-
-// Allocate an object of at least size bytes.
-// Small objects are allocated from the per-thread cache's free lists.
-// Large objects (> 32 kB) are allocated straight from the heap.
-// If the block will be freed with runtime·free(), typ must be 0.
 void*
 runtime·mallocgc(uintptr size, Type *typ, uint32 flag)
 {
-	int32 sizeclass;
-	uintptr tinysize, size0, size1;
-	intgo rate;
-	MCache *c;
-	MSpan *s;
-	MLink *v, *next;
-	byte *tiny;
+	void *ret;

-	if(size == 0) {
-		// All 0-length allocations use this pointer.
-		// The language does not require the allocations to
-		// have distinct values.
-		return &runtime·zerobase;
-	}
-	if(g->m->mallocing)
-		runtime·throw("malloc/free - deadlock");
-	// Disable preemption during settype.
-	// We can not use m->mallocing for this, because settype calls mallocgc.
-	g->m->locks++;
-	g->m->mallocing = 1;
-
-	size0 = size;
-	c = g->m->mcache;
-	if(!runtime·debug.efence && size <= MaxSmallSize) {
-		if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
-			// Tiny allocator.
-			//
-			// Tiny allocator combines several tiny allocation requests
-			// into a single memory block. The resulting memory block
-			// is freed when all subobjects are unreachable. The subobjects
-			// must be FlagNoScan (don't have pointers), this ensures that
-			// the amount of potentially wasted memory is bounded.
-			//
-			// Size of the memory block used for combining (TinySize) is tunable.
-			// Current setting is 16 bytes, which relates to 2x worst case memory
-			// wastage (when all but one subobjects are unreachable).
-			// 8 bytes would result in no wastage at all, but provides less
-			// opportunities for combining.
-			// 32 bytes provides more opportunities for combining,
-			// but can lead to 4x worst case wastage.
-			// The best case winning is 8x regardless of block size.
-			//
-			// Objects obtained from tiny allocator must not be freed explicitly.
-			// So when an object will be freed explicitly, we ensure that
-			// its size >= TinySize.
-			//
-			// SetFinalizer has a special case for objects potentially coming
-			// from tiny allocator, it such case it allows to set finalizers
-			// for an inner byte of a memory block.
-			//
-			// The main targets of tiny allocator are small strings and
-			// standalone escaping variables. On a json benchmark
-			// the allocator reduces number of allocations by ~12% and
-			// reduces heap size by ~20%.
-
-			tinysize = c->tinysize;
-			if(size <= tinysize) {
-				tiny = c->tiny;
-				// Align tiny pointer for required (conservative) alignment.
-				if((size&7) == 0)
-					tiny = (byte*)ROUND((uintptr)tiny, 8);
-				else if((size&3) == 0)
-					tiny = (byte*)ROUND((uintptr)tiny, 4);
-				else if((size&1) == 0)
-					tiny = (byte*)ROUND((uintptr)tiny, 2);
-				size1 = size + (tiny - c->tiny);
-				if(size1 <= tinysize) {
-					// The object fits into existing tiny block.
-					v = (MLink*)tiny;
-					c->tiny += size1;
-					c->tinysize -= size1;
-					g->m->mallocing = 0;
-					g->m->locks--;
-					if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
-						g->stackguard0 = StackPreempt;
-					return v;
-				}
-			}
-			// Allocate a new TinySize block.
-			s = c->alloc[TinySizeClass];
-			if(s->freelist == nil)
-				s = runtime·MCache_Refill(c, TinySizeClass);
-			v = s->freelist;
-			next = v->next;
-			s->freelist = next;
-			s->ref++;
-			if(next != nil)  // prefetching nil leads to a DTLB miss
-				PREFETCH(next);
-			((uint64*)v)[0] = 0;
-			((uint64*)v)[1] = 0;
-			// See if we need to replace the existing tiny block with the new one
-			// based on amount of remaining free space.
-			if(TinySize-size > tinysize) {
-				c->tiny = (byte*)v + size;
-				c->tinysize = TinySize - size;
-			}
-			size = TinySize;
-			goto done;
-		}
-		// Allocate from mcache free lists.
-		// Inlined version of SizeToClass().
-		if(size <= 1024-8)
-			sizeclass = runtime·size_to_class8[(size+7)>>3];
-		else
-			sizeclass = runtime·size_to_class128[(size-1024+127) >> 7];
-		size = runtime·class_to_size[sizeclass];
-		s = c->alloc[sizeclass];
-		if(s->freelist == nil)
-			s = runtime·MCache_Refill(c, sizeclass);
-		v = s->freelist;
-		next = v->next;
-		s->freelist = next;
-		s->ref++;
-		if(next != nil)  // prefetching nil leads to a DTLB miss
-			PREFETCH(next);
-		if(!(flag & FlagNoZero)) {
-			v->next = nil;
-			// block is zeroed iff second word is zero ...
-			if(size > 2*sizeof(uintptr) && ((uintptr*)v)[1] != 0)
-				runtime·memclr((byte*)v, size);
-		}
-	done:
-		c->local_cachealloc += size;
-	} else {
-		// Allocate directly from heap.
-		s = largealloc(flag, &size);
-		v = (void*)(s->start << PageShift);
-	}
-
-	if(!(flag & FlagNoGC))
-		runtime·markallocated(v, size, size0, typ, !(flag&FlagNoScan));
-
-	g->m->mallocing = 0;
-
-	if(raceenabled)
-		runtime·racemalloc(v, size);
-
-	if(runtime·debug.allocfreetrace)
-		runtime·tracealloc(v, size, typ);
-
-	if(!(flag & FlagNoProfiling) && (rate = runtime·MemProfileRate) > 0) {
-		if(size < rate && size < c->next_sample)
-			c->next_sample -= size;
-		else
-			profilealloc(v, size);
-	}
-
-	g->m->locks--;
-	if(g->m->locks == 0 && g->preempt)  // restore the preemption request in case we've cleared it in newstack
-		g->stackguard0 = StackPreempt;
-
-	if(!(flag & FlagNoInvokeGC) && mstats.heap_alloc >= mstats.next_gc)
-		runtime·gc(0);
-
-	return v;
-}
-
-static MSpan*
-largealloc(uint32 flag, uintptr *sizep)
-{
-	uintptr npages, size;
-	MSpan *s;
-	void *v;
-
-	// Allocate directly from heap.
-	size = *sizep;
-	if(size + PageSize < size)
-		runtime·throw("out of memory");
-	npages = size >> PageShift;
-	if((size & PageMask) != 0)
-		npages++;
-	s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
-	if(s == nil)
-		runtime·throw("out of memory");
-	s->limit = (byte*)(s->start<<PageShift) + size;
-	*sizep = npages<<PageShift;
-	v = (void*)(s->start << PageShift);
-	// setup for mark sweep
-	runtime·markspan(v, 0, 0, true);
-	return s;
-}
-
-static void
-profilealloc(void *v, uintptr size)
-{
-	uintptr rate;
-	int32 next;
-	MCache *c;
-
-	c = g->m->mcache;
-	rate = runtime·MemProfileRate;
-	if(size < rate) {
-		// pick next profile time
-		// If you change this, also change allocmcache.
-		if(rate > 0x3fffffff)	// make 2*rate not overflow
-			rate = 0x3fffffff;
-		next = runtime·fastrand1() % (2*rate);
-		// Subtract the "remainder" of the current allocation.
-		// Otherwise objects that are close in size to sampling rate
-		// will be under-sampled, because we consistently discard this remainder.
-		next -= (size - c->next_sample);
-		if(next < 0)
-			next = 0;
-		c->next_sample = next;
-	}
-	runtime·MProf_Malloc(v, size);
+	// Call into the Go version of mallocgc.
+	// TODO: maybe someday we can get rid of this.  It is
+	// probably the only location where we run Go code on the M stack.
+	runtime·cmallocgc(size, typ, flag, &ret);
+	return ret;
 }

 void*
@ -421,6 +211,10 @@ uintptr runtime·sizeof_C_MStats = sizeof(MStats) - (NumSizeClasses - 61) * size

 #define MaxArena32 (2U<<30)

+// For use by Go.  It can't be a constant in Go, unfortunately,
+// because it depends on the OS.
+uintptr runtime·maxMem = MaxMem;
+
 void
 runtime·mallocinit(void)
 {
@ -708,11 +502,6 @@ runtime·mal(uintptr n)
 	return runtime·mallocgc(n, nil, 0);
 }

-#pragma textflag NOSPLIT
-func new(typ *Type) (ret *uint8) {
-	ret = runtime·mallocgc(typ->size, typ, typ->kind&KindNoPointers ? FlagNoScan : 0);
-}
-
 static void*
 cnew(Type *typ, intgo n)
 {
@ -734,11 +523,9 @@ runtime·cnewarray(Type *typ, intgo n)
 	return cnew(typ, n);
 }

-func GC() {
-	runtime·gc(2);  // force GC and do eager sweep
-}
-
-func SetFinalizer(obj Eface, finalizer Eface) {
+static void
+setFinalizer(Eface obj, Eface finalizer)
+{
 	byte *base;
 	uintptr size;
 	FuncType *ft;
@ -823,8 +610,52 @@ throw:
 	runtime·throw("runtime.SetFinalizer");
 }

-// For testing.
-func GCMask(x Eface) (mask Slice) {
-	runtime·getgcmask(x.data, x.type, &mask.array, &mask.len);
-	mask.cap = mask.len;
+void
+runtime·setFinalizer(void)
+{
+	Eface obj, finalizer;
+
+	obj.type = g->m->ptrarg[0];
+	obj.data = g->m->ptrarg[1];
+	finalizer.type = g->m->ptrarg[2];
+	finalizer.data = g->m->ptrarg[3];
+	g->m->ptrarg[0] = nil;
+	g->m->ptrarg[1] = nil;
+	g->m->ptrarg[2] = nil;
+	g->m->ptrarg[3] = nil;
+	setFinalizer(obj, finalizer);
+}
+
+// mcallable cache refill
+void 
+runtime·mcacheRefill(void)
+{
+	runtime·MCache_Refill(g->m->mcache, (int32)g->m->scalararg[0]);
+}
+
+void
+runtime·largeAlloc(void)
+{
+	uintptr npages, size;
+	MSpan *s;
+	void *v;
+	int32 flag;
+
+	//runtime·printf("largeAlloc size=%D\n", g->m->scalararg[0]);
+	// Allocate directly from heap.
+	size = g->m->scalararg[0];
+	flag = (int32)g->m->scalararg[1];
+	if(size + PageSize < size)
+		runtime·throw("out of memory");
+	npages = size >> PageShift;
+	if((size & PageMask) != 0)
+		npages++;
+	s = runtime·MHeap_Alloc(&runtime·mheap, npages, 0, 1, !(flag & FlagNoZero));
+	if(s == nil)
+		runtime·throw("out of memory");
+	s->limit = (byte*)(s->start<<PageShift) + size;
+	v = (void*)(s->start << PageShift);
+	// setup for mark sweep
+	runtime·markspan(v, 0, 0, true);
+	g->m->ptrarg[0] = s;
 }
--- a/src/pkg/runtime/malloc.go
+++ b/src/pkg/runtime/malloc.go
@ -0,0 +1,426 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const (
+	flagNoScan      = 1 << 0 // GC doesn't have to scan object
+	flagNoProfiling = 1 << 1 // must not profile
+	flagNoZero      = 1 << 3 // don't zero memory
+	flagNoInvokeGC  = 1 << 4 // don't invoke GC
+
+	kindArray      = 17
+	kindFunc       = 19
+	kindInterface  = 20
+	kindPtr        = 22
+	kindStruct     = 25
+	kindMask       = 1<<6 - 1
+	kindGCProg     = 1 << 6
+	kindNoPointers = 1 << 7
+
+	maxTinySize   = 16
+	tinySizeClass = 2
+	maxSmallSize  = 32 << 10
+
+	pageShift = 13
+	pageSize  = 1 << pageShift
+	pageMask  = pageSize - 1
+)
+
+// All zero-sized allocations return a pointer to this byte.
+var zeroObject byte
+
+// Maximum possible heap size.
+var maxMem uintptr
+
+// Allocate an object of at least size bytes.
+// Small objects are allocated from the per-thread cache's free lists.
+// Large objects (> 32 kB) are allocated straight from the heap.
+// If the block will be freed with runtime·free(), typ must be nil.
+func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
+	if size == 0 {
+		return unsafe.Pointer(&zeroObject)
+	}
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		gothrow("malloc/free - deadlock")
+	}
+	mp.mallocing = 1
+	size0 := size
+
+	c := mp.mcache
+	var s *mspan
+	var x unsafe.Pointer
+	if size <= maxSmallSize {
+		if flags&flagNoScan != 0 && size < maxTinySize {
+			// Tiny allocator.
+			//
+			// Tiny allocator combines several tiny allocation requests
+			// into a single memory block. The resulting memory block
+			// is freed when all subobjects are unreachable. The subobjects
+			// must be FlagNoScan (don't have pointers), this ensures that
+			// the amount of potentially wasted memory is bounded.
+			//
+			// Size of the memory block used for combining (maxTinySize) is tunable.
+			// Current setting is 16 bytes, which relates to 2x worst case memory
+			// wastage (when all but one subobjects are unreachable).
+			// 8 bytes would result in no wastage at all, but provides less
+			// opportunities for combining.
+			// 32 bytes provides more opportunities for combining,
+			// but can lead to 4x worst case wastage.
+			// The best case winning is 8x regardless of block size.
+			//
+			// Objects obtained from tiny allocator must not be freed explicitly.
+			// So when an object will be freed explicitly, we ensure that
+			// its size >= maxTinySize.
+			//
+			// SetFinalizer has a special case for objects potentially coming
+			// from tiny allocator, it such case it allows to set finalizers
+			// for an inner byte of a memory block.
+			//
+			// The main targets of tiny allocator are small strings and
+			// standalone escaping variables. On a json benchmark
+			// the allocator reduces number of allocations by ~12% and
+			// reduces heap size by ~20%.
+
+			tinysize := uintptr(c.tinysize)
+			if size <= tinysize {
+				tiny := unsafe.Pointer(c.tiny)
+				// Align tiny pointer for required (conservative) alignment.
+				if size&7 == 0 {
+					tiny = roundup(tiny, 8)
+				} else if size&3 == 0 {
+					tiny = roundup(tiny, 4)
+				} else if size&1 == 0 {
+					tiny = roundup(tiny, 2)
+				}
+				size1 := size + (uintptr(tiny) - uintptr(unsafe.Pointer(c.tiny)))
+				if size1 <= tinysize {
+					// The object fits into existing tiny block.
+					x = tiny
+					c.tiny = (*byte)(add(x, size))
+					c.tinysize -= uint(size1)
+					mp.mallocing = 0
+					releasem(mp)
+					return x
+				}
+			}
+			// Allocate a new maxTinySize block.
+			s = c.alloc[tinySizeClass]
+			v := s.freelist
+			if v == nil {
+				mp.scalararg[0] = tinySizeClass
+				onM(&mcacheRefill)
+				s = c.alloc[tinySizeClass]
+				v = s.freelist
+			}
+			s.freelist = v.next
+			s.ref++
+			//TODO: prefetch v.next
+			x = unsafe.Pointer(v)
+			(*[2]uint64)(x)[0] = 0
+			(*[2]uint64)(x)[1] = 0
+			// See if we need to replace the existing tiny block with the new one
+			// based on amount of remaining free space.
+			if maxTinySize-size > tinysize {
+				c.tiny = (*byte)(add(x, size))
+				c.tinysize = uint(maxTinySize - size)
+			}
+			size = maxTinySize
+		} else {
+			var sizeclass int8
+			if size <= 1024-8 {
+				sizeclass = size_to_class8[(size+7)>>3]
+			} else {
+				sizeclass = size_to_class128[(size-1024+127)>>7]
+			}
+			size = uintptr(class_to_size[sizeclass])
+			s = c.alloc[sizeclass]
+			v := s.freelist
+			if v == nil {
+				mp.scalararg[0] = uint(sizeclass)
+				onM(&mcacheRefill)
+				s = c.alloc[sizeclass]
+				v = s.freelist
+			}
+			s.freelist = v.next
+			s.ref++
+			//TODO: prefetch
+			x = unsafe.Pointer(v)
+			if flags&flagNoZero == 0 {
+				v.next = nil
+				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
+					memclr(unsafe.Pointer(v), size)
+				}
+			}
+		}
+		c.local_cachealloc += int(size)
+	} else {
+		mp.scalararg[0] = uint(size)
+		mp.scalararg[1] = uint(flags)
+		onM(&largeAlloc)
+		s = (*mspan)(mp.ptrarg[0])
+		mp.ptrarg[0] = nil
+		x = unsafe.Pointer(uintptr(s.start << pageShift))
+		size = uintptr(s.elemsize)
+	}
+
+	// TODO: write markallocated in Go
+	mp.ptrarg[0] = x
+	mp.scalararg[0] = uint(size)
+	mp.scalararg[1] = uint(size0)
+	mp.ptrarg[1] = unsafe.Pointer(typ)
+	mp.scalararg[2] = uint(flags & flagNoScan)
+	onM(&markallocated_m)
+
+	mp.mallocing = 0
+
+	if raceenabled {
+		racemalloc(x, size)
+	}
+	if debug.allocfreetrace != 0 {
+		tracealloc(x, size, typ)
+	}
+	if flags&flagNoProfiling == 0 {
+		rate := MemProfileRate
+		if rate > 0 {
+			if size < uintptr(rate) && int32(size) < c.next_sample {
+				c.next_sample -= int32(size)
+			} else {
+				profilealloc(mp, x, size)
+			}
+		}
+	}
+
+	releasem(mp)
+
+	if flags&flagNoInvokeGC == 0 && memstats.heap_alloc >= memstats.next_gc {
+		gogc(0)
+	}
+
+	return x
+}
+
+// cmallocgc is a trampoline used to call the Go malloc from C.
+func cmallocgc(size uintptr, typ *_type, flags int, ret *unsafe.Pointer) {
+	*ret = gomallocgc(size, typ, flags)
+}
+
+// implementation of new builtin
+func newobject(typ *_type) unsafe.Pointer {
+	flags := 0
+	if typ.kind&kindNoPointers != 0 {
+		flags |= flagNoScan
+	}
+	return gomallocgc(uintptr(typ.size), typ, flags)
+}
+
+// implementation of make builtin for slices
+func newarray(typ *_type, n uintptr) unsafe.Pointer {
+	flags := 0
+	if typ.kind&kindNoPointers != 0 {
+		flags |= flagNoScan
+	}
+	if int(n) < 0 || (typ.size > 0 && n > maxMem/uintptr(typ.size)) {
+		panic("runtime: allocation size out of range")
+	}
+	return gomallocgc(uintptr(typ.size)*n, typ, flags)
+}
+
+// round size up to next size class
+func goroundupsize(size uintptr) uintptr {
+	if size < maxSmallSize {
+		if size <= 1024-8 {
+			return uintptr(class_to_size[size_to_class8[(size+7)>>3]])
+		}
+		return uintptr(class_to_size[size_to_class128[(size-1024+127)>>7]])
+	}
+	if size+pageSize < size {
+		return size
+	}
+	return (size + pageSize - 1) &^ pageMask
+}
+
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	c := mp.mcache
+	rate := MemProfileRate
+	if size < uintptr(rate) {
+		// pick next profile time
+		// If you change this, also change allocmcache.
+		if rate > 0x3fffffff { // make 2*rate not overflow
+			rate = 0x3fffffff
+		}
+		next := int32(fastrand2()) % (2 * int32(rate))
+		// Subtract the "remainder" of the current allocation.
+		// Otherwise objects that are close in size to sampling rate
+		// will be under-sampled, because we consistently discard this remainder.
+		next -= (int32(size) - c.next_sample)
+		if next < 0 {
+			next = 0
+		}
+		c.next_sample = next
+	}
+	mp.scalararg[0] = uint(size)
+	mp.ptrarg[0] = x
+	onM(&mprofMalloc)
+}
+
+// force = 1 - do GC regardless of current heap usage
+// force = 2 - go GC and eager sweep
+func gogc(force int32) {
+	if memstats.enablegc == 0 {
+		return
+	}
+
+	// TODO: should never happen?  Only C calls malloc while holding a lock?
+	mp := acquirem()
+	if mp.locks > 1 {
+		releasem(mp)
+		return
+	}
+	releasem(mp)
+
+	if panicking != 0 {
+		return
+	}
+	if gcpercent == gcpercentUnknown {
+		golock(&mheap_.lock)
+		if gcpercent == gcpercentUnknown {
+			gcpercent = goreadgogc()
+		}
+		gounlock(&mheap_.lock)
+	}
+	if gcpercent < 0 {
+		return
+	}
+
+	semacquire(&worldsema, false)
+
+	if force == 0 && memstats.heap_alloc < memstats.next_gc {
+		// typically threads which lost the race to grab
+		// worldsema exit here when gc is done.
+		semrelease(&worldsema)
+		return
+	}
+
+	// Ok, we're doing it!  Stop everybody else
+	startTime := gonanotime()
+	mp = acquirem()
+	mp.gcing = 1
+	stoptheworld()
+
+	clearpools()
+
+	// Run gc on the g0 stack.  We do this so that the g stack
+	// we're currently running on will no longer change.  Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	n := 1
+	if debug.gctrace > 1 {
+		n = 2
+	}
+	for i := 0; i < n; i++ {
+		if i > 0 {
+			startTime = gonanotime()
+		}
+		// switch to g0, call gc, then switch back
+		mp.scalararg[0] = uint(startTime)
+		if force >= 2 {
+			mp.scalararg[1] = 1 // eagersweep
+		} else {
+			mp.scalararg[1] = 0
+		}
+		onM(&mgc2)
+	}
+
+	// all done
+	mp.gcing = 0
+	semrelease(&worldsema)
+	starttheworld()
+	releasem(mp)
+
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		gosched()
+	}
+}
+
+// GC runs a garbage collection.
+func GC() {
+	gogc(2)
+}
+
+// SetFinalizer sets the finalizer associated with x to f.
+// When the garbage collector finds an unreachable block
+// with an associated finalizer, it clears the association and runs
+// f(x) in a separate goroutine.  This makes x reachable again, but
+// now without an associated finalizer.  Assuming that SetFinalizer
+// is not called again, the next time the garbage collector sees
+// that x is unreachable, it will free x.
+//
+// SetFinalizer(x, nil) clears any finalizer associated with x.
+//
+// The argument x must be a pointer to an object allocated by
+// calling new or by taking the address of a composite literal.
+// The argument f must be a function that takes a single argument
+// to which x's type can be assigned, and can have arbitrary ignored return
+// values. If either of these is not true, SetFinalizer aborts the
+// program.
+//
+// Finalizers are run in dependency order: if A points at B, both have
+// finalizers, and they are otherwise unreachable, only the finalizer
+// for A runs; once A is freed, the finalizer for B can run.
+// If a cyclic structure includes a block with a finalizer, that
+// cycle is not guaranteed to be garbage collected and the finalizer
+// is not guaranteed to run, because there is no ordering that
+// respects the dependencies.
+//
+// The finalizer for x is scheduled to run at some arbitrary time after
+// x becomes unreachable.
+// There is no guarantee that finalizers will run before a program exits,
+// so typically they are useful only for releasing non-memory resources
+// associated with an object during a long-running program.
+// For example, an os.File object could use a finalizer to close the
+// associated operating system file descriptor when a program discards
+// an os.File without calling Close, but it would be a mistake
+// to depend on a finalizer to flush an in-memory I/O buffer such as a
+// bufio.Writer, because the buffer would not be flushed at program exit.
+//
+// It is not guaranteed that a finalizer will run if the size of *x is
+// zero bytes.
+//
+// A single goroutine runs all finalizers for a program, sequentially.
+// If a finalizer must run for a long time, it should do so by starting
+// a new goroutine.
+func SetFinalizer(obj interface{}, finalizer interface{}) {
+	// We do just enough work here to make the mcall type safe.
+	// The rest is done on the M stack.
+	e := (*eface)(unsafe.Pointer(&obj))
+	typ := e._type
+	if typ == nil {
+		gothrow("runtime.SetFinalizer: first argument is nil")
+	}
+	if typ.kind&kindMask != kindPtr {
+		gothrow("runtime.SetFinalizer: first argument is " + *typ._string + ", not pointer")
+	}
+
+	f := (*eface)(unsafe.Pointer(&finalizer))
+	ftyp := f._type
+	if ftyp != nil && ftyp.kind&kindMask != kindFunc {
+		gothrow("runtime.SetFinalizer: second argument is " + *ftyp._string + ", not a function")
+	}
+	mp := acquirem()
+	mp.ptrarg[0] = unsafe.Pointer(typ)
+	mp.ptrarg[1] = e.data
+	mp.ptrarg[2] = unsafe.Pointer(ftyp)
+	mp.ptrarg[3] = f.data
+	onM(&setFinalizer)
+	releasem(mp)
+}
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@ -280,7 +280,7 @@ struct MStats
 	} by_size[NumSizeClasses];
 };

-#define mstats runtime·memStats
+#define mstats runtime·memstats
 extern MStats mstats;
 void	runtime·updatememstats(GCStats *stats);

@ -500,6 +500,7 @@ struct MHeap
 	uint64 nlargefree;	// number of frees for large objects (>MaxSmallSize)
 	uint64 nsmallfree[NumSizeClasses];	// number of frees for small objects (<=MaxSmallSize)
 };
+#define runtime·mheap runtime·mheap_
 extern MHeap runtime·mheap;

 void	runtime·MHeap_Init(MHeap *h);
@ -531,6 +532,10 @@ void	runtime·tracealloc(void*, uintptr, Type*);
 void	runtime·tracefree(void*, uintptr);
 void	runtime·tracegc(void);

+int32	runtime·gcpercent;
+int32	runtime·readgogc(void);
+void	runtime·clearpools(void);
+
 enum
 {
 	// flags to malloc
@ -551,6 +556,7 @@ void	runtime·gchelper(void);
 void	runtime·createfing(void);
 G*	runtime·wakefing(void);
 void	runtime·getgcmask(byte*, Type*, byte**, uintptr*);
+extern G*	runtime·fing;
 extern bool	runtime·fingwait;
 extern bool	runtime·fingwake;

--- a/src/pkg/runtime/mem.go
+++ b/src/pkg/runtime/mem.go
@ -70,6 +70,3 @@ func init() {

 // ReadMemStats populates m with memory allocator statistics.
 func ReadMemStats(m *MemStats)
-
-// GC runs a garbage collection.
-func GC()
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@ -81,7 +81,7 @@ enum {
 #define GcpercentUnknown (-2)

 // Initialized from $GOGC.  GOGC=off means no gc.
-static int32 gcpercent = GcpercentUnknown;
+extern int32 runtime·gcpercent = GcpercentUnknown;

 static FuncVal* poolcleanup;

@ -91,8 +91,8 @@ sync·runtime_registerPoolCleanup(FuncVal *f)
 	poolcleanup = f;
 }

-static void
-clearpools(void)
+void
+runtime·clearpools(void)
 {
 	P *p, **pp;
 	MCache *c;
@ -174,7 +174,6 @@ bool	runtime·fingwait;
 bool	runtime·fingwake;

 static Lock	gclock;
-static G*	fing;

 static void	runfinq(void);
 static void	bgsweep(void);
@ -670,6 +669,8 @@ scanframe(Stkframe *frame, void *unused)
 		// Frame is dead.
 		return true;
 	}
+	if(Debug > 1)
+		runtime·printf("scanframe %s\n", runtime·funcname(f));
 	if(targetpc != f->entry)
 		targetpc--;
 	pcdata = runtime·pcdatavalue(f, PCDATA_StackMapIndex, targetpc);
@ -971,7 +972,7 @@ runtime·MSpan_Sweep(MSpan *s)
 				runtime·MHeap_Free(&runtime·mheap, s, 1);
 			c->local_nlargefree++;
 			c->local_largefree += size;
-			runtime·xadd64(&mstats.next_gc, -(uint64)(size * (gcpercent + 100)/100));
+			runtime·xadd64(&mstats.next_gc, -(uint64)(size * (runtime·gcpercent + 100)/100));
 			res = true;
 		} else {
 			// Free small object.
@ -1005,7 +1006,7 @@ runtime·MSpan_Sweep(MSpan *s)
 	if(nfree > 0) {
 		c->local_nsmallfree[cl] += nfree;
 		c->local_cachealloc -= nfree * size;
-		runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (gcpercent + 100)/100));
+		runtime·xadd64(&mstats.next_gc, -(uint64)(nfree * size * (runtime·gcpercent + 100)/100));
 		res = runtime·MCentral_FreeSpan(&runtime·mheap.central[cl], s, nfree, head.next, end);
 		// MCentral_FreeSpan updates sweepgen
 	}
@ -1238,8 +1239,8 @@ struct gc_args
 static void gc(struct gc_args *args);
 static void mgc(G *gp);

-static int32
-readgogc(void)
+int32
+runtime·readgogc(void)
 {
 	byte *p;

@ -1259,16 +1260,8 @@ runtime·gc(int32 force)
 	struct gc_args a;
 	int32 i;

-	// The atomic operations are not atomic if the uint64s
-	// are not aligned on uint64 boundaries. This has been
-	// a problem in the past.
-	if((((uintptr)&work.empty) & 7) != 0)
-		runtime·throw("runtime: gc work buffer is misaligned");
-	if((((uintptr)&work.full) & 7) != 0)
-		runtime·throw("runtime: gc work buffer is misaligned");
 	if(sizeof(Workbuf) != WorkbufSize)
 		runtime·throw("runtime: size of Workbuf is suboptimal");
-
 	// The gc is turned off (via enablegc) until
 	// the bootstrap has completed.
 	// Also, malloc gets called in the guts
@ -1280,13 +1273,13 @@ runtime·gc(int32 force)
 	if(!mstats.enablegc || g == g->m->g0 || g->m->locks > 0 || runtime·panicking)
 		return;

-	if(gcpercent == GcpercentUnknown) {	// first time through
+	if(runtime·gcpercent == GcpercentUnknown) {	// first time through
 		runtime·lock(&runtime·mheap);
-		if(gcpercent == GcpercentUnknown)
-			gcpercent = readgogc();
+		if(runtime·gcpercent == GcpercentUnknown)
+			runtime·gcpercent = runtime·readgogc();
 		runtime·unlock(&runtime·mheap);
 	}
-	if(gcpercent < 0)
+	if(runtime·gcpercent < 0)
 		return;

 	runtime·semacquire(&runtime·worldsema, false);
@ -1303,7 +1296,7 @@ runtime·gc(int32 force)
 	g->m->gcing = 1;
 	runtime·stoptheworld();
 	
-	clearpools();
+	runtime·clearpools();

 	// Run gc on the g0 stack.  We do this so that the g stack
 	// we're currently running on will no longer change.  Cuts
@ -1343,6 +1336,23 @@ mgc(G *gp)
 	runtime·gogo(&gp->sched);
 }

+void
+runtime·mgc2(void)
+{
+	struct gc_args a;
+	G *gp;
+
+	gp = g->m->curg;
+	gp->status = Gwaiting;
+	gp->waitreason = "garbage collection";
+
+	a.start_time = g->m->scalararg[0];
+	a.eagersweep = g->m->scalararg[1];
+	gc(&a);
+
+	gp->status = Grunning;
+}
+
 static void
 gc(struct gc_args *args)
 {
@ -1409,10 +1419,10 @@ gc(struct gc_args *args)
 	cachestats();
 	// next_gc calculation is tricky with concurrent sweep since we don't know size of live heap
 	// estimate what was live heap size after previous GC (for tracing only)
-	heap0 = mstats.next_gc*100/(gcpercent+100);
+	heap0 = mstats.next_gc*100/(runtime·gcpercent+100);
 	// conservatively set next_gc to high value assuming that everything is live
 	// concurrent/lazy sweep will reduce this number while discovering new garbage
-	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
+	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*runtime·gcpercent/100;

 	t4 = runtime·nanotime();
 	mstats.last_gc = runtime·unixnanotime();  // must be Unix time to make sense to user
@ -1554,12 +1564,12 @@ runtime·setgcpercent(int32 in) {
 	int32 out;

 	runtime·lock(&runtime·mheap);
-	if(gcpercent == GcpercentUnknown)
-		gcpercent = readgogc();
-	out = gcpercent;
+	if(runtime·gcpercent == GcpercentUnknown)
+		runtime·gcpercent = runtime·readgogc();
+	out = runtime·gcpercent;
 	if(in < 0)
 		in = -1;
-	gcpercent = in;
+	runtime·gcpercent = in;
 	runtime·unlock(&runtime·mheap);
 	return out;
 }
@ -1678,17 +1688,24 @@ runfinq(void)
 void
 runtime·createfing(void)
 {
-	if(fing != nil)
+	if(runtime·fing != nil)
 		return;
 	// Here we use gclock instead of finlock,
 	// because newproc1 can allocate, which can cause on-demand span sweep,
 	// which can queue finalizers, which would deadlock.
 	runtime·lock(&gclock);
-	if(fing == nil)
-		fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
+	if(runtime·fing == nil)
+		runtime·fing = runtime·newproc1(&runfinqv, nil, 0, 0, runtime·gc);
 	runtime·unlock(&gclock);
 }

+void
+runtime·createfingM(G *gp)
+{
+	runtime·createfing();
+	runtime·gogo(&gp->sched);
+}
+
 G*
 runtime·wakefing(void)
 {
@ -1699,7 +1716,7 @@ runtime·wakefing(void)
 	if(runtime·fingwait && runtime·fingwake) {
 		runtime·fingwait = false;
 		runtime·fingwake = false;
-		res = fing;
+		res = runtime·fing;
 	}
 	runtime·unlock(&finlock);
 	return res;
@ -1944,6 +1961,17 @@ runtime·markallocated(void *v, uintptr size, uintptr size0, Type *typ, bool sca
 	}
 }

+void
+runtime·markallocated_m(void)
+{
+	M *mp;
+
+	mp = g->m;
+	runtime·markallocated(mp->ptrarg[0], mp->scalararg[0], mp->scalararg[1], mp->ptrarg[1], mp->scalararg[2] == 0);
+	mp->ptrarg[0] = nil;
+	mp->ptrarg[1] = nil;
+}
+
 // mark the block at v as freed.
 void
 runtime·markfreed(void *v)
--- a/src/pkg/runtime/mprof.goc
+++ b/src/pkg/runtime/mprof.goc
@ -140,6 +140,37 @@ runtime·MProf_Malloc(void *p, uintptr size)
 	runtime·setprofilebucket(p, b);
 }

+// Called by malloc to record a profiled block.
+void
+runtime·mprofMalloc(void)
+{
+	uintptr stk[32];
+	Bucket *b;
+	int32 nstk;
+	uintptr size;
+	void *p;
+
+	size = g->m->scalararg[0];
+	p = g->m->ptrarg[0];
+	g->m->ptrarg[0] = nil;
+	
+	if(g->m->curg == nil)
+		nstk = runtime·callers(1, stk, nelem(stk));
+	else
+		nstk = runtime·gcallers(g->m->curg, 1, stk, nelem(stk));
+	runtime·lock(&proflock);
+	b = stkbucket(MProf, size, stk, nstk, true);
+	b->recent_allocs++;
+	b->recent_alloc_bytes += size;
+	runtime·unlock(&proflock);
+
+	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
+	// This reduces potential contention and chances of deadlocks.
+	// Since the object must be alive during call to MProf_Malloc,
+	// it's fine to do this non-atomically.
+	runtime·setprofilebucket(p, b);
+}
+
 // Called when freeing a profiled block.
 void
 runtime·MProf_Free(Bucket *b, uintptr size, bool freed)
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@ -3136,6 +3136,7 @@ runtime·topofstack(Func *f)
 	return f->entry == (uintptr)runtime·goexit ||
 		f->entry == (uintptr)runtime·mstart ||
 		f->entry == (uintptr)runtime·mcall ||
+		f->entry == (uintptr)runtime·onM ||
 		f->entry == (uintptr)runtime·morestack ||
 		f->entry == (uintptr)runtime·lessstack ||
 		f->entry == (uintptr)_rt0_go ||
--- a/src/pkg/runtime/race.go
+++ b/src/pkg/runtime/race.go
@ -12,13 +12,6 @@ import (
 	"unsafe"
 )

-const (
-	// TODO: where should these live?
-	kindNoPointers = 1 << 7
-	kindArray      = 17
-	kindStruct     = 25
-)
-
 // RaceDisable disables handling of race events in the current goroutine.
 func RaceDisable()

--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@ -22,10 +22,17 @@ typedef	int64		intptr;
 typedef	int64		intgo; // Go's int
 typedef	uint64		uintgo; // Go's uint
 #else
-typedef	uint32		uintptr;
-typedef	int32		intptr;
-typedef	int32		intgo; // Go's int
-typedef	uint32		uintgo; // Go's uint
+// Normally, "int" == "long int" == 32 bits.
+// However, the C compiler uses this distinction
+// to disambiguate true 32 bit ints (e.g. int32)
+// from 32/64 bit ints (e.g. uintptr) so that it
+// can generate the corresponding go type correctly.
+typedef	signed long int		int32_x;
+typedef	unsigned long int	uint32_x;
+typedef	uint32_x	uintptr;
+typedef	int32_x		intptr;
+typedef	int32_x		intgo; // Go's int
+typedef	uint32_x	uintgo; // Go's uint
 #endif

 #ifdef _64BITREG
@ -874,6 +881,7 @@ uintptr	runtime·getcallersp(void*);
 int32	runtime·mcount(void);
 int32	runtime·gcount(void);
 void	runtime·mcall(void(*)(G*));
+void	runtime·onM(void(*)(void));
 uint32	runtime·fastrand1(void);
 void	runtime·rewindmorestack(Gobuf*);
 int32	runtime·timediv(int64, int32, int32*);
@ -916,6 +924,7 @@ void	runtime·exitsyscall(void);
 G*	runtime·newproc1(FuncVal*, byte*, int32, int32, void*);
 bool	runtime·sigsend(int32 sig);
 int32	runtime·callers(int32, uintptr*, int32);
+int32	runtime·gcallers(G*, int32, uintptr*, int32);
 int64	runtime·nanotime(void);	// monotonic time
 int64	runtime·unixnanotime(void); // real time, can skip
 void	runtime·dopanic(int32);
--- a/src/pkg/runtime/string.go
+++ b/src/pkg/runtime/string.go
@ -202,3 +202,56 @@ func stringiter2(s string, k int) (int, rune) {
 	r, n := charntorune(s[k:])
 	return k + n, r
 }
+
+// rawstring allocates storage for a new string. The returned
+// string and byte slice both refer to the same storage.
+// The storage is not zeroed. Callers should use
+// b to set the string contents and then drop b.
+func rawstring(size int) (s string, b []byte) {
+	p := gomallocgc(uintptr(size), nil, flagNoScan|flagNoZero)
+
+	(*stringStruct)(unsafe.Pointer(&s)).str = p
+	(*stringStruct)(unsafe.Pointer(&s)).len = size
+
+	(*slice)(unsafe.Pointer(&b)).array = (*uint8)(p)
+	(*slice)(unsafe.Pointer(&b)).len = uint(size)
+	(*slice)(unsafe.Pointer(&b)).cap = uint(size)
+
+	for {
+		ms := maxstring
+		if uintptr(size) <= uintptr(ms) || gocasx((*uintptr)(unsafe.Pointer(&maxstring)), uintptr(ms), uintptr(size)) {
+			return
+		}
+	}
+}
+
+// rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
+func rawbyteslice(size int) (b []byte) {
+	cap := goroundupsize(uintptr(size))
+	p := gomallocgc(cap, nil, flagNoScan|flagNoZero)
+	if cap != uintptr(size) {
+		memclr(add(p, uintptr(size)), cap-uintptr(size))
+	}
+
+	(*slice)(unsafe.Pointer(&b)).array = (*uint8)(p)
+	(*slice)(unsafe.Pointer(&b)).len = uint(size)
+	(*slice)(unsafe.Pointer(&b)).cap = uint(cap)
+	return
+}
+
+// rawruneslice allocates a new rune slice. The rune slice is not zeroed.
+func rawruneslice(size int) (b []rune) {
+	if uintptr(size) > maxMem/4 {
+		gothrow("out of memory")
+	}
+	mem := goroundupsize(uintptr(size) * 4)
+	p := gomallocgc(mem, nil, flagNoScan|flagNoZero)
+	if mem != uintptr(size)*4 {
+		memclr(add(p, uintptr(size)*4), mem-uintptr(size)*4)
+	}
+
+	(*slice)(unsafe.Pointer(&b)).array = (*uint8)(p)
+	(*slice)(unsafe.Pointer(&b)).len = uint(size)
+	(*slice)(unsafe.Pointer(&b)).cap = uint(mem / 4)
+	return
+}
--- a/src/pkg/runtime/stubs.go
+++ b/src/pkg/runtime/stubs.go
@ -15,18 +15,6 @@ const (
 	ptrSize = unsafe.Sizeof((*byte)(nil))
 )

-// rawstring allocates storage for a new string. The returned
-// string and byte slice both refer to the same storage.
-// The storage is not zeroed. Callers should use
-// b to set the string contents and then drop b.
-func rawstring(size int) (string, []byte)
-
-// rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
-func rawbyteslice(size int) []byte
-
-// rawruneslice allocates a new rune slice. The rune slice is not zeroed.
-func rawruneslice(size int) []rune
-
 //go:noescape
 func gogetcallerpc(p unsafe.Pointer) uintptr

@ -44,15 +32,38 @@ func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
 	return unsafe.Pointer(uintptr(p) + x)
 }

-// Make a new object of the given type
+// n must be a power of 2
+func roundup(p unsafe.Pointer, n uintptr) unsafe.Pointer {
+	return unsafe.Pointer((uintptr(p) + n - 1) &^ (n - 1))
+}
+
 // in stubs.goc
-func unsafe_New(t *_type) unsafe.Pointer
-func unsafe_NewArray(t *_type, n uintptr) unsafe.Pointer
+func acquirem() *m
+func releasem(mp *m)
+
+// in asm_*.s
+func mcall(fn *byte)
+func onM(fn *byte)
+
+// C routines that run on the M stack.  Call these like
+//   mcall(&mcacheRefill)
+// Arguments should be passed in m->scalararg[x] and
+// m->ptrarg[x].  Return values can be passed in those
+// same slots.
+var mcacheRefill byte
+var largeAlloc byte
+var mprofMalloc byte
+var mgc2 byte
+var setFinalizer byte
+var markallocated_m byte

 // memclr clears n bytes starting at ptr.
 // in memclr_*.s
 func memclr(ptr unsafe.Pointer, n uintptr)

+func racemalloc(p unsafe.Pointer, size uintptr)
+func tracealloc(p unsafe.Pointer, size uintptr, typ *_type)
+
 // memmove copies n bytes from "from" to "to".
 // in memmove_*.s
 func memmove(to unsafe.Pointer, from unsafe.Pointer, n uintptr)
@ -60,11 +71,26 @@ func memmove(to unsafe.Pointer, from unsafe.Pointer, n uintptr)
 // in asm_*.s
 func fastrand2() uint32

+const (
+	gcpercentUnknown = -2
+	concurrentSweep  = true
+)
+
 // in asm_*.s
 // if *p == x { *p = y; return true } else { return false }, atomically
 //go:noescape
 func gocas(p *uint32, x uint32, y uint32) bool

+//go:noescape
+func gocasx(p *uintptr, x uintptr, y uintptr) bool
+
+func goreadgogc() int32
+func gonanotime() int64
+func gosched()
+func starttheworld()
+func stoptheworld()
+func clearpools()
+
 // in asm_*.s
 //go:noescape
 func gohash(a *alg, p unsafe.Pointer, size uintptr, seed uintptr) uintptr
@ -86,3 +112,8 @@ var nohashcode uintptr
 // Go version of runtime.throw.
 // in panic.c
 func gothrow(s string)
+
+func golock(x *lock)
+func gounlock(x *lock)
+func semacquire(*uint32, bool)
+func semrelease(*uint32)
--- a/src/pkg/runtime/stubs.goc
+++ b/src/pkg/runtime/stubs.goc
@ -6,6 +6,7 @@ package runtime
 #include "runtime.h"
 #include "arch_GOARCH.h"
 #include "malloc.h"
+#include "stack.h"
 #include "../../cmd/ld/textflag.h"

 // This file contains functions called by Go but written
@ -23,51 +24,17 @@ package runtime
 // finished converting runtime support code from C to Go.

 #pragma textflag NOSPLIT
-func rawstring(size intgo) (s String, b Slice) {
-	uintptr ms;
-	byte *p;
-
-	p = runtime·mallocgc(size, 0, FlagNoScan|FlagNoZero);
-	s.str = p;
-	s.len = size;
-	b.array = p;
-	b.len = size;
-	b.cap = size;
-	for(;;) {
-		ms = runtime·maxstring;
-		if((uintptr)size <= ms || runtime·casp((void**)&runtime·maxstring, (void*)ms, (void*)size))
-			break;
-	}
+func golock(p *Lock) {
+	runtime·lock(p);
+}
+#pragma textflag NOSPLIT
+func gounlock(p *Lock) {
+	runtime·unlock(p);
 }

 #pragma textflag NOSPLIT
-func rawbyteslice(size intgo) (b Slice) {
-	uintptr cap;
-	byte *p;
-
-	cap = runtime·roundupsize(size);
-	p = runtime·mallocgc(cap, 0, FlagNoScan|FlagNoZero);
-	if(cap != size)
-		runtime·memclr(p + size, cap - size);
-	b.array = p;
-	b.len = size;
-	b.cap = cap;
-}
-
-#pragma textflag NOSPLIT
-func rawruneslice(size intgo) (b Slice) {
-	uintptr mem;
-	byte *p;
-
-	if(size > MaxMem/sizeof(int32))
-		runtime·throw("out of memory");
-	mem = runtime·roundupsize(size*sizeof(int32));
-	p = runtime·mallocgc(mem, 0, FlagNoScan|FlagNoZero);
-	if(mem != size*sizeof(int32))
-		runtime·memclr(p + size*sizeof(int32), mem - size*sizeof(int32));
-	b.array = p;
-	b.len = size;
-	b.cap = mem/sizeof(int32);
+func goreadgogc() (r int32) {
+	r = runtime·readgogc();
 }

 // entry point for testing
@ -77,16 +44,38 @@ func gostringW(str Slice) (s String) {
 }

 #pragma textflag NOSPLIT
-func runtime·unsafe_New(t *Type) (ret *byte) {
-	ret = runtime·cnew(t);
-}
-
-#pragma textflag NOSPLIT
-func runtime·unsafe_NewArray(t *Type, n int) (ret *byte) {
-	ret = runtime·cnewarray(t, n);
+func gonanotime() (r int64) {
+	r = runtime·nanotime();
 }

 #pragma textflag NOSPLIT
 func runtime·gocas(p *uint32, x uint32, y uint32) (ret bool) {
 	ret = runtime·cas(p, x, y);
 }
+
+#pragma textflag NOSPLIT
+func runtime·gocasx(p *uintptr, x uintptr, y uintptr) (ret bool) {
+	ret = runtime·casp((void**)p, (void*)x, (void*)y);
+}
+
+#pragma textflag NOSPLIT
+func runtime·acquirem() (ret *M) {
+	ret = g->m;
+	ret->locks++;
+}
+
+#pragma textflag NOSPLIT
+func runtime·releasem(mp *M) {
+	mp->locks--;
+	if(mp->locks == 0 && g->preempt) {
+		// restore the preemption request in case we've cleared it in newstack
+		g->stackguard0 = StackPreempt;
+	}
+}
+
+// For testing.
+// TODO: find a better place for this.
+func GCMask(x Eface) (mask Slice) {
+	runtime·getgcmask(x.data, x.type, &mask.array, &mask.len);
+	mask.cap = mask.len;
+}
--- a/src/pkg/runtime/traceback_arm.c
+++ b/src/pkg/runtime/traceback_arm.c
@ -350,3 +350,9 @@ runtime·callers(int32 skip, uintptr *pcbuf, int32 m)

 	return runtime·gentraceback(pc, sp, 0, g, skip, pcbuf, m, nil, nil, false);
 }
+
+int32
+runtime·gcallers(G *gp, int32 skip, uintptr *pcbuf, int32 m)
+{
+	return runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, skip, pcbuf, m, nil, nil, false);
+}
--- a/src/pkg/runtime/traceback_x86.c
+++ b/src/pkg/runtime/traceback_x86.c
@ -428,3 +428,9 @@ runtime·callers(int32 skip, uintptr *pcbuf, int32 m)

 	return runtime·gentraceback(pc, sp, 0, g, skip, pcbuf, m, nil, nil, false);
 }
+
+int32
+runtime·gcallers(G *gp, int32 skip, uintptr *pcbuf, int32 m)
+{
+	return runtime·gentraceback(~(uintptr)0, ~(uintptr)0, 0, gp, skip, pcbuf, m, nil, nil, false);
+}
--- a/test/live.go
+++ b/test/live.go
@ -397,9 +397,9 @@ func f27defer(b bool) {
 func f27go(b bool) {
 	x := 0
 	if b {
-		go call27(func() {x++}) // ERROR "live at call to new: &x" "live at call to newproc: &x$"
+		go call27(func() {x++}) // ERROR "live at call to newobject: &x" "live at call to newproc: &x$"
 	}
-	go call27(func() {x++}) // ERROR "live at call to new: &x"
+	go call27(func() {x++}) // ERROR "live at call to newobject: &x"
 	println()
 }

@ -461,7 +461,7 @@ func f31(b1, b2, b3 bool) {
 		g31("a") // ERROR "live at call to convT2E: autotmp_[0-9]+$" "live at call to g31: autotmp_[0-9]+$"
 	}
 	if b2 {
-		h31("b") // ERROR "live at call to new: autotmp_[0-9]+$" "live at call to convT2E: autotmp_[0-9]+ autotmp_[0-9]+$" "live at call to h31: autotmp_[0-9]+$"
+		h31("b") // ERROR "live at call to newobject: autotmp_[0-9]+$" "live at call to convT2E: autotmp_[0-9]+ autotmp_[0-9]+$" "live at call to h31: autotmp_[0-9]+$"
 	}
 	if b3 {
 		panic("asdf") // ERROR "live at call to convT2E: autotmp_[0-9]+$" "live at call to panic: autotmp_[0-9]+$"
@ -583,13 +583,13 @@ func f39a() (x []int) {
 }

 func f39b() (x [10]*int) {
-	x = [10]*int{new(int)} // ERROR "live at call to new: x"
+	x = [10]*int{new(int)} // ERROR "live at call to newobject: x"
 	println() // ERROR "live at call to printnl: x"
 	return x
 }

 func f39c() (x [10]*int) {
-	x = [10]*int{new(int)} // ERROR "live at call to new: x"
+	x = [10]*int{new(int)} // ERROR "live at call to newobject: x"
 	println() // ERROR "live at call to printnl: x"
 	return
 }