runtime: process ptr bitmaps one word at a time

[This is a retry of CL 407036 + its revert CL 422394. The only content change is the 1-line change in cmd/internal/obj/objfile.go.] Read the bitmaps one uintptr at a time instead of one byte at a time. Performance so far: Allocation heavy, no retention: ~30% faster in heapBitsSetType Scan heavy, ~no allocation: ~even in scanobject Change-Id: I04d899e1dbd23e989e9f552cdc1880318779c14c Reviewed-on: https://go-review.googlesource.com/c/go/+/422635 TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Keith Randall <khr@google.com> Run-TryBot: Keith Randall <khr@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2024-11-02 11:50:30 +00:00 · 2022-05-06 18:40:17 -07:00 · 2022-05-06 18:40:17 -07:00 · e49e876455
commit e49e876455
parent 6a9c674a09
4 changed files with 91 additions and 18 deletions
--- a/src/cmd/compile/internal/reflectdata/reflect.go
+++ b/src/cmd/compile/internal/reflectdata/reflect.go
@ -1548,7 +1548,11 @@ func dgcsym(t *types.Type, write bool) (lsym *obj.LSym, useGCProg bool, ptrdata

 // dgcptrmask emits and returns the symbol containing a pointer mask for type t.
 func dgcptrmask(t *types.Type, write bool) *obj.LSym {
-	ptrmask := make([]byte, (types.PtrDataSize(t)/int64(types.PtrSize)+7)/8)
+	// Bytes we need for the ptrmask.
+	n := (types.PtrDataSize(t)/int64(types.PtrSize) + 7) / 8
+	// Runtime wants ptrmasks padded to a multiple of uintptr in size.
+	n = (n + int64(types.PtrSize) - 1) &^ (int64(types.PtrSize) - 1)
+	ptrmask := make([]byte, n)
 	fillptrmask(t, ptrmask)
 	p := fmt.Sprintf("runtime.gcbits.%x", ptrmask)

--- a/src/cmd/internal/obj/objfile.go
+++ b/src/cmd/internal/obj/objfile.go
@ -354,7 +354,6 @@ func (w *writer) Sym(s *LSym) {
 		case strings.HasPrefix(s.Name, "go:string."),
 			strings.HasPrefix(name, "type:.namedata."),
 			strings.HasPrefix(name, "type:.importpath."),
-			strings.HasPrefix(name, "runtime.gcbits."),
 			strings.HasSuffix(name, ".opendefer"),
 			strings.HasSuffix(name, ".arginfo0"),
 			strings.HasSuffix(name, ".arginfo1"),
--- a/src/reflect/type.go
+++ b/src/reflect/type.go
@ -2271,7 +2271,10 @@ func bucketOf(ktyp, etyp *rtype) *rtype {

 	if ktyp.ptrdata != 0 || etyp.ptrdata != 0 {
 		nptr := (bucketSize*(1+ktyp.size+etyp.size) + goarch.PtrSize) / goarch.PtrSize
-		mask := make([]byte, (nptr+7)/8)
+		n := (nptr + 7) / 8
+		// Runtime needs pointer masks to be a multiple of uintptr in size.
+		n = (n + goarch.PtrSize - 1) &^ (goarch.PtrSize - 1)
+		mask := make([]byte, n)
 		base := bucketSize / goarch.PtrSize

 		if ktyp.ptrdata != 0 {
@ -2977,7 +2980,10 @@ func ArrayOf(length int, elem Type) Type {
 		// Element is small with pointer mask; array is still small.
 		// Create direct pointer mask by turning each 1 bit in elem
 		// into length 1 bits in larger mask.
-		mask := make([]byte, (array.ptrdata/goarch.PtrSize+7)/8)
+		n := (array.ptrdata/goarch.PtrSize + 7) / 8
+		// Runtime needs pointer masks to be a multiple of uintptr in size.
+		n = (n + goarch.PtrSize - 1) &^ (goarch.PtrSize - 1)
+		mask := make([]byte, n)
 		emitGCMask(mask, 0, typ, array.len)
 		array.gcdata = &mask[0]

@ -3146,8 +3152,13 @@ type bitVector struct {

 // append a bit to the bitmap.
 func (bv *bitVector) append(bit uint8) {
-	if bv.n%8 == 0 {
-		bv.data = append(bv.data, 0)
+	if bv.n%(8*goarch.PtrSize) == 0 {
+		// Runtime needs pointer masks to be a multiple of uintptr in size.
+		// Since reflect passes bv.data directly to the runtime as a pointer mask,
+		// we append a full uintptr of zeros at a time.
+		for i := 0; i < goarch.PtrSize; i++ {
+			bv.data = append(bv.data, 0)
+		}
 	}
 	bv.data[bv.n/8] |= bit << (bv.n % 8)
 	bv.n++
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@ -893,6 +893,19 @@ func (h writeHeapBits) flush(addr, size uintptr) {
 	}
 }

+// Read the bytes starting at the aligned pointer p into a uintptr.
+// Read is little-endian.
+func readUintptr(p *byte) uintptr {
+	x := *(*uintptr)(unsafe.Pointer(p))
+	if goarch.BigEndian {
+		if goarch.PtrSize == 8 {
+			return uintptr(sys.Bswap64(uint64(x)))
+		}
+		return uintptr(sys.Bswap32(uint32(x)))
+	}
+	return x
+}
+
 // heapBitsSetType records that the new allocation [x, x+size)
 // holds in [x, x+dataSize) one or more values of type typ.
 // (The number of values is given by dataSize / typ.size.)
@ -917,7 +930,7 @@ func (h writeHeapBits) flush(addr, size uintptr) {
 // machines, callers must execute a store/store (publication) barrier
 // between calling this function and making the object reachable.
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
-	const doubleCheck = true // slow but helpful; enable to test modifications to this code
+	const doubleCheck = false // slow but helpful; enable to test modifications to this code

 	if doubleCheck && dataSize%typ.size != 0 {
 		throw("heapBitsSetType: dataSize not a multiple of typ.size")
@ -995,19 +1008,65 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// objects with scalar tails, all but the last tail does have to
 	// be initialized, because there is no way to say "skip forward".

-	for i := uintptr(0); true; i += typ.size {
-		p := typ.gcdata
-		var j uintptr
-		for j = 0; j+8*goarch.PtrSize < typ.ptrdata; j += 8 * goarch.PtrSize {
-			h = h.write(uintptr(*p), 8)
-			p = add1(p)
+	ptrs := typ.ptrdata / goarch.PtrSize
+	if typ.size == dataSize { // Single element
+		if ptrs <= ptrBits { // Single small element
+			m := readUintptr(typ.gcdata)
+			h = h.write(m, ptrs)
+		} else { // Single large element
+			p := typ.gcdata
+			for {
+				h = h.write(readUintptr(p), ptrBits)
+				p = addb(p, ptrBits/8)
+				ptrs -= ptrBits
+				if ptrs <= ptrBits {
+					break
+				}
+			}
+			m := readUintptr(p)
+			h = h.write(m, ptrs)
 		}
-		h = h.write(uintptr(*p), (typ.ptrdata-j)/goarch.PtrSize)
-		if i+typ.size == dataSize {
-			break // don't need the trailing nonptr bits on the last element.
+	} else { // Repeated element
+		words := typ.size / goarch.PtrSize // total words, including scalar tail
+		if words <= ptrBits {              // Repeated small element
+			n := dataSize / typ.size
+			m := readUintptr(typ.gcdata)
+			// Make larger unit to repeat
+			for words <= ptrBits/2 {
+				if n&1 != 0 {
+					h = h.write(m, words)
+				}
+				n /= 2
+				m |= m << words
+				ptrs += words
+				words *= 2
+				if n == 1 {
+					break
+				}
+			}
+			for n > 1 {
+				h = h.write(m, words)
+				n--
+			}
+			h = h.write(m, ptrs)
+		} else { // Repeated large element
+			for i := uintptr(0); true; i += typ.size {
+				p := typ.gcdata
+				j := ptrs
+				for j > ptrBits {
+					h = h.write(readUintptr(p), ptrBits)
+					p = addb(p, ptrBits/8)
+					j -= ptrBits
+				}
+				m := readUintptr(p)
+				h = h.write(m, j)
+				if i+typ.size == dataSize {
+					break // don't need the trailing nonptr bits on the last element.
+				}
+				// Pad with zeros to the start of the next element.
+				h = h.pad(typ.size - typ.ptrdata)
+			}
 		}
-		// Pad with zeros to the start of the next element.
-		h = h.pad(typ.size - typ.ptrdata)
 	}
 	h.flush(x, size)