runtime: port performance-critical functions to regabi

This CL ports a few performance-critical runtime assembly functions to use register arguments directly. While using the faster ABI is nice, the real win here is that we avoid ABI wrappers: since these are "builtin" functions in the compiler, it can generate calls to them without knowing that their native implementation is ABI0. Hence, it generates ABIInternal calls that go through ABI wrappers. By porting them to use ABIInternal natively, we avoid the overhead of the ABI wrapper. This significantly improves performance on several benchmarks, comparing regabiwrappers before and after this change: name old time/op new time/op delta BiogoIgor 15.7s ± 2% 15.7s ± 2% ~ (p=0.617 n=25+25) BiogoKrishna 18.5s ± 5% 17.7s ± 2% -4.61% (p=0.000 n=25+25) BleveIndexBatch100 5.91s ± 3% 5.82s ± 3% -1.60% (p=0.000 n=25+25) BleveQuery 6.76s ± 0% 6.60s ± 1% -2.31% (p=0.000 n=22+25) CompileTemplate 248ms ± 5% 245ms ± 1% ~ (p=0.643 n=25+20) CompileUnicode 94.4ms ± 3% 93.9ms ± 2% ~ (p=0.152 n=24+23) CompileGoTypes 1.60s ± 2% 1.59s ± 2% ~ (p=0.059 n=24+24) CompileCompiler 104ms ± 3% 103ms ± 1% ~ (p=0.056 n=25+22) CompileSSA 10.9s ± 1% 10.9s ± 1% ~ (p=0.052 n=25+25) CompileFlate 156ms ± 8% 152ms ± 1% -2.49% (p=0.008 n=25+21) CompileGoParser 248ms ± 1% 249ms ± 2% ~ (p=0.058 n=21+20) CompileReflect 595ms ± 3% 601ms ± 4% ~ (p=0.182 n=25+25) CompileTar 211ms ± 2% 211ms ± 1% ~ (p=0.663 n=23+23) CompileXML 282ms ± 2% 284ms ± 5% ~ (p=0.456 n=21+23) CompileStdCmd 13.6s ± 2% 13.5s ± 2% ~ (p=0.112 n=25+24) FoglemanFauxGLRenderRotateBoat 8.69s ± 2% 8.67s ± 0% ~ (p=0.094 n=22+25) FoglemanPathTraceRenderGopherIter1 20.2s ± 2% 20.7s ± 3% +2.53% (p=0.000 n=24+24) GopherLuaKNucleotide 31.4s ± 1% 31.0s ± 1% -1.28% (p=0.000 n=25+24) MarkdownRenderXHTML 246ms ± 1% 244ms ± 1% -0.79% (p=0.000 n=20+21) Tile38WithinCircle100kmRequest 843µs ± 4% 818µs ± 4% -2.93% (p=0.000 n=25+25) Tile38IntersectsCircle100kmRequest 1.06ms ± 5% 1.05ms ± 3% -1.19% (p=0.021 n=24+25) Tile38KNearestLimit100Request 1.01ms ± 1% 1.01ms ± 2% ~ (p=0.335 n=22+25) [Geo mean] 596ms 592ms -0.71% (https://perf.golang.org/search?q=upload:20210411.5) It also significantly reduces the performance penalty of enabling regabiwrappers, though it doesn't yet fully close the gap on all benchmarks: name old time/op new time/op delta BiogoIgor 15.7s ± 1% 15.7s ± 2% ~ (p=0.366 n=24+25) BiogoKrishna 17.7s ± 2% 17.7s ± 2% ~ (p=0.315 n=23+25) BleveIndexBatch100 5.86s ± 4% 5.82s ± 3% ~ (p=0.137 n=24+25) BleveQuery 6.55s ± 0% 6.60s ± 1% +0.83% (p=0.000 n=24+25) CompileTemplate 244ms ± 1% 245ms ± 1% ~ (p=0.208 n=21+20) CompileUnicode 94.0ms ± 4% 93.9ms ± 2% ~ (p=0.666 n=24+23) CompileGoTypes 1.60s ± 2% 1.59s ± 2% ~ (p=0.154 n=25+24) CompileCompiler 103ms ± 1% 103ms ± 1% ~ (p=0.905 n=24+22) CompileSSA 10.9s ± 2% 10.9s ± 1% ~ (p=0.803 n=25+25) CompileFlate 153ms ± 1% 152ms ± 1% ~ (p=0.182 n=23+21) CompileGoParser 250ms ± 2% 249ms ± 2% ~ (p=0.843 n=24+20) CompileReflect 595ms ± 4% 601ms ± 4% ~ (p=0.141 n=25+25) CompileTar 212ms ± 3% 211ms ± 1% ~ (p=0.499 n=23+23) CompileXML 282ms ± 1% 284ms ± 5% ~ (p=0.129 n=20+23) CompileStdCmd 13.5s ± 2% 13.5s ± 2% ~ (p=0.480 n=24+24) FoglemanFauxGLRenderRotateBoat 8.66s ± 1% 8.67s ± 0% ~ (p=0.325 n=25+25) FoglemanPathTraceRenderGopherIter1 20.6s ± 3% 20.7s ± 3% ~ (p=0.137 n=25+24) GopherLuaKNucleotide 30.5s ± 2% 31.0s ± 1% +1.68% (p=0.000 n=23+24) MarkdownRenderXHTML 243ms ± 1% 244ms ± 1% +0.51% (p=0.000 n=23+21) Tile38WithinCircle100kmRequest 801µs ± 2% 818µs ± 4% +2.11% (p=0.000 n=25+25) Tile38IntersectsCircle100kmRequest 1.01ms ± 2% 1.05ms ± 3% +4.34% (p=0.000 n=24+25) Tile38KNearestLimit100Request 1.00ms ± 1% 1.01ms ± 2% +0.81% (p=0.008 n=21+25) [Geo mean] 589ms 592ms +0.50% (https://perf.golang.org/search?q=upload:20210411.6) Change-Id: I8f77f010b0abc658064df569a27a9c7a7b1c7bf9 Reviewed-on: https://go-review.googlesource.com/c/go/+/308931 Trust: Austin Clements <austin@google.com> Run-TryBot: Austin Clements <austin@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com> TryBot-Result: Go Bot <gobot@golang.org>
2024-09-15 22:20:06 +00:00 · 2021-04-08 17:43:51 -04:00 · 2021-04-08 17:43:51 -04:00 · 849dba07a5
parent 865d2bc78e
commit 849dba07a5
4 changed files with 174 additions and 58 deletions
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@ -1011,34 +1011,62 @@ done:

 // func memhash(p unsafe.Pointer, h, s uintptr) uintptr
 // hash function using AES hardware instructions
-TEXT runtime·memhash(SB),NOSPLIT,$0-32
+TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT,$0-32
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+	// CX = size
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifndef GOEXPERIMENT_regabiargs
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	s+16(FP), CX	// size
 	LEAQ	ret+24(FP), DX
+#endif
 	JMP	aeshashbody<>(SB)
 noaes:
-	JMP	runtime·memhashFallback(SB)
+	JMP	runtime·memhashFallback<ABIInternal>(SB)

 // func strhash(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·strhash(SB),NOSPLIT,$0-24
+TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to string struct
+	// BX = seed
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifndef GOEXPERIMENT_regabiargs
 	MOVQ	p+0(FP), AX	// ptr to string struct
+#endif
 	MOVQ	8(AX), CX	// length of string
 	MOVQ	(AX), AX	// string data
+#ifndef GOEXPERIMENT_regabiargs
 	LEAQ	ret+16(FP), DX
+#endif
 	JMP	aeshashbody<>(SB)
 noaes:
-	JMP	runtime·strhashFallback(SB)
+	JMP	runtime·strhashFallback<ABIInternal>(SB)

 // AX: data
+#ifdef GOEXPERIMENT_regabiargs
+// BX: hash seed
+#else
+// h+8(FP): hash seed
+#endif
 // CX: length
+#ifdef GOEXPERIMENT_regabiargs
+// At return: AX = return value
+#else
 // DX: address to put return value
+#endif
 TEXT aeshashbody<>(SB),NOSPLIT,$0-0
 	// Fill an SSE register with our seeds.
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0				// 64 bits of per-table hash seed
+#else
 	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
+#endif
 	PINSRW	$4, CX, X0			// 16 bits of length
 	PSHUFHW $0, X0, X0			// repeat length 4 times total
 	MOVO	X0, X1				// save unscrambled seed
@ -1075,7 +1103,11 @@ final1:
 	AESENC	X1, X1	// scramble combo 3 times
 	AESENC	X1, X1
 	AESENC	X1, X1
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X1, AX	// return X1
+#else
 	MOVQ	X1, (DX)
+#endif
 	RET

 endofpage:
@ -1091,7 +1123,11 @@ endofpage:
 aes0:
 	// Return scrambled input seed
 	AESENC	X0, X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, (DX)
+#endif
 	RET

 aes16:
@ -1121,7 +1157,11 @@ aes17to32:

 	// combine results
 	PXOR	X3, X2
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X2, AX	// return X2
+#else
 	MOVQ	X2, (DX)
+#endif
 	RET

 aes33to64:
@ -1163,7 +1203,11 @@ aes33to64:
 	PXOR	X6, X4
 	PXOR	X7, X5
 	PXOR	X5, X4
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X4, AX	// return X4
+#else
 	MOVQ	X4, (DX)
+#endif
 	RET

 aes65to128:
@ -1245,7 +1289,15 @@ aes65to128:
 	PXOR	X10, X8
 	PXOR	X11, X9
 	PXOR	X9, X8
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X8, AX	// return X8
+#else
 	MOVQ	X8, (DX)
+#endif
 	RET

 aes129plus:
@ -1361,38 +1413,73 @@ aesloop:
 	PXOR	X10, X8
 	PXOR	X11, X9
 	PXOR	X9, X8
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X8, AX	// return X8
+#else
 	MOVQ	X8, (DX)
+#endif
 	RET

 // func memhash32(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·memhash32(SB),NOSPLIT,$0-24
+// ABIInternal for performance.
+TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0	// X0 = seed
+#else
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
+#endif
 	PINSRD	$2, (AX), X0	// data
 	AESENC	runtime·aeskeysched+0(SB), X0
 	AESENC	runtime·aeskeysched+16(SB), X0
 	AESENC	runtime·aeskeysched+32(SB), X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, ret+16(FP)
+#endif
 	RET
 noaes:
-	JMP	runtime·memhash32Fallback(SB)
+	JMP	runtime·memhash32Fallback<ABIInternal>(SB)

 // func memhash64(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·memhash64(SB),NOSPLIT,$0-24
+// ABIInternal for performance.
+TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT,$0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr to data
+	// BX = seed
+#else
+#endif
 	CMPB	runtime·useAeshash(SB), $0
 	JEQ	noaes
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	BX, X0	// X0 = seed
+#else
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
+#endif
 	PINSRQ	$1, (AX), X0	// data
 	AESENC	runtime·aeskeysched+0(SB), X0
 	AESENC	runtime·aeskeysched+16(SB), X0
 	AESENC	runtime·aeskeysched+32(SB), X0
+#ifdef GOEXPERIMENT_regabiargs
+	MOVQ	X0, AX	// return X0
+#else
 	MOVQ	X0, ret+16(FP)
+#endif
 	RET
 noaes:
-	JMP	runtime·memhash64Fallback(SB)
+	JMP	runtime·memhash64Fallback<ABIInternal>(SB)

 // simple mask to get rid of data in the high part of the register.
 DATA masks<>+0x00(SB)/8, $0x0000000000000000
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@ -12,9 +12,16 @@
 // See memclrNoHeapPointers Go doc for important implementation constraints.

 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
-TEXT runtime·memclrNoHeapPointers(SB), NOSPLIT, $0-16
+// ABIInternal for performance.
+TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT, $0-16
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = ptr
+	// BX = n
+	MOVQ	AX, DI	// DI = ptr
+#else
 	MOVQ	ptr+0(FP), DI
 	MOVQ	n+8(FP), BX
+#endif
 	XORQ	AX, AX

 	// MOVOU seems always faster than REP STOSQ.
@ -31,7 +38,9 @@ tail:
 	JE	_8
 	CMPQ	BX, $16
 	JBE	_9through16
-	PXOR	X0, X0
+#ifndef GOEXPERIMENT_regabig
+	PXOR	X15, X15
+#endif
 	CMPQ	BX, $32
 	JBE	_17through32
 	CMPQ	BX, $64
@ -45,22 +54,22 @@ tail:
 	// TODO: for really big clears, use MOVNTDQ, even without AVX2.

 loop:
-	MOVOU	X0, 0(DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, 128(DI)
-	MOVOU	X0, 144(DI)
-	MOVOU	X0, 160(DI)
-	MOVOU	X0, 176(DI)
-	MOVOU	X0, 192(DI)
-	MOVOU	X0, 208(DI)
-	MOVOU	X0, 224(DI)
-	MOVOU	X0, 240(DI)
+	MOVOU	X15, 0(DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, 128(DI)
+	MOVOU	X15, 144(DI)
+	MOVOU	X15, 160(DI)
+	MOVOU	X15, 176(DI)
+	MOVOU	X15, 192(DI)
+	MOVOU	X15, 208(DI)
+	MOVOU	X15, 224(DI)
+	MOVOU	X15, 240(DI)
 	SUBQ	$256, BX
 	ADDQ	$256, DI
 	CMPQ	BX, $256
@ -141,40 +150,40 @@ _9through16:
 	MOVQ	AX, -8(DI)(BX*1)
 	RET
 _17through32:
-	MOVOU	X0, (DI)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _33through64:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _65through128:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
 _129through256:
-	MOVOU	X0, (DI)
-	MOVOU	X0, 16(DI)
-	MOVOU	X0, 32(DI)
-	MOVOU	X0, 48(DI)
-	MOVOU	X0, 64(DI)
-	MOVOU	X0, 80(DI)
-	MOVOU	X0, 96(DI)
-	MOVOU	X0, 112(DI)
-	MOVOU	X0, -128(DI)(BX*1)
-	MOVOU	X0, -112(DI)(BX*1)
-	MOVOU	X0, -96(DI)(BX*1)
-	MOVOU	X0, -80(DI)(BX*1)
-	MOVOU	X0, -64(DI)(BX*1)
-	MOVOU	X0, -48(DI)(BX*1)
-	MOVOU	X0, -32(DI)(BX*1)
-	MOVOU	X0, -16(DI)(BX*1)
+	MOVOU	X15, (DI)
+	MOVOU	X15, 16(DI)
+	MOVOU	X15, 32(DI)
+	MOVOU	X15, 48(DI)
+	MOVOU	X15, 64(DI)
+	MOVOU	X15, 80(DI)
+	MOVOU	X15, 96(DI)
+	MOVOU	X15, 112(DI)
+	MOVOU	X15, -128(DI)(BX*1)
+	MOVOU	X15, -112(DI)(BX*1)
+	MOVOU	X15, -96(DI)(BX*1)
+	MOVOU	X15, -80(DI)(BX*1)
+	MOVOU	X15, -64(DI)(BX*1)
+	MOVOU	X15, -48(DI)(BX*1)
+	MOVOU	X15, -32(DI)(BX*1)
+	MOVOU	X15, -16(DI)(BX*1)
 	RET
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@ -31,11 +31,20 @@
 // See memmove Go doc for important implementation constraints.

 // func memmove(to, from unsafe.Pointer, n uintptr)
-TEXT runtime·memmove(SB), NOSPLIT, $0-24
-
+// ABIInternal for performance.
+TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT, $0-24
+#ifdef GOEXPERIMENT_regabiargs
+	// AX = to
+	// BX = from
+	// CX = n
+	MOVQ	AX, DI
+	MOVQ	BX, SI
+	MOVQ	CX, BX
+#else
 	MOVQ	to+0(FP), DI
 	MOVQ	from+8(FP), SI
 	MOVQ	n+16(FP), BX
+#endif

 	// REP instructions have a high startup cost, so we handle small sizes
 	// with some straightline code. The REP MOVSQ instruction is really fast
@ -244,6 +253,10 @@ move_129through256:
 	MOVOU	X13, -48(DI)(BX*1)
 	MOVOU	X14, -32(DI)(BX*1)
 	MOVOU	X15, -16(DI)(BX*1)
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
 	RET
 move_256through2048:
 	SUBQ	$256, BX
@ -283,6 +296,10 @@ move_256through2048:
 	LEAQ	256(SI), SI
 	LEAQ	256(DI), DI
 	JGE	move_256through2048
+#ifdef GOEXPERIMENT_regabig
+	// X15 must be zero on return
+	PXOR	X15, X15
+#endif
 	JMP	tail

 avxUnaligned:
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@ -109,6 +109,9 @@ func reflect_memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr) {
 //go:noescape
 func memmove(to, from unsafe.Pointer, n uintptr)

+// Outside assembly calls memmove. Make sure it has ABI wrappers.
+//go:linkname memmove
+
 //go:linkname reflect_memmove reflect.memmove
 func reflect_memmove(to, from unsafe.Pointer, n uintptr) {
 	memmove(to, from, n)