cmd/compile: add arch-specific inlining for runtime.memmove

This CL add runtime.memmove inlining for AMD64 and ARM64.
According to ssa dump from testcases generic rules can't inline
memmomve properly due to one of the arguments is Phi operation. But this
Phi op will be optimized out by later optimization stages. As a result
memmove can be inlined during arch-specific rules.
The commit add new optimization rules to arch-specific rules that can
inline runtime.memmove if it possible during lowering stage.
Optimization fires 5 times in Go source-code using regabi.

Fixes #41662

Change-Id: Iaffaf4c482d068b5f0683d141863892202cc8824
Reviewed-on: https://go-review.googlesource.com/c/go/+/289151
Reviewed-by: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Trust: David Chase <drchase@google.com>
This commit is contained in:
Ruslan Andreev 2021-01-19 22:30:10 +08:00 committed by Keith Randall
parent 07ff596404
commit 3b321a9d12
6 changed files with 191 additions and 3 deletions

View file

@ -2216,3 +2216,22 @@
(MOVOstore [dstOff] {dstSym} ptr (MOVOload [srcOff] {srcSym} (SB) _) mem) && symIsRO(srcSym) =>
(MOVQstore [dstOff+8] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff)+8, config.ctxt.Arch.ByteOrder))])
(MOVQstore [dstOff] {dstSym} ptr (MOVQconst [int64(read64(srcSym, int64(srcOff), config.ctxt.Arch.ByteOrder))]) mem))
// Arch-specific inlining for small or disjoint runtime.memmove
// Match post-lowering calls, memory version.
(SelectN [0] call:(CALLstatic {sym} s1:(MOVQstoreconst _ [sc] s2:(MOVQstore _ src s3:(MOVQstore _ dst mem)))))
&& sc.Val64() >= 0
&& isSameCall(sym, "runtime.memmove")
&& s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
&& isInlinableMemmove(dst, src, sc.Val64(), config)
&& clobber(s1, s2, s3, call)
=> (Move [sc.Val64()] dst src mem)
// Match post-lowering calls, register version.
(SelectN [0] call:(CALLstatic {sym} dst src (MOVQconst [sz]) mem))
&& sz >= 0
&& isSameCall(sym, "runtime.memmove")
&& call.Uses == 1
&& isInlinableMemmove(dst, src, sz, config)
&& clobber(call)
=> (Move [sz] dst src mem)

View file

@ -2859,3 +2859,12 @@
(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
(MOVWUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
(MOVDload [off] {sym} (SB) _) && symIsRO(sym) => (MOVDconst [int64(read64(sym, int64(off), config.ctxt.Arch.ByteOrder))])
// Arch-specific inlining for small or disjoint runtime.memmove
(SelectN [0] call:(CALLstatic {sym} s1:(MOVDstore _ (MOVDconst [sz]) s2:(MOVDstore _ src s3:(MOVDstore {t} _ dst mem)))))
&& sz >= 0
&& isSameCall(sym, "runtime.memmove")
&& s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
&& isInlinableMemmove(dst, src, sz, config)
&& clobber(s1, s2, s3, call)
=> (Move [sz] dst src mem)

View file

@ -2065,7 +2065,7 @@
(SelectN [0] call:(StaticCall {sym} s1:(Store _ (Const(64|32) [sz]) s2:(Store _ src s3:(Store {t} _ dst mem)))))
&& sz >= 0
&& isSameCall(sym, "runtime.memmove")
&& t.IsPtr() // avoids TUINTPTR, see issue 30061
&& t.IsPtr() // avoids TUNSAFEPTR, see issue 30061
&& s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1
&& isInlinableMemmove(dst, src, int64(sz), config)
&& clobber(s1, s2, s3, call)
@ -2076,7 +2076,7 @@
&& sz >= 0
&& call.Uses == 1 // this will exclude all calls with results
&& isSameCall(sym, "runtime.memmove")
&& dst.Type.IsPtr() // avoids TUINTPTR, see issue 30061
&& dst.Type.IsPtr() // avoids TUNSAFEPTR, see issue 30061
&& isInlinableMemmove(dst, src, int64(sz), config)
&& clobber(call)
=> (Move {dst.Type.Elem()} [int64(sz)] dst src mem)
@ -2086,7 +2086,7 @@
&& sz >= 0
&& call.Uses == 1 // this will exclude all calls with results
&& isSameCall(sym, "runtime.memmove")
&& dst.Type.IsPtr() // avoids TUINTPTR, see issue 30061
&& dst.Type.IsPtr() // avoids TUNSAFEPTR, see issue 30061
&& isInlinableMemmove(dst, src, int64(sz), config)
&& clobber(call)
=> (Move {dst.Type.Elem()} [int64(sz)] dst src mem)

View file

@ -1038,6 +1038,8 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpSelect0(v)
case OpSelect1:
return rewriteValueAMD64_OpSelect1(v)
case OpSelectN:
return rewriteValueAMD64_OpSelectN(v)
case OpSignExt16to32:
v.Op = OpAMD64MOVWQSX
return true
@ -32981,6 +32983,78 @@ func rewriteValueAMD64_OpSelect1(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpSelectN(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
config := b.Func.Config
// match: (SelectN [0] call:(CALLstatic {sym} s1:(MOVQstoreconst _ [sc] s2:(MOVQstore _ src s3:(MOVQstore _ dst mem)))))
// cond: sc.Val64() >= 0 && isSameCall(sym, "runtime.memmove") && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && isInlinableMemmove(dst, src, sc.Val64(), config) && clobber(s1, s2, s3, call)
// result: (Move [sc.Val64()] dst src mem)
for {
if auxIntToInt64(v.AuxInt) != 0 {
break
}
call := v_0
if call.Op != OpAMD64CALLstatic || len(call.Args) != 1 {
break
}
sym := auxToCall(call.Aux)
s1 := call.Args[0]
if s1.Op != OpAMD64MOVQstoreconst {
break
}
sc := auxIntToValAndOff(s1.AuxInt)
_ = s1.Args[1]
s2 := s1.Args[1]
if s2.Op != OpAMD64MOVQstore {
break
}
_ = s2.Args[2]
src := s2.Args[1]
s3 := s2.Args[2]
if s3.Op != OpAMD64MOVQstore {
break
}
mem := s3.Args[2]
dst := s3.Args[1]
if !(sc.Val64() >= 0 && isSameCall(sym, "runtime.memmove") && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && isInlinableMemmove(dst, src, sc.Val64(), config) && clobber(s1, s2, s3, call)) {
break
}
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(sc.Val64())
v.AddArg3(dst, src, mem)
return true
}
// match: (SelectN [0] call:(CALLstatic {sym} dst src (MOVQconst [sz]) mem))
// cond: sz >= 0 && isSameCall(sym, "runtime.memmove") && call.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(call)
// result: (Move [sz] dst src mem)
for {
if auxIntToInt64(v.AuxInt) != 0 {
break
}
call := v_0
if call.Op != OpAMD64CALLstatic || len(call.Args) != 4 {
break
}
sym := auxToCall(call.Aux)
mem := call.Args[3]
dst := call.Args[0]
src := call.Args[1]
call_2 := call.Args[2]
if call_2.Op != OpAMD64MOVQconst {
break
}
sz := auxIntToInt64(call_2.AuxInt)
if !(sz >= 0 && isSameCall(sym, "runtime.memmove") && call.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(call)) {
break
}
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(sz)
v.AddArg3(dst, src, mem)
return true
}
return false
}
func rewriteValueAMD64_OpSlicemask(v *Value) bool {
v_0 := v.Args[0]
b := v.Block

View file

@ -984,6 +984,8 @@ func rewriteValueARM64(v *Value) bool {
return rewriteValueARM64_OpSelect0(v)
case OpSelect1:
return rewriteValueARM64_OpSelect1(v)
case OpSelectN:
return rewriteValueARM64_OpSelectN(v)
case OpSignExt16to32:
v.Op = OpARM64MOVHreg
return true
@ -25983,6 +25985,54 @@ func rewriteValueARM64_OpSelect1(v *Value) bool {
}
return false
}
func rewriteValueARM64_OpSelectN(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
config := b.Func.Config
// match: (SelectN [0] call:(CALLstatic {sym} s1:(MOVDstore _ (MOVDconst [sz]) s2:(MOVDstore _ src s3:(MOVDstore {t} _ dst mem)))))
// cond: sz >= 0 && isSameCall(sym, "runtime.memmove") && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(s1, s2, s3, call)
// result: (Move [sz] dst src mem)
for {
if auxIntToInt64(v.AuxInt) != 0 {
break
}
call := v_0
if call.Op != OpARM64CALLstatic {
break
}
sym := auxToCall(call.Aux)
s1 := call.Args[0]
if s1.Op != OpARM64MOVDstore {
break
}
_ = s1.Args[2]
s1_1 := s1.Args[1]
if s1_1.Op != OpARM64MOVDconst {
break
}
sz := auxIntToInt64(s1_1.AuxInt)
s2 := s1.Args[2]
if s2.Op != OpARM64MOVDstore {
break
}
_ = s2.Args[2]
src := s2.Args[1]
s3 := s2.Args[2]
if s3.Op != OpARM64MOVDstore {
break
}
mem := s3.Args[2]
dst := s3.Args[1]
if !(sz >= 0 && isSameCall(sym, "runtime.memmove") && s1.Uses == 1 && s2.Uses == 1 && s3.Uses == 1 && isInlinableMemmove(dst, src, sz, config) && clobber(s1, s2, s3, call)) {
break
}
v.reset(OpMove)
v.AuxInt = int64ToAuxInt(sz)
v.AddArg3(dst, src, mem)
return true
}
return false
}
func rewriteValueARM64_OpSlicemask(v *Value) bool {
v_0 := v.Args[0]
b := v.Block

View file

@ -97,6 +97,42 @@ func moveDisjointNoOverlap(a *[256]byte) {
copy(a[:], a[128:])
}
// Check arch-specific memmove lowering. See issue 41662 fot details
func moveArchLowering1(b []byte, x *[1]byte) {
_ = b[1]
// amd64:-".*memmove"
// arm64:-".*memmove"
copy(b, x[:])
}
func moveArchLowering2(b []byte, x *[2]byte) {
_ = b[2]
// amd64:-".*memmove"
// arm64:-".*memmove"
copy(b, x[:])
}
func moveArchLowering4(b []byte, x *[4]byte) {
_ = b[4]
// amd64:-".*memmove"
// arm64:-".*memmove"
copy(b, x[:])
}
func moveArchLowering8(b []byte, x *[8]byte) {
_ = b[8]
// amd64:-".*memmove"
// arm64:-".*memmove"
copy(b, x[:])
}
func moveArchLowering16(b []byte, x *[16]byte) {
_ = b[16]
// amd64:-".*memmove"
copy(b, x[:])
}
// Check that no branches are generated when the pointers are [not] equal.
func ptrEqual() {