mirror of
https://github.com/golang/go
synced 2024-11-02 11:50:30 +00:00
cmd/compile: use MOVBE instruction for GOAMD64>=v3
encoding/binary benchmark on my laptop: name old time/op new time/op delta ReadSlice1000Int32s-8 4.42µs ± 5% 4.20µs ± 1% -4.94% (p=0.046 n=9+8) ReadStruct-8 359ns ± 8% 368ns ± 5% +2.35% (p=0.041 n=9+10) WriteStruct-8 349ns ± 1% 357ns ± 1% +2.15% (p=0.000 n=8+10) ReadInts-8 235ns ± 1% 233ns ± 1% -1.01% (p=0.005 n=10+10) WriteInts-8 265ns ± 1% 274ns ± 1% +3.45% (p=0.000 n=10+10) WriteSlice1000Int32s-8 4.61µs ± 5% 4.59µs ± 5% ~ (p=0.986 n=10+10) PutUint16-8 0.56ns ± 4% 0.57ns ± 4% ~ (p=0.101 n=10+10) PutUint32-8 0.83ns ± 2% 0.56ns ± 6% -32.91% (p=0.000 n=10+10) PutUint64-8 0.81ns ± 3% 0.62ns ± 4% -23.82% (p=0.000 n=10+10) LittleEndianPutUint16-8 0.55ns ± 4% 0.55ns ± 3% ~ (p=0.926 n=10+10) LittleEndianPutUint32-8 0.41ns ± 4% 0.42ns ± 3% ~ (p=0.148 n=10+9) LittleEndianPutUint64-8 0.55ns ± 2% 0.56ns ± 6% ~ (p=0.897 n=10+10) ReadFloats-8 60.4ns ± 4% 59.0ns ± 1% -2.25% (p=0.007 n=10+10) WriteFloats-8 72.3ns ± 2% 71.5ns ± 7% ~ (p=0.089 n=10+10) ReadSlice1000Float32s-8 4.21µs ± 3% 4.18µs ± 2% ~ (p=0.197 n=10+10) WriteSlice1000Float32s-8 4.61µs ± 2% 4.68µs ± 7% ~ (p=1.000 n=8+10) ReadSlice1000Uint8s-8 250ns ± 4% 247ns ± 4% ~ (p=0.324 n=10+10) WriteSlice1000Uint8s-8 227ns ± 5% 229ns ± 2% ~ (p=0.193 n=10+7) PutUvarint32-8 15.3ns ± 2% 15.4ns ± 4% ~ (p=0.782 n=10+10) PutUvarint64-8 38.5ns ± 1% 38.6ns ± 5% ~ (p=0.396 n=8+10) name old speed new speed delta ReadSlice1000Int32s-8 890MB/s ±17% 953MB/s ± 1% +7.00% (p=0.027 n=10+8) ReadStruct-8 209MB/s ± 8% 204MB/s ± 5% -2.42% (p=0.043 n=9+10) WriteStruct-8 214MB/s ± 3% 210MB/s ± 1% -1.75% (p=0.003 n=9+10) ReadInts-8 127MB/s ± 1% 129MB/s ± 1% +1.01% (p=0.006 n=10+10) WriteInts-8 113MB/s ± 1% 109MB/s ± 1% -3.34% (p=0.000 n=10+10) WriteSlice1000Int32s-8 868MB/s ± 5% 872MB/s ± 5% ~ (p=1.000 n=10+10) PutUint16-8 3.55GB/s ± 4% 3.50GB/s ± 4% ~ (p=0.093 n=10+10) PutUint32-8 4.83GB/s ± 2% 7.21GB/s ± 6% +49.16% (p=0.000 n=10+10) PutUint64-8 9.89GB/s ± 3% 12.99GB/s ± 4% +31.26% (p=0.000 n=10+10) LittleEndianPutUint16-8 3.65GB/s ± 4% 3.65GB/s ± 4% ~ (p=0.912 n=10+10) LittleEndianPutUint32-8 9.74GB/s ± 3% 9.63GB/s ± 3% ~ (p=0.222 n=9+9) LittleEndianPutUint64-8 14.4GB/s ± 2% 14.3GB/s ± 5% ~ (p=0.912 n=10+10) ReadFloats-8 199MB/s ± 4% 203MB/s ± 1% +2.27% (p=0.007 n=10+10) WriteFloats-8 166MB/s ± 2% 168MB/s ± 7% ~ (p=0.089 n=10+10) ReadSlice1000Float32s-8 949MB/s ± 3% 958MB/s ± 2% ~ (p=0.218 n=10+10) WriteSlice1000Float32s-8 867MB/s ± 2% 857MB/s ± 6% ~ (p=1.000 n=8+10) ReadSlice1000Uint8s-8 4.00GB/s ± 4% 4.06GB/s ± 4% ~ (p=0.353 n=10+10) WriteSlice1000Uint8s-8 4.40GB/s ± 4% 4.36GB/s ± 2% ~ (p=0.193 n=10+7) PutUvarint32-8 262MB/s ± 2% 260MB/s ± 4% ~ (p=0.739 n=10+10) PutUvarint64-8 208MB/s ± 1% 207MB/s ± 5% ~ (p=0.408 n=8+10) Updates #45453 Change-Id: Ifda0d48d54665cef45d46d3aad974062633142c4 Reviewed-on: https://go-review.googlesource.com/c/go/+/354670 Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Matthew Dempsky <mdempsky@google.com>
This commit is contained in:
parent
c091767d87
commit
3e5cc4d6f6
7 changed files with 389 additions and 20 deletions
|
@ -772,7 +772,9 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = x
|
||||
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
|
||||
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
|
||||
ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
|
||||
ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Reg = v.Args[0].Reg()
|
||||
|
@ -788,7 +790,8 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
p.To.Reg = v.Reg()
|
||||
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
|
||||
ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
|
||||
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify:
|
||||
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
|
||||
ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore:
|
||||
p := s.Prog(v.Op.Asm())
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = v.Args[1].Reg()
|
||||
|
|
|
@ -53,7 +53,9 @@ func TestGoAMD64v1(t *testing.T) {
|
|||
opcodes := map[string]bool{}
|
||||
var features []string
|
||||
for feature, opcodeList := range featureToOpcodes {
|
||||
features = append(features, fmt.Sprintf("cpu.%s=off", feature))
|
||||
if runtimeFeatures[feature] {
|
||||
features = append(features, fmt.Sprintf("cpu.%s=off", feature))
|
||||
}
|
||||
for _, op := range opcodeList {
|
||||
opcodes[op] = true
|
||||
}
|
||||
|
@ -204,14 +206,28 @@ func clobber(t *testing.T, src string, dst *os.File, opcodes map[string]bool) {
|
|||
f.Close()
|
||||
}
|
||||
|
||||
func setOf(keys ...string) map[string]bool {
|
||||
m := make(map[string]bool, len(keys))
|
||||
for _, key := range keys {
|
||||
m[key] = true
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
var runtimeFeatures = setOf(
|
||||
"adx", "aes", "avx", "avx2", "bmi1", "bmi2", "erms", "fma",
|
||||
"pclmulqdq", "popcnt", "rdtscp", "sse3", "sse41", "sse42", "ssse3",
|
||||
)
|
||||
|
||||
var featureToOpcodes = map[string][]string{
|
||||
// Note: we include *q, *l, and plain opcodes here.
|
||||
// go tool objdump doesn't include a [QL] on popcnt instructions, until CL 351889
|
||||
// native objdump doesn't include [QL] on linux.
|
||||
"popcnt": []string{"popcntq", "popcntl", "popcnt"},
|
||||
"bmi1": []string{"andnq", "andnl", "andn", "blsiq", "blsil", "blsi", "blsmskq", "blsmskl", "blsmsk", "blsrq", "blsrl", "blsr", "tzcntq", "tzcntl", "tzcnt"},
|
||||
"sse41": []string{"roundsd"},
|
||||
"fma": []string{"vfmadd231sd"},
|
||||
"popcnt": {"popcntq", "popcntl", "popcnt"},
|
||||
"bmi1": {"andnq", "andnl", "andn", "blsiq", "blsil", "blsi", "blsmskq", "blsmskl", "blsmsk", "blsrq", "blsrl", "blsr", "tzcntq", "tzcntl", "tzcnt"},
|
||||
"sse41": {"roundsd"},
|
||||
"fma": {"vfmadd231sd"},
|
||||
"movbe": {"movbeqq", "movbeq", "movbell", "movbel", "movbe"},
|
||||
}
|
||||
|
||||
// Test to use POPCNT instruction, if available
|
||||
|
@ -364,5 +380,4 @@ func TestFMA(t *testing.T) {
|
|||
t.Errorf("FMA(%f,%f,%f) = %f, want %f", tt.x, tt.y, tt.z, got, tt.want)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2219,3 +2219,29 @@
|
|||
(AND(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSR(Q|L) x)
|
||||
|
||||
(BSWAP(Q|L) (BSWAP(Q|L) p)) => p
|
||||
|
||||
// CPUID feature: MOVBE.
|
||||
(MOV(Q|L)store [i] {s} p x:(BSWAP(Q|L) w) mem) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)store [i] {s} p w mem)
|
||||
(BSWAP(Q|L) x:(MOV(Q|L)load [i] {s} p mem)) && x.Uses == 1 && buildcfg.GOAMD64 >= 3 => (MOVBE(Q|L)load [i] {s} p mem)
|
||||
(BSWAP(Q|L) (MOVBE(Q|L)load [i] {s} p m)) => (MOV(Q|L)load [i] {s} p m)
|
||||
(MOVBE(Q|L)store [i] {s} p (BSWAP(Q|L) x) m) => (MOV(Q|L)store [i] {s} p x m)
|
||||
|
||||
(ORQ x0:(MOVBELload [i0] {s} p mem)
|
||||
sh:(SHLQconst [32] x1:(MOVBELload [i1] {s} p mem)))
|
||||
&& i0 == i1+4
|
||||
&& x0.Uses == 1
|
||||
&& x1.Uses == 1
|
||||
&& sh.Uses == 1
|
||||
&& mergePoint(b,x0,x1) != nil
|
||||
&& clobber(x0, x1, sh)
|
||||
=> @mergePoint(b,x0,x1) (MOVBEQload [i1] {s} p mem)
|
||||
|
||||
(ORQ x0:(MOVBELload [i] {s} p0 mem)
|
||||
sh:(SHLQconst [32] x1:(MOVBELload [i] {s} p1 mem)))
|
||||
&& x0.Uses == 1
|
||||
&& x1.Uses == 1
|
||||
&& sh.Uses == 1
|
||||
&& sequentialAddresses(p1, p0, 4)
|
||||
&& mergePoint(b,x0,x1) != nil
|
||||
&& clobber(x0, x1, sh)
|
||||
=> @mergePoint(b,x0,x1) (MOVBEQload [i] {s} p0 mem)
|
||||
|
|
|
@ -922,6 +922,12 @@ func init() {
|
|||
// and BSFQ(0) is undefined. Same for TZCNTL(0)==32
|
||||
{name: "TZCNTQ", argLength: 1, reg: gp11, asm: "TZCNTQ", clobberFlags: true},
|
||||
{name: "TZCNTL", argLength: 1, reg: gp11, asm: "TZCNTL", clobberFlags: true},
|
||||
|
||||
// CPUID feature: MOVBE
|
||||
{name: "MOVBELload", argLength: 2, reg: gpload, asm: "MOVBEL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
|
||||
{name: "MOVBELstore", argLength: 3, reg: gpstore, asm: "MOVBEL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
|
||||
{name: "MOVBEQload", argLength: 2, reg: gpload, asm: "MOVBEQ", aux: "SymOff", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load and swap 8 bytes from arg0+auxint+aux. arg1=mem
|
||||
{name: "MOVBEQstore", argLength: 3, reg: gpstore, asm: "MOVBEQ", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // swap and store 8 bytes in arg1 to arg0+auxint+aux. arg2=mem
|
||||
}
|
||||
|
||||
var AMD64blocks = []blockData{
|
||||
|
|
|
@ -1043,6 +1043,10 @@ const (
|
|||
OpAMD64BLSRL
|
||||
OpAMD64TZCNTQ
|
||||
OpAMD64TZCNTL
|
||||
OpAMD64MOVBELload
|
||||
OpAMD64MOVBELstore
|
||||
OpAMD64MOVBEQload
|
||||
OpAMD64MOVBEQstore
|
||||
|
||||
OpARMADD
|
||||
OpARMADDconst
|
||||
|
@ -13780,6 +13784,66 @@ var opcodeTable = [...]opInfo{
|
|||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MOVBELload",
|
||||
auxType: auxSymOff,
|
||||
argLen: 2,
|
||||
faultOnNilArg0: true,
|
||||
symEffect: SymRead,
|
||||
asm: x86.AMOVBEL,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 g R15 SB
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MOVBELstore",
|
||||
auxType: auxSymOff,
|
||||
argLen: 3,
|
||||
faultOnNilArg0: true,
|
||||
symEffect: SymWrite,
|
||||
asm: x86.AMOVBEL,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{1, 49151}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 g R15 SB
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MOVBEQload",
|
||||
auxType: auxSymOff,
|
||||
argLen: 2,
|
||||
faultOnNilArg0: true,
|
||||
symEffect: SymRead,
|
||||
asm: x86.AMOVBEQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 g R15 SB
|
||||
},
|
||||
outputs: []outputInfo{
|
||||
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "MOVBEQstore",
|
||||
auxType: auxSymOff,
|
||||
argLen: 3,
|
||||
faultOnNilArg0: true,
|
||||
symEffect: SymWrite,
|
||||
asm: x86.AMOVBEQ,
|
||||
reg: regInfo{
|
||||
inputs: []inputInfo{
|
||||
{1, 49151}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15
|
||||
{0, 4295032831}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 g R15 SB
|
||||
},
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "ADD",
|
||||
|
|
|
@ -222,6 +222,10 @@ func rewriteValueAMD64(v *Value) bool {
|
|||
return rewriteValueAMD64_OpAMD64LEAQ4(v)
|
||||
case OpAMD64LEAQ8:
|
||||
return rewriteValueAMD64_OpAMD64LEAQ8(v)
|
||||
case OpAMD64MOVBELstore:
|
||||
return rewriteValueAMD64_OpAMD64MOVBELstore(v)
|
||||
case OpAMD64MOVBEQstore:
|
||||
return rewriteValueAMD64_OpAMD64MOVBEQstore(v)
|
||||
case OpAMD64MOVBQSX:
|
||||
return rewriteValueAMD64_OpAMD64MOVBQSX(v)
|
||||
case OpAMD64MOVBQSXload:
|
||||
|
@ -3623,6 +3627,43 @@ func rewriteValueAMD64_OpAMD64BSWAPL(v *Value) bool {
|
|||
v.copyOf(p)
|
||||
return true
|
||||
}
|
||||
// match: (BSWAPL x:(MOVLload [i] {s} p mem))
|
||||
// cond: x.Uses == 1 && buildcfg.GOAMD64 >= 3
|
||||
// result: (MOVBELload [i] {s} p mem)
|
||||
for {
|
||||
x := v_0
|
||||
if x.Op != OpAMD64MOVLload {
|
||||
break
|
||||
}
|
||||
i := auxIntToInt32(x.AuxInt)
|
||||
s := auxToSym(x.Aux)
|
||||
mem := x.Args[1]
|
||||
p := x.Args[0]
|
||||
if !(x.Uses == 1 && buildcfg.GOAMD64 >= 3) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64MOVBELload)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg2(p, mem)
|
||||
return true
|
||||
}
|
||||
// match: (BSWAPL (MOVBELload [i] {s} p m))
|
||||
// result: (MOVLload [i] {s} p m)
|
||||
for {
|
||||
if v_0.Op != OpAMD64MOVBELload {
|
||||
break
|
||||
}
|
||||
i := auxIntToInt32(v_0.AuxInt)
|
||||
s := auxToSym(v_0.Aux)
|
||||
m := v_0.Args[1]
|
||||
p := v_0.Args[0]
|
||||
v.reset(OpAMD64MOVLload)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg2(p, m)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64BSWAPQ(v *Value) bool {
|
||||
|
@ -3637,6 +3678,43 @@ func rewriteValueAMD64_OpAMD64BSWAPQ(v *Value) bool {
|
|||
v.copyOf(p)
|
||||
return true
|
||||
}
|
||||
// match: (BSWAPQ x:(MOVQload [i] {s} p mem))
|
||||
// cond: x.Uses == 1 && buildcfg.GOAMD64 >= 3
|
||||
// result: (MOVBEQload [i] {s} p mem)
|
||||
for {
|
||||
x := v_0
|
||||
if x.Op != OpAMD64MOVQload {
|
||||
break
|
||||
}
|
||||
i := auxIntToInt32(x.AuxInt)
|
||||
s := auxToSym(x.Aux)
|
||||
mem := x.Args[1]
|
||||
p := x.Args[0]
|
||||
if !(x.Uses == 1 && buildcfg.GOAMD64 >= 3) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64MOVBEQload)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg2(p, mem)
|
||||
return true
|
||||
}
|
||||
// match: (BSWAPQ (MOVBEQload [i] {s} p m))
|
||||
// result: (MOVQload [i] {s} p m)
|
||||
for {
|
||||
if v_0.Op != OpAMD64MOVBEQload {
|
||||
break
|
||||
}
|
||||
i := auxIntToInt32(v_0.AuxInt)
|
||||
s := auxToSym(v_0.Aux)
|
||||
m := v_0.Args[1]
|
||||
p := v_0.Args[0]
|
||||
v.reset(OpAMD64MOVQload)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg2(p, m)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64BTCLconst(v *Value) bool {
|
||||
|
@ -9395,6 +9473,52 @@ func rewriteValueAMD64_OpAMD64LEAQ8(v *Value) bool {
|
|||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64MOVBELstore(v *Value) bool {
|
||||
v_2 := v.Args[2]
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
// match: (MOVBELstore [i] {s} p (BSWAPL x) m)
|
||||
// result: (MOVLstore [i] {s} p x m)
|
||||
for {
|
||||
i := auxIntToInt32(v.AuxInt)
|
||||
s := auxToSym(v.Aux)
|
||||
p := v_0
|
||||
if v_1.Op != OpAMD64BSWAPL {
|
||||
break
|
||||
}
|
||||
x := v_1.Args[0]
|
||||
m := v_2
|
||||
v.reset(OpAMD64MOVLstore)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg3(p, x, m)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64MOVBEQstore(v *Value) bool {
|
||||
v_2 := v.Args[2]
|
||||
v_1 := v.Args[1]
|
||||
v_0 := v.Args[0]
|
||||
// match: (MOVBEQstore [i] {s} p (BSWAPQ x) m)
|
||||
// result: (MOVQstore [i] {s} p x m)
|
||||
for {
|
||||
i := auxIntToInt32(v.AuxInt)
|
||||
s := auxToSym(v.Aux)
|
||||
p := v_0
|
||||
if v_1.Op != OpAMD64BSWAPQ {
|
||||
break
|
||||
}
|
||||
x := v_1.Args[0]
|
||||
m := v_2
|
||||
v.reset(OpAMD64MOVQstore)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg3(p, x, m)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64MOVBQSX(v *Value) bool {
|
||||
v_0 := v.Args[0]
|
||||
b := v.Block
|
||||
|
@ -12225,6 +12349,28 @@ func rewriteValueAMD64_OpAMD64MOVLstore(v *Value) bool {
|
|||
v.AddArg3(ptr, val, mem)
|
||||
return true
|
||||
}
|
||||
// match: (MOVLstore [i] {s} p x:(BSWAPL w) mem)
|
||||
// cond: x.Uses == 1 && buildcfg.GOAMD64 >= 3
|
||||
// result: (MOVBELstore [i] {s} p w mem)
|
||||
for {
|
||||
i := auxIntToInt32(v.AuxInt)
|
||||
s := auxToSym(v.Aux)
|
||||
p := v_0
|
||||
x := v_1
|
||||
if x.Op != OpAMD64BSWAPL {
|
||||
break
|
||||
}
|
||||
w := x.Args[0]
|
||||
mem := v_2
|
||||
if !(x.Uses == 1 && buildcfg.GOAMD64 >= 3) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64MOVBELstore)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg3(p, w, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64MOVLstoreconst(v *Value) bool {
|
||||
|
@ -13164,6 +13310,28 @@ func rewriteValueAMD64_OpAMD64MOVQstore(v *Value) bool {
|
|||
v.AddArg3(ptr, val, mem)
|
||||
return true
|
||||
}
|
||||
// match: (MOVQstore [i] {s} p x:(BSWAPQ w) mem)
|
||||
// cond: x.Uses == 1 && buildcfg.GOAMD64 >= 3
|
||||
// result: (MOVBEQstore [i] {s} p w mem)
|
||||
for {
|
||||
i := auxIntToInt32(v.AuxInt)
|
||||
s := auxToSym(v.Aux)
|
||||
p := v_0
|
||||
x := v_1
|
||||
if x.Op != OpAMD64BSWAPQ {
|
||||
break
|
||||
}
|
||||
w := x.Args[0]
|
||||
mem := v_2
|
||||
if !(x.Uses == 1 && buildcfg.GOAMD64 >= 3) {
|
||||
break
|
||||
}
|
||||
v.reset(OpAMD64MOVBEQstore)
|
||||
v.AuxInt = int32ToAuxInt(i)
|
||||
v.Aux = symToAux(s)
|
||||
v.AddArg3(p, w, mem)
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64MOVQstoreconst(v *Value) bool {
|
||||
|
@ -18657,6 +18825,81 @@ func rewriteValueAMD64_OpAMD64ORQ(v *Value) bool {
|
|||
}
|
||||
break
|
||||
}
|
||||
// match: (ORQ x0:(MOVBELload [i0] {s} p mem) sh:(SHLQconst [32] x1:(MOVBELload [i1] {s} p mem)))
|
||||
// cond: i0 == i1+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
|
||||
// result: @mergePoint(b,x0,x1) (MOVBEQload [i1] {s} p mem)
|
||||
for {
|
||||
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
||||
x0 := v_0
|
||||
if x0.Op != OpAMD64MOVBELload {
|
||||
continue
|
||||
}
|
||||
i0 := auxIntToInt32(x0.AuxInt)
|
||||
s := auxToSym(x0.Aux)
|
||||
mem := x0.Args[1]
|
||||
p := x0.Args[0]
|
||||
sh := v_1
|
||||
if sh.Op != OpAMD64SHLQconst || auxIntToInt8(sh.AuxInt) != 32 {
|
||||
continue
|
||||
}
|
||||
x1 := sh.Args[0]
|
||||
if x1.Op != OpAMD64MOVBELload {
|
||||
continue
|
||||
}
|
||||
i1 := auxIntToInt32(x1.AuxInt)
|
||||
if auxToSym(x1.Aux) != s {
|
||||
continue
|
||||
}
|
||||
_ = x1.Args[1]
|
||||
if p != x1.Args[0] || mem != x1.Args[1] || !(i0 == i1+4 && x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
|
||||
continue
|
||||
}
|
||||
b = mergePoint(b, x0, x1)
|
||||
v0 := b.NewValue0(x1.Pos, OpAMD64MOVBEQload, typ.UInt64)
|
||||
v.copyOf(v0)
|
||||
v0.AuxInt = int32ToAuxInt(i1)
|
||||
v0.Aux = symToAux(s)
|
||||
v0.AddArg2(p, mem)
|
||||
return true
|
||||
}
|
||||
break
|
||||
}
|
||||
// match: (ORQ x0:(MOVBELload [i] {s} p0 mem) sh:(SHLQconst [32] x1:(MOVBELload [i] {s} p1 mem)))
|
||||
// cond: x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p1, p0, 4) && mergePoint(b,x0,x1) != nil && clobber(x0, x1, sh)
|
||||
// result: @mergePoint(b,x0,x1) (MOVBEQload [i] {s} p0 mem)
|
||||
for {
|
||||
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
|
||||
x0 := v_0
|
||||
if x0.Op != OpAMD64MOVBELload {
|
||||
continue
|
||||
}
|
||||
i := auxIntToInt32(x0.AuxInt)
|
||||
s := auxToSym(x0.Aux)
|
||||
mem := x0.Args[1]
|
||||
p0 := x0.Args[0]
|
||||
sh := v_1
|
||||
if sh.Op != OpAMD64SHLQconst || auxIntToInt8(sh.AuxInt) != 32 {
|
||||
continue
|
||||
}
|
||||
x1 := sh.Args[0]
|
||||
if x1.Op != OpAMD64MOVBELload || auxIntToInt32(x1.AuxInt) != i || auxToSym(x1.Aux) != s {
|
||||
continue
|
||||
}
|
||||
_ = x1.Args[1]
|
||||
p1 := x1.Args[0]
|
||||
if mem != x1.Args[1] || !(x0.Uses == 1 && x1.Uses == 1 && sh.Uses == 1 && sequentialAddresses(p1, p0, 4) && mergePoint(b, x0, x1) != nil && clobber(x0, x1, sh)) {
|
||||
continue
|
||||
}
|
||||
b = mergePoint(b, x0, x1)
|
||||
v0 := b.NewValue0(x1.Pos, OpAMD64MOVBEQload, typ.UInt64)
|
||||
v.copyOf(v0)
|
||||
v0.AuxInt = int32ToAuxInt(i)
|
||||
v0.Aux = symToAux(s)
|
||||
v0.AddArg2(p0, mem)
|
||||
return true
|
||||
}
|
||||
break
|
||||
}
|
||||
return false
|
||||
}
|
||||
func rewriteValueAMD64_OpAMD64ORQconst(v *Value) bool {
|
||||
|
|
|
@ -70,7 +70,8 @@ func load_le16_idx(b []byte, idx int) {
|
|||
}
|
||||
|
||||
func load_be64(b []byte) {
|
||||
// amd64:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
|
||||
// amd64/v1,amd64/v2:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
|
||||
// amd64/v3:`MOVBEQ`
|
||||
// s390x:`MOVD\s\(.*\),`
|
||||
// arm64:`REV`,`MOVD\s\(R[0-9]+\),`,-`MOV[BHW]`,-`REVW`,-`REV16W`
|
||||
// ppc64le:`MOVDBR`,-`MOV[BHW]Z`
|
||||
|
@ -78,7 +79,8 @@ func load_be64(b []byte) {
|
|||
}
|
||||
|
||||
func load_be64_idx(b []byte, idx int) {
|
||||
// amd64:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
|
||||
// amd64/v1,amd64/v2:`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
|
||||
// amd64/v3: `MOVBEQ`
|
||||
// s390x:`MOVD\s\(.*\)\(.*\*1\),`
|
||||
// arm64:`REV`,`MOVD\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[WHB]`,-`REVW`,-`REV16W`
|
||||
// ppc64le:`MOVDBR`,-`MOV[BHW]Z`
|
||||
|
@ -86,7 +88,8 @@ func load_be64_idx(b []byte, idx int) {
|
|||
}
|
||||
|
||||
func load_be32(b []byte) {
|
||||
// amd64:`BSWAPL`,-`MOV[BW]`,-`OR`
|
||||
// amd64/v1,amd64/v2:`BSWAPL`,-`MOV[BW]`,-`OR`
|
||||
// amd64/v3: `MOVBEL`
|
||||
// s390x:`MOVWZ\s\(.*\),`
|
||||
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\),`,-`MOV[BH]`,-`REV16W`
|
||||
// ppc64le:`MOVWBR`,-`MOV[BH]Z`
|
||||
|
@ -94,7 +97,8 @@ func load_be32(b []byte) {
|
|||
}
|
||||
|
||||
func load_be32_idx(b []byte, idx int) {
|
||||
// amd64:`BSWAPL`,-`MOV[BW]`,-`OR`
|
||||
// amd64/v1,amd64/v2:`BSWAPL`,-`MOV[BW]`,-`OR`
|
||||
// amd64/v3: `MOVBEL`
|
||||
// s390x:`MOVWZ\s\(.*\)\(.*\*1\),`
|
||||
// arm64:`REVW`,`MOVWU\s\(R[0-9]+\)\(R[0-9]+\),`,-`MOV[HB]`,-`REV16W`
|
||||
// ppc64le:`MOVWBR`,-`MOV[BH]Z`
|
||||
|
@ -179,7 +183,8 @@ func load_be_byte4_uint32(s []byte) uint32 {
|
|||
|
||||
func load_be_byte4_uint32_inv(s []byte) uint32 {
|
||||
// arm64:`MOVWU\t\(R[0-9]+\)`,`REVW`,-`ORR`,-`REV16W`,-`MOV[BH]`
|
||||
// amd64:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR`
|
||||
// amd64/v1,amd64/v2:`MOVL\s\([A-Z]+\)`,`BSWAPL`,-`MOV[BW]`,-`OR`
|
||||
// amd64/v3: `MOVBEL`
|
||||
return uint32(s[3]) | uint32(s[2])<<8 | uint32(s[1])<<16 | uint32(s[0])<<24
|
||||
}
|
||||
|
||||
|
@ -191,7 +196,8 @@ func load_be_byte8_uint64(s []byte) uint64 {
|
|||
|
||||
func load_be_byte8_uint64_inv(s []byte) uint64 {
|
||||
// arm64:`MOVD\t\(R[0-9]+\)`,`REV`,-`ORR`,-`REVW`,-`REV16W`,-`MOV[BHW]`
|
||||
// amd64:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
|
||||
// amd64/v1,amd64/v2:`MOVQ\s\([A-Z]+\),\s[A-Z]+`,`BSWAPQ`,-`MOV[BWL]\t[^$]`,-`OR`
|
||||
// amd64/v3: `MOVBEQ`
|
||||
// ppc64le:`MOVDBR\t\(R[0-9]+\)`,-`MOV[BHW]Z`
|
||||
return uint64(s[7]) | uint64(s[6])<<8 | uint64(s[5])<<16 | uint64(s[4])<<24 | uint64(s[3])<<32 | uint64(s[2])<<40 | uint64(s[1])<<48 | uint64(s[0])<<56
|
||||
}
|
||||
|
@ -409,7 +415,8 @@ func store_le16_idx(b []byte, idx int) {
|
|||
}
|
||||
|
||||
func store_be64(b []byte) {
|
||||
// amd64:`BSWAPQ`,-`SHR.`
|
||||
// amd64/v1,amd64/v2:`BSWAPQ`,-`SHR.`
|
||||
// amd64/v3: `MOVBEQ`
|
||||
// arm64:`MOVD`,`REV`,-`MOV[WBH]`,-`REVW`,-`REV16W`
|
||||
// ppc64le:`MOVDBR`
|
||||
// s390x:`MOVD\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
|
||||
|
@ -417,7 +424,8 @@ func store_be64(b []byte) {
|
|||
}
|
||||
|
||||
func store_be64_idx(b []byte, idx int) {
|
||||
// amd64:`BSWAPQ`,-`SHR.`
|
||||
// amd64/v1,amd64/v2:`BSWAPQ`,-`SHR.`
|
||||
// amd64/v3:`MOVBEQ`
|
||||
// arm64:`REV`,`MOVD\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BHW]`,-`REV16W`,-`REVW`
|
||||
// ppc64le:`MOVDBR`
|
||||
// s390x:`MOVD\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
|
||||
|
@ -425,7 +433,8 @@ func store_be64_idx(b []byte, idx int) {
|
|||
}
|
||||
|
||||
func store_be32(b []byte) {
|
||||
// amd64:`BSWAPL`,-`SHR.`
|
||||
// amd64/v1,amd64/v2:`BSWAPL`,-`SHR.`
|
||||
// amd64/v3:`MOVBEL`
|
||||
// arm64:`MOVW`,`REVW`,-`MOV[BH]`,-`REV16W`
|
||||
// ppc64le:`MOVWBR`
|
||||
// s390x:`MOVW\s.*\(.*\)$`,-`SRW\s`,-`SRD\s`
|
||||
|
@ -445,7 +454,8 @@ func store_be32_load(b, x *[8]byte) {
|
|||
}
|
||||
|
||||
func store_be32_idx(b []byte, idx int) {
|
||||
// amd64:`BSWAPL`,-`SHR.`
|
||||
// amd64/v1,amd64/v2:`BSWAPL`,-`SHR.`
|
||||
// amd64/v3:`MOVBEL`
|
||||
// arm64:`REVW`,`MOVW\sR[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`,-`MOV[BH]`,-`REV16W`
|
||||
// ppc64le:`MOVWBR`
|
||||
// s390x:`MOVW\s.*\(.*\)\(.*\*1\)$`,-`SRW\s`,-`SRD\s`
|
||||
|
@ -508,14 +518,16 @@ func store_be_byte_2(b []byte, val uint16) {
|
|||
func store_be_byte_4(b []byte, val uint32) {
|
||||
_ = b[4]
|
||||
// arm64:`REVW`,`MOVW\sR[0-9]+,\s1\(R[0-9]+\)`,-`MOVB`,-`MOVH`,-`REV16W`
|
||||
// amd64:`MOVL\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`
|
||||
// amd64/v1,amd64/v2:`MOVL\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`
|
||||
// amd64/v3:`MOVBEL\s[A-Z]+,\s1\([A-Z]+\)`
|
||||
b[1], b[2], b[3], b[4] = byte(val>>24), byte(val>>16), byte(val>>8), byte(val)
|
||||
}
|
||||
|
||||
func store_be_byte_8(b []byte, val uint64) {
|
||||
_ = b[8]
|
||||
// arm64:`REV`,`MOVD\sR[0-9]+,\s1\(R[0-9]+\)`,-`MOVB`,-`MOVH`,-`MOVW`,-`REV16W`,-`REVW`
|
||||
// amd64:`MOVQ\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`,-`MOVL`
|
||||
// amd64/v1,amd64/v2:`MOVQ\s[A-Z]+,\s1\([A-Z]+\)`,-`MOVB`,-`MOVW`,-`MOVL`
|
||||
// amd64/v3:`MOVBEQ\s[A-Z]+,\s1\([A-Z]+\)`, -`MOVBEL`
|
||||
b[1], b[2], b[3], b[4], b[5], b[6], b[7], b[8] = byte(val>>56), byte(val>>48), byte(val>>40), byte(val>>32), byte(val>>24), byte(val>>16), byte(val>>8), byte(val)
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue