cmd/compile: use BMI1 instructions for GOAMD64=v3 and higher

BMI1 includes four instructions (ANDN, BLSI, BLSMSK, BLSR) that are
easy to peephole optimize, and which GCC always seems to favor using
when available and applicable.

Updates #45453.

Change-Id: I0274184057058f5c579e5bc3ea9c414396d3cf46
Reviewed-on: https://go-review.googlesource.com/c/go/+/351130
Run-TryBot: Matthew Dempsky <mdempsky@google.com>
Trust: Matthew Dempsky <mdempsky@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Matthew Dempsky 2021-07-02 21:02:30 -07:00
parent 30faf968b1
commit 04572fa29b
6 changed files with 378 additions and 0 deletions

View file

@ -263,6 +263,23 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Reg = lo
p.SetFrom3Reg(hi)
case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
p.SetFrom3Reg(v.Args[1].Reg())
case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
// Arg[0] (the dividend) is in AX.
// Arg[1] (the divisor) can be in any other register.

View file

@ -639,6 +639,7 @@
// Recognize bit clearing: a &^= 1<<b
(AND(Q|L) (NOT(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y)) x) => (BTR(Q|L) x y)
(ANDN(Q|L) x (SHL(Q|L) (MOV(Q|L)const [1]) y)) => (BTR(Q|L) x y)
(ANDQconst [c] x) && isUint64PowerOfTwo(int64(^c)) && uint64(^c) >= 128
=> (BTRQconst [int8(log32(^c))] x)
(ANDLconst [c] x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128
@ -2204,3 +2205,9 @@
// Prefetch instructions
(PrefetchCache ...) => (PrefetchT0 ...)
(PrefetchCacheStreamed ...) => (PrefetchNTA ...)
// CPUID feature: BMI1.
(AND(Q|L) x (NOT(Q|L) y)) && buildcfg.GOAMD64 >= 3 => (ANDN(Q|L) x y)
(AND(Q|L) x (NEG(Q|L) x)) && buildcfg.GOAMD64 >= 3 => (BLSI(Q|L) x)
(XOR(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSMSK(Q|L) x)
(AND(Q|L) x (ADD(Q|L)const [-1] x)) && buildcfg.GOAMD64 >= 3 => (BLSR(Q|L) x)

View file

@ -908,6 +908,16 @@ func init() {
// Do prefetch arg0 address. arg0=addr, arg1=memory. Instruction variant selects locality hint
{name: "PrefetchT0", argLength: 2, reg: prefreg, asm: "PREFETCHT0", hasSideEffects: true},
{name: "PrefetchNTA", argLength: 2, reg: prefreg, asm: "PREFETCHNTA", hasSideEffects: true},
// CPUID feature: BMI1.
{name: "ANDNQ", argLength: 2, reg: gp21, asm: "ANDNQ", clobberFlags: true}, // arg0 &^ arg1
{name: "ANDNL", argLength: 2, reg: gp21, asm: "ANDNL", clobberFlags: true}, // arg0 &^ arg1
{name: "BLSIQ", argLength: 1, reg: gp11, asm: "BLSIQ", clobberFlags: true}, // arg0 & -arg0
{name: "BLSIL", argLength: 1, reg: gp11, asm: "BLSIL", clobberFlags: true}, // arg0 & -arg0
{name: "BLSMSKQ", argLength: 1, reg: gp11, asm: "BLSMSKQ", clobberFlags: true}, // arg0 ^ (arg0 - 1)
{name: "BLSMSKL", argLength: 1, reg: gp11, asm: "BLSMSKL", clobberFlags: true}, // arg0 ^ (arg0 - 1)
{name: "BLSRQ", argLength: 1, reg: gp11, asm: "BLSRQ", clobberFlags: true}, // arg0 & (arg0 - 1)
{name: "BLSRL", argLength: 1, reg: gp11, asm: "BLSRL", clobberFlags: true}, // arg0 & (arg0 - 1)
}
var AMD64blocks = []blockData{

View file

@ -1033,6 +1033,14 @@ const (
OpAMD64ORLlock
OpAMD64PrefetchT0
OpAMD64PrefetchNTA
OpAMD64ANDNQ
OpAMD64ANDNL
OpAMD64BLSIQ
OpAMD64BLSIL
OpAMD64BLSMSKQ
OpAMD64BLSMSKL
OpAMD64BLSRQ
OpAMD64BLSRL
OpARMADD
OpARMADDconst
@ -13628,6 +13636,120 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "ANDNQ",
argLen: 2,
clobberFlags: true,
asm: x86.AANDNQ,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "ANDNL",
argLen: 2,
clobberFlags: true,
asm: x86.AANDNL,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
{1, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "BLSIQ",
argLen: 1,
clobberFlags: true,
asm: x86.ABLSIQ,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "BLSIL",
argLen: 1,
clobberFlags: true,
asm: x86.ABLSIL,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "BLSMSKQ",
argLen: 1,
clobberFlags: true,
asm: x86.ABLSMSKQ,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "BLSMSKL",
argLen: 1,
clobberFlags: true,
asm: x86.ABLSMSKL,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "BLSRQ",
argLen: 1,
clobberFlags: true,
asm: x86.ABLSRQ,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "BLSRL",
argLen: 1,
clobberFlags: true,
asm: x86.ABLSRL,
reg: regInfo{
inputs: []inputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
outputs: []outputInfo{
{0, 49135}, // AX CX DX BX BP SI DI R8 R9 R10 R11 R12 R13 R15
},
},
},
{
name: "ADD",

View file

@ -3,6 +3,7 @@
package ssa
import "internal/buildcfg"
import "math"
import "cmd/internal/obj"
import "cmd/compile/internal/types"
@ -53,6 +54,10 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpAMD64ANDLload(v)
case OpAMD64ANDLmodify:
return rewriteValueAMD64_OpAMD64ANDLmodify(v)
case OpAMD64ANDNL:
return rewriteValueAMD64_OpAMD64ANDNL(v)
case OpAMD64ANDNQ:
return rewriteValueAMD64_OpAMD64ANDNQ(v)
case OpAMD64ANDQ:
return rewriteValueAMD64_OpAMD64ANDQ(v)
case OpAMD64ANDQconst:
@ -2759,6 +2764,55 @@ func rewriteValueAMD64_OpAMD64ANDL(v *Value) bool {
}
break
}
// match: (ANDL x (NOTL y))
// cond: buildcfg.GOAMD64 >= 3
// result: (ANDNL x y)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64NOTL {
continue
}
y := v_1.Args[0]
if !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64ANDNL)
v.AddArg2(x, y)
return true
}
break
}
// match: (ANDL x (NEGL x))
// cond: buildcfg.GOAMD64 >= 3
// result: (BLSIL x)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64NEGL || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64BLSIL)
v.AddArg(x)
return true
}
break
}
// match: (ANDL x (ADDLconst [-1] x))
// cond: buildcfg.GOAMD64 >= 3
// result: (BLSRL x)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64ADDLconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64BLSRL)
v.AddArg(x)
return true
}
break
}
return false
}
func rewriteValueAMD64_OpAMD64ANDLconst(v *Value) bool {
@ -3037,6 +3091,48 @@ func rewriteValueAMD64_OpAMD64ANDLmodify(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpAMD64ANDNL(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (ANDNL x (SHLL (MOVLconst [1]) y))
// result: (BTRL x y)
for {
x := v_0
if v_1.Op != OpAMD64SHLL {
break
}
y := v_1.Args[1]
v_1_0 := v_1.Args[0]
if v_1_0.Op != OpAMD64MOVLconst || auxIntToInt32(v_1_0.AuxInt) != 1 {
break
}
v.reset(OpAMD64BTRL)
v.AddArg2(x, y)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64ANDNQ(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// match: (ANDNQ x (SHLQ (MOVQconst [1]) y))
// result: (BTRQ x y)
for {
x := v_0
if v_1.Op != OpAMD64SHLQ {
break
}
y := v_1.Args[1]
v_1_0 := v_1.Args[0]
if v_1_0.Op != OpAMD64MOVQconst || auxIntToInt64(v_1_0.AuxInt) != 1 {
break
}
v.reset(OpAMD64BTRQ)
v.AddArg2(x, y)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64ANDQ(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
@ -3138,6 +3234,55 @@ func rewriteValueAMD64_OpAMD64ANDQ(v *Value) bool {
}
break
}
// match: (ANDQ x (NOTQ y))
// cond: buildcfg.GOAMD64 >= 3
// result: (ANDNQ x y)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64NOTQ {
continue
}
y := v_1.Args[0]
if !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64ANDNQ)
v.AddArg2(x, y)
return true
}
break
}
// match: (ANDQ x (NEGQ x))
// cond: buildcfg.GOAMD64 >= 3
// result: (BLSIQ x)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64NEGQ || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64BLSIQ)
v.AddArg(x)
return true
}
break
}
// match: (ANDQ x (ADDQconst [-1] x))
// cond: buildcfg.GOAMD64 >= 3
// result: (BLSRQ x)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64ADDQconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64BLSRQ)
v.AddArg(x)
return true
}
break
}
return false
}
func rewriteValueAMD64_OpAMD64ANDQconst(v *Value) bool {
@ -26474,6 +26619,21 @@ func rewriteValueAMD64_OpAMD64XORL(v *Value) bool {
}
break
}
// match: (XORL x (ADDLconst [-1] x))
// cond: buildcfg.GOAMD64 >= 3
// result: (BLSMSKL x)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64ADDLconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64BLSMSKL)
v.AddArg(x)
return true
}
break
}
return false
}
func rewriteValueAMD64_OpAMD64XORLconst(v *Value) bool {
@ -26950,6 +27110,21 @@ func rewriteValueAMD64_OpAMD64XORQ(v *Value) bool {
}
break
}
// match: (XORQ x (ADDQconst [-1] x))
// cond: buildcfg.GOAMD64 >= 3
// result: (BLSMSKQ x)
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
x := v_0
if v_1.Op != OpAMD64ADDQconst || auxIntToInt32(v_1.AuxInt) != -1 || x != v_1.Args[0] || !(buildcfg.GOAMD64 >= 3) {
continue
}
v.reset(OpAMD64BLSMSKQ)
v.AddArg(x)
return true
}
break
}
return false
}
func rewriteValueAMD64_OpAMD64XORQconst(v *Value) bool {

47
test/codegen/bmi.go Normal file
View file

@ -0,0 +1,47 @@
// asmcheck
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
func andn64(x, y int64) int64 {
// amd64/v3:"ANDNQ"
return x &^ y
}
func andn32(x, y int32) int32 {
// amd64/v3:"ANDNL"
return x &^ y
}
func blsi64(x int64) int64 {
// amd64/v3:"BLSIQ"
return x & -x
}
func blsi32(x int32) int32 {
// amd64/v3:"BLSIL"
return x & -x
}
func blsmsk64(x int64) int64 {
// amd64/v3:"BLSMSKQ"
return x ^ (x - 1)
}
func blsmsk32(x int32) int32 {
// amd64/v3:"BLSMSKL"
return x ^ (x - 1)
}
func blsr64(x int64) int64 {
// amd64/v3:"BLSRQ"
return x & (x - 1)
}
func blsr32(x int32) int32 {
// amd64/v3:"BLSRL"
return x & (x - 1)
}