go/test/codegen/shift.go
Josh Bleecher Snyder 43d5f213e2 cmd/compile: optimize multi-register shifts on amd64
amd64 can shift in bits from another register instead of filling with 0/1.
This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts.
In the standard library, it shows up in pure Go math/big.

Benchmarks results on amd64 with -tags=math_big_pure_go.

name                          old time/op  new time/op  delta
NonZeroShifts/1/shrVU-8       4.45ns ± 3%  4.39ns ± 1%   -1.28%  (p=0.000 n=30+27)
NonZeroShifts/1/shlVU-8       4.13ns ± 4%  4.10ns ± 2%     ~     (p=0.254 n=29+28)
NonZeroShifts/2/shrVU-8       5.55ns ± 1%  5.63ns ± 2%   +1.42%  (p=0.000 n=28+29)
NonZeroShifts/2/shlVU-8       5.70ns ± 2%  5.14ns ± 1%   -9.82%  (p=0.000 n=29+28)
NonZeroShifts/3/shrVU-8       6.79ns ± 2%  6.35ns ± 2%   -6.46%  (p=0.000 n=28+29)
NonZeroShifts/3/shlVU-8       6.69ns ± 1%  6.25ns ± 1%   -6.60%  (p=0.000 n=28+27)
NonZeroShifts/4/shrVU-8       7.79ns ± 2%  7.06ns ± 2%   -9.48%  (p=0.000 n=30+30)
NonZeroShifts/4/shlVU-8       7.82ns ± 1%  7.24ns ± 1%   -7.37%  (p=0.000 n=28+29)
NonZeroShifts/5/shrVU-8       8.90ns ± 3%  7.93ns ± 1%  -10.84%  (p=0.000 n=29+26)
NonZeroShifts/5/shlVU-8       8.68ns ± 1%  7.92ns ± 1%   -8.76%  (p=0.000 n=29+29)
NonZeroShifts/10/shrVU-8      14.4ns ± 1%  12.3ns ± 2%  -14.79%  (p=0.000 n=28+29)
NonZeroShifts/10/shlVU-8      14.1ns ± 1%  11.9ns ± 2%  -15.55%  (p=0.000 n=28+27)
NonZeroShifts/100/shrVU-8      118ns ± 1%    96ns ± 3%  -18.82%  (p=0.000 n=30+29)
NonZeroShifts/100/shlVU-8      120ns ± 2%    98ns ± 2%  -18.46%  (p=0.000 n=29+28)
NonZeroShifts/1000/shrVU-8    1.10µs ± 1%  0.88µs ± 2%  -19.63%  (p=0.000 n=29+30)
NonZeroShifts/1000/shlVU-8    1.10µs ± 2%  0.88µs ± 2%  -20.28%  (p=0.000 n=29+28)
NonZeroShifts/10000/shrVU-8   10.9µs ± 1%   8.7µs ± 1%  -19.78%  (p=0.000 n=28+27)
NonZeroShifts/10000/shlVU-8   10.9µs ± 2%   8.7µs ± 1%  -19.64%  (p=0.000 n=29+27)
NonZeroShifts/100000/shrVU-8   111µs ± 2%    90µs ± 2%  -19.39%  (p=0.000 n=28+29)
NonZeroShifts/100000/shlVU-8   113µs ± 2%    90µs ± 2%  -20.43%  (p=0.000 n=30+27)

The assembly version is still faster, unfortunately, but the gap is narrowing.
Speedup from pure Go to assembly:

name                          old time/op  new time/op  delta
NonZeroShifts/1/shrVU-8       4.39ns ± 1%  3.45ns ± 2%  -21.36%  (p=0.000 n=27+29)
NonZeroShifts/1/shlVU-8       4.10ns ± 2%  3.47ns ± 3%  -15.42%  (p=0.000 n=28+30)
NonZeroShifts/2/shrVU-8       5.63ns ± 2%  3.97ns ± 0%  -29.40%  (p=0.000 n=29+25)
NonZeroShifts/2/shlVU-8       5.14ns ± 1%  3.77ns ± 2%  -26.65%  (p=0.000 n=28+26)
NonZeroShifts/3/shrVU-8       6.35ns ± 2%  4.79ns ± 2%  -24.52%  (p=0.000 n=29+29)
NonZeroShifts/3/shlVU-8       6.25ns ± 1%  4.42ns ± 1%  -29.29%  (p=0.000 n=27+26)
NonZeroShifts/4/shrVU-8       7.06ns ± 2%  5.64ns ± 1%  -20.05%  (p=0.000 n=30+29)
NonZeroShifts/4/shlVU-8       7.24ns ± 1%  5.34ns ± 2%  -26.23%  (p=0.000 n=29+29)
NonZeroShifts/5/shrVU-8       7.93ns ± 1%  6.56ns ± 2%  -17.26%  (p=0.000 n=26+30)
NonZeroShifts/5/shlVU-8       7.92ns ± 1%  6.27ns ± 1%  -20.79%  (p=0.000 n=29+25)
NonZeroShifts/10/shrVU-8      12.3ns ± 2%  10.2ns ± 2%  -17.21%  (p=0.000 n=29+29)
NonZeroShifts/10/shlVU-8      11.9ns ± 2%  10.5ns ± 2%  -12.45%  (p=0.000 n=27+29)
NonZeroShifts/100/shrVU-8     95.9ns ± 3%  77.7ns ± 1%  -19.00%  (p=0.000 n=29+30)
NonZeroShifts/100/shlVU-8     97.5ns ± 2%  66.8ns ± 2%  -31.47%  (p=0.000 n=28+30)
NonZeroShifts/1000/shrVU-8     884ns ± 2%   705ns ± 1%  -20.17%  (p=0.000 n=30+28)
NonZeroShifts/1000/shlVU-8     880ns ± 2%   590ns ± 1%  -32.96%  (p=0.000 n=28+25)
NonZeroShifts/10000/shrVU-8   8.74µs ± 1%  7.34µs ± 3%  -15.94%  (p=0.000 n=27+30)
NonZeroShifts/10000/shlVU-8   8.73µs ± 1%  6.00µs ± 1%  -31.25%  (p=0.000 n=27+28)
NonZeroShifts/100000/shrVU-8  89.6µs ± 2%  75.5µs ± 2%  -15.80%  (p=0.000 n=29+29)
NonZeroShifts/100000/shlVU-8  89.6µs ± 2%  68.0µs ± 3%  -24.09%  (p=0.000 n=27+30)

Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958
Reviewed-on: https://go-review.googlesource.com/c/go/+/297050
Trust: Josh Bleecher Snyder <josharian@gmail.com>
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
2021-03-11 19:11:46 +00:00

304 lines
7.7 KiB
Go

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
// ------------------ //
// masked shifts //
// ------------------ //
func lshMask64x64(v int64, s uint64) int64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v << (s & 63)
}
func rshMask64Ux64(v uint64, s uint64) uint64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v >> (s & 63)
}
func rshMask64x64(v int64, s uint64) int64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v >> (s & 63)
}
func lshMask32x64(v int32, s uint64) int32 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ISEL",-"ORN"
// ppc64:"ISEL",-"ORN"
return v << (s & 63)
}
func rshMask32Ux64(v uint32, s uint64) uint32 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ISEL",-"ORN"
// ppc64:"ISEL",-"ORN"
return v >> (s & 63)
}
func rshMask32x64(v int32, s uint64) int32 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ISEL",-"ORN"
// ppc64:"ISEL",-"ORN"
return v >> (s & 63)
}
func lshMask64x32(v int64, s uint32) int64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN"
// ppc64:"ANDCC",-"ORN"
return v << (s & 63)
}
func rshMask64Ux32(v uint64, s uint32) uint64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN"
// ppc64:"ANDCC",-"ORN"
return v >> (s & 63)
}
func rshMask64x32(v int64, s uint32) int64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v >> (s & 63)
}
func lshMask64x32Ext(v int64, s int32) int64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v << uint(s&63)
}
func rshMask64Ux32Ext(v uint64, s int32) uint64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v >> uint(s&63)
}
func rshMask64x32Ext(v int64, s int32) int64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// ppc64le:"ANDCC",-"ORN",-"ISEL"
// ppc64:"ANDCC",-"ORN",-"ISEL"
return v >> uint(s&63)
}
// --------------- //
// signed shifts //
// --------------- //
// We do want to generate a test + panicshift for these cases.
func lshSigned(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
// amd64:"TESTB"
_ = x << v8
// amd64:"TESTW"
_ = x << v16
// amd64:"TESTL"
_ = x << v32
// amd64:"TESTQ"
_ = x << v64
}
// We want to avoid generating a test + panicshift for these cases.
func lshSignedMasked(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
// amd64:-"TESTB"
_ = x << (v8 & 7)
// amd64:-"TESTW"
_ = x << (v16 & 15)
// amd64:-"TESTL"
_ = x << (v32 & 31)
// amd64:-"TESTQ"
_ = x << (v64 & 63)
}
// ------------------ //
// bounded shifts //
// ------------------ //
func rshGuarded64(v int64, s uint) int64 {
if s < 64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// wasm:-"Select",-".*LtU"
return v >> s
}
panic("shift too large")
}
func rshGuarded64U(v uint64, s uint) uint64 {
if s < 64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// wasm:-"Select",-".*LtU"
return v >> s
}
panic("shift too large")
}
func lshGuarded64(v int64, s uint) int64 {
if s < 64 {
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// wasm:-"Select",-".*LtU"
return v << s
}
panic("shift too large")
}
func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byte) (uint32, uint64) {
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
f := tab[byte(v)^b]
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
f += tab[byte(v)&b]
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
f += tab[byte(v)|b]
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
f += tab[uint16(v)&h]
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
f += tab[uint16(v)^h]
// ppc64le:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
// ppc64:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
f += tab[uint16(v)|h]
// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
f += tab[v&0xff]
// ppc64le:-".*AND",".*CLRLSLWI"
// ppc64:-".*AND",".*CLRLSLWI"
f += 2 * uint32(uint16(d))
// ppc64le:-".*AND",-"RLDICR",".*CLRLSLDI"
// ppc64:-".*AND",-"RLDICR",".*CLRLSLDI"
g := 2 * uint64(uint32(d))
return f, g
}
func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64) (uint8, uint16, uint32, uint64, int64) {
// ppc64le:-"AND","CLRLSLWI"
// ppc64:-"AND","CLRLSLWI"
f := (v8 & 0xF) << 2
// ppc64le:"CLRLSLWI"
// ppc64:"CLRLSLWI"
f += byte(v16) << 3
// ppc64le:-"AND","CLRLSLWI"
// ppc64:-"AND","CLRLSLWI"
g := (v16 & 0xFF) << 3
// ppc64le:-"AND","CLRLSLWI"
// ppc64:-"AND","CLRLSLWI"
h := (v32 & 0xFFFFF) << 2
// ppc64le:"CLRLSLDI"
// ppc64:"CLRLSLDI"
i := (v64 & 0xFFFFFFFF) << 5
// ppc64le:-"CLRLSLDI"
// ppc64:-"CLRLSLDI"
i += (v64 & 0xFFFFFFF) << 38
// ppc64le/power9:-"CLRLSLDI"
// ppc64/power9:-"CLRLSLDI"
i += (v64 & 0xFFFF00) << 10
// ppc64le/power9:-"SLD","EXTSWSLI"
// ppc64/power9:-"SLD","EXTSWSLI"
j := int64(x32+32) * 8
return f, g, h, i, j
}
func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
// ppc64le:-".*MOVW"
f := int32(v >> 32)
// ppc64le:".*MOVW"
f += int32(v >> 31)
// ppc64le:-".*MOVH"
g := int16(v >> 48)
// ppc64le:".*MOVH"
g += int16(v >> 30)
// ppc64le:-".*MOVH"
g += int16(f >> 16)
// ppc64le:-".*MOVB"
h := int8(v >> 56)
// ppc64le:".*MOVB"
h += int8(v >> 28)
// ppc64le:-".*MOVB"
h += int8(f >> 24)
// ppc64le:".*MOVB"
h += int8(f >> 16)
return int64(h), uint64(g)
}
func checkShiftAndMask32(v []uint32) {
i := 0
// ppc64le: "RLWNM\t[$]24, R[0-9]+, [$]12, [$]19, R[0-9]+"
// ppc64: "RLWNM\t[$]24, R[0-9]+, [$]12, [$]19, R[0-9]+"
v[i] = (v[i] & 0xFF00000) >> 8
i++
// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]22, [$]29, R[0-9]+"
// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]22, [$]29, R[0-9]+"
v[i] = (v[i] & 0xFF00) >> 6
i++
// ppc64le: "MOVW\tR0"
// ppc64: "MOVW\tR0"
v[i] = (v[i] & 0xFF) >> 8
i++
// ppc64le: "MOVW\tR0"
// ppc64: "MOVW\tR0"
v[i] = (v[i] & 0xF000000) >> 28
i++
// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]24, [$]31, R[0-9]+"
// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]24, [$]31, R[0-9]+"
v[i] = (v[i] >> 6) & 0xFF
i++
// ppc64le: "RLWNM\t[$]26, R[0-9]+, [$]12, [$]19, R[0-9]+"
// ppc64: "RLWNM\t[$]26, R[0-9]+, [$]12, [$]19, R[0-9]+"
v[i] = (v[i] >> 6) & 0xFF000
i++
// ppc64le: "MOVW\tR0"
// ppc64: "MOVW\tR0"
v[i] = (v[i] >> 20) & 0xFF000
i++
// ppc64le: "MOVW\tR0"
// ppc64: "MOVW\tR0"
v[i] = (v[i] >> 24) & 0xFF00
i++
}
func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
//ppc64le: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
//ppc64: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
a[0] = a[uint8(v>>24)]
//ppc64le: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]21, [$]28, R[0-9]+"
//ppc64: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]21, [$]28, R[0-9]+"
b[0] = b[uint8(v>>24)]
//ppc64le: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
//ppc64: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
b[1] = b[(v>>20)&0xFF]
//ppc64le: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
//ppc64: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
b[2] = b[v>>25]
}
// 128 bit shifts
func check128bitShifts(x, y uint64, bits uint) (uint64, uint64) {
s := bits & 63
ŝ := (64 - bits) & 63
// check that the shift operation has two commas (three operands)
// amd64:"SHRQ.*,.*,"
shr := x>>s | y<<ŝ
// amd64:"SHLQ.*,.*,"
shl := x<<s | y>>ŝ
return shr, shl
}