cmd/compile: optimize comparisons using load merging where available

Multi-byte comparison operations were used on amd64, arm64, i386
and s390x for comparisons with constant arrays, but only amd64 and
i386 for comparisons with string constants. This CL combines the
check for platform capability, since they have the same requirements,
and also enables both on ppc64le which also supports load merging.

Note that these optimizations currently use little endian byte order
which results in byte reversal instructions on s390x. This should
be fixed at some point.

Change-Id: Ie612d13359b50c77f4d7c6e73fea4a59fa11f322
Reviewed-on: https://go-review.googlesource.com/102558
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Michael Munday 2018-03-26 21:18:27 +01:00
parent 53c8f6161c
commit b65122f99a
2 changed files with 41 additions and 22 deletions

View file

@ -8,6 +8,7 @@ import (
"cmd/compile/internal/types" "cmd/compile/internal/types"
"cmd/internal/objabi" "cmd/internal/objabi"
"cmd/internal/sys" "cmd/internal/sys"
"encoding/binary"
"fmt" "fmt"
"strings" "strings"
) )
@ -1281,21 +1282,14 @@ opswitch:
maxRewriteLen := 6 maxRewriteLen := 6
// Some architectures can load unaligned byte sequence as 1 word. // Some architectures can load unaligned byte sequence as 1 word.
// So we can cover longer strings with the same amount of code. // So we can cover longer strings with the same amount of code.
canCombineLoads := false canCombineLoads := canMergeLoads()
combine64bit := false combine64bit := false
// TODO: does this improve performance on any other architectures? if canCombineLoads {
switch thearch.LinkArch.Family { // Keep this low enough to generate less code than a function call.
case sys.AMD64: maxRewriteLen = 2 * thearch.LinkArch.RegSize
// Larger compare require longer instructions, so keep this reasonably low. combine64bit = thearch.LinkArch.RegSize >= 8
// Data from CL 26758 shows that longer strings are rare.
// If we really want we can do 16 byte SSE comparisons in the future.
maxRewriteLen = 16
canCombineLoads = true
combine64bit = true
case sys.I386:
maxRewriteLen = 8
canCombineLoads = true
} }
var and Op var and Op
switch cmp { switch cmp {
case OEQ: case OEQ:
@ -3288,15 +3282,10 @@ func walkcompare(n *Node, init *Nodes) *Node {
var inline bool var inline bool
maxcmpsize := int64(4) maxcmpsize := int64(4)
unalignedLoad := false unalignedLoad := canMergeLoads()
switch thearch.LinkArch.Family { if unalignedLoad {
case sys.AMD64, sys.ARM64, sys.S390X: // Keep this low enough to generate less code than a function call.
// Keep this low enough, to generate less code than function call. maxcmpsize = 2 * int64(thearch.LinkArch.RegSize)
maxcmpsize = 16
unalignedLoad = true
case sys.I386:
maxcmpsize = 8
unalignedLoad = true
} }
switch t.Etype { switch t.Etype {
@ -3913,3 +3902,18 @@ func substArgTypes(old *Node, types_ ...*types.Type) *Node {
} }
return n return n
} }
// canMergeLoads reports whether the backend optimization passes for
// the current architecture can combine adjacent loads into a single
// larger, possibly unaligned, load. Note that currently the
// optimizations must be able to handle little endian byte order.
func canMergeLoads() bool {
switch thearch.LinkArch.Family {
case sys.ARM64, sys.AMD64, sys.I386, sys.S390X:
return true
case sys.PPC64:
// Load combining only supported on ppc64le.
return thearch.LinkArch.ByteOrder == binary.LittleEndian
}
return false
}

View file

@ -19,16 +19,25 @@ import "unsafe"
func CompareString1(s string) bool { func CompareString1(s string) bool {
// amd64:`CMPW\t\(.*\), [$]` // amd64:`CMPW\t\(.*\), [$]`
// arm64:`MOVHU\t\(.*\), [R]`,`CMPW\t[$]`
// ppc64le:`MOVHZ\t\(.*\), [R]`,`CMPW\t.*, [$]`
// s390x:`MOVHBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
return s == "xx" return s == "xx"
} }
func CompareString2(s string) bool { func CompareString2(s string) bool {
// amd64:`CMPL\t\(.*\), [$]` // amd64:`CMPL\t\(.*\), [$]`
// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
return s == "xxxx" return s == "xxxx"
} }
func CompareString3(s string) bool { func CompareString3(s string) bool {
// amd64:`CMPQ\t\(.*\), [A-Z]` // amd64:`CMPQ\t\(.*\), [A-Z]`
// arm64:-`CMPW\t`
// ppc64le:-`CMPW\t`
// s390x:-`CMPW\t`
return s == "xxxxxxxx" return s == "xxxxxxxx"
} }
@ -36,6 +45,9 @@ func CompareString3(s string) bool {
func CompareArray1(a, b [2]byte) bool { func CompareArray1(a, b [2]byte) bool {
// amd64:`CMPW\t""[.+_a-z0-9]+\(SP\), [A-Z]` // amd64:`CMPW\t""[.+_a-z0-9]+\(SP\), [A-Z]`
// arm64:-`MOVBU\t`
// ppc64le:-`MOVBZ\t`
// s390x:-`MOVBZ\t`
return a == b return a == b
} }
@ -65,6 +77,9 @@ func CompareArray5(a, b [15]byte) bool {
// This was a TODO in mapaccess1_faststr // This was a TODO in mapaccess1_faststr
func CompareArray6(a, b unsafe.Pointer) bool { func CompareArray6(a, b unsafe.Pointer) bool {
// amd64:`CMPL\t\(.*\), [A-Z]` // amd64:`CMPL\t\(.*\), [A-Z]`
// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [R]`
return *((*[4]byte)(a)) != *((*[4]byte)(b)) return *((*[4]byte)(a)) != *((*[4]byte)(b))
} }