mirror of
https://github.com/golang/go
synced 2024-11-02 11:50:30 +00:00
cmd/compile: optimize comparisons using load merging where available
Multi-byte comparison operations were used on amd64, arm64, i386 and s390x for comparisons with constant arrays, but only amd64 and i386 for comparisons with string constants. This CL combines the check for platform capability, since they have the same requirements, and also enables both on ppc64le which also supports load merging. Note that these optimizations currently use little endian byte order which results in byte reversal instructions on s390x. This should be fixed at some point. Change-Id: Ie612d13359b50c77f4d7c6e73fea4a59fa11f322 Reviewed-on: https://go-review.googlesource.com/102558 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
53c8f6161c
commit
b65122f99a
2 changed files with 41 additions and 22 deletions
|
@ -8,6 +8,7 @@ import (
|
|||
"cmd/compile/internal/types"
|
||||
"cmd/internal/objabi"
|
||||
"cmd/internal/sys"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
@ -1281,21 +1282,14 @@ opswitch:
|
|||
maxRewriteLen := 6
|
||||
// Some architectures can load unaligned byte sequence as 1 word.
|
||||
// So we can cover longer strings with the same amount of code.
|
||||
canCombineLoads := false
|
||||
canCombineLoads := canMergeLoads()
|
||||
combine64bit := false
|
||||
// TODO: does this improve performance on any other architectures?
|
||||
switch thearch.LinkArch.Family {
|
||||
case sys.AMD64:
|
||||
// Larger compare require longer instructions, so keep this reasonably low.
|
||||
// Data from CL 26758 shows that longer strings are rare.
|
||||
// If we really want we can do 16 byte SSE comparisons in the future.
|
||||
maxRewriteLen = 16
|
||||
canCombineLoads = true
|
||||
combine64bit = true
|
||||
case sys.I386:
|
||||
maxRewriteLen = 8
|
||||
canCombineLoads = true
|
||||
if canCombineLoads {
|
||||
// Keep this low enough to generate less code than a function call.
|
||||
maxRewriteLen = 2 * thearch.LinkArch.RegSize
|
||||
combine64bit = thearch.LinkArch.RegSize >= 8
|
||||
}
|
||||
|
||||
var and Op
|
||||
switch cmp {
|
||||
case OEQ:
|
||||
|
@ -3288,15 +3282,10 @@ func walkcompare(n *Node, init *Nodes) *Node {
|
|||
var inline bool
|
||||
|
||||
maxcmpsize := int64(4)
|
||||
unalignedLoad := false
|
||||
switch thearch.LinkArch.Family {
|
||||
case sys.AMD64, sys.ARM64, sys.S390X:
|
||||
// Keep this low enough, to generate less code than function call.
|
||||
maxcmpsize = 16
|
||||
unalignedLoad = true
|
||||
case sys.I386:
|
||||
maxcmpsize = 8
|
||||
unalignedLoad = true
|
||||
unalignedLoad := canMergeLoads()
|
||||
if unalignedLoad {
|
||||
// Keep this low enough to generate less code than a function call.
|
||||
maxcmpsize = 2 * int64(thearch.LinkArch.RegSize)
|
||||
}
|
||||
|
||||
switch t.Etype {
|
||||
|
@ -3913,3 +3902,18 @@ func substArgTypes(old *Node, types_ ...*types.Type) *Node {
|
|||
}
|
||||
return n
|
||||
}
|
||||
|
||||
// canMergeLoads reports whether the backend optimization passes for
|
||||
// the current architecture can combine adjacent loads into a single
|
||||
// larger, possibly unaligned, load. Note that currently the
|
||||
// optimizations must be able to handle little endian byte order.
|
||||
func canMergeLoads() bool {
|
||||
switch thearch.LinkArch.Family {
|
||||
case sys.ARM64, sys.AMD64, sys.I386, sys.S390X:
|
||||
return true
|
||||
case sys.PPC64:
|
||||
// Load combining only supported on ppc64le.
|
||||
return thearch.LinkArch.ByteOrder == binary.LittleEndian
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -19,16 +19,25 @@ import "unsafe"
|
|||
|
||||
func CompareString1(s string) bool {
|
||||
// amd64:`CMPW\t\(.*\), [$]`
|
||||
// arm64:`MOVHU\t\(.*\), [R]`,`CMPW\t[$]`
|
||||
// ppc64le:`MOVHZ\t\(.*\), [R]`,`CMPW\t.*, [$]`
|
||||
// s390x:`MOVHBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
|
||||
return s == "xx"
|
||||
}
|
||||
|
||||
func CompareString2(s string) bool {
|
||||
// amd64:`CMPL\t\(.*\), [$]`
|
||||
// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
|
||||
// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
|
||||
// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
|
||||
return s == "xxxx"
|
||||
}
|
||||
|
||||
func CompareString3(s string) bool {
|
||||
// amd64:`CMPQ\t\(.*\), [A-Z]`
|
||||
// arm64:-`CMPW\t`
|
||||
// ppc64le:-`CMPW\t`
|
||||
// s390x:-`CMPW\t`
|
||||
return s == "xxxxxxxx"
|
||||
}
|
||||
|
||||
|
@ -36,6 +45,9 @@ func CompareString3(s string) bool {
|
|||
|
||||
func CompareArray1(a, b [2]byte) bool {
|
||||
// amd64:`CMPW\t""[.+_a-z0-9]+\(SP\), [A-Z]`
|
||||
// arm64:-`MOVBU\t`
|
||||
// ppc64le:-`MOVBZ\t`
|
||||
// s390x:-`MOVBZ\t`
|
||||
return a == b
|
||||
}
|
||||
|
||||
|
@ -65,6 +77,9 @@ func CompareArray5(a, b [15]byte) bool {
|
|||
// This was a TODO in mapaccess1_faststr
|
||||
func CompareArray6(a, b unsafe.Pointer) bool {
|
||||
// amd64:`CMPL\t\(.*\), [A-Z]`
|
||||
// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
|
||||
// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
|
||||
// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [R]`
|
||||
return *((*[4]byte)(a)) != *((*[4]byte)(b))
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue