cmd/compile: optimize comparisons using load merging where available

Multi-byte comparison operations were used on amd64, arm64, i386 and s390x for comparisons with constant arrays, but only amd64 and i386 for comparisons with string constants. This CL combines the check for platform capability, since they have the same requirements, and also enables both on ppc64le which also supports load merging. Note that these optimizations currently use little endian byte order which results in byte reversal instructions on s390x. This should be fixed at some point. Change-Id: Ie612d13359b50c77f4d7c6e73fea4a59fa11f322 Reviewed-on: https://go-review.googlesource.com/102558 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2024-09-15 22:20:06 +00:00 · 2018-03-26 21:18:27 +01:00 · 2018-03-26 21:18:27 +01:00 · b65122f99a
parent 53c8f6161c
commit b65122f99a
2 changed files with 41 additions and 22 deletions
--- a/src/cmd/compile/internal/gc/walk.go
+++ b/src/cmd/compile/internal/gc/walk.go
@ -8,6 +8,7 @@ import (
 	"cmd/compile/internal/types"
 	"cmd/internal/objabi"
 	"cmd/internal/sys"
 	"encoding/binary"
 	"fmt"
 	"strings"
 )
@ -1281,21 +1282,14 @@ opswitch:
 			maxRewriteLen := 6
 			// Some architectures can load unaligned byte sequence as 1 word.
 			// So we can cover longer strings with the same amount of code.
-			canCombineLoads := false
+			canCombineLoads := canMergeLoads()
 			combine64bit := false
-			// TODO: does this improve performance on any other architectures?
+			if canCombineLoads {
-			switch thearch.LinkArch.Family {
+				// Keep this low enough to generate less code than a function call.
-			case sys.AMD64:
+				maxRewriteLen = 2 * thearch.LinkArch.RegSize
-				// Larger compare require longer instructions, so keep this reasonably low.
+				combine64bit = thearch.LinkArch.RegSize >= 8
 				// Data from CL 26758 shows that longer strings are rare.
 				// If we really want we can do 16 byte SSE comparisons in the future.
 				maxRewriteLen = 16
 				canCombineLoads = true
 				combine64bit = true
 			case sys.I386:
 				maxRewriteLen = 8
 				canCombineLoads = true
 			}
 			var and Op
 			switch cmp {
 			case OEQ:
@ -3288,15 +3282,10 @@ func walkcompare(n *Node, init *Nodes) *Node {
 	var inline bool
 	maxcmpsize := int64(4)
-	unalignedLoad := false
+	unalignedLoad := canMergeLoads()
-	switch thearch.LinkArch.Family {
+	if unalignedLoad {
-	case sys.AMD64, sys.ARM64, sys.S390X:
+		// Keep this low enough to generate less code than a function call.
-		// Keep this low enough, to generate less code than function call.
+		maxcmpsize = 2 * int64(thearch.LinkArch.RegSize)
 		maxcmpsize = 16
 		unalignedLoad = true
 	case sys.I386:
 		maxcmpsize = 8
 		unalignedLoad = true
 	}
 	switch t.Etype {
@ -3913,3 +3902,18 @@ func substArgTypes(old *Node, types_ ...*types.Type) *Node {
 	}
 	return n
 }
 // canMergeLoads reports whether the backend optimization passes for
 // the current architecture can combine adjacent loads into a single
 // larger, possibly unaligned, load. Note that currently the
 // optimizations must be able to handle little endian byte order.
 func canMergeLoads() bool {
 	switch thearch.LinkArch.Family {
 	case sys.ARM64, sys.AMD64, sys.I386, sys.S390X:
 		return true
 	case sys.PPC64:
 		// Load combining only supported on ppc64le.
 		return thearch.LinkArch.ByteOrder == binary.LittleEndian
 	}
 	return false
 }
--- a/test/codegen/comparisons.go
+++ b/test/codegen/comparisons.go
@ -19,16 +19,25 @@ import "unsafe"
 func CompareString1(s string) bool {
 	// amd64:`CMPW\t\(.*\), [$]`
 	// arm64:`MOVHU\t\(.*\), [R]`,`CMPW\t[$]`
 	// ppc64le:`MOVHZ\t\(.*\), [R]`,`CMPW\t.*, [$]`
 	// s390x:`MOVHBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
 	return s == "xx"
 }
 func CompareString2(s string) bool {
 	// amd64:`CMPL\t\(.*\), [$]`
 	// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
 	// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
 	// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
 	return s == "xxxx"
 }
 func CompareString3(s string) bool {
 	// amd64:`CMPQ\t\(.*\), [A-Z]`
 	// arm64:-`CMPW\t`
 	// ppc64le:-`CMPW\t`
 	// s390x:-`CMPW\t`
 	return s == "xxxxxxxx"
 }
@ -36,6 +45,9 @@ func CompareString3(s string) bool {
 func CompareArray1(a, b [2]byte) bool {
 	// amd64:`CMPW\t""[.+_a-z0-9]+\(SP\), [A-Z]`
 	// arm64:-`MOVBU\t`
 	// ppc64le:-`MOVBZ\t`
 	// s390x:-`MOVBZ\t`
 	return a == b
 }
@ -65,6 +77,9 @@ func CompareArray5(a, b [15]byte) bool {
 // This was a TODO in mapaccess1_faststr
 func CompareArray6(a, b unsafe.Pointer) bool {
 	// amd64:`CMPL\t\(.*\), [A-Z]`
 	// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
 	// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
 	// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [R]`
 	return *((*[4]byte)(a)) != *((*[4]byte)(b))
 }