diff --git a/src/cmd/compile/internal/gc/walk.go b/src/cmd/compile/internal/gc/walk.go
index 27df285a63..69cc56da59 100644
--- a/src/cmd/compile/internal/gc/walk.go
+++ b/src/cmd/compile/internal/gc/walk.go
@@ -8,6 +8,7 @@ import (
 	"cmd/compile/internal/types"
 	"cmd/internal/objabi"
 	"cmd/internal/sys"
+	"encoding/binary"
 	"fmt"
 	"strings"
 )
@@ -1281,21 +1282,14 @@ opswitch:
 			maxRewriteLen := 6
 			// Some architectures can load unaligned byte sequence as 1 word.
 			// So we can cover longer strings with the same amount of code.
-			canCombineLoads := false
+			canCombineLoads := canMergeLoads()
 			combine64bit := false
-			// TODO: does this improve performance on any other architectures?
-			switch thearch.LinkArch.Family {
-			case sys.AMD64:
-				// Larger compare require longer instructions, so keep this reasonably low.
-				// Data from CL 26758 shows that longer strings are rare.
-				// If we really want we can do 16 byte SSE comparisons in the future.
-				maxRewriteLen = 16
-				canCombineLoads = true
-				combine64bit = true
-			case sys.I386:
-				maxRewriteLen = 8
-				canCombineLoads = true
+			if canCombineLoads {
+				// Keep this low enough to generate less code than a function call.
+				maxRewriteLen = 2 * thearch.LinkArch.RegSize
+				combine64bit = thearch.LinkArch.RegSize >= 8
 			}
+
 			var and Op
 			switch cmp {
 			case OEQ:
@@ -3288,15 +3282,10 @@ func walkcompare(n *Node, init *Nodes) *Node {
 	var inline bool
 
 	maxcmpsize := int64(4)
-	unalignedLoad := false
-	switch thearch.LinkArch.Family {
-	case sys.AMD64, sys.ARM64, sys.S390X:
-		// Keep this low enough, to generate less code than function call.
-		maxcmpsize = 16
-		unalignedLoad = true
-	case sys.I386:
-		maxcmpsize = 8
-		unalignedLoad = true
+	unalignedLoad := canMergeLoads()
+	if unalignedLoad {
+		// Keep this low enough to generate less code than a function call.
+		maxcmpsize = 2 * int64(thearch.LinkArch.RegSize)
 	}
 
 	switch t.Etype {
@@ -3913,3 +3902,18 @@ func substArgTypes(old *Node, types_ ...*types.Type) *Node {
 	}
 	return n
 }
+
+// canMergeLoads reports whether the backend optimization passes for
+// the current architecture can combine adjacent loads into a single
+// larger, possibly unaligned, load. Note that currently the
+// optimizations must be able to handle little endian byte order.
+func canMergeLoads() bool {
+	switch thearch.LinkArch.Family {
+	case sys.ARM64, sys.AMD64, sys.I386, sys.S390X:
+		return true
+	case sys.PPC64:
+		// Load combining only supported on ppc64le.
+		return thearch.LinkArch.ByteOrder == binary.LittleEndian
+	}
+	return false
+}
diff --git a/test/codegen/comparisons.go b/test/codegen/comparisons.go
index 15a659a4e6..2f010bcbae 100644
--- a/test/codegen/comparisons.go
+++ b/test/codegen/comparisons.go
@@ -19,16 +19,25 @@ import "unsafe"
 
 func CompareString1(s string) bool {
 	// amd64:`CMPW\t\(.*\), [$]`
+	// arm64:`MOVHU\t\(.*\), [R]`,`CMPW\t[$]`
+	// ppc64le:`MOVHZ\t\(.*\), [R]`,`CMPW\t.*, [$]`
+	// s390x:`MOVHBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
 	return s == "xx"
 }
 
 func CompareString2(s string) bool {
 	// amd64:`CMPL\t\(.*\), [$]`
+	// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
+	// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
+	// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [$]`
 	return s == "xxxx"
 }
 
 func CompareString3(s string) bool {
 	// amd64:`CMPQ\t\(.*\), [A-Z]`
+	// arm64:-`CMPW\t`
+	// ppc64le:-`CMPW\t`
+	// s390x:-`CMPW\t`
 	return s == "xxxxxxxx"
 }
 
@@ -36,6 +45,9 @@ func CompareString3(s string) bool {
 
 func CompareArray1(a, b [2]byte) bool {
 	// amd64:`CMPW\t""[.+_a-z0-9]+\(SP\), [A-Z]`
+	// arm64:-`MOVBU\t`
+	// ppc64le:-`MOVBZ\t`
+	// s390x:-`MOVBZ\t`
 	return a == b
 }
 
@@ -65,6 +77,9 @@ func CompareArray5(a, b [15]byte) bool {
 // This was a TODO in mapaccess1_faststr
 func CompareArray6(a, b unsafe.Pointer) bool {
 	// amd64:`CMPL\t\(.*\), [A-Z]`
+	// arm64:`MOVWU\t\(.*\), [R]`,`CMPW\t.*, [R]`
+	// ppc64le:`MOVWZ\t\(.*\), [R]`,`CMPW\t.*, [R]`
+	// s390x:`MOVWBR\t\(.*\), [R]`,`CMPW\t.*, [R]`
 	return *((*[4]byte)(a)) != *((*[4]byte)(b))
 }