mirror of
https://github.com/golang/go
synced 2024-10-14 11:53:56 +00:00
cmd/compile: make encoding/binary loads/stores cheaper to inline
The encoding/binary little- and big-endian load and store routines are frequently used in performance sensitive code. They look fairly complex to the inliner. Though the routines themselves can be inlined, code using them typically cannot be. Yet they typically compile down to an instruction or two on architectures that support merging such loads. This change teaches the inliner to treat calls to these methods as cheap, so that code using them will be more inlineable. It'd be better to teach the inliner that this pattern of code is cheap, rather than these particular methods. However, that is difficult to do robustly when working with the IR representation. And the broader project of which that would be a part, namely to model the rest of the compiler in the inliner, is probably a non-starter. By way of contrast, imperfect though it is, this change is an easy, cheap, and useful heuristic. If/when we base inlining decisions on more accurate information obtained later in the compilation process, or on PGO/FGO, we can remove this and other such heuristics. Newly inlineable functions in the standard library: crypto/cipher.gcmInc32 crypto/sha512.appendUint64 crypto/md5.appendUint64 crypto/sha1.appendUint64 crypto/sha256.appendUint64 vendor/golang.org/x/crypto/poly1305.initialize encoding/gob.(*encoderState).encodeUint vendor/golang.org/x/text/unicode/norm.buildRecompMap net/http.(*http2SettingsFrame).Setting net/http.http2parseGoAwayFrame net/http.http2parseWindowUpdateFrame Benchmark impact for encoding/gob (the only package I measured): name old time/op new time/op delta EndToEndPipe-8 2.25µs ± 1% 2.21µs ± 3% -1.79% (p=0.000 n=28+27) EndToEndByteBuffer-8 93.3ns ± 5% 94.2ns ± 5% ~ (p=0.174 n=30+30) EndToEndSliceByteBuffer-8 10.5µs ± 1% 10.6µs ± 1% +0.87% (p=0.000 n=30+30) EncodeComplex128Slice-8 1.81µs ± 0% 1.75µs ± 1% -3.23% (p=0.000 n=28+30) EncodeFloat64Slice-8 900ns ± 1% 847ns ± 0% -5.91% (p=0.000 n=29+28) EncodeInt32Slice-8 1.02µs ± 0% 0.90µs ± 0% -11.82% (p=0.000 n=28+26) EncodeStringSlice-8 1.16µs ± 1% 1.04µs ± 1% -10.20% (p=0.000 n=29+26) EncodeInterfaceSlice-8 28.7µs ± 3% 29.2µs ± 6% ~ (p=0.067 n=29+30) DecodeComplex128Slice-8 7.98µs ± 1% 7.96µs ± 1% -0.27% (p=0.017 n=30+30) DecodeFloat64Slice-8 4.33µs ± 1% 4.34µs ± 1% +0.24% (p=0.022 n=30+29) DecodeInt32Slice-8 4.18µs ± 1% 4.18µs ± 0% ~ (p=0.074 n=30+28) DecodeStringSlice-8 13.2µs ± 1% 13.1µs ± 1% -0.64% (p=0.000 n=28+28) DecodeStringsSlice-8 31.9µs ± 1% 31.8µs ± 1% -0.34% (p=0.001 n=30+30) DecodeBytesSlice-8 8.88µs ± 1% 8.84µs ± 1% -0.48% (p=0.000 n=30+30) DecodeInterfaceSlice-8 64.1µs ± 1% 64.2µs ± 1% ~ (p=0.173 n=30+28) DecodeMap-8 74.3µs ± 0% 74.2µs ± 0% ~ (p=0.131 n=29+30) Fixes #42958 Change-Id: Ie048b8976fb403d8bcc72ac6bde4b33e133e2a47 Reviewed-on: https://go-review.googlesource.com/c/go/+/349931 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
058fa255bc
commit
17c513e722
|
@ -275,14 +275,31 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
|
|||
}
|
||||
if n.X.Op() == ir.OMETHEXPR {
|
||||
if meth := ir.MethodExprName(n.X); meth != nil {
|
||||
fn := meth.Func
|
||||
if fn != nil && types.IsRuntimePkg(fn.Sym().Pkg) && fn.Sym().Name == "heapBits.nextArena" {
|
||||
// Special case: explicitly allow
|
||||
// mid-stack inlining of
|
||||
// runtime.heapBits.next even though
|
||||
// it calls slow-path
|
||||
// runtime.heapBits.nextArena.
|
||||
break
|
||||
if fn := meth.Func; fn != nil {
|
||||
s := fn.Sym()
|
||||
var cheap bool
|
||||
if types.IsRuntimePkg(s.Pkg) && s.Name == "heapBits.nextArena" {
|
||||
// Special case: explicitly allow mid-stack inlining of
|
||||
// runtime.heapBits.next even though it calls slow-path
|
||||
// runtime.heapBits.nextArena.
|
||||
cheap = true
|
||||
}
|
||||
// Special case: on architectures that can do unaligned loads,
|
||||
// explicitly mark encoding/binary methods as cheap,
|
||||
// because in practice they are, even though our inlining
|
||||
// budgeting system does not see that. See issue 42958.
|
||||
if base.Ctxt.Arch.CanMergeLoads && s.Pkg.Path == "encoding/binary" {
|
||||
switch s.Name {
|
||||
case "littleEndian.Uint64", "littleEndian.Uint32", "littleEndian.Uint16",
|
||||
"bigEndian.Uint64", "bigEndian.Uint32", "bigEndian.Uint16",
|
||||
"littleEndian.PutUint64", "littleEndian.PutUint32", "littleEndian.PutUint16",
|
||||
"bigEndian.PutUint64", "bigEndian.PutUint32", "bigEndian.PutUint16":
|
||||
cheap = true
|
||||
}
|
||||
}
|
||||
if cheap {
|
||||
break // treat like any other node, that is, cost of 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
package walk
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"go/constant"
|
||||
|
||||
"cmd/compile/internal/base"
|
||||
|
@ -14,7 +13,6 @@ import (
|
|||
"cmd/compile/internal/ssagen"
|
||||
"cmd/compile/internal/typecheck"
|
||||
"cmd/compile/internal/types"
|
||||
"cmd/internal/sys"
|
||||
)
|
||||
|
||||
// The result of walkCompare MUST be assigned back to n, e.g.
|
||||
|
@ -81,7 +79,7 @@ func walkCompare(n *ir.BinaryExpr, init *ir.Nodes) ir.Node {
|
|||
var inline bool
|
||||
|
||||
maxcmpsize := int64(4)
|
||||
unalignedLoad := canMergeLoads()
|
||||
unalignedLoad := ssagen.Arch.LinkArch.CanMergeLoads
|
||||
if unalignedLoad {
|
||||
// Keep this low enough to generate less code than a function call.
|
||||
maxcmpsize = 2 * int64(ssagen.Arch.LinkArch.RegSize)
|
||||
|
@ -311,7 +309,7 @@ func walkCompareString(n *ir.BinaryExpr, init *ir.Nodes) ir.Node {
|
|||
maxRewriteLen := 6
|
||||
// Some architectures can load unaligned byte sequence as 1 word.
|
||||
// So we can cover longer strings with the same amount of code.
|
||||
canCombineLoads := canMergeLoads()
|
||||
canCombineLoads := ssagen.Arch.LinkArch.CanMergeLoads
|
||||
combine64bit := false
|
||||
if canCombineLoads {
|
||||
// Keep this low enough to generate less code than a function call.
|
||||
|
@ -491,18 +489,3 @@ func tracecmpArg(n ir.Node, t *types.Type, init *ir.Nodes) ir.Node {
|
|||
|
||||
return typecheck.Conv(n, t)
|
||||
}
|
||||
|
||||
// canMergeLoads reports whether the backend optimization passes for
|
||||
// the current architecture can combine adjacent loads into a single
|
||||
// larger, possibly unaligned, load. Note that currently the
|
||||
// optimizations must be able to handle little endian byte order.
|
||||
func canMergeLoads() bool {
|
||||
switch ssagen.Arch.LinkArch.Family {
|
||||
case sys.ARM64, sys.AMD64, sys.I386, sys.S390X:
|
||||
return true
|
||||
case sys.PPC64:
|
||||
// Load combining only supported on ppc64le.
|
||||
return ssagen.Arch.LinkArch.ByteOrder == binary.LittleEndian
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -47,6 +47,11 @@ type Arch struct {
|
|||
// Loads or stores smaller than Alignment must be naturally aligned.
|
||||
// Loads or stores larger than Alignment need only be Alignment-aligned.
|
||||
Alignment int8
|
||||
|
||||
// CanMergeLoads reports whether the backend optimization passes
|
||||
// can combine adjacent loads into a single larger, possibly unaligned, load.
|
||||
// Note that currently the optimizations must be able to handle little endian byte order.
|
||||
CanMergeLoads bool
|
||||
}
|
||||
|
||||
// InFamily reports whether a is a member of any of the specified
|
||||
|
@ -61,143 +66,157 @@ func (a *Arch) InFamily(xs ...ArchFamily) bool {
|
|||
}
|
||||
|
||||
var Arch386 = &Arch{
|
||||
Name: "386",
|
||||
Family: I386,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 1,
|
||||
Alignment: 1,
|
||||
Name: "386",
|
||||
Family: I386,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 1,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: true,
|
||||
}
|
||||
|
||||
var ArchAMD64 = &Arch{
|
||||
Name: "amd64",
|
||||
Family: AMD64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 1,
|
||||
Alignment: 1,
|
||||
Name: "amd64",
|
||||
Family: AMD64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 1,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: true,
|
||||
}
|
||||
|
||||
var ArchARM = &Arch{
|
||||
Name: "arm",
|
||||
Family: ARM,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 4,
|
||||
Alignment: 4, // TODO: just for arm5?
|
||||
Name: "arm",
|
||||
Family: ARM,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 4,
|
||||
Alignment: 4, // TODO: just for arm5?
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchARM64 = &Arch{
|
||||
Name: "arm64",
|
||||
Family: ARM64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 1,
|
||||
Name: "arm64",
|
||||
Family: ARM64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: true,
|
||||
}
|
||||
|
||||
var ArchLoong64 = &Arch{
|
||||
Name: "loong64",
|
||||
Family: Loong64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8, // Unaligned accesses are not guaranteed to be fast
|
||||
Name: "loong64",
|
||||
Family: Loong64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8, // Unaligned accesses are not guaranteed to be fast
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchMIPS = &Arch{
|
||||
Name: "mips",
|
||||
Family: MIPS,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 4,
|
||||
Alignment: 4,
|
||||
Name: "mips",
|
||||
Family: MIPS,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 4,
|
||||
Alignment: 4,
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchMIPSLE = &Arch{
|
||||
Name: "mipsle",
|
||||
Family: MIPS,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 4,
|
||||
Alignment: 4,
|
||||
Name: "mipsle",
|
||||
Family: MIPS,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 4,
|
||||
RegSize: 4,
|
||||
MinLC: 4,
|
||||
Alignment: 4,
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchMIPS64 = &Arch{
|
||||
Name: "mips64",
|
||||
Family: MIPS64,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8,
|
||||
Name: "mips64",
|
||||
Family: MIPS64,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8,
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchMIPS64LE = &Arch{
|
||||
Name: "mips64le",
|
||||
Family: MIPS64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8,
|
||||
Name: "mips64le",
|
||||
Family: MIPS64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8,
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchPPC64 = &Arch{
|
||||
Name: "ppc64",
|
||||
Family: PPC64,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 1,
|
||||
Name: "ppc64",
|
||||
Family: PPC64,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchPPC64LE = &Arch{
|
||||
Name: "ppc64le",
|
||||
Family: PPC64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 1,
|
||||
Name: "ppc64le",
|
||||
Family: PPC64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: true,
|
||||
}
|
||||
|
||||
var ArchRISCV64 = &Arch{
|
||||
Name: "riscv64",
|
||||
Family: RISCV64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8, // riscv unaligned loads work, but are really slow (trap + simulated by OS)
|
||||
Name: "riscv64",
|
||||
Family: RISCV64,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 4,
|
||||
Alignment: 8, // riscv unaligned loads work, but are really slow (trap + simulated by OS)
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var ArchS390X = &Arch{
|
||||
Name: "s390x",
|
||||
Family: S390X,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 2,
|
||||
Alignment: 1,
|
||||
Name: "s390x",
|
||||
Family: S390X,
|
||||
ByteOrder: binary.BigEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 2,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: true,
|
||||
}
|
||||
|
||||
var ArchWasm = &Arch{
|
||||
Name: "wasm",
|
||||
Family: Wasm,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 1,
|
||||
Alignment: 1,
|
||||
Name: "wasm",
|
||||
Family: Wasm,
|
||||
ByteOrder: binary.LittleEndian,
|
||||
PtrSize: 8,
|
||||
RegSize: 8,
|
||||
MinLC: 1,
|
||||
Alignment: 1,
|
||||
CanMergeLoads: false,
|
||||
}
|
||||
|
||||
var Archs = [...]*Arch{
|
||||
|
|
22
test/inline_endian.go
Normal file
22
test/inline_endian.go
Normal file
|
@ -0,0 +1,22 @@
|
|||
// errorcheckwithauto -0 -m -d=inlfuncswithclosures=1
|
||||
|
||||
//go:build 386 || amd64 || arm64 || ppc64le || s390x
|
||||
// +build 386 amd64 arm64 ppc64le s390x
|
||||
|
||||
// Copyright 2021 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Similar to inline.go, but only for architectures that can merge loads.
|
||||
|
||||
package foo
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
)
|
||||
|
||||
// Ensure that simple encoding/binary functions are cheap enough
|
||||
// that functions using them can also be inlined (issue 42958).
|
||||
func endian(b []byte) uint64 { // ERROR "can inline endian" "b does not escape"
|
||||
return binary.LittleEndian.Uint64(b) + binary.BigEndian.Uint64(b) // ERROR "inlining call to binary.littleEndian.Uint64" "inlining call to binary.bigEndian.Uint64"
|
||||
}
|
Loading…
Reference in a new issue