go/test/writebarrier.go

292 lines
5.6 KiB
Go
Raw Normal View History

// errorcheck -0 -l -d=wb
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test where write barriers are and are not emitted.
package p
import "unsafe"
func f(x **byte, y *byte) {
*x = y // no barrier (dead store)
z := y // no barrier
*x = z // ERROR "write barrier"
}
func f1(x *[]byte, y []byte) {
*x = y // no barrier (dead store)
z := y // no barrier
*x = z // ERROR "write barrier"
}
func f1a(x *[]byte, y *[]byte) {
*x = *y // ERROR "write barrier"
z := *y // no barrier
cmd/internal/gc: optimize append + write barrier The code generated for x = append(x, v) is roughly: t := x if len(t)+1 > cap(t) { t = grow(t) } t[len(t)] = v len(t)++ x = t We used to generate this code as Go pseudocode during walk. Generate it instead as actual instructions during gen. Doing so lets us apply a few optimizations. The most important is that when, as in the above example, the source slice and the destination slice are the same, the code can instead do: t := x if len(t)+1 > cap(t) { t = grow(t) x = {base(t), len(t)+1, cap(t)} } else { len(x)++ } t[len(t)] = v That is, in the fast path that does not reallocate the array, only the updated length needs to be written back to x, not the array pointer and not the capacity. This is more like what you'd write by hand in C. It's faster in general, since the fast path elides two of the three stores, but it's especially faster when the form of x is such that the base pointer write would turn into a write barrier. No write, no barrier. name old mean new mean delta BinaryTree17 5.68s × (0.97,1.04) 5.81s × (0.98,1.03) +2.35% (p=0.023) Fannkuch11 4.41s × (0.98,1.03) 4.35s × (1.00,1.00) ~ (p=0.090) FmtFprintfEmpty 92.7ns × (0.91,1.16) 86.0ns × (0.94,1.11) -7.31% (p=0.038) FmtFprintfString 281ns × (0.96,1.08) 276ns × (0.98,1.04) ~ (p=0.219) FmtFprintfInt 288ns × (0.97,1.06) 274ns × (0.98,1.06) -4.94% (p=0.002) FmtFprintfIntInt 493ns × (0.97,1.04) 506ns × (0.99,1.01) +2.65% (p=0.009) FmtFprintfPrefixedInt 423ns × (0.97,1.04) 391ns × (0.99,1.01) -7.52% (p=0.000) FmtFprintfFloat 598ns × (0.99,1.01) 566ns × (0.99,1.01) -5.27% (p=0.000) FmtManyArgs 1.89µs × (0.98,1.05) 1.91µs × (0.99,1.01) ~ (p=0.231) GobDecode 14.8ms × (0.98,1.03) 15.3ms × (0.99,1.02) +3.01% (p=0.000) GobEncode 12.3ms × (0.98,1.01) 11.5ms × (0.97,1.03) -5.93% (p=0.000) Gzip 656ms × (0.99,1.05) 645ms × (0.99,1.01) ~ (p=0.055) Gunzip 142ms × (1.00,1.00) 142ms × (1.00,1.00) -0.32% (p=0.034) HTTPClientServer 91.2µs × (0.97,1.04) 90.5µs × (0.97,1.04) ~ (p=0.468) JSONEncode 32.6ms × (0.97,1.08) 32.0ms × (0.98,1.03) ~ (p=0.190) JSONDecode 114ms × (0.97,1.05) 114ms × (0.99,1.01) ~ (p=0.887) Mandelbrot200 6.11ms × (0.98,1.04) 6.04ms × (1.00,1.01) ~ (p=0.167) GoParse 6.66ms × (0.97,1.04) 6.47ms × (0.97,1.05) -2.81% (p=0.014) RegexpMatchEasy0_32 159ns × (0.99,1.00) 171ns × (0.93,1.07) +7.19% (p=0.002) RegexpMatchEasy0_1K 538ns × (1.00,1.01) 550ns × (0.98,1.01) +2.30% (p=0.000) RegexpMatchEasy1_32 138ns × (1.00,1.00) 135ns × (0.99,1.02) -1.60% (p=0.000) RegexpMatchEasy1_1K 869ns × (0.99,1.01) 879ns × (1.00,1.01) +1.08% (p=0.000) RegexpMatchMedium_32 252ns × (0.99,1.01) 243ns × (1.00,1.00) -3.71% (p=0.000) RegexpMatchMedium_1K 72.7µs × (1.00,1.00) 70.3µs × (1.00,1.00) -3.34% (p=0.000) RegexpMatchHard_32 3.85µs × (1.00,1.00) 3.82µs × (1.00,1.01) -0.81% (p=0.000) RegexpMatchHard_1K 118µs × (1.00,1.00) 117µs × (1.00,1.00) -0.56% (p=0.000) Revcomp 920ms × (0.97,1.07) 917ms × (0.97,1.04) ~ (p=0.808) Template 129ms × (0.98,1.03) 114ms × (0.99,1.01) -12.06% (p=0.000) TimeParse 619ns × (0.99,1.01) 622ns × (0.99,1.01) ~ (p=0.062) TimeFormat 661ns × (0.98,1.04) 665ns × (0.99,1.01) ~ (p=0.524) See next CL for combination with a similar optimization for slice. The benchmarks that are slower in this CL are still faster overall with the combination of the two. Change-Id: I2a7421658091b2488c64741b4db15ab6c3b4cb7e Reviewed-on: https://go-review.googlesource.com/9812 Reviewed-by: David Chase <drchase@google.com>
2015-05-06 16:34:30 +00:00
*x = z // ERROR "write barrier"
}
func f2(x *interface{}, y interface{}) {
*x = y // no barrier (dead store)
z := y // no barrier
*x = z // ERROR "write barrier"
}
func f2a(x *interface{}, y *interface{}) {
*x = *y // no barrier (dead store)
z := y // no barrier
*x = z // ERROR "write barrier"
}
func f3(x *string, y string) {
*x = y // no barrier (dead store)
z := y // no barrier
*x = z // ERROR "write barrier"
}
func f3a(x *string, y *string) {
*x = *y // ERROR "write barrier"
z := *y // no barrier
cmd/internal/gc: optimize append + write barrier The code generated for x = append(x, v) is roughly: t := x if len(t)+1 > cap(t) { t = grow(t) } t[len(t)] = v len(t)++ x = t We used to generate this code as Go pseudocode during walk. Generate it instead as actual instructions during gen. Doing so lets us apply a few optimizations. The most important is that when, as in the above example, the source slice and the destination slice are the same, the code can instead do: t := x if len(t)+1 > cap(t) { t = grow(t) x = {base(t), len(t)+1, cap(t)} } else { len(x)++ } t[len(t)] = v That is, in the fast path that does not reallocate the array, only the updated length needs to be written back to x, not the array pointer and not the capacity. This is more like what you'd write by hand in C. It's faster in general, since the fast path elides two of the three stores, but it's especially faster when the form of x is such that the base pointer write would turn into a write barrier. No write, no barrier. name old mean new mean delta BinaryTree17 5.68s × (0.97,1.04) 5.81s × (0.98,1.03) +2.35% (p=0.023) Fannkuch11 4.41s × (0.98,1.03) 4.35s × (1.00,1.00) ~ (p=0.090) FmtFprintfEmpty 92.7ns × (0.91,1.16) 86.0ns × (0.94,1.11) -7.31% (p=0.038) FmtFprintfString 281ns × (0.96,1.08) 276ns × (0.98,1.04) ~ (p=0.219) FmtFprintfInt 288ns × (0.97,1.06) 274ns × (0.98,1.06) -4.94% (p=0.002) FmtFprintfIntInt 493ns × (0.97,1.04) 506ns × (0.99,1.01) +2.65% (p=0.009) FmtFprintfPrefixedInt 423ns × (0.97,1.04) 391ns × (0.99,1.01) -7.52% (p=0.000) FmtFprintfFloat 598ns × (0.99,1.01) 566ns × (0.99,1.01) -5.27% (p=0.000) FmtManyArgs 1.89µs × (0.98,1.05) 1.91µs × (0.99,1.01) ~ (p=0.231) GobDecode 14.8ms × (0.98,1.03) 15.3ms × (0.99,1.02) +3.01% (p=0.000) GobEncode 12.3ms × (0.98,1.01) 11.5ms × (0.97,1.03) -5.93% (p=0.000) Gzip 656ms × (0.99,1.05) 645ms × (0.99,1.01) ~ (p=0.055) Gunzip 142ms × (1.00,1.00) 142ms × (1.00,1.00) -0.32% (p=0.034) HTTPClientServer 91.2µs × (0.97,1.04) 90.5µs × (0.97,1.04) ~ (p=0.468) JSONEncode 32.6ms × (0.97,1.08) 32.0ms × (0.98,1.03) ~ (p=0.190) JSONDecode 114ms × (0.97,1.05) 114ms × (0.99,1.01) ~ (p=0.887) Mandelbrot200 6.11ms × (0.98,1.04) 6.04ms × (1.00,1.01) ~ (p=0.167) GoParse 6.66ms × (0.97,1.04) 6.47ms × (0.97,1.05) -2.81% (p=0.014) RegexpMatchEasy0_32 159ns × (0.99,1.00) 171ns × (0.93,1.07) +7.19% (p=0.002) RegexpMatchEasy0_1K 538ns × (1.00,1.01) 550ns × (0.98,1.01) +2.30% (p=0.000) RegexpMatchEasy1_32 138ns × (1.00,1.00) 135ns × (0.99,1.02) -1.60% (p=0.000) RegexpMatchEasy1_1K 869ns × (0.99,1.01) 879ns × (1.00,1.01) +1.08% (p=0.000) RegexpMatchMedium_32 252ns × (0.99,1.01) 243ns × (1.00,1.00) -3.71% (p=0.000) RegexpMatchMedium_1K 72.7µs × (1.00,1.00) 70.3µs × (1.00,1.00) -3.34% (p=0.000) RegexpMatchHard_32 3.85µs × (1.00,1.00) 3.82µs × (1.00,1.01) -0.81% (p=0.000) RegexpMatchHard_1K 118µs × (1.00,1.00) 117µs × (1.00,1.00) -0.56% (p=0.000) Revcomp 920ms × (0.97,1.07) 917ms × (0.97,1.04) ~ (p=0.808) Template 129ms × (0.98,1.03) 114ms × (0.99,1.01) -12.06% (p=0.000) TimeParse 619ns × (0.99,1.01) 622ns × (0.99,1.01) ~ (p=0.062) TimeFormat 661ns × (0.98,1.04) 665ns × (0.99,1.01) ~ (p=0.524) See next CL for combination with a similar optimization for slice. The benchmarks that are slower in this CL are still faster overall with the combination of the two. Change-Id: I2a7421658091b2488c64741b4db15ab6c3b4cb7e Reviewed-on: https://go-review.googlesource.com/9812 Reviewed-by: David Chase <drchase@google.com>
2015-05-06 16:34:30 +00:00
*x = z // ERROR "write barrier"
}
func f4(x *[2]string, y [2]string) {
*x = y // ERROR "write barrier"
z := y // no barrier
*x = z // ERROR "write barrier"
}
func f4a(x *[2]string, y *[2]string) {
*x = *y // ERROR "write barrier"
z := *y // no barrier
cmd/internal/gc: optimize append + write barrier The code generated for x = append(x, v) is roughly: t := x if len(t)+1 > cap(t) { t = grow(t) } t[len(t)] = v len(t)++ x = t We used to generate this code as Go pseudocode during walk. Generate it instead as actual instructions during gen. Doing so lets us apply a few optimizations. The most important is that when, as in the above example, the source slice and the destination slice are the same, the code can instead do: t := x if len(t)+1 > cap(t) { t = grow(t) x = {base(t), len(t)+1, cap(t)} } else { len(x)++ } t[len(t)] = v That is, in the fast path that does not reallocate the array, only the updated length needs to be written back to x, not the array pointer and not the capacity. This is more like what you'd write by hand in C. It's faster in general, since the fast path elides two of the three stores, but it's especially faster when the form of x is such that the base pointer write would turn into a write barrier. No write, no barrier. name old mean new mean delta BinaryTree17 5.68s × (0.97,1.04) 5.81s × (0.98,1.03) +2.35% (p=0.023) Fannkuch11 4.41s × (0.98,1.03) 4.35s × (1.00,1.00) ~ (p=0.090) FmtFprintfEmpty 92.7ns × (0.91,1.16) 86.0ns × (0.94,1.11) -7.31% (p=0.038) FmtFprintfString 281ns × (0.96,1.08) 276ns × (0.98,1.04) ~ (p=0.219) FmtFprintfInt 288ns × (0.97,1.06) 274ns × (0.98,1.06) -4.94% (p=0.002) FmtFprintfIntInt 493ns × (0.97,1.04) 506ns × (0.99,1.01) +2.65% (p=0.009) FmtFprintfPrefixedInt 423ns × (0.97,1.04) 391ns × (0.99,1.01) -7.52% (p=0.000) FmtFprintfFloat 598ns × (0.99,1.01) 566ns × (0.99,1.01) -5.27% (p=0.000) FmtManyArgs 1.89µs × (0.98,1.05) 1.91µs × (0.99,1.01) ~ (p=0.231) GobDecode 14.8ms × (0.98,1.03) 15.3ms × (0.99,1.02) +3.01% (p=0.000) GobEncode 12.3ms × (0.98,1.01) 11.5ms × (0.97,1.03) -5.93% (p=0.000) Gzip 656ms × (0.99,1.05) 645ms × (0.99,1.01) ~ (p=0.055) Gunzip 142ms × (1.00,1.00) 142ms × (1.00,1.00) -0.32% (p=0.034) HTTPClientServer 91.2µs × (0.97,1.04) 90.5µs × (0.97,1.04) ~ (p=0.468) JSONEncode 32.6ms × (0.97,1.08) 32.0ms × (0.98,1.03) ~ (p=0.190) JSONDecode 114ms × (0.97,1.05) 114ms × (0.99,1.01) ~ (p=0.887) Mandelbrot200 6.11ms × (0.98,1.04) 6.04ms × (1.00,1.01) ~ (p=0.167) GoParse 6.66ms × (0.97,1.04) 6.47ms × (0.97,1.05) -2.81% (p=0.014) RegexpMatchEasy0_32 159ns × (0.99,1.00) 171ns × (0.93,1.07) +7.19% (p=0.002) RegexpMatchEasy0_1K 538ns × (1.00,1.01) 550ns × (0.98,1.01) +2.30% (p=0.000) RegexpMatchEasy1_32 138ns × (1.00,1.00) 135ns × (0.99,1.02) -1.60% (p=0.000) RegexpMatchEasy1_1K 869ns × (0.99,1.01) 879ns × (1.00,1.01) +1.08% (p=0.000) RegexpMatchMedium_32 252ns × (0.99,1.01) 243ns × (1.00,1.00) -3.71% (p=0.000) RegexpMatchMedium_1K 72.7µs × (1.00,1.00) 70.3µs × (1.00,1.00) -3.34% (p=0.000) RegexpMatchHard_32 3.85µs × (1.00,1.00) 3.82µs × (1.00,1.01) -0.81% (p=0.000) RegexpMatchHard_1K 118µs × (1.00,1.00) 117µs × (1.00,1.00) -0.56% (p=0.000) Revcomp 920ms × (0.97,1.07) 917ms × (0.97,1.04) ~ (p=0.808) Template 129ms × (0.98,1.03) 114ms × (0.99,1.01) -12.06% (p=0.000) TimeParse 619ns × (0.99,1.01) 622ns × (0.99,1.01) ~ (p=0.062) TimeFormat 661ns × (0.98,1.04) 665ns × (0.99,1.01) ~ (p=0.524) See next CL for combination with a similar optimization for slice. The benchmarks that are slower in this CL are still faster overall with the combination of the two. Change-Id: I2a7421658091b2488c64741b4db15ab6c3b4cb7e Reviewed-on: https://go-review.googlesource.com/9812 Reviewed-by: David Chase <drchase@google.com>
2015-05-06 16:34:30 +00:00
*x = z // ERROR "write barrier"
}
type T struct {
X *int
Y int
M map[int]int
}
func f5(t, u *T) {
t.X = &u.Y // ERROR "write barrier"
}
func f6(t *T) {
t.M = map[int]int{1: 2} // ERROR "write barrier"
}
func f7(x, y *int) []*int {
var z [3]*int
i := 0
z[i] = x // ERROR "write barrier"
i++
z[i] = y // ERROR "write barrier"
i++
return z[:i]
}
func f9(x *interface{}, v *byte) {
*x = v // ERROR "write barrier"
}
func f10(x *byte, f func(interface{})) {
f(x)
}
func f11(x *unsafe.Pointer, y unsafe.Pointer) {
*x = unsafe.Pointer(uintptr(y) + 1) // ERROR "write barrier"
}
cmd/internal/gc: optimize append + write barrier The code generated for x = append(x, v) is roughly: t := x if len(t)+1 > cap(t) { t = grow(t) } t[len(t)] = v len(t)++ x = t We used to generate this code as Go pseudocode during walk. Generate it instead as actual instructions during gen. Doing so lets us apply a few optimizations. The most important is that when, as in the above example, the source slice and the destination slice are the same, the code can instead do: t := x if len(t)+1 > cap(t) { t = grow(t) x = {base(t), len(t)+1, cap(t)} } else { len(x)++ } t[len(t)] = v That is, in the fast path that does not reallocate the array, only the updated length needs to be written back to x, not the array pointer and not the capacity. This is more like what you'd write by hand in C. It's faster in general, since the fast path elides two of the three stores, but it's especially faster when the form of x is such that the base pointer write would turn into a write barrier. No write, no barrier. name old mean new mean delta BinaryTree17 5.68s × (0.97,1.04) 5.81s × (0.98,1.03) +2.35% (p=0.023) Fannkuch11 4.41s × (0.98,1.03) 4.35s × (1.00,1.00) ~ (p=0.090) FmtFprintfEmpty 92.7ns × (0.91,1.16) 86.0ns × (0.94,1.11) -7.31% (p=0.038) FmtFprintfString 281ns × (0.96,1.08) 276ns × (0.98,1.04) ~ (p=0.219) FmtFprintfInt 288ns × (0.97,1.06) 274ns × (0.98,1.06) -4.94% (p=0.002) FmtFprintfIntInt 493ns × (0.97,1.04) 506ns × (0.99,1.01) +2.65% (p=0.009) FmtFprintfPrefixedInt 423ns × (0.97,1.04) 391ns × (0.99,1.01) -7.52% (p=0.000) FmtFprintfFloat 598ns × (0.99,1.01) 566ns × (0.99,1.01) -5.27% (p=0.000) FmtManyArgs 1.89µs × (0.98,1.05) 1.91µs × (0.99,1.01) ~ (p=0.231) GobDecode 14.8ms × (0.98,1.03) 15.3ms × (0.99,1.02) +3.01% (p=0.000) GobEncode 12.3ms × (0.98,1.01) 11.5ms × (0.97,1.03) -5.93% (p=0.000) Gzip 656ms × (0.99,1.05) 645ms × (0.99,1.01) ~ (p=0.055) Gunzip 142ms × (1.00,1.00) 142ms × (1.00,1.00) -0.32% (p=0.034) HTTPClientServer 91.2µs × (0.97,1.04) 90.5µs × (0.97,1.04) ~ (p=0.468) JSONEncode 32.6ms × (0.97,1.08) 32.0ms × (0.98,1.03) ~ (p=0.190) JSONDecode 114ms × (0.97,1.05) 114ms × (0.99,1.01) ~ (p=0.887) Mandelbrot200 6.11ms × (0.98,1.04) 6.04ms × (1.00,1.01) ~ (p=0.167) GoParse 6.66ms × (0.97,1.04) 6.47ms × (0.97,1.05) -2.81% (p=0.014) RegexpMatchEasy0_32 159ns × (0.99,1.00) 171ns × (0.93,1.07) +7.19% (p=0.002) RegexpMatchEasy0_1K 538ns × (1.00,1.01) 550ns × (0.98,1.01) +2.30% (p=0.000) RegexpMatchEasy1_32 138ns × (1.00,1.00) 135ns × (0.99,1.02) -1.60% (p=0.000) RegexpMatchEasy1_1K 869ns × (0.99,1.01) 879ns × (1.00,1.01) +1.08% (p=0.000) RegexpMatchMedium_32 252ns × (0.99,1.01) 243ns × (1.00,1.00) -3.71% (p=0.000) RegexpMatchMedium_1K 72.7µs × (1.00,1.00) 70.3µs × (1.00,1.00) -3.34% (p=0.000) RegexpMatchHard_32 3.85µs × (1.00,1.00) 3.82µs × (1.00,1.01) -0.81% (p=0.000) RegexpMatchHard_1K 118µs × (1.00,1.00) 117µs × (1.00,1.00) -0.56% (p=0.000) Revcomp 920ms × (0.97,1.07) 917ms × (0.97,1.04) ~ (p=0.808) Template 129ms × (0.98,1.03) 114ms × (0.99,1.01) -12.06% (p=0.000) TimeParse 619ns × (0.99,1.01) 622ns × (0.99,1.01) ~ (p=0.062) TimeFormat 661ns × (0.98,1.04) 665ns × (0.99,1.01) ~ (p=0.524) See next CL for combination with a similar optimization for slice. The benchmarks that are slower in this CL are still faster overall with the combination of the two. Change-Id: I2a7421658091b2488c64741b4db15ab6c3b4cb7e Reviewed-on: https://go-review.googlesource.com/9812 Reviewed-by: David Chase <drchase@google.com>
2015-05-06 16:34:30 +00:00
func f12(x []*int, y *int) []*int {
// write barrier for storing y in x's underlying array
x = append(x, y) // ERROR "write barrier"
return x
}
func f12a(x []int, y int) []int {
// y not a pointer, so no write barriers in this function
x = append(x, y)
return x
}
func f13(x []int, y *[]int) {
*y = append(x, 1) // ERROR "write barrier"
}
func f14(y *[]int) {
*y = append(*y, 1) // ERROR "write barrier"
}
type T1 struct {
X *int
}
func f15(x []T1, y T1) []T1 {
return append(x, y) // ERROR "write barrier"
}
type T8 struct {
X [8]*int
}
func f16(x []T8, y T8) []T8 {
return append(x, y) // ERROR "write barrier"
}
func t1(i interface{}) **int {
// From issue 14306, make sure we have write barriers in a type switch
// where the assigned variable escapes.
cmd/compile: improve generated code for concrete cases in type switches Consider switch x:= x.(type) { case int: // int stmts case error: // error stmts } Prior to this change, we lowered this roughly as: if x, ok := x.(int); ok { // int stmts } else if x, ok := x.(error); ok { // error stmts } x, ok := x.(error) is implemented with a call to runtime.assertE2I2 or runtime.assertI2I2. x, ok := x.(int) generates inline code that checks whether x has type int, and populates x and ok as appropriate. We then immediately branch again on ok. The shortcircuit pass in the SSA backend is designed to recognize situations like this, in which we are immediately branching on a bool value that we just calculated with a branch. However, the shortcircuit pass has limitations when the intermediate state has phis. In this case, the phi value is x (the int). CL 222923 improved the situation, but many cases are still unhandled. I have further improvements in progress, which is how I found this particular problem, but they are expensive, and may or may not see the light of day. In the common case of a lone concrete type in a type switch case, it is easier and cheaper to simply lower a different way, roughly: if _, ok := x.(int); ok { x := x.(int) // int stmts } Instead of using a type assertion, though, we extract the value of x from the interface directly. This removes the need to track x (the int) across the branch on ok, which removes the phi, which lets the shortcircuit pass do its job. Benchmarks for encoding/binary show improvements, as well as some wild swings on the super fast benchmarks (alignment effects?): name old time/op new time/op delta ReadSlice1000Int32s-8 5.25µs ± 2% 4.87µs ± 3% -7.11% (p=0.000 n=44+49) ReadStruct-8 451ns ± 2% 417ns ± 2% -7.39% (p=0.000 n=45+46) WriteStruct-8 412ns ± 2% 405ns ± 3% -1.58% (p=0.000 n=46+48) ReadInts-8 296ns ± 8% 275ns ± 3% -7.23% (p=0.000 n=48+50) WriteInts-8 324ns ± 1% 318ns ± 2% -1.67% (p=0.000 n=44+49) WriteSlice1000Int32s-8 5.21µs ± 2% 4.92µs ± 1% -5.67% (p=0.000 n=46+44) PutUint16-8 0.58ns ± 2% 0.59ns ± 2% +0.63% (p=0.000 n=49+49) PutUint32-8 0.87ns ± 1% 0.58ns ± 1% -33.10% (p=0.000 n=46+44) PutUint64-8 0.66ns ± 2% 0.87ns ± 2% +33.07% (p=0.000 n=47+48) LittleEndianPutUint16-8 0.86ns ± 2% 0.87ns ± 2% +0.55% (p=0.003 n=47+50) LittleEndianPutUint32-8 0.87ns ± 1% 0.87ns ± 1% ~ (p=0.547 n=45+47) LittleEndianPutUint64-8 0.87ns ± 2% 0.87ns ± 1% ~ (p=0.451 n=46+47) ReadFloats-8 79.8ns ± 5% 75.9ns ± 2% -4.83% (p=0.000 n=50+47) WriteFloats-8 89.3ns ± 1% 88.9ns ± 1% -0.48% (p=0.000 n=46+44) ReadSlice1000Float32s-8 5.51µs ± 1% 4.87µs ± 2% -11.74% (p=0.000 n=47+46) WriteSlice1000Float32s-8 5.51µs ± 1% 4.93µs ± 1% -10.60% (p=0.000 n=48+47) PutUvarint32-8 25.9ns ± 2% 24.0ns ± 2% -7.02% (p=0.000 n=48+50) PutUvarint64-8 75.1ns ± 1% 61.5ns ± 2% -18.12% (p=0.000 n=45+47) [Geo mean] 57.3ns 54.3ns -5.33% Despite the rarity of type switches, this generates noticeably smaller binaries. file before after Δ % addr2line 4413296 4409200 -4096 -0.093% api 5982648 5962168 -20480 -0.342% cgo 4854168 4833688 -20480 -0.422% compile 19694784 19682560 -12224 -0.062% cover 5278008 5265720 -12288 -0.233% doc 4694824 4682536 -12288 -0.262% fix 3411336 3394952 -16384 -0.480% link 6721496 6717400 -4096 -0.061% nm 4371152 4358864 -12288 -0.281% objdump 4760960 4752768 -8192 -0.172% pprof 14810820 14790340 -20480 -0.138% trace 11681076 11668788 -12288 -0.105% vet 8285464 8244504 -40960 -0.494% total 115824120 115627576 -196544 -0.170% Compiler performance is marginally improved (note that go/types has many type switches): name old alloc/op new alloc/op delta Template 35.0MB ± 0% 35.0MB ± 0% +0.09% (p=0.008 n=5+5) Unicode 28.5MB ± 0% 28.5MB ± 0% ~ (p=0.548 n=5+5) GoTypes 114MB ± 0% 114MB ± 0% -0.76% (p=0.008 n=5+5) Compiler 541MB ± 0% 541MB ± 0% -0.03% (p=0.008 n=5+5) SSA 1.17GB ± 0% 1.17GB ± 0% ~ (p=0.841 n=5+5) Flate 21.9MB ± 0% 21.9MB ± 0% ~ (p=0.421 n=5+5) GoParser 26.9MB ± 0% 26.9MB ± 0% ~ (p=0.222 n=5+5) Reflect 74.6MB ± 0% 74.6MB ± 0% ~ (p=1.000 n=5+5) Tar 32.9MB ± 0% 32.8MB ± 0% ~ (p=0.056 n=5+5) XML 42.4MB ± 0% 42.1MB ± 0% -0.77% (p=0.008 n=5+5) [Geo mean] 73.2MB 73.1MB -0.15% name old allocs/op new allocs/op delta Template 377k ± 0% 377k ± 0% +0.06% (p=0.008 n=5+5) Unicode 354k ± 0% 354k ± 0% ~ (p=0.095 n=5+5) GoTypes 1.31M ± 0% 1.30M ± 0% -0.73% (p=0.008 n=5+5) Compiler 5.44M ± 0% 5.44M ± 0% -0.04% (p=0.008 n=5+5) SSA 11.7M ± 0% 11.7M ± 0% ~ (p=1.000 n=5+5) Flate 239k ± 0% 239k ± 0% ~ (p=1.000 n=5+5) GoParser 302k ± 0% 302k ± 0% -0.04% (p=0.008 n=5+5) Reflect 977k ± 0% 977k ± 0% ~ (p=0.690 n=5+5) Tar 346k ± 0% 346k ± 0% ~ (p=0.889 n=5+5) XML 431k ± 0% 430k ± 0% -0.25% (p=0.008 n=5+5) [Geo mean] 806k 806k -0.10% For packages with many type switches, this considerably shrinks function text size. Some examples: file before after Δ % encoding/binary.s 30726 29504 -1222 -3.977% go/printer.s 77597 76005 -1592 -2.052% cmd/vendor/golang.org/x/tools/go/ast/astutil.s 65704 63318 -2386 -3.631% cmd/vendor/golang.org/x/tools/go/analysis/passes/unreachable.s 8047 7714 -333 -4.138% Text size regressions are rare. Change-Id: Ic10982bbb04876250eaa5bfee97990141ae5fc28 Reviewed-on: https://go-review.googlesource.com/c/go/+/228106 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cuong Manh Le <cuong.manhle.vn@gmail.com> Reviewed-by: Keith Randall <khr@golang.org>
2020-04-13 00:34:33 +00:00
switch x := i.(type) {
case *int: // ERROR "write barrier"
return &x
}
cmd/compile: improve generated code for concrete cases in type switches Consider switch x:= x.(type) { case int: // int stmts case error: // error stmts } Prior to this change, we lowered this roughly as: if x, ok := x.(int); ok { // int stmts } else if x, ok := x.(error); ok { // error stmts } x, ok := x.(error) is implemented with a call to runtime.assertE2I2 or runtime.assertI2I2. x, ok := x.(int) generates inline code that checks whether x has type int, and populates x and ok as appropriate. We then immediately branch again on ok. The shortcircuit pass in the SSA backend is designed to recognize situations like this, in which we are immediately branching on a bool value that we just calculated with a branch. However, the shortcircuit pass has limitations when the intermediate state has phis. In this case, the phi value is x (the int). CL 222923 improved the situation, but many cases are still unhandled. I have further improvements in progress, which is how I found this particular problem, but they are expensive, and may or may not see the light of day. In the common case of a lone concrete type in a type switch case, it is easier and cheaper to simply lower a different way, roughly: if _, ok := x.(int); ok { x := x.(int) // int stmts } Instead of using a type assertion, though, we extract the value of x from the interface directly. This removes the need to track x (the int) across the branch on ok, which removes the phi, which lets the shortcircuit pass do its job. Benchmarks for encoding/binary show improvements, as well as some wild swings on the super fast benchmarks (alignment effects?): name old time/op new time/op delta ReadSlice1000Int32s-8 5.25µs ± 2% 4.87µs ± 3% -7.11% (p=0.000 n=44+49) ReadStruct-8 451ns ± 2% 417ns ± 2% -7.39% (p=0.000 n=45+46) WriteStruct-8 412ns ± 2% 405ns ± 3% -1.58% (p=0.000 n=46+48) ReadInts-8 296ns ± 8% 275ns ± 3% -7.23% (p=0.000 n=48+50) WriteInts-8 324ns ± 1% 318ns ± 2% -1.67% (p=0.000 n=44+49) WriteSlice1000Int32s-8 5.21µs ± 2% 4.92µs ± 1% -5.67% (p=0.000 n=46+44) PutUint16-8 0.58ns ± 2% 0.59ns ± 2% +0.63% (p=0.000 n=49+49) PutUint32-8 0.87ns ± 1% 0.58ns ± 1% -33.10% (p=0.000 n=46+44) PutUint64-8 0.66ns ± 2% 0.87ns ± 2% +33.07% (p=0.000 n=47+48) LittleEndianPutUint16-8 0.86ns ± 2% 0.87ns ± 2% +0.55% (p=0.003 n=47+50) LittleEndianPutUint32-8 0.87ns ± 1% 0.87ns ± 1% ~ (p=0.547 n=45+47) LittleEndianPutUint64-8 0.87ns ± 2% 0.87ns ± 1% ~ (p=0.451 n=46+47) ReadFloats-8 79.8ns ± 5% 75.9ns ± 2% -4.83% (p=0.000 n=50+47) WriteFloats-8 89.3ns ± 1% 88.9ns ± 1% -0.48% (p=0.000 n=46+44) ReadSlice1000Float32s-8 5.51µs ± 1% 4.87µs ± 2% -11.74% (p=0.000 n=47+46) WriteSlice1000Float32s-8 5.51µs ± 1% 4.93µs ± 1% -10.60% (p=0.000 n=48+47) PutUvarint32-8 25.9ns ± 2% 24.0ns ± 2% -7.02% (p=0.000 n=48+50) PutUvarint64-8 75.1ns ± 1% 61.5ns ± 2% -18.12% (p=0.000 n=45+47) [Geo mean] 57.3ns 54.3ns -5.33% Despite the rarity of type switches, this generates noticeably smaller binaries. file before after Δ % addr2line 4413296 4409200 -4096 -0.093% api 5982648 5962168 -20480 -0.342% cgo 4854168 4833688 -20480 -0.422% compile 19694784 19682560 -12224 -0.062% cover 5278008 5265720 -12288 -0.233% doc 4694824 4682536 -12288 -0.262% fix 3411336 3394952 -16384 -0.480% link 6721496 6717400 -4096 -0.061% nm 4371152 4358864 -12288 -0.281% objdump 4760960 4752768 -8192 -0.172% pprof 14810820 14790340 -20480 -0.138% trace 11681076 11668788 -12288 -0.105% vet 8285464 8244504 -40960 -0.494% total 115824120 115627576 -196544 -0.170% Compiler performance is marginally improved (note that go/types has many type switches): name old alloc/op new alloc/op delta Template 35.0MB ± 0% 35.0MB ± 0% +0.09% (p=0.008 n=5+5) Unicode 28.5MB ± 0% 28.5MB ± 0% ~ (p=0.548 n=5+5) GoTypes 114MB ± 0% 114MB ± 0% -0.76% (p=0.008 n=5+5) Compiler 541MB ± 0% 541MB ± 0% -0.03% (p=0.008 n=5+5) SSA 1.17GB ± 0% 1.17GB ± 0% ~ (p=0.841 n=5+5) Flate 21.9MB ± 0% 21.9MB ± 0% ~ (p=0.421 n=5+5) GoParser 26.9MB ± 0% 26.9MB ± 0% ~ (p=0.222 n=5+5) Reflect 74.6MB ± 0% 74.6MB ± 0% ~ (p=1.000 n=5+5) Tar 32.9MB ± 0% 32.8MB ± 0% ~ (p=0.056 n=5+5) XML 42.4MB ± 0% 42.1MB ± 0% -0.77% (p=0.008 n=5+5) [Geo mean] 73.2MB 73.1MB -0.15% name old allocs/op new allocs/op delta Template 377k ± 0% 377k ± 0% +0.06% (p=0.008 n=5+5) Unicode 354k ± 0% 354k ± 0% ~ (p=0.095 n=5+5) GoTypes 1.31M ± 0% 1.30M ± 0% -0.73% (p=0.008 n=5+5) Compiler 5.44M ± 0% 5.44M ± 0% -0.04% (p=0.008 n=5+5) SSA 11.7M ± 0% 11.7M ± 0% ~ (p=1.000 n=5+5) Flate 239k ± 0% 239k ± 0% ~ (p=1.000 n=5+5) GoParser 302k ± 0% 302k ± 0% -0.04% (p=0.008 n=5+5) Reflect 977k ± 0% 977k ± 0% ~ (p=0.690 n=5+5) Tar 346k ± 0% 346k ± 0% ~ (p=0.889 n=5+5) XML 431k ± 0% 430k ± 0% -0.25% (p=0.008 n=5+5) [Geo mean] 806k 806k -0.10% For packages with many type switches, this considerably shrinks function text size. Some examples: file before after Δ % encoding/binary.s 30726 29504 -1222 -3.977% go/printer.s 77597 76005 -1592 -2.052% cmd/vendor/golang.org/x/tools/go/ast/astutil.s 65704 63318 -2386 -3.631% cmd/vendor/golang.org/x/tools/go/analysis/passes/unreachable.s 8047 7714 -333 -4.138% Text size regressions are rare. Change-Id: Ic10982bbb04876250eaa5bfee97990141ae5fc28 Reviewed-on: https://go-review.googlesource.com/c/go/+/228106 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cuong Manh Le <cuong.manhle.vn@gmail.com> Reviewed-by: Keith Randall <khr@golang.org>
2020-04-13 00:34:33 +00:00
switch y := i.(type) {
case **int: // no write barrier here
return y
}
return nil
}
type T17 struct {
f func(*T17)
}
func f17(x *T17) {
cmd/compile: disable various write barrier optimizations Several of our current write barrier elision optimizations are invalid with the hybrid barrier. Eliding the hybrid barrier requires that *both* the current and new pointer be already shaded and, since we don't have the flow analysis to figure out anything about the slot's current value, for now we have to just disable several of these optimizations. This has a slight impact on binary size. On linux/amd64, the go tool binary increases by 0.7% and the compile binary increases by 1.5%. It also has a slight impact on performance, as one would expect. We'll win some of this back in subsequent commits. name old time/op new time/op delta BinaryTree17-12 2.38s ± 1% 2.40s ± 1% +0.82% (p=0.000 n=18+20) Fannkuch11-12 2.84s ± 1% 2.70s ± 0% -4.97% (p=0.000 n=18+18) FmtFprintfEmpty-12 44.2ns ± 1% 46.4ns ± 2% +4.89% (p=0.000 n=16+18) FmtFprintfString-12 131ns ± 0% 134ns ± 1% +2.05% (p=0.000 n=12+19) FmtFprintfInt-12 114ns ± 1% 117ns ± 1% +3.26% (p=0.000 n=19+20) FmtFprintfIntInt-12 176ns ± 1% 181ns ± 1% +3.25% (p=0.000 n=20+20) FmtFprintfPrefixedInt-12 185ns ± 1% 190ns ± 1% +2.77% (p=0.000 n=19+18) FmtFprintfFloat-12 249ns ± 1% 254ns ± 1% +1.71% (p=0.000 n=18+20) FmtManyArgs-12 747ns ± 1% 743ns ± 1% -0.58% (p=0.000 n=19+18) GobDecode-12 6.57ms ± 1% 6.61ms ± 0% +0.73% (p=0.000 n=19+20) GobEncode-12 5.58ms ± 1% 5.60ms ± 0% +0.27% (p=0.001 n=18+18) Gzip-12 223ms ± 1% 223ms ± 1% ~ (p=0.351 n=19+20) Gunzip-12 37.9ms ± 0% 37.9ms ± 1% ~ (p=0.095 n=16+20) HTTPClientServer-12 77.8µs ± 1% 78.5µs ± 1% +0.97% (p=0.000 n=19+20) JSONEncode-12 14.8ms ± 1% 14.8ms ± 1% ~ (p=0.079 n=20+19) JSONDecode-12 53.7ms ± 1% 54.2ms ± 1% +0.92% (p=0.000 n=20+19) Mandelbrot200-12 3.81ms ± 1% 3.81ms ± 0% ~ (p=0.916 n=19+18) GoParse-12 3.19ms ± 1% 3.19ms ± 1% ~ (p=0.175 n=20+19) RegexpMatchEasy0_32-12 71.9ns ± 1% 70.6ns ± 1% -1.87% (p=0.000 n=19+20) RegexpMatchEasy0_1K-12 946ns ± 0% 944ns ± 0% -0.22% (p=0.000 n=19+16) RegexpMatchEasy1_32-12 67.3ns ± 2% 66.8ns ± 1% -0.72% (p=0.008 n=20+20) RegexpMatchEasy1_1K-12 374ns ± 1% 384ns ± 1% +2.69% (p=0.000 n=18+20) RegexpMatchMedium_32-12 107ns ± 1% 107ns ± 1% ~ (p=1.000 n=20+20) RegexpMatchMedium_1K-12 34.3µs ± 1% 34.6µs ± 1% +0.90% (p=0.000 n=20+20) RegexpMatchHard_32-12 1.78µs ± 1% 1.80µs ± 1% +1.45% (p=0.000 n=20+19) RegexpMatchHard_1K-12 53.6µs ± 0% 54.5µs ± 1% +1.52% (p=0.000 n=19+18) Revcomp-12 417ms ± 5% 391ms ± 1% -6.42% (p=0.000 n=16+19) Template-12 61.1ms ± 1% 64.2ms ± 0% +5.07% (p=0.000 n=19+20) TimeParse-12 302ns ± 1% 305ns ± 1% +0.90% (p=0.000 n=18+18) TimeFormat-12 319ns ± 1% 315ns ± 1% -1.25% (p=0.000 n=18+18) [Geo mean] 54.0µs 54.3µs +0.58% name old time/op new time/op delta XGarbage-12 2.24ms ± 2% 2.28ms ± 1% +1.68% (p=0.000 n=18+17) XHTTP-12 11.4µs ± 1% 11.6µs ± 2% +1.63% (p=0.000 n=18+18) XJSON-12 11.6ms ± 0% 12.5ms ± 0% +7.84% (p=0.000 n=18+17) Updates #17503. Change-Id: I1899f8e35662971e24bf692b517dfbe2b533c00c Reviewed-on: https://go-review.googlesource.com/31572 Reviewed-by: Keith Randall <khr@golang.org>
2016-10-18 14:26:28 +00:00
// Originally from golang.org/issue/13901, but the hybrid
// barrier requires both to have barriers.
x.f = f17 // ERROR "write barrier"
x.f = func(y *T17) { *y = *x } // ERROR "write barrier"
}
type T18 struct {
a []int
s string
}
func f18(p *T18, x *[]int) {
p.a = p.a[:5] // no barrier
*x = (*x)[0:5] // no barrier
p.a = p.a[3:5] // ERROR "write barrier"
p.a = p.a[1:2:3] // ERROR "write barrier"
p.s = p.s[8:9] // ERROR "write barrier"
*x = (*x)[3:5] // ERROR "write barrier"
}
func f19(x, y *int, i int) int {
// Constructing a temporary slice on the stack should not
// require any write barriers. See issue 14263.
a := []*int{x, y} // no barrier
return *a[i]
}
func f20(x, y *int, i int) []*int {
// ... but if that temporary slice escapes, then the
// write barriers are necessary.
a := []*int{x, y} // ERROR "write barrier"
return a
}
runtime: don't rescan globals Currently the runtime rescans globals during mark 2 and mark termination. This costs as much as 500µs/MB in STW time, which is enough to surpass the 10ms STW limit with only 20MB of globals. It's also basically unnecessary. The compiler already generates write barriers for global -> heap pointer updates and the regular write barrier doesn't check whether the slot is a global or in the heap. Some less common write barriers do cause problems. heapBitsBulkBarrier, which is used by typedmemmove and related functions, currently depends on having access to the pointer bitmap and as a result ignores writes to globals. Likewise, the reflect-related write barriers reflect_typedmemmovepartial and callwritebarrier ignore non-heap destinations; though it appears they can never be called with global pointers anyway. This commit makes heapBitsBulkBarrier issue write barriers for writes to global pointers using the data and BSS pointer bitmaps, removes the inheap checks from the reflection write barriers, and eliminates the rescans during mark 2 and mark termination. It also adds a test that writes to globals have write barriers. Programs with large data+BSS segments (with pointers) aren't common, but for programs that do have large data+BSS segments, this significantly reduces pause time: name \ 95%ile-time/markTerm old new delta LargeBSS/bss:1GB/gomaxprocs:4 148200µs ± 6% 302µs ±52% -99.80% (p=0.008 n=5+5) This very slightly improves the go1 benchmarks: name old time/op new time/op delta BinaryTree17-12 2.62s ± 3% 2.62s ± 4% ~ (p=0.904 n=20+20) Fannkuch11-12 2.15s ± 1% 2.13s ± 0% -1.29% (p=0.000 n=18+20) FmtFprintfEmpty-12 48.3ns ± 2% 47.6ns ± 1% -1.52% (p=0.000 n=20+16) FmtFprintfString-12 152ns ± 0% 152ns ± 1% ~ (p=0.725 n=18+18) FmtFprintfInt-12 150ns ± 1% 149ns ± 1% -1.14% (p=0.000 n=19+20) FmtFprintfIntInt-12 250ns ± 0% 244ns ± 1% -2.12% (p=0.000 n=20+18) FmtFprintfPrefixedInt-12 219ns ± 1% 217ns ± 1% -1.20% (p=0.000 n=19+20) FmtFprintfFloat-12 280ns ± 0% 281ns ± 1% +0.47% (p=0.000 n=19+19) FmtManyArgs-12 928ns ± 0% 923ns ± 1% -0.53% (p=0.000 n=19+18) GobDecode-12 7.21ms ± 1% 7.24ms ± 2% ~ (p=0.091 n=19+19) GobEncode-12 6.07ms ± 1% 6.05ms ± 1% -0.36% (p=0.002 n=20+17) Gzip-12 265ms ± 1% 265ms ± 1% ~ (p=0.496 n=20+19) Gunzip-12 39.6ms ± 1% 39.3ms ± 1% -0.85% (p=0.000 n=19+19) HTTPClientServer-12 74.0µs ± 2% 73.8µs ± 1% ~ (p=0.569 n=20+19) JSONEncode-12 15.4ms ± 1% 15.3ms ± 1% -0.25% (p=0.049 n=17+17) JSONDecode-12 53.7ms ± 2% 53.0ms ± 1% -1.29% (p=0.000 n=18+17) Mandelbrot200-12 3.97ms ± 1% 3.97ms ± 0% ~ (p=0.072 n=17+18) GoParse-12 3.35ms ± 2% 3.36ms ± 1% +0.51% (p=0.005 n=18+20) RegexpMatchEasy0_32-12 72.7ns ± 2% 72.2ns ± 1% -0.70% (p=0.005 n=19+19) RegexpMatchEasy0_1K-12 246ns ± 1% 245ns ± 0% -0.60% (p=0.000 n=18+16) RegexpMatchEasy1_32-12 72.8ns ± 1% 72.5ns ± 1% -0.37% (p=0.011 n=18+18) RegexpMatchEasy1_1K-12 380ns ± 1% 385ns ± 1% +1.34% (p=0.000 n=20+19) RegexpMatchMedium_32-12 115ns ± 2% 115ns ± 1% +0.44% (p=0.047 n=20+20) RegexpMatchMedium_1K-12 35.4µs ± 1% 35.5µs ± 1% ~ (p=0.079 n=18+19) RegexpMatchHard_32-12 1.83µs ± 0% 1.80µs ± 1% -1.76% (p=0.000 n=18+18) RegexpMatchHard_1K-12 55.1µs ± 0% 54.3µs ± 1% -1.42% (p=0.000 n=18+19) Revcomp-12 386ms ± 1% 381ms ± 1% -1.14% (p=0.000 n=18+18) Template-12 61.5ms ± 2% 61.5ms ± 2% ~ (p=0.647 n=19+20) TimeParse-12 338ns ± 0% 336ns ± 1% -0.72% (p=0.000 n=14+19) TimeFormat-12 350ns ± 0% 357ns ± 0% +2.05% (p=0.000 n=19+18) [Geo mean] 55.3µs 55.0µs -0.41% Change-Id: I57e8720385a1b991aeebd111b6874354308e2a6b Reviewed-on: https://go-review.googlesource.com/20829 Run-TryBot: Austin Clements <austin@google.com> Reviewed-by: Rick Hudson <rlh@golang.org>
2016-03-18 15:27:59 +00:00
var x21 *int
var y21 struct {
x *int
}
var z21 int
// f21x: Global -> heap pointer updates must have write barriers.
func f21a(x *int) {
x21 = x // ERROR "write barrier"
y21.x = x // ERROR "write barrier"
}
func f21b(x *int) {
x21 = &z21 // ERROR "write barrier"
y21.x = &z21 // ERROR "write barrier"
}
func f21c(x *int) {
runtime: don't rescan globals Currently the runtime rescans globals during mark 2 and mark termination. This costs as much as 500µs/MB in STW time, which is enough to surpass the 10ms STW limit with only 20MB of globals. It's also basically unnecessary. The compiler already generates write barriers for global -> heap pointer updates and the regular write barrier doesn't check whether the slot is a global or in the heap. Some less common write barriers do cause problems. heapBitsBulkBarrier, which is used by typedmemmove and related functions, currently depends on having access to the pointer bitmap and as a result ignores writes to globals. Likewise, the reflect-related write barriers reflect_typedmemmovepartial and callwritebarrier ignore non-heap destinations; though it appears they can never be called with global pointers anyway. This commit makes heapBitsBulkBarrier issue write barriers for writes to global pointers using the data and BSS pointer bitmaps, removes the inheap checks from the reflection write barriers, and eliminates the rescans during mark 2 and mark termination. It also adds a test that writes to globals have write barriers. Programs with large data+BSS segments (with pointers) aren't common, but for programs that do have large data+BSS segments, this significantly reduces pause time: name \ 95%ile-time/markTerm old new delta LargeBSS/bss:1GB/gomaxprocs:4 148200µs ± 6% 302µs ±52% -99.80% (p=0.008 n=5+5) This very slightly improves the go1 benchmarks: name old time/op new time/op delta BinaryTree17-12 2.62s ± 3% 2.62s ± 4% ~ (p=0.904 n=20+20) Fannkuch11-12 2.15s ± 1% 2.13s ± 0% -1.29% (p=0.000 n=18+20) FmtFprintfEmpty-12 48.3ns ± 2% 47.6ns ± 1% -1.52% (p=0.000 n=20+16) FmtFprintfString-12 152ns ± 0% 152ns ± 1% ~ (p=0.725 n=18+18) FmtFprintfInt-12 150ns ± 1% 149ns ± 1% -1.14% (p=0.000 n=19+20) FmtFprintfIntInt-12 250ns ± 0% 244ns ± 1% -2.12% (p=0.000 n=20+18) FmtFprintfPrefixedInt-12 219ns ± 1% 217ns ± 1% -1.20% (p=0.000 n=19+20) FmtFprintfFloat-12 280ns ± 0% 281ns ± 1% +0.47% (p=0.000 n=19+19) FmtManyArgs-12 928ns ± 0% 923ns ± 1% -0.53% (p=0.000 n=19+18) GobDecode-12 7.21ms ± 1% 7.24ms ± 2% ~ (p=0.091 n=19+19) GobEncode-12 6.07ms ± 1% 6.05ms ± 1% -0.36% (p=0.002 n=20+17) Gzip-12 265ms ± 1% 265ms ± 1% ~ (p=0.496 n=20+19) Gunzip-12 39.6ms ± 1% 39.3ms ± 1% -0.85% (p=0.000 n=19+19) HTTPClientServer-12 74.0µs ± 2% 73.8µs ± 1% ~ (p=0.569 n=20+19) JSONEncode-12 15.4ms ± 1% 15.3ms ± 1% -0.25% (p=0.049 n=17+17) JSONDecode-12 53.7ms ± 2% 53.0ms ± 1% -1.29% (p=0.000 n=18+17) Mandelbrot200-12 3.97ms ± 1% 3.97ms ± 0% ~ (p=0.072 n=17+18) GoParse-12 3.35ms ± 2% 3.36ms ± 1% +0.51% (p=0.005 n=18+20) RegexpMatchEasy0_32-12 72.7ns ± 2% 72.2ns ± 1% -0.70% (p=0.005 n=19+19) RegexpMatchEasy0_1K-12 246ns ± 1% 245ns ± 0% -0.60% (p=0.000 n=18+16) RegexpMatchEasy1_32-12 72.8ns ± 1% 72.5ns ± 1% -0.37% (p=0.011 n=18+18) RegexpMatchEasy1_1K-12 380ns ± 1% 385ns ± 1% +1.34% (p=0.000 n=20+19) RegexpMatchMedium_32-12 115ns ± 2% 115ns ± 1% +0.44% (p=0.047 n=20+20) RegexpMatchMedium_1K-12 35.4µs ± 1% 35.5µs ± 1% ~ (p=0.079 n=18+19) RegexpMatchHard_32-12 1.83µs ± 0% 1.80µs ± 1% -1.76% (p=0.000 n=18+18) RegexpMatchHard_1K-12 55.1µs ± 0% 54.3µs ± 1% -1.42% (p=0.000 n=18+19) Revcomp-12 386ms ± 1% 381ms ± 1% -1.14% (p=0.000 n=18+18) Template-12 61.5ms ± 2% 61.5ms ± 2% ~ (p=0.647 n=19+20) TimeParse-12 338ns ± 0% 336ns ± 1% -0.72% (p=0.000 n=14+19) TimeFormat-12 350ns ± 0% 357ns ± 0% +2.05% (p=0.000 n=19+18) [Geo mean] 55.3µs 55.0µs -0.41% Change-Id: I57e8720385a1b991aeebd111b6874354308e2a6b Reviewed-on: https://go-review.googlesource.com/20829 Run-TryBot: Austin Clements <austin@google.com> Reviewed-by: Rick Hudson <rlh@golang.org>
2016-03-18 15:27:59 +00:00
y21 = struct{ x *int }{x} // ERROR "write barrier"
}
func f22(x *int) (y *int) {
// pointer write on stack should have no write barrier.
// this is a case that the frontend failed to eliminate.
p := &y
*p = x // no barrier
return
}
type T23 struct {
p *int
a int
}
var t23 T23
var i23 int
// f23x: zeroing global needs write barrier for the hybrid barrier.
func f23a() {
t23 = T23{} // ERROR "write barrier"
}
func f23b() {
// also test partial assignments
t23 = T23{a: 1} // ERROR "write barrier"
}
func f23c() {
t23 = T23{} // no barrier (dead store)
// also test partial assignments
t23 = T23{p: &i23} // ERROR "write barrier"
}
var g int
func f24() **int {
p := new(*int)
*p = &g // no write barrier here
return p
}
func f25() []string {
return []string{"abc", "def", "ghi"} // no write barrier here
}
type T26 struct {
a, b, c int
d, e, f *int
}
var g26 int
func f26(p *int) *T26 { // see issue 29573
return &T26{
a: 5,
b: 6,
c: 7,
d: &g26, // no write barrier: global ptr
e: nil, // no write barrier: nil ptr
f: p, // ERROR "write barrier"
}
}
func f27(p *int) []interface{} {
return []interface{}{
nil, // no write barrier: zeroed memory, nil ptr
(*T26)(nil), // no write barrier: zeroed memory, type ptr & nil ptr
&g26, // no write barrier: zeroed memory, type ptr & global ptr
7, // no write barrier: zeroed memory, type ptr & global ptr
p, // ERROR "write barrier"
}
}