sync: allow inlining the Mutex.Unlock fast path

Make use of the newly-enabled limited midstack inlining.
Similar changes will be done in followup CLs.

name                    old time/op  new time/op  delta
MutexUncontended        19.3ns ± 1%  18.9ns ± 0%   -1.92%  (p=0.000 n=20+19)
MutexUncontended-4      5.24ns ± 0%  4.75ns ± 1%   -9.25%  (p=0.000 n=20+20)
MutexUncontended-16     2.10ns ± 0%  2.05ns ± 0%   -2.38%  (p=0.000 n=15+19)
Mutex                   19.6ns ± 0%  19.3ns ± 1%   -1.92%  (p=0.000 n=20+17)
Mutex-4                 54.6ns ± 5%  52.4ns ± 4%   -4.09%  (p=0.000 n=20+20)
Mutex-16                 133ns ± 5%   139ns ± 2%   +4.23%  (p=0.000 n=20+16)
MutexSlack              33.4ns ± 2%  18.9ns ± 1%  -43.56%  (p=0.000 n=19+20)
MutexSlack-4             206ns ± 5%   225ns ± 8%   +9.12%  (p=0.000 n=20+18)
MutexSlack-16           89.4ns ± 1%  98.4ns ± 1%  +10.10%  (p=0.000 n=18+17)
MutexWork               60.5ns ± 0%  58.2ns ± 3%   -3.75%  (p=0.000 n=12+20)
MutexWork-4              105ns ± 5%   103ns ± 7%   -1.68%  (p=0.007 n=20+20)
MutexWork-16             157ns ± 1%   163ns ± 2%   +3.90%  (p=0.000 n=18+18)
MutexWorkSlack          70.2ns ± 5%  57.7ns ± 1%  -17.81%  (p=0.000 n=19+20)
MutexWorkSlack-4         277ns ±13%   276ns ±13%     ~     (p=0.682 n=20+19)
MutexWorkSlack-16        156ns ± 0%   147ns ± 0%   -5.62%  (p=0.000 n=16+14)
MutexNoSpin              966ns ± 0%   968ns ± 0%   +0.11%  (p=0.029 n=15+20)
MutexNoSpin-4            269ns ± 4%   270ns ± 2%     ~     (p=0.807 n=20+19)
MutexNoSpin-16           122ns ± 0%   120ns ± 4%   -1.63%  (p=0.000 n=19+19)
MutexSpin               3.13µs ± 0%  3.13µs ± 1%   +0.16%  (p=0.004 n=18+20)
MutexSpin-4              826ns ± 1%   832ns ± 2%   +0.74%  (p=0.000 n=19+16)
MutexSpin-16             397ns ± 1%   395ns ± 0%   -0.50%  (p=0.000 n=19+17)
RWMutexUncontended      71.4ns ± 0%  69.5ns ± 0%   -2.72%  (p=0.000 n=16+20)
RWMutexUncontended-4    18.4ns ± 4%  17.5ns ± 0%   -4.92%  (p=0.000 n=20+18)
RWMutexUncontended-16   8.01ns ± 0%  7.92ns ± 0%   -1.15%  (p=0.000 n=18+18)
RWMutexWrite100         24.9ns ± 0%  24.9ns ± 1%     ~     (p=0.099 n=19+20)
RWMutexWrite100-4       46.5ns ± 3%  46.2ns ± 4%     ~     (p=0.253 n=17+19)
RWMutexWrite100-16      68.9ns ± 3%  69.9ns ± 5%   +1.46%  (p=0.012 n=18+20)
RWMutexWrite10          27.1ns ± 0%  27.0ns ± 2%     ~     (p=0.128 n=17+20)
RWMutexWrite10-4        34.8ns ± 1%  34.7ns ± 2%     ~     (p=0.180 n=20+18)
RWMutexWrite10-16       37.5ns ± 2%  37.2ns ± 4%   -0.89%  (p=0.023 n=20+20)
RWMutexWorkWrite100      164ns ± 0%   164ns ± 0%     ~     (p=0.106 n=12+20)
RWMutexWorkWrite100-4    186ns ± 3%   193ns ± 3%   +3.46%  (p=0.000 n=20+20)
RWMutexWorkWrite100-16   204ns ± 2%   210ns ± 3%   +2.96%  (p=0.000 n=18+20)
RWMutexWorkWrite10       153ns ± 0%   153ns ± 0%   -0.20%  (p=0.017 n=20+19)
RWMutexWorkWrite10-4     179ns ± 1%   178ns ± 2%     ~     (p=0.215 n=19+20)
RWMutexWorkWrite10-16    191ns ± 1%   192ns ± 2%     ~     (p=0.166 n=15+19)

linux/amd64 bin/go 14630572 (previous commit 14605947, +24625/+0.17%)

Change-Id: I3f9d1765801fe0b8deb1bc2728b8bba8a7508e23
Reviewed-on: https://go-review.googlesource.com/c/go/+/148958
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
Carlo Alberto Ferraris 2018-11-10 08:28:44 +09:00 committed by Brad Fitzpatrick
parent 94cbfc2f7f
commit 4c3f26076b
8 changed files with 50 additions and 16 deletions

View file

@ -62,8 +62,8 @@ func poll_runtime_Semacquire(addr *uint32) {
}
//go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
func sync_runtime_Semrelease(addr *uint32, handoff bool) {
semrelease1(addr, handoff)
func sync_runtime_Semrelease(addr *uint32, handoff bool, skipframes int) {
semrelease1(addr, handoff, skipframes)
}
//go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
@ -153,10 +153,10 @@ func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags) {
}
func semrelease(addr *uint32) {
semrelease1(addr, false)
semrelease1(addr, false, 0)
}
func semrelease1(addr *uint32, handoff bool) {
func semrelease1(addr *uint32, handoff bool, skipframes int) {
root := semroot(addr)
atomic.Xadd(addr, 1)
@ -183,7 +183,7 @@ func semrelease1(addr *uint32, handoff bool) {
if s != nil { // May be slow, so unlock first
acquiretime := s.acquiretime
if acquiretime != 0 {
mutexevent(t0-acquiretime, 3)
mutexevent(t0-acquiretime, 3+skipframes)
}
if s.ticket != 0 {
throw("corrupted semaphore ticket")
@ -191,7 +191,7 @@ func semrelease1(addr *uint32, handoff bool) {
if handoff && cansemacquire(addr) {
s.ticket = 1
}
readyWithTime(s, 5)
readyWithTime(s, 5+skipframes)
}
}

View file

@ -180,6 +180,14 @@ func (m *Mutex) Unlock() {
// Fast path: drop lock bit.
new := atomic.AddInt32(&m.state, -mutexLocked)
if new != 0 {
// Outlined slow path to allow inlining the fast path.
// To hide unlockSlow during tracing we skip one extra frame when tracing GoUnblock.
m.unlockSlow(new)
}
}
func (m *Mutex) unlockSlow(new int32) {
if (new+mutexLocked)&mutexLocked == 0 {
throw("sync: unlock of unlocked mutex")
}
@ -198,7 +206,7 @@ func (m *Mutex) Unlock() {
// Grab the right to wake someone.
new = (old - 1<<mutexWaiterShift) | mutexWoken
if atomic.CompareAndSwapInt32(&m.state, old, new) {
runtime_Semrelease(&m.sema, false)
runtime_Semrelease(&m.sema, false, 1)
return
}
old = m.state
@ -208,6 +216,6 @@ func (m *Mutex) Unlock() {
// Note: mutexLocked is not set, the waiter will set it after wakeup.
// But mutex is still considered locked if mutexStarving is set,
// so new coming goroutines won't acquire it.
runtime_Semrelease(&m.sema, true)
runtime_Semrelease(&m.sema, true, 1)
}
}

View file

@ -21,7 +21,7 @@ import (
func HammerSemaphore(s *uint32, loops int, cdone chan bool) {
for i := 0; i < loops; i++ {
Runtime_Semacquire(s)
Runtime_Semrelease(s, false)
Runtime_Semrelease(s, false, 0)
}
cdone <- true
}

View file

@ -22,7 +22,9 @@ func runtime_SemacquireMutex(s *uint32, lifo bool)
// It is intended as a simple wakeup primitive for use by the synchronization
// library and should not be used directly.
// If handoff is true, pass count directly to the first waiter.
func runtime_Semrelease(s *uint32, handoff bool)
// skipframes is the number of frames to omit during tracing, counting from
// runtime_Semrelease's caller.
func runtime_Semrelease(s *uint32, handoff bool, skipframes int)
// Approximation of notifyList in runtime/sema.go. Size and alignment must
// agree.

View file

@ -18,7 +18,7 @@ func BenchmarkSemaUncontended(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
sem := new(PaddedSem)
for pb.Next() {
Runtime_Semrelease(&sem.sem, false)
Runtime_Semrelease(&sem.sem, false, 0)
Runtime_Semacquire(&sem.sem)
}
})
@ -44,7 +44,7 @@ func benchmarkSema(b *testing.B, block, work bool) {
b.RunParallel(func(pb *testing.PB) {
foo := 0
for pb.Next() {
Runtime_Semrelease(&sem, false)
Runtime_Semrelease(&sem, false, 0)
if work {
for i := 0; i < 100; i++ {
foo *= 2
@ -54,7 +54,7 @@ func benchmarkSema(b *testing.B, block, work bool) {
Runtime_Semacquire(&sem)
}
_ = foo
Runtime_Semrelease(&sem, false)
Runtime_Semrelease(&sem, false, 0)
})
}

View file

@ -73,7 +73,7 @@ func (rw *RWMutex) RUnlock() {
// A writer is pending.
if atomic.AddInt32(&rw.readerWait, -1) == 0 {
// The last reader unblocks the writer.
runtime_Semrelease(&rw.writerSem, false)
runtime_Semrelease(&rw.writerSem, false, 0)
}
}
if race.Enabled {
@ -125,7 +125,7 @@ func (rw *RWMutex) Unlock() {
}
// Unblock blocked readers, if any.
for i := 0; i < int(r); i++ {
runtime_Semrelease(&rw.readerSem, false)
runtime_Semrelease(&rw.readerSem, false, 0)
}
// Allow other writers to proceed.
rw.w.Unlock()

View file

@ -90,7 +90,7 @@ func (wg *WaitGroup) Add(delta int) {
// Reset waiters count to 0.
*statep = 0
for ; w != 0; w-- {
runtime_Semrelease(semap, false)
runtime_Semrelease(semap, false, 0)
}
}

24
test/inline_sync.go Normal file
View file

@ -0,0 +1,24 @@
// +build !nacl,!386
// errorcheck -0 -m
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test, using compiler diagnostic flags, that inlining of functions
// imported from the sync package is working.
// Compiles but does not run.
// FIXME: nacl-386 is excluded as inlining currently does not work there.
package foo
import (
"sync"
)
var mutex *sync.Mutex
func small5() { // ERROR "can inline small5"
// the Unlock fast path should be inlined
mutex.Unlock() // ERROR "inlining call to sync\.\(\*Mutex\)\.Unlock" "&sync\.m\.state escapes to heap"
}