sync: allow inlining the Mutex.Lock fast path

name                    old time/op  new time/op  delta
MutexUncontended        18.9ns ± 0%  16.2ns ± 0%  -14.29%  (p=0.000 n=19+19)
MutexUncontended-4      4.75ns ± 1%  4.08ns ± 0%  -14.20%  (p=0.000 n=20+19)
MutexUncontended-16     2.05ns ± 0%  2.11ns ± 0%   +2.93%  (p=0.000 n=19+16)
Mutex                   19.3ns ± 1%  16.2ns ± 0%  -15.86%  (p=0.000 n=17+19)
Mutex-4                 52.4ns ± 4%  48.6ns ± 9%   -7.22%  (p=0.000 n=20+20)
Mutex-16                 139ns ± 2%   140ns ± 3%   +1.03%  (p=0.011 n=16+20)
MutexSlack              18.9ns ± 1%  16.2ns ± 1%  -13.96%  (p=0.000 n=20+20)
MutexSlack-4             225ns ± 8%   211ns ±10%   -5.94%  (p=0.000 n=18+19)
MutexSlack-16           98.4ns ± 1%  90.9ns ± 1%   -7.60%  (p=0.000 n=17+18)
MutexWork               58.2ns ± 3%  55.4ns ± 0%   -4.82%  (p=0.000 n=20+17)
MutexWork-4              103ns ± 7%    95ns ±18%   -8.03%  (p=0.000 n=20+20)
MutexWork-16             163ns ± 2%   155ns ± 2%   -4.47%  (p=0.000 n=18+18)
MutexWorkSlack          57.7ns ± 1%  55.4ns ± 0%   -3.99%  (p=0.000 n=20+13)
MutexWorkSlack-4         276ns ±13%   260ns ±10%   -5.64%  (p=0.001 n=19+19)
MutexWorkSlack-16        147ns ± 0%   156ns ± 1%   +5.87%  (p=0.000 n=14+19)
MutexNoSpin              968ns ± 0%   900ns ± 1%   -6.98%  (p=0.000 n=20+18)
MutexNoSpin-4            270ns ± 2%   255ns ± 2%   -5.74%  (p=0.000 n=19+20)
MutexNoSpin-16           120ns ± 4%   112ns ± 0%   -6.99%  (p=0.000 n=19+14)
MutexSpin               3.13µs ± 1%  3.19µs ± 6%     ~     (p=0.401 n=20+20)
MutexSpin-4              832ns ± 2%   831ns ± 1%   -0.17%  (p=0.023 n=16+18)
MutexSpin-16             395ns ± 0%   399ns ± 0%   +0.94%  (p=0.000 n=17+19)
RWMutexUncontended      69.5ns ± 0%  68.4ns ± 0%   -1.59%  (p=0.000 n=20+20)
RWMutexUncontended-4    17.5ns ± 0%  16.7ns ± 0%   -4.30%  (p=0.000 n=18+17)
RWMutexUncontended-16   7.92ns ± 0%  7.87ns ± 0%   -0.61%  (p=0.000 n=18+17)
RWMutexWrite100         24.9ns ± 1%  25.0ns ± 1%   +0.32%  (p=0.000 n=20+20)
RWMutexWrite100-4       46.2ns ± 4%  46.2ns ± 5%     ~     (p=0.840 n=19+20)
RWMutexWrite100-16      69.9ns ± 5%  69.9ns ± 3%     ~     (p=0.545 n=20+19)
RWMutexWrite10          27.0ns ± 2%  26.8ns ± 2%   -0.98%  (p=0.001 n=20+20)
RWMutexWrite10-4        34.7ns ± 2%  35.0ns ± 4%     ~     (p=0.191 n=18+20)
RWMutexWrite10-16       37.2ns ± 4%  37.3ns ± 2%     ~     (p=0.438 n=20+19)
RWMutexWorkWrite100      164ns ± 0%   163ns ± 0%   -0.24%  (p=0.025 n=20+20)
RWMutexWorkWrite100-4    193ns ± 3%   191ns ± 2%   -1.06%  (p=0.027 n=20+20)
RWMutexWorkWrite100-16   210ns ± 3%   207ns ± 3%   -1.22%  (p=0.038 n=20+20)
RWMutexWorkWrite10       153ns ± 0%   153ns ± 0%     ~     (all equal)
RWMutexWorkWrite10-4     178ns ± 2%   179ns ± 2%     ~     (p=0.186 n=20+20)
RWMutexWorkWrite10-16    192ns ± 2%   192ns ± 2%     ~     (p=0.731 n=19+20)

linux/amd64 bin/go 14663387 (previous commit 14630572, +32815/+0.22%)

Change-Id: I98171006dce14069b1a62da07c3d165455a7906b
Reviewed-on: https://go-review.googlesource.com/c/go/+/148959
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
This commit is contained in:
Carlo Alberto Ferraris 2018-11-09 22:49:38 +09:00 committed by Brad Fitzpatrick
parent 83a33d3855
commit 41cb0aedff
5 changed files with 28 additions and 13 deletions

View file

@ -53,12 +53,12 @@ var semtable [semTabSize]struct {
//go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
func sync_runtime_Semacquire(addr *uint32) {
semacquire1(addr, false, semaBlockProfile)
semacquire1(addr, false, semaBlockProfile, 0)
}
//go:linkname poll_runtime_Semacquire internal/poll.runtime_Semacquire
func poll_runtime_Semacquire(addr *uint32) {
semacquire1(addr, false, semaBlockProfile)
semacquire1(addr, false, semaBlockProfile, 0)
}
//go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
@ -67,8 +67,8 @@ func sync_runtime_Semrelease(addr *uint32, handoff bool, skipframes int) {
}
//go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
func sync_runtime_SemacquireMutex(addr *uint32, lifo bool) {
semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile)
func sync_runtime_SemacquireMutex(addr *uint32, lifo bool, skipframes int) {
semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes)
}
//go:linkname poll_runtime_Semrelease internal/poll.runtime_Semrelease
@ -92,10 +92,10 @@ const (
// Called from runtime.
func semacquire(addr *uint32) {
semacquire1(addr, false, 0)
semacquire1(addr, false, 0, 0)
}
func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags) {
func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int) {
gp := getg()
if gp != gp.m.curg {
throw("semacquire not on the G stack")
@ -141,13 +141,13 @@ func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags) {
// Any semrelease after the cansemacquire knows we're waiting
// (we set nwait above), so go to sleep.
root.queue(addr, s, lifo)
goparkunlock(&root.lock, waitReasonSemacquire, traceEvGoBlockSync, 4)
goparkunlock(&root.lock, waitReasonSemacquire, traceEvGoBlockSync, 4+skipframes)
if s.ticket != 0 || cansemacquire(addr) {
break
}
}
if s.releasetime > 0 {
blockevent(s.releasetime-t0, 3)
blockevent(s.releasetime-t0, 3+skipframes)
}
releaseSudog(s)
}

View file

@ -77,7 +77,11 @@ func (m *Mutex) Lock() {
}
return
}
// Slow path (outlined so that the fast path can be inlined)
m.lockSlow()
}
func (m *Mutex) lockSlow() {
var waitStartTime int64
starving := false
awoke := false
@ -131,7 +135,7 @@ func (m *Mutex) Lock() {
if waitStartTime == 0 {
waitStartTime = runtime_nanotime()
}
runtime_SemacquireMutex(&m.sema, queueLifo)
runtime_SemacquireMutex(&m.sema, queueLifo, 1)
starving = starving || runtime_nanotime()-waitStartTime > starvationThresholdNs
old = m.state
if old&mutexStarving != 0 {

View file

@ -15,7 +15,9 @@ func runtime_Semacquire(s *uint32)
// SemacquireMutex is like Semacquire, but for profiling contended Mutexes.
// If lifo is true, queue waiter at the head of wait queue.
func runtime_SemacquireMutex(s *uint32, lifo bool)
// skipframes is the number of frames to omit during tracing, counting from
// runtime_SemacquireMutex's caller.
func runtime_SemacquireMutex(s *uint32, lifo bool, skipframes int)
// Semrelease atomically increments *s and notifies a waiting goroutine
// if one is blocked in Semacquire.

View file

@ -47,7 +47,7 @@ func (rw *RWMutex) RLock() {
}
if atomic.AddInt32(&rw.readerCount, 1) < 0 {
// A writer is pending, wait for it.
runtime_SemacquireMutex(&rw.readerSem, false)
runtime_SemacquireMutex(&rw.readerSem, false, 0)
}
if race.Enabled {
race.Enable()
@ -95,7 +95,7 @@ func (rw *RWMutex) Lock() {
r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
// Wait for active readers.
if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
runtime_SemacquireMutex(&rw.writerSem, false)
runtime_SemacquireMutex(&rw.writerSem, false, 0)
}
if race.Enabled {
race.Enable()

View file

@ -8,7 +8,11 @@
// Test, using compiler diagnostic flags, that inlining of functions
// imported from the sync package is working.
// Compiles but does not run.
// FIXME: nacl-386 is excluded as inlining currently does not work there.
// FIXME: This test is disabled on architectures where atomic operations
// are function calls rather than intrinsics, since this prevents inlining
// of the sync fast paths. This test should be re-enabled once the problem
// is solved.
package foo
@ -22,3 +26,8 @@ func small5() { // ERROR "can inline small5"
// the Unlock fast path should be inlined
mutex.Unlock() // ERROR "inlining call to sync\.\(\*Mutex\)\.Unlock" "&sync\.m\.state escapes to heap"
}
func small6() { // ERROR "can inline small6"
// the Lock fast path should be inlined
mutex.Lock() // ERROR "inlining call to sync\.\(\*Mutex\)\.Lock" "&sync\.m\.state escapes to heap"
}