runtime: double-link list of waiting Ms

When an M unlocks a contended mutex, it needs to consult a list of the Ms that had to wait during its critical section. This allows the M to attribute the appropriate amount of blame to the unlocking call stack. Mirroring the implementation for users' sync.Mutex contention (via sudog), we can (in a future commit) use the time that the head and tail of the wait list started waiting, and the number of waiters, to estimate the sum of the Ms' delays. When an M acquires the mutex, it needs to remove itself from the list of waiters. Since the futex-based lock implementation leaves the OS in control of the order of M wakeups, we need to be prepared for quickly (constant time) removing any M from the list. First, have each M add itself to a singly-linked wait list when it finds that its lock call will need to sleep. This case is safe against live-lock, since any delay to one M adding itself to the list would be due to another M making durable progress. Second, have the M that holds the lock (either right before releasing, or right after acquiring) update metadata on the list of waiting Ms to double-link the list and maintain a tail pointer and waiter count. That work is amortized-constant: we'll avoid contended locks becoming proportionally more contended and undergoing performance collapse. For #66999 Change-Id: If75cdea915afb59ccec47294e0b52c466aac8736 Reviewed-on: https://go-review.googlesource.com/c/go/+/585637 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com>
2024-10-14 11:53:56 +00:00 · 2024-05-13 13:00:52 -07:00 · 2024-05-13 13:00:52 -07:00 · d881ed6384
parent dfb7073bb8
commit d881ed6384
3 changed files with 324 additions and 25 deletions
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@ -37,6 +37,13 @@ const (
 // independently: a thread can enter lock2, observe that another thread is
 // already asleep, and immediately try to grab the lock anyway without waiting
 // for its "fair" turn.
+//
+// The rest of mutex.key holds a pointer to the head of a linked list of the Ms
+// that are waiting for the mutex. The pointer portion is set if and only if the
+// mutex_sleeping flag is set. Because the futex syscall operates on 32 bits but
+// a uintptr may be larger, the flag lets us be sure the futexsleep call will
+// only commit if the pointer portion is unset. Otherwise an M allocated at an
+// address like 0x123_0000_0000 might miss its wakeups.

 // We use the uintptr mutex.key and note.key as a uint32.
 //
@ -67,18 +74,53 @@ func lock2(l *mutex) {

 	timer := &lockTimer{lock: l}
 	timer.begin()
+
+	// If a goroutine's stack needed to grow during a lock2 call, the M could
+	// end up with two active lock2 calls (one each on curg and g0). If both are
+	// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
+	stackguard0, throwsplit := gp.stackguard0, gp.throwsplit
+	if gp == gp.m.curg {
+		gp.stackguard0, gp.throwsplit = stackPreempt, true
+	}
+
 	// On uniprocessors, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
 	if ncpu > 1 {
 		spin = active_spin
 	}
+	var enqueued bool
 Loop:
 	for i := 0; ; i++ {
 		v := atomic.Loaduintptr(&l.key)
 		if v&mutex_locked == 0 {
 			// Unlocked. Try to lock.
 			if atomic.Casuintptr(&l.key, v, v|mutex_locked) {
+				// We now own the mutex
+				v = v | mutex_locked
+				for {
+					old := v
+
+					head := muintptr(v &^ (mutex_sleeping | mutex_locked))
+					fixMutexWaitList(head)
+					if enqueued {
+						head = removeMutexWaitList(head, gp.m)
+					}
+
+					v = mutex_locked
+					if head != 0 {
+						v = v | uintptr(head) | mutex_sleeping
+					}
+
+					if v == old || atomic.Casuintptr(&l.key, old, v) {
+						gp.m.mWaitList.clearLinks()
+						break
+					}
+					v = atomic.Loaduintptr(&l.key)
+				}
+				if gp == gp.m.curg {
+					gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
+				}
 				timer.end()
 				return
 			}
@ -90,21 +132,28 @@ Loop:
 			osyield()
 		} else {
 			// Someone else has it.
+			// l->key points to a linked list of M's waiting
+			// for this lock, chained through m->mWaitList.next.
+			// Queue this M.
 			for {
 				head := v &^ (mutex_locked | mutex_sleeping)
-				if atomic.Casuintptr(&l.key, v, head|mutex_locked|mutex_sleeping) {
-					break
+				if !enqueued {
+					gp.m.mWaitList.next = muintptr(head)
+					head = uintptr(unsafe.Pointer(gp.m))
+					if atomic.Casuintptr(&l.key, v, head|mutex_locked|mutex_sleeping) {
+						enqueued = true
+						break
+					}
+					gp.m.mWaitList.next = 0
 				}
 				v = atomic.Loaduintptr(&l.key)
 				if v&mutex_locked == 0 {
 					continue Loop
 				}
 			}
-			if v&mutex_locked != 0 {
-				// Queued. Wait.
-				futexsleep(key32(&l.key), uint32(v), -1)
-				i = 0
-			}
+			// Queued. Wait.
+			futexsleep(key32(&l.key), uint32(v), -1)
+			i = 0
 		}
 	}
 }
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@ -54,18 +54,49 @@ func lock2(l *mutex) {

 	timer := &lockTimer{lock: l}
 	timer.begin()
+
+	// If a goroutine's stack needed to grow during a lock2 call, the M could
+	// end up with two active lock2 calls (one each on curg and g0). If both are
+	// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
+	stackguard0, throwsplit := gp.stackguard0, gp.throwsplit
+	if gp == gp.m.curg {
+		gp.stackguard0, gp.throwsplit = stackPreempt, true
+	}
+
 	// On uniprocessor's, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
 	if ncpu > 1 {
 		spin = active_spin
 	}
+	var enqueued bool
 Loop:
 	for i := 0; ; i++ {
 		v := atomic.Loaduintptr(&l.key)
 		if v&locked == 0 {
 			// Unlocked. Try to lock.
 			if atomic.Casuintptr(&l.key, v, v|locked) {
+				// We now own the mutex
+				v = v | locked
+				for {
+					old := v
+
+					head := muintptr(v &^ locked)
+					fixMutexWaitList(head)
+					if enqueued {
+						head = removeMutexWaitList(head, gp.m)
+					}
+					v = locked | uintptr(head)
+
+					if v == old || atomic.Casuintptr(&l.key, old, v) {
+						gp.m.mWaitList.clearLinks()
+						break
+					}
+					v = atomic.Loaduintptr(&l.key)
+				}
+				if gp == gp.m.curg {
+					gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
+				}
 				timer.end()
 				return
 			}
@ -81,20 +112,29 @@ Loop:
 			// for this lock, chained through m.mWaitList.next.
 			// Queue this M.
 			for {
-				gp.m.mWaitList.next = muintptr(v &^ locked)
-				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
-					break
+				if !enqueued {
+					gp.m.mWaitList.next = muintptr(v &^ locked)
+					if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
+						enqueued = true
+						break
+					}
+					gp.m.mWaitList.next = 0
 				}
+
 				v = atomic.Loaduintptr(&l.key)
 				if v&locked == 0 {
 					continue Loop
 				}
 			}
-			if v&locked != 0 {
-				// Queued. Wait.
-				semasleep(-1)
-				i = 0
-			}
+			// Queued. Wait.
+			semasleep(-1)
+			i = 0
+			enqueued = false
+			// unlock2 removed this M from the list (it was at the head). We
+			// need to erase the metadata about its former position in the
+			// list -- and since it's no longer a published member we can do
+			// so without races.
+			gp.m.mWaitList.clearLinks()
 		}
 	}
 }
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@ -572,6 +572,215 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 	releasem(mp)
 }

+// mWaitList is part of the M struct, and holds the list of Ms that are waiting
+// for a particular runtime.mutex.
+//
+// When an M is unable to immediately obtain a mutex, it notes the current time
+// and it adds itself to the list of Ms waiting for the mutex. It does that via
+// this struct's next field, forming a singly-linked list with the mutex's key
+// field pointing to the head of the list.
+//
+// Immediately before releasing the mutex, the previous holder calculates how
+// much delay it caused for the Ms that had to wait. First, it sets the prev
+// links of each node in the list -- starting at the head and continuing until
+// it finds the portion of the list that is already doubly linked. That part of
+// the list also has correct values for the tail pointer and the waiters count,
+// which we'll apply to the head of the wait list. This is amortized-constant
+// work, though it takes place within the critical section of the contended
+// mutex.
+//
+// Having found the head and tail nodes and a correct waiters count, the
+// unlocking M can read and update those two nodes' acquireTimes fields and thus
+// take responsibility for (an estimate of) the entire list's delay since the
+// last unlock call.
+//
+// Finally, the M that is then able to acquire the mutex needs to remove itself
+// from the list of waiters. This is simpler than with many lock-free linked
+// lists, since deletion here is guarded by the mutex itself. If the M's prev
+// field isn't set and also isn't at the head of the list, it does the same
+// amortized-constant double-linking as in unlock, enabling quick deletion
+// regardless of where the M is in the list. Note that with lock_sema.go the
+// runtime controls the order of thread wakeups (it's a LIFO stack), but with
+// lock_futex.go the OS can wake an arbitrary thread.
+type mWaitList struct {
+	acquireTimes timePair // start of current wait (set by us, updated by others during unlock)
+	next         muintptr // next m waiting for lock (set by us, cleared by another during unlock)
+	prev         muintptr // previous m waiting for lock (an amortized hint, set by another during unlock)
+	tail         muintptr // final m waiting for lock (an amortized hint, set by others during unlock)
+	waiters      int32    // length of waiting m list (an amortized hint, set by another during unlock)
+}
+
+type timePair struct {
+	nanotime int64
+	cputicks int64
+}
+
+// clearLinks resets the fields related to the M's position in the list of Ms
+// waiting for a mutex. It leaves acquireTimes intact, since this M may still be
+// waiting and may have had its acquireTimes updated by an unlock2 call.
+//
+// In lock_sema.go, the previous owner of the mutex dequeues an M and then wakes
+// it; with semaphore-based sleep, it's important that each M receives only one
+// wakeup for each time they sleep. If the dequeued M fails to obtain the lock,
+// it will need to sleep again -- and may have a different position in the list.
+//
+// With lock_futex.go, each thread is responsible for removing itself from the
+// list, upon securing ownership of the mutex.
+//
+// Called while stack splitting is disabled in lock2.
+//
+//go:nosplit
+func (l *mWaitList) clearLinks() {
+	l.next = 0
+	l.prev = 0
+	l.tail = 0
+	l.waiters = 0
+}
+
+// verifyMutexWaitList instructs fixMutexWaitList to confirm that the mutex wait
+// list invariants are intact. Operations on the list are typically
+// amortized-constant; but when active, these extra checks require visiting
+// every other M that is waiting for the lock.
+const verifyMutexWaitList = false
+
+// fixMutexWaitList restores the invariants of the linked list of Ms waiting for
+// a particular mutex.
+//
+// It takes as an argument the pointer bits of the mutex's key. (The caller is
+// responsible for clearing flag values.)
+//
+// On return, the list will be doubly-linked, and the head of the list (if not
+// nil) will point to an M where mWaitList.tail points to the end of the linked
+// list and where mWaitList.waiters is the number of Ms in the list.
+//
+// The caller must hold the mutex that the Ms of the list are waiting to
+// acquire.
+//
+// Called while stack splitting is disabled in lock2.
+//
+//go:nosplit
+func fixMutexWaitList(head muintptr) {
+	if head == 0 {
+		return
+	}
+	hp := head.ptr()
+	node := hp
+
+	var waiters int32
+	var tail *m
+	for {
+		// For amortized-constant cost, stop searching once we reach part of the
+		// list that's been visited before. Identify it by the presence of a
+		// tail pointer.
+		if node.mWaitList.tail.ptr() != nil {
+			tail = node.mWaitList.tail.ptr()
+			waiters += node.mWaitList.waiters
+			break
+		}
+		waiters++
+
+		next := node.mWaitList.next.ptr()
+		if next == nil {
+			break
+		}
+		next.mWaitList.prev.set(node)
+
+		node = next
+	}
+	if tail == nil {
+		tail = node
+	}
+	hp.mWaitList.tail.set(tail)
+	hp.mWaitList.waiters = waiters
+
+	if verifyMutexWaitList {
+		var revisit int32
+		var reTail *m
+		for node := hp; node != nil; node = node.mWaitList.next.ptr() {
+			revisit++
+			reTail = node
+		}
+		if revisit != waiters {
+			throw("miscounted mutex waiters")
+		}
+		if reTail != tail {
+			throw("incorrect mutex wait list tail")
+		}
+	}
+}
+
+// removeMutexWaitList removes mp from the list of Ms waiting for a particular
+// mutex. It relies on (and keeps up to date) the invariants that
+// fixMutexWaitList establishes and repairs.
+//
+// It modifies the nodes that are to remain in the list. It returns the value to
+// assign as the head of the list, with the caller responsible for ensuring that
+// the (atomic, contended) head assignment worked and subsequently clearing the
+// list-related fields of mp.
+//
+// The only change it makes to mp is to clear the tail field -- so a subsequent
+// call to fixMutexWaitList will be able to re-establish the prev link from its
+// next node (just in time for another removeMutexWaitList call to clear it
+// again).
+//
+// The caller must hold the mutex that the Ms of the list are waiting to
+// acquire.
+//
+// Called while stack splitting is disabled in lock2.
+//
+//go:nosplit
+func removeMutexWaitList(head muintptr, mp *m) muintptr {
+	if head == 0 {
+		return 0
+	}
+	hp := head.ptr()
+	tail := hp.mWaitList.tail
+	waiters := hp.mWaitList.waiters
+	headTimes := hp.mWaitList.acquireTimes
+	tailTimes := hp.mWaitList.tail.ptr().mWaitList.acquireTimes
+
+	mp.mWaitList.tail = 0
+
+	if head.ptr() == mp {
+		// mp is the head
+		if mp.mWaitList.prev.ptr() != nil {
+			throw("removeMutexWaitList node at head of list, but has prev field set")
+		}
+		head = mp.mWaitList.next
+	} else {
+		// mp is not the head
+		if mp.mWaitList.prev.ptr() == nil {
+			throw("removeMutexWaitList node not in list (not at head, no prev pointer)")
+		}
+		mp.mWaitList.prev.ptr().mWaitList.next = mp.mWaitList.next
+		if tail.ptr() == mp {
+			// mp is the tail
+			if mp.mWaitList.next.ptr() != nil {
+				throw("removeMutexWaitList node at tail of list, but has next field set")
+			}
+			tail = mp.mWaitList.prev
+		} else {
+			if mp.mWaitList.next.ptr() == nil {
+				throw("removeMutexWaitList node in body of list, but without next field set")
+			}
+			mp.mWaitList.next.ptr().mWaitList.prev = mp.mWaitList.prev
+		}
+	}
+
+	// head and tail nodes are responsible for having current versions of
+	// certain metadata
+	if hp := head.ptr(); hp != nil {
+		hp.mWaitList.prev = 0
+		hp.mWaitList.tail = tail
+		hp.mWaitList.waiters = waiters - 1
+		hp.mWaitList.acquireTimes = headTimes
+	}
+	if tp := tail.ptr(); tp != nil {
+		tp.mWaitList.acquireTimes = tailTimes
+	}
+	return head
+}
+
 // lockTimer assists with profiling contention on runtime-internal locks.
 //
 // There are several steps between the time that an M experiences contention and
@ -667,17 +876,18 @@ func (lt *lockTimer) end() {
 	}
 }

-// mWaitList is part of the M struct, and holds the list of Ms that are waiting
-// for a particular runtime.mutex.
+// mLockProfile is part of the M struct to hold information relating to mutex
+// contention delay attributed to this M.
 //
-// When an M is unable to immediately obtain a lock, it adds itself to the list
-// of Ms waiting for the lock. It does that via this struct's next field,
-// forming a singly-linked list with the mutex's key field pointing to the head
-// of the list.
-type mWaitList struct {
-	next muintptr // next m waiting for lock (set by us, cleared by another during unlock)
-}
-
+// Adding records to the process-wide mutex contention profile involves
+// acquiring mutexes, so the M uses this to buffer a single contention event
+// until it can safely transfer it to the shared profile.
+//
+// When the M unlocks its last mutex, it transfers the local buffer into the
+// profile. As part of that step, it also transfers any "additional contention"
+// time to the profile. Any lock contention that it experiences while adding
+// samples to the profile will be recorded later as "additional contention" and
+// not include a call stack, to avoid an echo.
 type mLockProfile struct {
 	waitTime   atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
 	stack      []uintptr    // stack that experienced contention in runtime.lockWithRank