runtime: only acquire trace.lock on the system stack

Currently, trace.lock can be acquired while on a user G and stack
splits can happen while holding trace.lock. That means every lock used
by the stack allocator must be okay to acquire while holding
trace.lock, including various locks related to span allocation. In
turn, we cannot safely emit trace events while holding any
allocation-related locks because this would cause a cycle in the lock
rank graph.

To fix this, require that trace.lock only be acquired on the system
stack, like mheap.lock. This pushes it into the "bottom half" and
eliminates the lock rank relationship between tracing and stack
allocation, making it safe to emit trace events in many more places.

One subtlety is that the trace code has race annotations and uses
maps, which have race annotations. By default, we can't have race
annotations on the system stack, so we borrow the user race context
for these situations.

We'll update the lock graph itself in the next CL.

For #53979. This CL technically fixes the problem, but the lock rank
checker doesn't know that yet.

Change-Id: I9f5187a9c52a67bee4f7064db124b1ad53e5178f
Reviewed-on: https://go-review.googlesource.com/c/go/+/418956
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Austin Clements 2022-07-21 15:43:11 -04:00
parent 9923162f1c
commit 6c2e327e35

View file

@ -109,6 +109,8 @@ const (
// trace is global tracing context.
var trace struct {
// trace.lock must only be acquired on the system stack where
// stack splits cannot happen while it is held.
lock mutex // protects the following members
lockOwner *g // to avoid deadlocks during recursive lock locks
enabled bool // when set runtime traces events
@ -385,31 +387,33 @@ func StopTrace() {
raceacquire(unsafe.Pointer(&trace.shutdownSema))
}
// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
lock(&trace.lock)
for _, p := range allp[:cap(allp)] {
if p.tracebuf != 0 {
throw("trace: non-empty trace buffer in proc")
systemstack(func() {
// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
lock(&trace.lock)
for _, p := range allp[:cap(allp)] {
if p.tracebuf != 0 {
throw("trace: non-empty trace buffer in proc")
}
}
}
if trace.buf != 0 {
throw("trace: non-empty global trace buffer")
}
if trace.fullHead != 0 || trace.fullTail != 0 {
throw("trace: non-empty full trace buffer")
}
if trace.reading != 0 || trace.reader.Load() != nil {
throw("trace: reading after shutdown")
}
for trace.empty != 0 {
buf := trace.empty
trace.empty = buf.ptr().link
sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
}
trace.strings = nil
trace.shutdown = false
trace.cpuLogRead = nil
unlock(&trace.lock)
if trace.buf != 0 {
throw("trace: non-empty global trace buffer")
}
if trace.fullHead != 0 || trace.fullTail != 0 {
throw("trace: non-empty full trace buffer")
}
if trace.reading != 0 || trace.reader.Load() != nil {
throw("trace: reading after shutdown")
}
for trace.empty != 0 {
buf := trace.empty
trace.empty = buf.ptr().link
sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
}
trace.strings = nil
trace.shutdown = false
trace.cpuLogRead = nil
unlock(&trace.lock)
})
}
// ReadTrace returns the next chunk of binary tracing data, blocking until data
@ -419,48 +423,12 @@ func StopTrace() {
// ReadTrace must be called from one goroutine at a time.
func ReadTrace() []byte {
top:
// This function may need to lock trace.lock recursively
// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
// To allow this we use trace.lockOwner.
// Also this function must not allocate while holding trace.lock:
// allocation can call heap allocate, which will try to emit a trace
// event while holding heap lock.
lock(&trace.lock)
trace.lockOwner = getg()
if trace.reader.Load() != nil {
// More than one goroutine reads trace. This is bad.
// But we rather do not crash the program because of tracing,
// because tracing can be enabled at runtime on prod servers.
trace.lockOwner = nil
unlock(&trace.lock)
println("runtime: ReadTrace called from multiple goroutines simultaneously")
return nil
}
// Recycle the old buffer.
if buf := trace.reading; buf != 0 {
buf.ptr().link = trace.empty
trace.empty = buf
trace.reading = 0
}
// Write trace header.
if !trace.headerWritten {
trace.headerWritten = true
trace.lockOwner = nil
unlock(&trace.lock)
return []byte("go 1.19 trace\x00\x00\x00")
}
// Optimistically look for CPU profile samples. This may write new stack
// records, and may write new tracing buffers.
if !trace.footerWritten && !trace.shutdown {
traceReadCPU()
}
// Wait for new data.
if trace.fullHead == 0 && !trace.shutdown {
// We don't simply use a note because the scheduler
// executes this goroutine directly when it wakes up
// (also a note would consume an M).
unlock(&trace.lock)
var buf []byte
var park bool
systemstack(func() {
buf, park = readTrace0()
})
if park {
gopark(func(gp *g, _ unsafe.Pointer) bool {
if !trace.reader.CompareAndSwapNoWB(nil, gp) {
// We're racing with another reader.
@ -484,6 +452,70 @@ top:
goto top
}
return buf
}
// readTrace0 is ReadTrace's continuation on g0. This must run on the
// system stack because it acquires trace.lock.
//
//go:systemstack
func readTrace0() (buf []byte, park bool) {
if raceenabled {
// g0 doesn't have a race context. Borrow the user G's.
if getg().racectx != 0 {
throw("expected racectx == 0")
}
getg().racectx = getg().m.curg.racectx
// (This defer should get open-coded, which is safe on
// the system stack.)
defer func() { getg().racectx = 0 }()
}
// This function may need to lock trace.lock recursively
// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
// To allow this we use trace.lockOwner.
// Also this function must not allocate while holding trace.lock:
// allocation can call heap allocate, which will try to emit a trace
// event while holding heap lock.
lock(&trace.lock)
trace.lockOwner = getg().m.curg
if trace.reader.Load() != nil {
// More than one goroutine reads trace. This is bad.
// But we rather do not crash the program because of tracing,
// because tracing can be enabled at runtime on prod servers.
trace.lockOwner = nil
unlock(&trace.lock)
println("runtime: ReadTrace called from multiple goroutines simultaneously")
return nil, false
}
// Recycle the old buffer.
if buf := trace.reading; buf != 0 {
buf.ptr().link = trace.empty
trace.empty = buf
trace.reading = 0
}
// Write trace header.
if !trace.headerWritten {
trace.headerWritten = true
trace.lockOwner = nil
unlock(&trace.lock)
return []byte("go 1.19 trace\x00\x00\x00"), false
}
// Optimistically look for CPU profile samples. This may write new stack
// records, and may write new tracing buffers.
if !trace.footerWritten && !trace.shutdown {
traceReadCPU()
}
// Wait for new data.
if trace.fullHead == 0 && !trace.shutdown {
// We don't simply use a note because the scheduler
// executes this goroutine directly when it wakes up
// (also a note would consume an M).
trace.lockOwner = nil
unlock(&trace.lock)
return nil, true
}
newFull:
assertLockHeld(&trace.lock)
// Write a buffer.
@ -492,7 +524,7 @@ newFull:
trace.reading = buf
trace.lockOwner = nil
unlock(&trace.lock)
return buf.ptr().arr[:buf.ptr().pos]
return buf.ptr().arr[:buf.ptr().pos], false
}
// Write footer with timer frequency.
@ -534,17 +566,21 @@ newFull:
}
// trace.enabled is already reset, so can call traceable functions.
semrelease(&trace.shutdownSema)
return nil
return nil, false
}
// Also bad, but see the comment above.
trace.lockOwner = nil
unlock(&trace.lock)
println("runtime: spurious wakeup of trace reader")
return nil
return nil, false
}
// traceReader returns the trace reader that should be woken up, if any.
// Callers should first check that trace.enabled or trace.shutdown is set.
//
// This must run on the system stack because it acquires trace.lock.
//
//go:systemstack
func traceReader() *g {
// Optimistic check first
if traceReaderAvailable() == nil {
@ -571,6 +607,10 @@ func traceReaderAvailable() *g {
}
// traceProcFree frees trace buffer associated with pp.
//
// This must run on the system stack because it acquires trace.lock.
//
//go:systemstack
func traceProcFree(pp *p) {
buf := pp.tracebuf
pp.tracebuf = 0
@ -661,7 +701,9 @@ func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev by
// TODO: test on non-zero extraBytes param.
maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
if buf == nil || len(buf.arr)-buf.pos < maxSize {
buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
systemstack(func() {
buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
})
bufp.set(buf)
}
@ -802,7 +844,9 @@ func traceReadCPU() {
buf := bufp.ptr()
if buf == nil {
*bufp = traceFlush(*bufp, 0)
systemstack(func() {
*bufp = traceFlush(*bufp, 0)
})
buf = bufp.ptr()
}
for i := range stk {
@ -856,6 +900,10 @@ func traceReleaseBuffer(pid int32) {
}
// traceFlush puts buf onto stack of full buffers and returns an empty buffer.
//
// This must run on the system stack because it acquires trace.lock.
//
//go:systemstack
func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
owner := trace.lockOwner
dolock := owner == nil || owner != getg().m.curg
@ -933,8 +981,10 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
buf := bufp.ptr()
size := 1 + 2*traceBytesPerNumber + len(s)
if buf == nil || len(buf.arr)-buf.pos < size {
buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
bufp.set(buf)
systemstack(func() {
buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
bufp.set(buf)
})
}
buf.byte(traceEvString)
buf.varint(id)
@ -1088,6 +1138,10 @@ func traceFrames(bufp traceBufPtr, pcs []uintptr) ([]traceFrame, traceBufPtr) {
// dump writes all previously cached stacks to trace buffers,
// releases all memory and resets state.
//
// This must run on the system stack because it calls traceFlush.
//
//go:systemstack
func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
for i, _ := range tab.tab {
stk := tab.tab[i].ptr()