cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints

This closely resembles helpers added for the global cgroup_rstat_lock in
commit fc29e04ae1 ("cgroup/rstat: add cgroup_rstat_lock helpers and
tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.

Based on production workloads, we observe the fast-path "update" function
cgroup_rstat_updated() is invoked around 3 million times per sec, while the
"flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
can see periodic spikes of 700 invocations/sec.

For this reason, the tracepoints are split into normal and fastpath
versions for this per-CPU lock. Making it feasible for production to
continuously monitor the non-fastpath tracepoint to detect lock contention
issues. The reason for monitoring is that lock disables IRQs which can
disturb e.g. softirq processing on the local CPUs involved. When the
global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
this per CPU lock becomes the next bottleneck that can introduce latency
variations.

A practical bpftrace script for monitoring contention latency:

 bpftrace -e '
   tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
     @start[tid]=nsecs; @cnt[probe]=count()}
   tracepoint:cgroup:cgroup_rstat_cpu_locked {
     if (args->contended) {
       @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
     @cnt[probe]=count()}
   interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
Jesper Dangaard Brouer 2024-05-01 16:04:11 +02:00 committed by Tejun Heo
parent c1457d9aad
commit 21c38a3bd4
2 changed files with 108 additions and 18 deletions

View file

@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
DECLARE_EVENT_CLASS(cgroup_rstat,
TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu_in_loop, contended),
TP_ARGS(cgrp, cpu, contended),
TP_STRUCT__entry(
__field( int, root )
__field( int, level )
__field( u64, id )
__field( int, cpu_in_loop )
__field( int, cpu )
__field( bool, contended )
),
@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
__entry->root = cgrp->root->hierarchy_id;
__entry->id = cgroup_id(cgrp);
__entry->level = cgrp->level;
__entry->cpu_in_loop = cpu_in_loop;
__entry->cpu = cpu;
__entry->contended = contended;
),
TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
__entry->root, __entry->id, __entry->level,
__entry->cpu_in_loop, __entry->contended)
__entry->cpu, __entry->contended)
);
/* Related to global: cgroup_rstat_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
TP_ARGS(cgrp, cpu, contended)
);
/* Related to per CPU: cgroup_rstat_cpu_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu, contended)
);
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
TP_ARGS(cgrp, cpu, contended)
);
#endif /* _TRACE_CGROUP_H */
/* This part must be outside protection */

View file

@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}
/*
* Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
*
* This makes it easier to diagnose locking issues and contention in
* production environments. The parameter @fast_path determine the
* tracepoints being added, allowing us to diagnose "flush" related
* operations without handling high-frequency fast-path "update" events.
*/
static __always_inline
unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
struct cgroup *cgrp, const bool fast_path)
{
unsigned long flags;
bool contended;
/*
* The _irqsave() is needed because cgroup_rstat_lock is
* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
* this lock with the _irq() suffix only disables interrupts on
* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
* interrupts on both configurations. The _irqsave() ensures
* that interrupts are always disabled and later restored.
*/
contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
if (contended) {
if (fast_path)
trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
else
trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
raw_spin_lock_irqsave(cpu_lock, flags);
}
if (fast_path)
trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
else
trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
return flags;
}
static __always_inline
void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
struct cgroup *cgrp, unsigned long flags,
const bool fast_path)
{
if (fast_path)
trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
else
trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
/**
* cgroup_rstat_updated - keep track of updated rstat_cpu
* @cgrp: target cgroup
@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
/* put @cgrp and all ancestors on the corresponding updated lists */
while (true) {
@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
cgrp = parent;
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}
/**
@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
struct cgroup *head = NULL, *parent, *child;
unsigned long flags;
/*
* The _irqsave() is needed because cgroup_rstat_lock is
* spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
* this lock with the _irq() suffix only disables interrupts on
* a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
* interrupts on both configurations. The _irqsave() ensures
* that interrupts are always disabled and later restored.
*/
raw_spin_lock_irqsave(cpu_lock, flags);
flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
if (child != root)
head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
raw_spin_unlock_irqrestore(cpu_lock, flags);
_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
return head;
}