linux/kernel/trace/trace_clock.c
Steven Rostedt (VMware) 89529d8b8f tracing: Do no increment trace_clock_global() by one
The trace_clock_global() tries to make sure the events between CPUs is
somewhat in order. A global value is used and updated by the latest read
of a clock. If one CPU is ahead by a little, and is read by another CPU, a
lock is taken, and if the timestamp of the other CPU is behind, it will
simply use the other CPUs timestamp.

The lock is also only taken with a "trylock" due to tracing, and strange
recursions can happen. The lock is not taken at all in NMI context.

In the case where the lock is not able to be taken, the non synced
timestamp is returned. But it will not be less than the saved global
timestamp.

The problem arises because when the time goes "backwards" the time
returned is the saved timestamp plus 1. If the lock is not taken, and the
plus one to the timestamp is returned, there's a small race that can cause
the time to go backwards!

	CPU0				CPU1
	----				----
				trace_clock_global() {
				    ts = clock() [ 1000 ]
				    trylock(clock_lock) [ success ]
				    global_ts = ts; [ 1000 ]

				    <interrupted by NMI>
 trace_clock_global() {
    ts = clock() [ 999 ]
    if (ts < global_ts)
	ts = global_ts + 1 [ 1001 ]

    trylock(clock_lock) [ fail ]

    return ts [ 1001]
 }
				    unlock(clock_lock);
				    return ts; [ 1000 ]
				}

 trace_clock_global() {
    ts = clock() [ 1000 ]
    if (ts < global_ts) [ false 1000 == 1000 ]

    trylock(clock_lock) [ success ]
    global_ts = ts; [ 1000 ]
    unlock(clock_lock)

    return ts; [ 1000 ]
 }

The above case shows to reads of trace_clock_global() on the same CPU, but
the second read returns one less than the first read. That is, time when
backwards, and this is not what is allowed by trace_clock_global().

This was triggered by heavy tracing and the ring buffer checker that tests
for the clock going backwards:

 Ring buffer clock went backwards: 20613921464 -> 20613921463
 ------------[ cut here ]------------
 WARNING: CPU: 2 PID: 0 at kernel/trace/ring_buffer.c:3412 check_buffer+0x1b9/0x1c0
 Modules linked in:
 [..]
 [CPU: 2]TIME DOES NOT MATCH expected:20620711698 actual:20620711697 delta:6790234 before:20613921463 after:20613921463
   [20613915818] PAGE TIME STAMP
   [20613915818] delta:0
   [20613915819] delta:1
   [20613916035] delta:216
   [20613916465] delta:430
   [20613916575] delta:110
   [20613916749] delta:174
   [20613917248] delta:499
   [20613917333] delta:85
   [20613917775] delta:442
   [20613917921] delta:146
   [20613918321] delta:400
   [20613918568] delta:247
   [20613918768] delta:200
   [20613919306] delta:538
   [20613919353] delta:47
   [20613919980] delta:627
   [20613920296] delta:316
   [20613920571] delta:275
   [20613920862] delta:291
   [20613921152] delta:290
   [20613921464] delta:312
   [20613921464] delta:0 TIME EXTEND
   [20613921464] delta:0

This happened more than once, and always for an off by one result. It also
started happening after commit aafe104aa9 was added.

Cc: stable@vger.kernel.org
Fixes: aafe104aa9 ("tracing: Restructure trace_clock_global() to never block")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
2021-06-18 09:10:00 -04:00

159 lines
4.2 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* tracing clocks
*
* Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Implements 3 trace clock variants, with differing scalability/precision
* tradeoffs:
*
* - local: CPU-local trace clock
* - medium: scalable global clock with some jitter
* - global: globally monotonic, serialized clock
*
* Tracer plugins will chose a default from these clocks.
*/
#include <linux/spinlock.h>
#include <linux/irqflags.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/ktime.h>
#include <linux/trace_clock.h>
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
* Useful for tracing that does not cross to other CPUs nor
* does it go through idle events.
*/
u64 notrace trace_clock_local(void)
{
u64 clock;
/*
* sched_clock() is an architecture implemented, fast, scalable,
* lockless clock. It is not guaranteed to be coherent across
* CPUs, nor across CPU idle events.
*/
preempt_disable_notrace();
clock = sched_clock();
preempt_enable_notrace();
return clock;
}
EXPORT_SYMBOL_GPL(trace_clock_local);
/*
* trace_clock(): 'between' trace clock. Not completely serialized,
* but not completely incorrect when crossing CPUs either.
*
* This is based on cpu_clock(), which will allow at most ~1 jiffy of
* jitter between CPUs. So it's a pretty scalable clock, but there
* can be offsets in the trace data.
*/
u64 notrace trace_clock(void)
{
return local_clock();
}
EXPORT_SYMBOL_GPL(trace_clock);
/*
* trace_jiffy_clock(): Simply use jiffies as a clock counter.
* Note that this use of jiffies_64 is not completely safe on
* 32-bit systems. But the window is tiny, and the effect if
* we are affected is that we will have an obviously bogus
* timestamp on a trace event - i.e. not life threatening.
*/
u64 notrace trace_clock_jiffies(void)
{
return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES);
}
EXPORT_SYMBOL_GPL(trace_clock_jiffies);
/*
* trace_clock_global(): special globally coherent trace clock
*
* It has higher overhead than the other trace clocks but is still
* an order of magnitude faster than GTOD derived hardware clocks.
*
* Used by plugins that need globally coherent timestamps.
*/
/* keep prev_time and lock in the same cacheline. */
static struct {
u64 prev_time;
arch_spinlock_t lock;
} trace_clock_struct ____cacheline_aligned_in_smp =
{
.lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
};
u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
u64 now, prev_time;
raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
/*
* The global clock "guarantees" that the events are ordered
* between CPUs. But if two events on two different CPUS call
* trace_clock_global at roughly the same time, it really does
* not matter which one gets the earlier time. Just make sure
* that the same CPU will always show a monotonic clock.
*
* Use a read memory barrier to get the latest written
* time that was recorded.
*/
smp_rmb();
prev_time = READ_ONCE(trace_clock_struct.prev_time);
now = sched_clock_cpu(this_cpu);
/* Make sure that now is always greater than or equal to prev_time */
if ((s64)(now - prev_time) < 0)
now = prev_time;
/*
* If in an NMI context then dont risk lockups and simply return
* the current time.
*/
if (unlikely(in_nmi()))
goto out;
/* Tracing can cause strange recursion, always use a try lock */
if (arch_spin_trylock(&trace_clock_struct.lock)) {
/* Reread prev_time in case it was already updated */
prev_time = READ_ONCE(trace_clock_struct.prev_time);
if ((s64)(now - prev_time) < 0)
now = prev_time;
trace_clock_struct.prev_time = now;
/* The unlock acts as the wmb for the above rmb */
arch_spin_unlock(&trace_clock_struct.lock);
}
out:
raw_local_irq_restore(flags);
return now;
}
EXPORT_SYMBOL_GPL(trace_clock_global);
static atomic64_t trace_counter;
/*
* trace_clock_counter(): simply an atomic counter.
* Use the trace_counter "counter" for cases where you do not care
* about timings, but are interested in strict ordering.
*/
u64 notrace trace_clock_counter(void)
{
return atomic64_add_return(1, &trace_counter);
}