From 0264c8c9e1b53e9dbb41fae5e54756e84644bc60 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:36 -0500 Subject: [PATCH 01/33] ftrace: Move the recursion testing into global headers Currently, if a callback is registered to a ftrace function and its ftrace_ops does not have the RECURSION flag set, it is encapsulated in a helper function that does the recursion for it. Really, all the callbacks should have their own recursion protection for performance reasons. But they should not all implement their own. Move the recursion helpers to global headers, so that all callbacks can use them. Link: https://lkml.kernel.org/r/20201028115612.460535535@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.166456258@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 1 + include/linux/trace_recursion.h | 187 ++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 177 ------------------------------ 3 files changed, 188 insertions(+), 177 deletions(-) create mode 100644 include/linux/trace_recursion.h diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1bd3a0356ae4..0e4164a7f56d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -7,6 +7,7 @@ #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H +#include #include #include #include diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h new file mode 100644 index 000000000000..dbb7b6d4c94c --- /dev/null +++ b/include/linux/trace_recursion.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TRACE_RECURSION_H +#define _LINUX_TRACE_RECURSION_H + +#include +#include + +#ifdef CONFIG_TRACING + +/* Only current can touch trace_recursion */ + +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not support a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { + /* Function recursion bits */ + TRACE_FTRACE_BIT, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + + /* INTERNAL_BITs must be greater than FTRACE_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_BRANCH_BIT, +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ + TRACE_IRQ_BIT, + + /* Set if the function is in the set_graph_function file */ + TRACE_GRAPH_BIT, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_BIT + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, + + /* + * To implement set_graph_notrace, if this bit is set, we ignore + * function graph tracing of called functions, until the return + * function is called to clear it. + */ + TRACE_GRAPH_NOTRACE_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, +}; + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) + +#define trace_recursion_depth() \ + (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ + do { \ + current->trace_recursion &= \ + ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ + current->trace_recursion |= \ + ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ + } while (0) + +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit + 1; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit--; + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + +#endif /* CONFIG_TRACING */ +#endif /* _LINUX_TRACE_RECURSION_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dadef445cd1..9462251cab92 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -558,183 +558,6 @@ struct tracer { bool noboot; }; - -/* Only current can touch trace_recursion */ - -/* - * For function tracing recursion: - * The order of these bits are important. - * - * When function tracing occurs, the following steps are made: - * If arch does not support a ftrace feature: - * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... - * The function callback, which can use the FTRACE bits to - * check for recursion. - * - * Now if the arch does not support a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. - */ -enum { - /* Function recursion bits */ - TRACE_FTRACE_BIT, - TRACE_FTRACE_NMI_BIT, - TRACE_FTRACE_IRQ_BIT, - TRACE_FTRACE_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than FTRACE_BITs */ - TRACE_INTERNAL_BIT, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, - - TRACE_BRANCH_BIT, -/* - * Abuse of the trace_recursion. - * As we need a way to maintain state if we are tracing the function - * graph in irq because we want to trace a particular function that - * was called in irq context but we have irq tracing off. Since this - * can only be modified by current, we can reuse trace_recursion. - */ - TRACE_IRQ_BIT, - - /* Set if the function is in the set_graph_function file */ - TRACE_GRAPH_BIT, - - /* - * In the very unlikely case that an interrupt came in - * at a start of graph tracing, and we want to trace - * the function in that interrupt, the depth can be greater - * than zero, because of the preempted start of a previous - * trace. In an even more unlikely case, depth could be 2 - * if a softirq interrupted the start of graph tracing, - * followed by an interrupt preempting a start of graph - * tracing in the softirq, and depth can even be 3 - * if an NMI came in at the start of an interrupt function - * that preempted a softirq start of a function that - * preempted normal context!!!! Luckily, it can't be - * greater than 3, so the next two bits are a mask - * of what the depth is when we set TRACE_GRAPH_BIT - */ - - TRACE_GRAPH_DEPTH_START_BIT, - TRACE_GRAPH_DEPTH_END_BIT, - - /* - * To implement set_graph_notrace, if this bit is set, we ignore - * function graph tracing of called functions, until the return - * function is called to clear it. - */ - TRACE_GRAPH_NOTRACE_BIT, - - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, -}; - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) - -#define trace_recursion_depth() \ - (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) -#define trace_recursion_set_depth(depth) \ - do { \ - current->trace_recursion &= \ - ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ - current->trace_recursion |= \ - ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ - } while (0) - -#define TRACE_CONTEXT_BITS 4 - -#define TRACE_FTRACE_START TRACE_FTRACE_BIT -#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) - -#define TRACE_LIST_START TRACE_INTERNAL_BIT -#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) - -#define TRACE_CONTEXT_MASK TRACE_LIST_MAX - -static __always_inline int trace_get_context_bit(void) -{ - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = 0; - - else if (in_irq()) - bit = 1; - else - bit = 2; - } else - bit = 3; - - return bit; -} - -static __always_inline int trace_test_and_set_recursion(int start, int max) -{ - unsigned int val = current->trace_recursion; - int bit; - - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - - bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) { - /* - * It could be that preempt_count has not been updated during - * a switch between contexts. Allow for a single recursion. - */ - bit = TRACE_TRANSITION_BIT; - if (trace_recursion_test(bit)) - return -1; - trace_recursion_set(bit); - barrier(); - return bit + 1; - } - - /* Normal check passed, clear the transition to allow it again */ - trace_recursion_clear(TRACE_TRANSITION_BIT); - - val |= 1 << bit; - current->trace_recursion = val; - barrier(); - - return bit + 1; -} - -static __always_inline void trace_clear_recursion(int bit) -{ - unsigned int val = current->trace_recursion; - - if (!bit) - return; - - bit--; - bit = 1 << bit; - val &= ~bit; - - barrier(); - current->trace_recursion = val; -} - static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { From 6e4eb9cb22fc8a893cb708ed42644de5ee7c3827 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:37 -0500 Subject: [PATCH 02/33] ftrace: Add ftrace_test_recursion_trylock() helper function To make it easier for ftrace callbacks to have recursion protection, provide a ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() helper that tests for recursion. Link: https://lkml.kernel.org/r/20201028115612.634927593@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.378584067@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 25 +++++++++++++++++++++++++ kernel/trace/trace_functions.c | 12 +++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index dbb7b6d4c94c..f2a949dbfec7 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -183,5 +183,30 @@ static __always_inline void trace_clear_recursion(int bit) current->trace_recursion = val; } +/** + * ftrace_test_recursion_trylock - tests for recursion in same context + * + * Use this for ftrace callbacks. This will detect if the function + * tracing recursed in the same context (normal vs interrupt), + * + * Returns: -1 if a recursion happened. + * >= 0 if no recursion + */ +static __always_inline int ftrace_test_recursion_trylock(void) +{ + return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); +} + +/** + * ftrace_test_recursion_unlock - called when function callback is complete + * @bit: The return of a successful ftrace_test_recursion_trylock() + * + * This is used at the end of a ftrace callback. + */ +static __always_inline void ftrace_test_recursion_unlock(int bit) +{ + trace_clear_recursion(bit); +} + #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_RECURSION_H */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2c2126e1871d..943756c01190 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -141,22 +141,20 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (unlikely(!tr->function_enabled)) return; + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + pc = preempt_count(); preempt_disable_notrace(); - bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); - if (bit < 0) - goto out; - cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } - trace_clear_recursion(bit); - - out: + ftrace_test_recursion_unlock(bit); preempt_enable_notrace(); } From da5afbeb1724609996ca7bb4fbce2cd104c95914 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:38 -0500 Subject: [PATCH 03/33] ftrace: Optimize testing what context current is in The preempt_count() is not a simple location in memory, it could be part of per_cpu code or more. Each access to preempt_count(), or one of its accessor functions (like in_interrupt()) takes several cycles. By reading preempt_count() once, and then doing tests to find the context against the value return is slightly faster than using in_nmi() and in_interrupt(). Link: https://lkml.kernel.org/r/20201028115612.780796355@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.558881845@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index f2a949dbfec7..ac3d73484cb2 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -117,22 +117,29 @@ enum { #define TRACE_CONTEXT_MASK TRACE_LIST_MAX +/* + * Used for setting context + * NMI = 0 + * IRQ = 1 + * SOFTIRQ = 2 + * NORMAL = 3 + */ +enum { + TRACE_CTX_NMI, + TRACE_CTX_IRQ, + TRACE_CTX_SOFTIRQ, + TRACE_CTX_NORMAL, +}; + static __always_inline int trace_get_context_bit(void) { - int bit; + unsigned long pc = preempt_count(); - if (in_interrupt()) { - if (in_nmi()) - bit = 0; - - else if (in_irq()) - bit = 1; - else - bit = 2; - } else - bit = 3; - - return bit; + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + return TRACE_CTX_NORMAL; + else + return pc & NMI_MASK ? TRACE_CTX_NMI : + pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; } static __always_inline int trace_test_and_set_recursion(int start, int max) From 6cdf941871ec9cb1cf03227038a45a73afd8dc9a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:39 -0500 Subject: [PATCH 04/33] pstore/ftrace: Add recursion protection to the ftrace callback If a ftrace callback does not supply its own recursion protection and does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will make a helper trampoline to do so before calling the callback instead of just calling the callback directly. The default for ftrace_ops is going to change. It will expect that handlers provide their own recursion protection, unless its ftrace_ops states otherwise. Link: https://lkml.kernel.org/r/20201028115612.990886844@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.720372267@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Thomas Meyer Reviewed-by: Kees Cook Signed-off-by: Steven Rostedt (VMware) --- fs/pstore/ftrace.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 5c0450701293..816210fc5d3a 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -28,6 +28,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, struct ftrace_ops *op, struct pt_regs *regs) { + int bit; unsigned long flags; struct pstore_ftrace_record rec = {}; struct pstore_record record = { @@ -40,6 +41,10 @@ static void notrace pstore_ftrace_call(unsigned long ip, if (unlikely(oops_in_progress)) return; + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + local_irq_save(flags); rec.ip = ip; @@ -49,6 +54,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, psinfo->write(&record); local_irq_restore(flags); + ftrace_test_recursion_unlock(bit); } static struct ftrace_ops pstore_ftrace_ops __read_mostly = { From c536aa1c5b17fac1ee395932031ff7d82827f2c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:40 -0500 Subject: [PATCH 05/33] kprobes/ftrace: Add recursion protection to the ftrace callback If a ftrace callback does not supply its own recursion protection and does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will make a helper trampoline to do so before calling the callback instead of just calling the callback directly. The default for ftrace_ops is going to change. It will expect that handlers provide their own recursion protection, unless its ftrace_ops states otherwise. Link: https://lkml.kernel.org/r/20201028115613.140212174@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.944907560@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Andrew Morton Cc: Guo Ren Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Thomas Gleixner Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: linux-csky@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 12 ++++++++++-- arch/parisc/kernel/ftrace.c | 16 +++++++++++++--- arch/powerpc/kernel/kprobes-ftrace.c | 11 ++++++++++- arch/s390/kernel/ftrace.c | 16 +++++++++++++--- arch/x86/kernel/kprobes/ftrace.c | 12 ++++++++++-- 5 files changed, 56 insertions(+), 11 deletions(-) diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index 5264763d05be..5eb2604fdf71 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -13,16 +13,21 @@ int arch_check_ftrace_location(struct kprobe *p) void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) { + int bit; bool lr_saver = false; struct kprobe *p; struct kprobe_ctlblk *kcb; - /* Preempt is disabled by ftrace */ + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (!p) { p = get_kprobe((kprobe_opcode_t *)(ip - MCOUNT_INSN_SIZE)); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; lr_saver = true; } @@ -56,6 +61,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 63e3ecb9da81..13d85042810a 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -207,14 +207,21 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe_ctlblk *kcb; - struct kprobe *p = get_kprobe((kprobe_opcode_t *)ip); + struct kprobe *p; + int bit; - if (unlikely(!p) || kprobe_disabled(p)) + bit = ftrace_test_recursion_trylock(); + if (bit < 0) return; + preempt_disable_notrace(); + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto out; + if (kprobe_running()) { kprobes_inc_nmissed_count(p); - return; + goto out; } __this_cpu_write(current_kprobe, p); @@ -235,6 +242,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, } } __this_cpu_write(current_kprobe, NULL); +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 972cb28174b2..5df8d50c65ae 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -18,10 +18,16 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, { struct kprobe *p; struct kprobe_ctlblk *kcb; + int bit; + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -52,6 +58,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index b388e87a08bf..8f31c726537a 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -201,14 +201,21 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) { struct kprobe_ctlblk *kcb; - struct kprobe *p = get_kprobe((kprobe_opcode_t *)ip); + struct kprobe *p; + int bit; - if (unlikely(!p) || kprobe_disabled(p)) + bit = ftrace_test_recursion_trylock(); + if (bit < 0) return; + preempt_disable_notrace(); + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto out; + if (kprobe_running()) { kprobes_inc_nmissed_count(p); - return; + goto out; } __this_cpu_write(current_kprobe, p); @@ -228,6 +235,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, } } __this_cpu_write(current_kprobe, NULL); +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 681a4b36e9bb..a40a6cdfcca3 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -18,11 +18,16 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, { struct kprobe *p; struct kprobe_ctlblk *kcb; + int bit; - /* Preempt is disabled by ftrace */ + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + + preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) - return; + goto out; kcb = get_kprobe_ctlblk(); if (kprobe_running()) { @@ -52,6 +57,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, */ __this_cpu_write(current_kprobe, NULL); } +out: + preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); From 13f3ea9a2c829f28610bb8772a8b9c328412930e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:41 -0500 Subject: [PATCH 06/33] livepatch/ftrace: Add recursion protection to the ftrace callback If a ftrace callback does not supply its own recursion protection and does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will make a helper trampoline to do so before calling the callback instead of just calling the callback directly. The default for ftrace_ops is going to change. It will expect that handlers provide their own recursion protection, unless its ftrace_ops states otherwise. Link: https://lkml.kernel.org/r/20201028115613.291169246@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.122802424@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index b552cf2d85f8..6c0164d24bbd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -45,9 +45,13 @@ static void notrace klp_ftrace_handler(unsigned long ip, struct klp_ops *ops; struct klp_func *func; int patch_state; + int bit; ops = container_of(fops, struct klp_ops, fops); + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; /* * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). @@ -117,6 +121,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, unlock: preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } /* From 4b750b573c5b3ee10e33c1573eaa94a9dad62f19 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:42 -0500 Subject: [PATCH 07/33] livepatch: Trigger WARNING if livepatch function fails due to recursion If for some reason a function is called that triggers the recursion detection of live patching, trigger a warning. By not executing the live patch code, it is possible that the old unpatched function will be called placing the system into an unknown state. Link: https://lore.kernel.org/r/20201029145709.GD16774@alley Link: https://lkml.kernel.org/r/20201106023547.312639435@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Suggested-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 6c0164d24bbd..15480bf3ce88 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -50,7 +50,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); bit = ftrace_test_recursion_trylock(); - if (bit < 0) + if (WARN_ON_ONCE(bit < 0)) return; /* * A variant of synchronize_rcu() is used to allow patching functions From 5d15a624c34b11c8d1c04c8cc004782e7ac2888d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:43 -0500 Subject: [PATCH 08/33] perf/ftrace: Add recursion protection to the ftrace callback If a ftrace callback does not supply its own recursion protection and does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will make a helper trampoline to do so before calling the callback instead of just calling the callback directly. The default for ftrace_ops is going to change. It will expect that handlers provide their own recursion protection, unless its ftrace_ops states otherwise. Link: https://lkml.kernel.org/r/20201028115613.444477858@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.466892083@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 643e0b19920d..fd58d83861d8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -439,10 +439,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct hlist_head head; struct pt_regs regs; int rctx; + int bit; if ((unsigned long)ops->private != smp_processor_id()) return; + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + event = container_of(ops, struct perf_event, ftrace_ops); /* @@ -463,13 +468,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) - return; + goto out; entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, &head, NULL); +out: + ftrace_test_recursion_unlock(bit); #undef ENTRY_SIZE } From 5d029b035bf112466541b844ee1b86197936db65 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:44 -0500 Subject: [PATCH 09/33] perf/ftrace: Check for rcu_is_watching() in callback function If a ftrace callback requires "rcu_is_watching", then it adds the FTRACE_OPS_FL_RCU flag and it will not be called if RCU is not "watching". But this means that it will use a trampoline when called, and this slows down the function tracing a tad. By checking rcu_is_watching() from within the callback, it no longer needs the RCU flag set in the ftrace_ops and it can be safely called directly. Link: https://lkml.kernel.org/r/20201028115613.591878956@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.711035826@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fd58d83861d8..a2b9fddb8148 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -441,6 +441,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, int rctx; int bit; + if (!rcu_is_watching()) + return; + if ((unsigned long)ops->private != smp_processor_id()) return; @@ -484,7 +487,6 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags = FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; From a25d036d939a30623ff73ecad9c8b9116b02e823 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:45 -0500 Subject: [PATCH 10/33] ftrace: Reverse what the RECURSION flag means in the ftrace_ops Now that all callbacks are recursion safe, reverse the meaning of the RECURSION flag and rename it from RECURSION_SAFE to simply RECURSION. Now only callbacks that request to have recursion protecting it will have the added trampoline to do so. Also remove the outdated comment about "PER_CPU" when determining to use the ftrace_ops_assist_func. Link: https://lkml.kernel.org/r/20201028115613.742454631@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.904270143@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Jonathan Corbet Cc: Sebastian Andrzej Siewior Cc: Miroslav Benes Cc: Kamalesh Babulal Cc: Petr Mladek Cc: linux-doc@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 78 +++++++++++++++++++++-------- include/linux/ftrace.h | 12 ++--- kernel/trace/fgraph.c | 3 +- kernel/trace/ftrace.c | 20 ++++---- kernel/trace/trace_events.c | 1 - kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_selftest.c | 7 +-- kernel/trace/trace_stack.c | 1 - 8 files changed, 77 insertions(+), 47 deletions(-) diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index a4955f7e3d19..86cd14b8e126 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -30,8 +30,8 @@ The ftrace context This requires extra care to what can be done inside a callback. A callback can be called outside the protective scope of RCU. -The ftrace infrastructure has some protections against recursions and RCU -but one must still be very careful how they use the callbacks. +There are helper functions to help against recursion, and making sure +RCU is watching. These are explained below. The ftrace_ops structure @@ -108,6 +108,50 @@ The prototype of the callback function is as follows (as of v4.14): at the start of the function where ftrace was tracing. Otherwise it either contains garbage, or NULL. +Protect your callback +===================== + +As functions can be called from anywhere, and it is possible that a function +called by a callback may also be traced, and call that same callback, +recursion protection must be used. There are two helper functions that +can help in this regard. If you start your code with: + + int bit; + + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + +and end it with: + + ftrace_test_recursion_unlock(bit); + +The code in between will be safe to use, even if it ends up calling a +function that the callback is tracing. Note, on success, +ftrace_test_recursion_trylock() will disable preemption, and the +ftrace_test_recursion_unlock() will enable it again (if it was previously +enabled). + +Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops +(as explained below), then a helper trampoline will be used to test +for recursion for the callback and no recursion test needs to be done. +But this is at the expense of a slightly more overhead from an extra +function call. + +If your callback accesses any data or critical section that requires RCU +protection, it is best to make sure that RCU is "watching", otherwise +that data or critical section will not be protected as expected. In this +case add: + + if (!rcu_is_watching()) + return; + +Alternatively, if the FTRACE_OPS_FL_RCU flag is set on the ftrace_ops +(as explained below), then a helper trampoline will be used to test +for rcu_is_watching for the callback and no other test needs to be done. +But this is at the expense of a slightly more overhead from an extra +function call. + The ftrace FLAGS ================ @@ -128,26 +172,20 @@ FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED will not fail with this flag set. But the callback must check if regs is NULL or not to determine if the architecture supports it. -FTRACE_OPS_FL_RECURSION_SAFE - By default, a wrapper is added around the callback to - make sure that recursion of the function does not occur. That is, - if a function that is called as a result of the callback's execution - is also traced, ftrace will prevent the callback from being called - again. But this wrapper adds some overhead, and if the callback is - safe from recursion, it can set this flag to disable the ftrace - protection. +FTRACE_OPS_FL_RECURSION + By default, it is expected that the callback can handle recursion. + But if the callback is not that worried about overehead, then + setting this bit will add the recursion protection around the + callback by calling a helper function that will do the recursion + protection and only call the callback if it did not recurse. - Note, if this flag is set, and recursion does occur, it could cause - the system to crash, and possibly reboot via a triple fault. + Note, if this flag is not set, and recursion does occur, it could + cause the system to crash, and possibly reboot via a triple fault. - It is OK if another callback traces a function that is called by a - callback that is marked recursion safe. Recursion safe callbacks - must never trace any function that are called by the callback - itself or any nested functions that those functions call. - - If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPTION is set), - but this is not guaranteed. + Not, if this flag is set, then the callback will always be called + with preemption disabled. If it is not set, then it is possible + (but not guaranteed) that the callback will be called in + preemptable context. FTRACE_OPS_FL_IPMODIFY Requires FTRACE_OPS_FL_SAVE_REGS set. If the callback is to "hijack" diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0e4164a7f56d..806196345c3f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -98,7 +98,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); /* * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are * set in the flags member. - * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION_SAFE, STUB and + * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION, STUB and * IPMODIFY are a kind of attribute flags which can be set only before * registering the ftrace_ops, and can not be modified while registered. * Changing those attribute flags after registering ftrace_ops will @@ -121,10 +121,10 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * passing regs to the handler. * Note, if this flag is set, the SAVE_REGS flag will automatically * get set upon registering the ftrace_ops, if the arch supports it. - * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure - * that the call back has its own recursion protection. If it does - * not set this, then the ftrace infrastructure will add recursion - * protection for the caller. + * RECURSION - The ftrace_ops can set this to tell the ftrace infrastructure + * that the call back needs recursion protection. If it does + * not set this, then the ftrace infrastructure will assume + * that the callback can handle recursion on its own. * STUB - The ftrace_ops is just a place holder. * INITIALIZED - The ftrace_ops has already been initialized (first use time * register_ftrace_function() is called, it will initialized the ops) @@ -156,7 +156,7 @@ enum { FTRACE_OPS_FL_DYNAMIC = BIT(1), FTRACE_OPS_FL_SAVE_REGS = BIT(2), FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = BIT(3), - FTRACE_OPS_FL_RECURSION_SAFE = BIT(4), + FTRACE_OPS_FL_RECURSION = BIT(4), FTRACE_OPS_FL_STUB = BIT(5), FTRACE_OPS_FL_INITIALIZED = BIT(6), FTRACE_OPS_FL_DELETED = BIT(7), diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5658f13037b3..73edb9e4f354 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -334,8 +334,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, static struct ftrace_ops graph_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID | FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095..39f2bba89b76 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -80,7 +80,7 @@ enum { struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + .flags = FTRACE_OPS_FL_STUB, INIT_OPS_HASH(ftrace_list_end) }; @@ -866,7 +866,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + .flags = FTRACE_OPS_FL_INITIALIZED, INIT_OPS_HASH(ftrace_profile_ops) }; @@ -1040,8 +1040,7 @@ struct ftrace_ops global_ops = { .local_hash.notrace_hash = EMPTY_HASH, .local_hash.filter_hash = EMPTY_HASH, INIT_OPS_HASH(global_ops) - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID, }; @@ -2382,7 +2381,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops direct_ops = { .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, /* @@ -6864,8 +6863,7 @@ void ftrace_init_trace_array(struct trace_array *tr) struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID, }; @@ -7023,11 +7021,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the function does not handle recursion, needs to be RCU safe, - * or does per cpu logic, then we need to call the assist handler. + * If the function does not handle recursion or needs to be RCU safe, + * then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || - ops->flags & FTRACE_OPS_FL_RCU) + if (ops->flags & (FTRACE_OPS_FL_RECURSION | + FTRACE_OPS_FL_RCU)) return ftrace_ops_assist_func; return ops->func; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 47a71f96e5bc..244abbcd1db5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3712,7 +3712,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 943756c01190..89c414ce1388 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -48,7 +48,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack version is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; + ops->flags = FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 4738ad48a667..8ee3c0bb5d8a 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -150,17 +150,14 @@ static void trace_selftest_test_dyn_func(unsigned long ip, static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) @@ -448,11 +445,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip, static struct ftrace_ops test_rec_probe = { .func = trace_selftest_test_recursion_func, + .flags = FTRACE_OPS_FL_RECURSION, }; static struct ftrace_ops test_recsafe_probe = { .func = trace_selftest_test_recursion_safe_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int @@ -561,7 +558,7 @@ static void trace_selftest_test_regs_func(unsigned long ip, static struct ftrace_ops test_regs_probe = { .func = trace_selftest_test_regs_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_REGS, }; static int diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c408423e5d65..969db526a563 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t From 773c16705058e9be7b0f4ce124e89cd231c120a2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:46 -0500 Subject: [PATCH 11/33] ftrace: Add recording of functions that caused recursion This adds CONFIG_FTRACE_RECORD_RECURSION that will record to a file "recursed_functions" all the functions that caused recursion while a callback to the function tracer was running. Link: https://lkml.kernel.org/r/20201106023548.102375687@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Guo Ren Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Thomas Gleixner Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Kees Cook Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: Kamalesh Babulal Cc: Mauro Carvalho Chehab Cc: Sebastian Andrzej Siewior Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-csky@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Cc: live-patching@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 6 +- arch/csky/kernel/probes/ftrace.c | 2 +- arch/parisc/kernel/ftrace.c | 2 +- arch/powerpc/kernel/kprobes-ftrace.c | 2 +- arch/s390/kernel/ftrace.c | 2 +- arch/x86/kernel/kprobes/ftrace.c | 2 +- fs/pstore/ftrace.c | 2 +- include/linux/trace_recursion.h | 29 +++- kernel/livepatch/patch.c | 2 +- kernel/trace/Kconfig | 25 +++ kernel/trace/Makefile | 1 + kernel/trace/ftrace.c | 4 +- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_output.c | 6 +- kernel/trace/trace_output.h | 1 + kernel/trace/trace_recursion_record.c | 236 ++++++++++++++++++++++++++ 17 files changed, 306 insertions(+), 20 deletions(-) create mode 100644 kernel/trace/trace_recursion_record.c diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 86cd14b8e126..5981d5691745 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -118,7 +118,7 @@ can help in this regard. If you start your code with: int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; @@ -130,7 +130,9 @@ The code in between will be safe to use, even if it ends up calling a function that the callback is tracing. Note, on success, ftrace_test_recursion_trylock() will disable preemption, and the ftrace_test_recursion_unlock() will enable it again (if it was previously -enabled). +enabled). The instruction pointer (ip) and its parent (parent_ip) is passed to +ftrace_test_recursion_trylock() to record where the recursion happened +(if CONFIG_FTRACE_RECORD_RECURSION is set). Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops (as explained below), then a helper trampoline will be used to test diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index 5eb2604fdf71..f30b179924ef 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -18,7 +18,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; struct kprobe_ctlblk *kcb; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 13d85042810a..1c5d3732bda2 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -210,7 +210,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 5df8d50c65ae..fdfee39938ea 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct kprobe_ctlblk *kcb; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 8f31c726537a..657c1ab45408 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -204,7 +204,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index a40a6cdfcca3..954d930a7127 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 816210fc5d3a..adb0935eb062 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -41,7 +41,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, if (unlikely(oops_in_progress)) return; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index ac3d73484cb2..228cc56ed66e 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -91,6 +91,9 @@ enum { * not be correct. Allow for a single recursion to cover this case. */ TRACE_TRANSITION_BIT, + + /* Used to prevent recursion recording from recursing. */ + TRACE_RECORD_RECURSION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -142,7 +145,22 @@ static __always_inline int trace_get_context_bit(void) pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; } -static __always_inline int trace_test_and_set_recursion(int start, int max) +#ifdef CONFIG_FTRACE_RECORD_RECURSION +extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); +# define do_ftrace_record_recursion(ip, pip) \ + do { \ + if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \ + trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \ + ftrace_record_recursion(ip, pip); \ + trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \ + } \ + } while (0) +#else +# define do_ftrace_record_recursion(ip, pip) do { } while (0) +#endif + +static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, + int start, int max) { unsigned int val = current->trace_recursion; int bit; @@ -158,8 +176,10 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) * a switch between contexts. Allow for a single recursion. */ bit = TRACE_TRANSITION_BIT; - if (trace_recursion_test(bit)) + if (trace_recursion_test(bit)) { + do_ftrace_record_recursion(ip, pip); return -1; + } trace_recursion_set(bit); barrier(); return bit + 1; @@ -199,9 +219,10 @@ static __always_inline void trace_clear_recursion(int bit) * Returns: -1 if a recursion happened. * >= 0 if no recursion */ -static __always_inline int ftrace_test_recursion_trylock(void) +static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, + unsigned long parent_ip) { - return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX); } /** diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 15480bf3ce88..875c5dbbdd33 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -49,7 +49,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (WARN_ON_ONCE(bit < 0)) return; /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..9b11c096d139 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -727,6 +727,31 @@ config TRACE_EVAL_MAP_FILE If unsure, say N. +config FTRACE_RECORD_RECURSION + bool "Record functions that recurse in function tracing" + depends on FUNCTION_TRACER + help + All callbacks that attach to the function tracing have some sort + of protection against recursion. Even though the protection exists, + it adds overhead. This option will create a file in the tracefs + file system called "recursed_functions" that will list the functions + that triggered a recursion. + + This will add more overhead to cases that have recursion. + + If unsure, say N + +config FTRACE_RECORD_RECURSION_SIZE + int "Max number of recursed functions to record" + default 128 + depends on FTRACE_RECORD_RECURSION + help + This defines the limit of number of functions that can be + listed in the "recursed_functions" file, that lists all + the functions that caused a recursion to happen. + This file can be reset, but the limit can not change in + size at runtime. + config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e153be351548..7e44cea89fdc 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o +obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 39f2bba89b76..03aad2b5cd5e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6918,7 +6918,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; @@ -6993,7 +6993,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a2b9fddb8148..1b202e28dfaa 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -447,7 +447,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, if ((unsigned long)ops->private != smp_processor_id()) return; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 89c414ce1388..646eda6c44a5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -141,7 +141,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (unlikely(!tr->function_enabled)) return; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 000e9dc224c6..92b1575ae0ca 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name) } #endif /* CONFIG_KRETPROBES */ -static void -seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) +void +trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; @@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 2f742b74e7e6..4c954636caf0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,7 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c new file mode 100644 index 000000000000..b2edac1fe156 --- /dev/null +++ b/kernel/trace/trace_recursion_record.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "trace_output.h" + +struct recursed_functions { + unsigned long ip; + unsigned long parent_ip; +}; + +static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE]; +static atomic_t nr_records; + +/* + * Cache the last found function. Yes, updates to this is racey, but + * so is memory cache ;-) + */ +static unsigned long cached_function; + +void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip) +{ + int index = 0; + int i; + unsigned long old; + + again: + /* First check the last one recorded */ + if (ip == cached_function) + return; + + i = atomic_read(&nr_records); + /* nr_records is -1 when clearing records */ + smp_mb__after_atomic(); + if (i < 0) + return; + + /* + * If there's two writers and this writer comes in second, + * the cmpxchg() below to update the ip will fail. Then this + * writer will try again. It is possible that index will now + * be greater than nr_records. This is because the writer + * that succeeded has not updated the nr_records yet. + * This writer could keep trying again until the other writer + * updates nr_records. But if the other writer takes an + * interrupt, and that interrupt locks up that CPU, we do + * not want this CPU to lock up due to the recursion protection, + * and have a bug report showing this CPU as the cause of + * locking up the computer. To not lose this record, this + * writer will simply use the next position to update the + * recursed_functions, and it will update the nr_records + * accordingly. + */ + if (index < i) + index = i; + if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE) + return; + + for (i = index - 1; i >= 0; i--) { + if (recursed_functions[i].ip == ip) { + cached_function = ip; + return; + } + } + + cached_function = ip; + + /* + * We only want to add a function if it hasn't been added before. + * Add to the current location before incrementing the count. + * If it fails to add, then increment the index (save in i) + * and try again. + */ + old = cmpxchg(&recursed_functions[index].ip, 0, ip); + if (old != 0) { + /* Did something else already added this for us? */ + if (old == ip) + return; + /* Try the next location (use i for the next index) */ + index++; + goto again; + } + + recursed_functions[index].parent_ip = parent_ip; + + /* + * It's still possible that we could race with the clearing + * CPU0 CPU1 + * ---- ---- + * ip = func + * nr_records = -1; + * recursed_functions[0] = 0; + * i = -1 + * if (i < 0) + * nr_records = 0; + * (new recursion detected) + * recursed_functions[0] = func + * cmpxchg(recursed_functions[0], + * func, 0) + * + * But the worse that could happen is that we get a zero in + * the recursed_functions array, and it's likely that "func" will + * be recorded again. + */ + i = atomic_read(&nr_records); + smp_mb__after_atomic(); + if (i < 0) + cmpxchg(&recursed_functions[index].ip, ip, 0); + else if (i <= index) + atomic_cmpxchg(&nr_records, i, index + 1); +} +EXPORT_SYMBOL_GPL(ftrace_record_recursion); + +static DEFINE_MUTEX(recursed_function_lock); +static struct trace_seq *tseq; + +static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos) +{ + void *ret = NULL; + int index; + + mutex_lock(&recursed_function_lock); + index = atomic_read(&nr_records); + if (*pos < index) { + ret = &recursed_functions[*pos]; + } + + tseq = kzalloc(sizeof(*tseq), GFP_KERNEL); + if (!tseq) + return ERR_PTR(-ENOMEM); + + trace_seq_init(tseq); + + return ret; +} + +static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int index; + int p; + + index = atomic_read(&nr_records); + p = ++(*pos); + + return p < index ? &recursed_functions[p] : NULL; +} + +static void recursed_function_seq_stop(struct seq_file *m, void *v) +{ + kfree(tseq); + mutex_unlock(&recursed_function_lock); +} + +static int recursed_function_seq_show(struct seq_file *m, void *v) +{ + struct recursed_functions *record = v; + int ret = 0; + + if (record) { + trace_seq_print_sym(tseq, record->parent_ip, true); + trace_seq_puts(tseq, ":\t"); + trace_seq_print_sym(tseq, record->ip, true); + trace_seq_putc(tseq, '\n'); + ret = trace_print_seq(m, tseq); + } + + return ret; +} + +static const struct seq_operations recursed_function_seq_ops = { + .start = recursed_function_seq_start, + .next = recursed_function_seq_next, + .stop = recursed_function_seq_stop, + .show = recursed_function_seq_show +}; + +static int recursed_function_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&recursed_function_lock); + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + /* disable updating records */ + atomic_set(&nr_records, -1); + smp_mb__after_atomic(); + memset(recursed_functions, 0, sizeof(recursed_functions)); + smp_wmb(); + /* enable them again */ + atomic_set(&nr_records, 0); + } + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &recursed_function_seq_ops); + mutex_unlock(&recursed_function_lock); + + return ret; +} + +static ssize_t recursed_function_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int recursed_function_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static const struct file_operations recursed_functions_fops = { + .open = recursed_function_open, + .write = recursed_function_write, + .read = seq_read, + .llseek = seq_lseek, + .release = recursed_function_release, +}; + +__init static int create_recursed_functions(void) +{ + struct dentry *dentry; + + dentry = trace_create_file("recursed_functions", 0644, NULL, NULL, + &recursed_functions_fops); + if (!dentry) + pr_warn("WARNING: Failed to create recursed_functions\n"); + return 0; +} + +fs_initcall(create_recursed_functions); From 60602cb549f1965a7edbc96026760dfb93911fab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Oct 2020 08:19:24 -0400 Subject: [PATCH 12/33] fgraph: Make overruns 4 bytes in graph stack structure Inspecting the data structures of the function graph tracer, I found that the overrun value is unsigned long, which is 8 bytes on a 64 bit machine, and not only that, the depth is an int (4 bytes). The overrun can be simply an unsigned int (4 bytes) and pack the ftrace_graph_ret structure better. The depth is moved up next to the func, as it is used more often with func, and improves cache locality. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 ++-- kernel/trace/trace_entries.h | 4 ++-- kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 806196345c3f..8dde9c17aaa5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -864,11 +864,11 @@ struct ftrace_graph_ent { */ struct ftrace_graph_ret { unsigned long func; /* Current function */ + int depth; /* Number of functions that overran the depth limit for current task */ - unsigned long overrun; + unsigned int overrun; unsigned long long calltime; unsigned long long rettime; - int depth; } __packed; /* Type of the callback handlers for tracing function graph*/ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 18c4a58aff79..ceafe2dc97e1 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -93,10 +93,10 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( unsigned long, ret, overrun ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) __field_packed( unsigned long long, ret, calltime) __field_packed( unsigned long long, ret, rettime ) - __field_packed( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 60d66278aa0d..d874dec87131 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -957,7 +957,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) - trace_seq_printf(s, " (Overruns: %lu)\n", + trace_seq_printf(s, " (Overruns: %u)\n", trace->overrun); print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, From 7b68621f8d16689cbb4203aceaca86ffb165f1d0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 30 Oct 2020 17:21:00 -0400 Subject: [PATCH 13/33] ftrace: Clean up the recursion code a bit In trace_test_and_set_recursion(), current->trace_recursion is placed into a variable, and that variable should be used for the processing, as there's no reason to dereference current multiple times. On trace_clear_recursion(), current->trace_recursion is modified and there's no reason to copy it over to a variable. Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 228cc56ed66e..a9f9c5714e65 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -162,7 +162,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, int start, int max) { - unsigned int val = current->trace_recursion; + unsigned int val = READ_ONCE(current->trace_recursion); int bit; /* A previous recursion check was made */ @@ -176,18 +176,15 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign * a switch between contexts. Allow for a single recursion. */ bit = TRACE_TRANSITION_BIT; - if (trace_recursion_test(bit)) { + if (val & (1 << bit)) { do_ftrace_record_recursion(ip, pip); return -1; } - trace_recursion_set(bit); - barrier(); - return bit + 1; + } else { + /* Normal check passed, clear the transition to allow it again */ + val &= ~(1 << TRACE_TRANSITION_BIT); } - /* Normal check passed, clear the transition to allow it again */ - trace_recursion_clear(TRACE_TRANSITION_BIT); - val |= 1 << bit; current->trace_recursion = val; barrier(); @@ -197,17 +194,12 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign static __always_inline void trace_clear_recursion(int bit) { - unsigned int val = current->trace_recursion; - if (!bit) return; - bit--; - bit = 1 << bit; - val &= ~bit; - barrier(); - current->trace_recursion = val; + bit--; + trace_recursion_clear(bit); } /** From 28575c61ea602537a3d86fe301a53554e59452ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Nov 2020 14:43:10 -0500 Subject: [PATCH 14/33] ring-buffer: Add recording of ring buffer recursion into recursed_functions Add a new config RING_BUFFER_RECORD_RECURSION that will place functions that recurse from the ring buffer into the ftrace recused_functions file. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 14 ++++++++++++++ kernel/trace/ring_buffer.c | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9b11c096d139..6aa36ec73ccb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -752,6 +752,20 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config RING_BUFFER_RECORD_RECURSION + bool "Record functions that recurse in the ring buffer" + depends on FTRACE_RECORD_RECURSION + # default y, because it is coupled with FTRACE_RECORD_RECURSION + default y + help + The ring buffer has its own internal recursion. Although when + recursion happens it wont cause harm because of the protection, + but it does cause an unwanted overhead. Enabling this option will + place where recursion was detected into the ftrace "recursed_functions" + file. + + This will add more overhead to cases that have recursion. + config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc83b3fa9fe7..ab68f28b8f4b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include #include @@ -3006,6 +3007,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) irq_work_queue(&cpu_buffer->irq_work.work); } +#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION +# define do_ring_buffer_record_recursion() \ + do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) +#else +# define do_ring_buffer_record_recursion() do { } while (0) +#endif + /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified @@ -3088,8 +3096,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; - if (val & (1 << (bit + cpu_buffer->nest))) + if (val & (1 << (bit + cpu_buffer->nest))) { + do_ring_buffer_record_recursion(); return 1; + } } val |= (1 << (bit + cpu_buffer->nest)); From 045e269c1eb2db5b5df9e034af617af8f4c1b35c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 6 Nov 2020 22:54:46 +0800 Subject: [PATCH 15/33] ftrace: Remove unused varible 'ret' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'ret' in 2 functions are not used. and one of them is a void function. So remove them to avoid gcc warning: kernel/trace/ftrace.c:4166:6: warning: variable ‘ret’ set but not used [-Wunused-but-set-variable] kernel/trace/ftrace.c:5571:6: warning: variable ‘ret’ set but not used [-Wunused-but-set-variable] Link: https://lkml.kernel.org/r/1604674486-52350-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03aad2b5cd5e..3db64fb0cce8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4162,7 +4162,6 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, struct ftrace_hash **orig_hash, *new_hash; LIST_HEAD(process_mods); char *func; - int ret; mutex_lock(&ops->func_hash->regex_lock); @@ -4215,7 +4214,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); mutex_unlock(&ftrace_lock); @@ -5567,7 +5566,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) struct ftrace_hash **orig_hash; struct trace_parser *parser; int filter_hash; - int ret; if (file->f_mode & FMODE_READ) { iter = m->private; @@ -5595,7 +5593,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); } else { From 2b5894cc33e9dea189a7010c7ed57d414786d174 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 29 Oct 2020 23:05:54 +0800 Subject: [PATCH 16/33] tracing: Fix some typos in comments s/detetector/detector/ s/enfoced/enforced/ s/writen/written/ s/actualy/actually/ s/bascially/basically/ s/Regarldess/Regardless/ s/zeroes/zeros/ s/followd/followed/ s/incrememented/incremented/ s/separatelly/separately/ s/accesible/accessible/ s/sythetic/synthetic/ s/enabed/enabled/ s/heurisitc/heuristic/ s/assocated/associated/ s/otherwides/otherwise/ s/specfied/specified/ s/seaching/searching/ s/hierachry/hierarchy/ s/internel/internal/ s/Thise/This/ Link: https://lkml.kernel.org/r/20201029150554.3354-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/blktrace.c | 4 ++-- kernel/trace/bpf_trace.c | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace_benchmark.c | 6 +++--- kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_dynevent.h | 6 +++--- kernel/trace/trace_entries.h | 2 +- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_events_filter.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_events_synth.c | 4 ++-- kernel/trace/trace_export.c | 2 +- kernel/trace/trace_hwlat.c | 4 ++-- kernel/trace/tracing_map.c | 6 +++--- kernel/trace/tracing_map.h | 2 +- 15 files changed, 25 insertions(+), 25 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f1022945e346..1c3d0f57d763 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1343,7 +1343,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, * ones now use the 64bit ino as the whole ID and * no longer use generation. * - * Regarldess of the content, always output + * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry(). @@ -1385,7 +1385,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, i == 0 ? "" : " ", pdu_buf[i]); /* - * stop when the rest is just zeroes and indicate so + * stop when the rest is just zeros and indicate so * with a ".." appended */ if (i == end && end != pdu_len - 1) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..f4172b870377 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -113,7 +113,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock * to all call sites, we did a bpf_prog_array_valid() there to check * whether call->prog_array is empty or not, which is - * a heurisitc to speed up execution. + * a heuristic to speed up execution. * * If bpf_prog_array_valid() fetched prog_array was * non-NULL, we go into trace_call_bpf() and do the actual diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 410cfeb16db5..6a282bbc7e7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3118,7 +3118,7 @@ struct trace_buffer_struct { static struct trace_buffer_struct *trace_percpu_buffer; /* - * Thise allows for lockless recording. If we're nested too deeply, then + * This allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 2e9a4746ea85..801c2a7f7605 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -31,7 +31,7 @@ static bool ok_to_run; * it simply writes "START". As the first write is cold cache and * the rest is hot, we save off that time in bm_first and it is * reported as "first", which is shown in the second write to the - * tracepoint. The "first" field is writen within the statics from + * tracepoint. The "first" field is written within the statics from * then on but never changes. */ static void trace_do_benchmark(void) @@ -112,7 +112,7 @@ static void trace_do_benchmark(void) int i = 0; /* * stddev is the square of standard deviation but - * we want the actualy number. Use the average + * we want the actually number. Use the average * as our seed to find the std. * * The next try is: @@ -155,7 +155,7 @@ static int benchmark_event_kthread(void *arg) /* * We don't go to sleep, but let others run as well. - * This is bascially a "yield()" to let any task that + * This is basically a "yield()" to let any task that * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5fa49cfd2bb6..4f967d5cd917 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -276,7 +276,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, * arguments of the form 'type variable_name;' or 'x+y'. * * The lhs argument string will be appended to the current cmd string, - * followed by an operator, if applicable, followd by the rhs string, + * followed by an operator, if applicable, followed by the rhs string, * followed finally by a separator, if applicable. Before the * argument is added, the @check_arg function, if present, will be * used to check the sanity of the current arg strings. diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index d6857a254ede..d6f72dcb7269 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -29,10 +29,10 @@ struct dyn_event; * @show: Showing method. This is invoked when user reads the event definitions * via dynamic_events interface. * @is_busy: Check whether given event is busy so that it can not be deleted. - * Return true if it is busy, otherwides false. - * @free: Delete the given event. Return 0 if success, otherwides error. + * Return true if it is busy, otherwise false. + * @free: Delete the given event. Return 0 if success, otherwise error. * @match: Check whether given event and system name match this event. The argc - * and argv is used for exact match. Return true if it matches, otherwides + * and argv is used for exact match. Return true if it matches, otherwise * false. * * Except for @create, these methods are called under holding event_mutex. diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ceafe2dc97e1..4547ac59da61 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -32,7 +32,7 @@ * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the - * internel structures are just tracing helpers, this is not + * internal structures are just tracing helpers, this is not * an issue. * * When an internal structure is used, it should use: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 244abbcd1db5..f4b459bb6d33 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2436,7 +2436,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) /* * Since calls are grouped by systems, the likelyhood that the * next call in the iteration belongs to the same system as the - * previous call is high. As an optimization, we skip seaching + * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i @@ -3271,7 +3271,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also - * creates the event hierachry in the @parent/events directory. + * creates the event hierarchy in the @parent/events directory. * * Returns 0 on success. * diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 78a678eeb140..d0f515ac9b7c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1950,7 +1950,7 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len, /* * The 'ip' field could have multiple filters set, separated * either by space or comma. We first cut the filter and apply - * all pieces separatelly. + * all pieces separately. */ re = ftrace_function_filter_re(buf, len, &re_cnt); if (!re) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 96c3f86b81c5..39ebe1826fc3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3355,7 +3355,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, } else { field_var = NULL; /* - * If no explicit system.event is specfied, default to + * If no explicit system.event is specified, default to * looking for fields on the onmatch(system.event.xxx) * event. */ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 881df991742a..5a8bc0b421f1 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1276,7 +1276,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) /** * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event + * @name: The name of the new synthetic event * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * @mod: The module creating the event, NULL if not created from a module @@ -1446,7 +1446,7 @@ __synth_event_trace_init(struct trace_event_file *file, * this code to be called, etc). Because this is called * directly by the user, we don't have that but we still need * to honor not logging when disabled. For the iterated - * trace case, we save the enabed state upon start and just + * trace case, we save the enabled state upon start and just * ignore the following data calls. */ if (!(file->flags & EVENT_FILE_FL_ENABLED) || diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 90f81d33fa3f..d960f6b11b5e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -26,7 +26,7 @@ static int ftrace_event_register(struct trace_event_call *call, /* * The FTRACE_ENTRY_REG macro allows ftrace entry to define register - * function and thus become accesible via perf. + * function and thus become accessible via perf. */ #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c9ad5c6fbaad..d3ab2f4a77df 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -485,11 +485,11 @@ hwlat_width_write(struct file *filp, const char __user *ubuf, * @ppos: The current position in @file * * This function provides a write implementation for the "window" interface - * to the hardware latency detetector. The window is the total time + * to the hardware latency detector. The window is the total time * in us that will be considered one sample period. Conceptually, windows * occur back-to-back and contain a sample width period during which * actual sampling occurs. Can be used to write a new total window size. It - * is enfoced that any value written must be greater than the sample width + * is enforced that any value written must be greater than the sample width * size, or an error results. */ static ssize_t diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 4b50fc0cb12c..d6bddb157ef2 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -609,7 +609,7 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) * signal that state. There are two user-visible tracing_map * variables, 'hits' and 'drops', which are updated by this function. * Every time an element is either successfully inserted or retrieved, - * the 'hits' value is incrememented. Every time an element insertion + * the 'hits' value is incremented. Every time an element insertion * fails, the 'drops' value is incremented. * * This is a lock-free tracing map insertion function implementing a @@ -642,9 +642,9 @@ struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) * tracing_map_elt. This is a lock-free lookup; see * tracing_map_insert() for details on tracing_map and how it works. * Every time an element is retrieved, the 'hits' value is - * incrememented. There is one user-visible tracing_map variable, + * incremented. There is one user-visible tracing_map variable, * 'hits', which is updated by this function. Every time an element - * is successfully retrieved, the 'hits' value is incrememented. The + * is successfully retrieved, the 'hits' value is incremented. The * 'drops' value is never updated by this function. * * Return: the tracing_map_elt pointer val associated with the key. diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index a6de61fc22de..2c765ee2a4d4 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -50,7 +50,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); * an instance of tracing_map_elt, where 'elt' in the latter part of * that variable name is short for 'element'. The purpose of a * tracing_map_elt is to hold values specific to the particular - * 32-bit hashed key it's assocated with. Things such as the unique + * 32-bit hashed key it's associated with. Things such as the unique * set of aggregated sums associated with the 32-bit hashed key, along * with a copy of the full key associated with the entry, and which * was used to produce the 32-bit hashed key. From 58954b3be8b7a8a0ebf1ced6fbbab808e8ccc4b6 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 9 Nov 2020 13:22:50 +0100 Subject: [PATCH 17/33] MAINTAINERS: assign ./fs/tracefs to TRACING A check with ./scripts/get_maintainer.pl --letters -f fs/tracefs/ shows that the tracefs is not assigned to the TRACING section in MAINTAINERS. Add the file pattern for the TRACING section to rectify that. Link: https://lkml.kernel.org/r/20201109122250.31915-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index b516bb34a8d5..5c714c19fadc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17751,6 +17751,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git perf/core F: Documentation/trace/ftrace.rst F: arch/*/*/*/ftrace.h F: arch/*/kernel/ftrace.c +F: fs/tracefs/ F: include/*/ftrace.h F: include/linux/trace*.h F: include/trace/ From d19ad0775dcd64b49eecf4fa79c17959ebfbd26b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Oct 2020 17:42:17 -0400 Subject: [PATCH 18/33] ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs In preparation to have arguments of a function passed to callbacks attached to functions as default, change the default callback prototype to receive a struct ftrace_regs as the forth parameter instead of a pt_regs. For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they will now need to get the pt_regs via a ftrace_get_regs() helper call. If this is called by a callback that their ftrace_ops did not have a FL_SAVE_REGS flag set, it that helper function will return NULL. This will allow the ftrace_regs to hold enough just to get the parameters and stack pointer, but without the worry that callbacks may have a pt_regs that is not completely filled. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 4 +++- arch/nds32/kernel/ftrace.c | 4 ++-- arch/parisc/kernel/ftrace.c | 8 +++++--- arch/powerpc/kernel/kprobes-ftrace.c | 4 +++- arch/s390/kernel/ftrace.c | 4 +++- arch/x86/kernel/kprobes/ftrace.c | 3 ++- fs/pstore/ftrace.c | 2 +- include/linux/ftrace.h | 16 ++++++++++++++-- include/linux/kprobes.h | 2 +- kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 27 +++++++++++++++------------ kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c | 9 ++++----- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 20 +++++++++++--------- kernel/trace/trace_stack.c | 2 +- 18 files changed, 71 insertions(+), 45 deletions(-) diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index f30b179924ef..ae2b1c7b3b5c 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -11,17 +11,19 @@ int arch_check_ftrace_location(struct kprobe *p) /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { int bit; bool lr_saver = false; struct kprobe *p; struct kprobe_ctlblk *kcb; + struct pt_regs *regs; bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (!p) { diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 3763b3f8c3db..414f8a780cc3 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -10,7 +10,7 @@ extern void (*ftrace_trace_function)(unsigned long, unsigned long, extern void ftrace_graph_caller(void); noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { __asm__ (""); /* avoid to optimize as pure function */ } @@ -38,7 +38,7 @@ EXPORT_SYMBOL(_mcount); #else /* CONFIG_DYNAMIC_FTRACE */ noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { __asm__ (""); /* avoid to optimize as pure function */ } diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 1c5d3732bda2..0a1e75af5382 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -51,7 +51,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent, void notrace __hot ftrace_function_trampoline(unsigned long parent, unsigned long self_addr, unsigned long org_sp_gr3, - struct pt_regs *regs) + struct ftrace_regs *fregs) { #ifndef CONFIG_DYNAMIC_FTRACE extern ftrace_func_t ftrace_trace_function; @@ -61,7 +61,7 @@ void notrace __hot ftrace_function_trampoline(unsigned long parent, if (function_trace_op->flags & FTRACE_OPS_FL_ENABLED && ftrace_trace_function != ftrace_stub) ftrace_trace_function(self_addr, parent, - function_trace_op, regs); + function_trace_op, fregs); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (dereference_function_descriptor(ftrace_graph_return) != @@ -204,9 +204,10 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, #ifdef CONFIG_KPROBES_ON_FTRACE void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe_ctlblk *kcb; + struct pt_regs *regs; struct kprobe *p; int bit; @@ -214,6 +215,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index fdfee39938ea..660138f6c4b2 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -14,16 +14,18 @@ /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe *p; struct kprobe_ctlblk *kcb; + struct pt_regs *regs; int bit; bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 657c1ab45408..67b80f4412f9 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -198,9 +198,10 @@ int ftrace_disable_ftrace_graph_caller(void) #ifdef CONFIG_KPROBES_ON_FTRACE void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe_ctlblk *kcb; + struct pt_regs *regs; struct kprobe *p; int bit; @@ -208,6 +209,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 954d930a7127..373e5fa3ce1f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -14,8 +14,9 @@ /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct kprobe *p; struct kprobe_ctlblk *kcb; int bit; diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index adb0935eb062..5939595f0115 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -26,7 +26,7 @@ static u64 pstore_ftrace_stamp; static void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, - struct pt_regs *regs) + struct ftrace_regs *fregs) { int bit; unsigned long flags; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8dde9c17aaa5..24e1fa52337d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -90,8 +90,20 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, struct ftrace_ops; +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) +{ + if (!fregs) + return NULL; + + return &fregs->regs; +} + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); @@ -259,7 +271,7 @@ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); extern void ftrace_stub(unsigned long a0, unsigned long a1, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); #else /* !CONFIG_FUNCTION_TRACER */ /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 629abaf25681..be73350955e4 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs); + struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); #endif diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 875c5dbbdd33..f89f9e7e9b07 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -40,8 +40,9 @@ struct klp_ops *klp_find_ops(void *old_func) static void notrace klp_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *fops, - struct pt_regs *regs) + struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct klp_ops *ops; struct klp_func *func; int patch_state; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3db64fb0cce8..67888311784e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -121,7 +121,7 @@ struct ftrace_ops global_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); #else /* See comment below, where ftrace_ops_list_func is defined */ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); @@ -140,7 +140,7 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; int pid; @@ -154,7 +154,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, return; } - op->saved_func(ip, parent_ip, op, regs); + op->saved_func(ip, parent_ip, op, fregs); } static void ftrace_sync_ipi(void *data) @@ -754,7 +754,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) static void function_profile_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -2143,6 +2143,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { /* * If there's only one user (direct_ops helper) @@ -2368,8 +2369,9 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) } static void call_direct_funcs(unsigned long ip, unsigned long pip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); unsigned long addr; addr = ftrace_find_rec_direct(ip); @@ -4292,7 +4294,7 @@ static int __init ftrace_mod_cmd_init(void) core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct ftrace_probe_ops *probe_ops; struct ftrace_func_probe *probe; @@ -6911,8 +6913,9 @@ void ftrace_reset_array_ops(struct trace_array *tr) static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ignored, struct pt_regs *regs) + struct ftrace_ops *ignored, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct ftrace_ops *op; int bit; @@ -6945,7 +6948,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, pr_warn("op=%p %pS\n", op, op); goto out; } - op->func(ip, parent_ip, op, regs); + op->func(ip, parent_ip, op, fregs); } } while_for_each_ftrace_op(op); out: @@ -6968,9 +6971,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { - __ftrace_ops_list_func(ip, parent_ip, NULL, regs); + __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } NOKPROBE_SYMBOL(ftrace_ops_list_func); #else @@ -6987,7 +6990,7 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops); * this function will be called by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { int bit; @@ -6998,7 +7001,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) - op->func(ip, parent_ip, op, regs); + op->func(ip, parent_ip, op, fregs); preempt_enable_notrace(); trace_clear_recursion(bit); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 1b202e28dfaa..a71181655958 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -432,7 +432,7 @@ NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *pt_regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_entry *entry; struct perf_event *event; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f4b459bb6d33..98d194d8460e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3673,7 +3673,7 @@ static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *regs) { struct trace_buffer *buffer; struct ring_buffer_event *event; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 646eda6c44a5..c5095dd28e20 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,10 +23,10 @@ static void tracing_start_function_trace(struct trace_array *tr); static void tracing_stop_function_trace(struct trace_array *tr); static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ @@ -89,7 +89,6 @@ void ftrace_destroy_function_files(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; - /* * Instance trace_arrays get their ops allocated * at instance creation. Unless it failed @@ -129,7 +128,7 @@ static void function_trace_start(struct trace_array *tr) static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; struct trace_array_cpu *data; @@ -178,7 +177,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 10bbb0f381d5..d06aab4dcbb8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -138,7 +138,7 @@ static int func_prolog_dec(struct trace_array *tr, */ static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 97b10bb31a1f..c0181066dbe9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -212,7 +212,7 @@ static void wakeup_print_header(struct seq_file *s) */ static void wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 8ee3c0bb5d8a..5ed081c6471c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe1_cnt++; } @@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe2_cnt++; } @@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe3_cnt++; } @@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_global_cnt++; } @@ -143,7 +143,7 @@ static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_dyn_cnt++; } @@ -414,7 +414,7 @@ static int trace_selftest_recursion_cnt; static void trace_selftest_test_recursion_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { /* * This function is registered without the recursion safe flag. @@ -429,7 +429,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip, static void trace_selftest_test_recursion_safe_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { /* * We said we would provide our own recursion. By calling @@ -548,9 +548,11 @@ static enum { static void trace_selftest_test_regs_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { - if (pt_regs) + struct pt_regs *regs = ftrace_get_regs(fregs); + + if (regs) trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; else trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 969db526a563..63c285042051 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack) static void stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long stack; From 02a474ca266a47ea8f4d5a11f4ffa120f83730ad Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Oct 2020 10:55:55 -0400 Subject: [PATCH 19/33] ftrace/x86: Allow for arguments to be passed in to ftrace_regs by default Currently, the only way to get access to the registers of a function via a ftrace callback is to set the "FL_SAVE_REGS" bit in the ftrace_ops. But as this saves all regs as if a breakpoint were to trigger (for use with kprobes), it is expensive. The regs are already saved on the stack for the default ftrace callbacks, as that is required otherwise a function being traced will get the wrong arguments and possibly crash. And on x86, the arguments are already stored where they would be on a pt_regs structure to use that code for both the regs version of a callback, it makes sense to pass that information always to all functions. If an architecture does this (as x86_64 now does), it is to set HAVE_DYNAMIC_FTRACE_WITH_ARGS, and this will let the generic code that it could have access to arguments without having to set the flags. This also includes having the stack pointer being saved, which could be used for accessing arguments on the stack, as well as having the function graph tracer not require its own trampoline! Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 15 +++++++++++++++ arch/x86/kernel/ftrace_64.S | 11 +++++++++-- include/linux/ftrace.h | 7 ++++++- kernel/trace/Kconfig | 9 +++++++++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..478526aabe5d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -167,6 +167,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 84b9449be080..e00fe88146e0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -41,6 +41,21 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned regs->orig_ax = addr; } +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs * +arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + /* Only when FL_SAVE_REGS is set, cs will be non zero */ + if (!fregs->regs.cs) + return NULL; + return &fregs->regs; +} +#endif + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index ac3d5f22fe64..60e3b64f5ea6 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -140,12 +140,19 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* Stack - skipping return address of ftrace_caller */ + leaq MCOUNT_REG_SIZE+8(%rsp), %rcx + movq %rcx, RSP(%rsp) + SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Only ops with REGS flag set should have CS register set */ + movq $0, CS(%rsp) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 24e1fa52337d..588ea7023a7a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -90,16 +90,21 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, struct ftrace_ops; +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + struct ftrace_regs { struct pt_regs regs; }; +#define arch_ftrace_get_regs(fregs) (&(fregs)->regs) + +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) { if (!fregs) return NULL; - return &fregs->regs; + return arch_ftrace_get_regs(fregs); } typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6aa36ec73ccb..c9b64dea1216 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_DYNAMIC_FTRACE_WITH_ARGS + bool + help + If this is set, then arguments and stack can be found from + the pt_regs passed into the function callback regs parameter + by default, even without setting the REGS flag in the ftrace_ops. + This allows for use of regs_get_kernel_argument() and + kernel_stack_pointer(). + config HAVE_FTRACE_MCOUNT_RECORD bool help From 2860cd8a235375df3c8ec8039d9fe5eb2f658b86 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Oct 2020 17:15:27 -0400 Subject: [PATCH 20/33] livepatch: Use the default ftrace_ops instead of REGS when ARGS is available When CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS is available, the ftrace call will be able to set the ip of the calling function. This will improve the performance of live kernel patching where it does not need all the regs to be stored just to change the instruction pointer. If all archs that support live kernel patching also support HAVE_DYNAMIC_FTRACE_WITH_ARGS, then the architecture specific function klp_arch_set_pc() could be made generic. It is possible that an arch can support HAVE_DYNAMIC_FTRACE_WITH_ARGS but not HAVE_DYNAMIC_FTRACE_WITH_REGS and then have access to live patching. Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: live-patching@vger.kernel.org Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- arch/powerpc/include/asm/livepatch.h | 4 +++- arch/s390/include/asm/livepatch.h | 5 ++++- arch/x86/include/asm/ftrace.h | 3 +++ arch/x86/include/asm/livepatch.h | 4 ++-- arch/x86/kernel/ftrace_64.S | 4 ++++ include/linux/ftrace.h | 7 +++++++ kernel/livepatch/Kconfig | 2 +- kernel/livepatch/patch.c | 9 +++++---- 8 files changed, 29 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 4a3d5d25fed5..ae25e6e72997 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -12,8 +12,10 @@ #include #ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { + struct pt_regs *regs = ftrace_get_regs(fregs); + regs->nip = ip; } diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h index 818612b784cd..d578a8c76676 100644 --- a/arch/s390/include/asm/livepatch.h +++ b/arch/s390/include/asm/livepatch.h @@ -11,10 +11,13 @@ #ifndef ASM_LIVEPATCH_H #define ASM_LIVEPATCH_H +#include #include -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { + struct pt_regs *regs = ftrace_get_regs(fregs); + regs->psw.addr = ip; } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index e00fe88146e0..9f3130f40807 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -54,6 +54,9 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return NULL; return &fregs->regs; } + +#define ftrace_instruction_pointer_set(fregs, _ip) \ + do { (fregs)->regs.ip = (_ip); } while (0) #endif #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 1fde1ab6559e..7c5cc6660e4b 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -12,9 +12,9 @@ #include #include -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - regs->ip = ip; + ftrace_instruction_pointer_set(fregs, ip); } #endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 60e3b64f5ea6..0d54099c2a3a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -157,6 +157,10 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + restore_mcount_regs /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 588ea7023a7a..9a8ce28e4485 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -97,6 +97,13 @@ struct ftrace_regs { }; #define arch_ftrace_get_regs(fregs) (&(fregs)->regs) +/* + * ftrace_instruction_pointer_set() is to be defined by the architecture + * if to allow setting of the instruction pointer from the ftrace_regs + * when HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports + * live kernel patching. + */ +#define ftrace_instruction_pointer_set(fregs, ip) do { } while (0) #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 54102deb50ba..53d51ed619a3 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -6,7 +6,7 @@ config HAVE_LIVEPATCH config LIVEPATCH bool "Kernel Live Patching" - depends on DYNAMIC_FTRACE_WITH_REGS + depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS depends on MODULES depends on SYSFS depends on KALLSYMS_ALL diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index f89f9e7e9b07..e8029aea67f1 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -42,7 +42,6 @@ static void notrace klp_ftrace_handler(unsigned long ip, struct ftrace_ops *fops, struct ftrace_regs *fregs) { - struct pt_regs *regs = ftrace_get_regs(fregs); struct klp_ops *ops; struct klp_func *func; int patch_state; @@ -118,7 +117,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(regs, (unsigned long)func->new_func); + klp_arch_set_pc(fregs, (unsigned long)func->new_func); unlock: preempt_enable_notrace(); @@ -200,8 +199,10 @@ static int klp_patch_func(struct klp_func *func) return -ENOMEM; ops->fops.func = klp_ftrace_handler; - ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | - FTRACE_OPS_FL_DYNAMIC | + ops->fops.flags = FTRACE_OPS_FL_DYNAMIC | +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + FTRACE_OPS_FL_SAVE_REGS | +#endif FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_PERMANENT; From b111545d26c0d66dd9aae668d9373669e752b075 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 14 Nov 2020 00:02:40 +0800 Subject: [PATCH 21/33] tracing: Remove the useless value assignment in test_create_synth_event() The value of variable ret is overwritten on the delete branch in the test_create_synth_event() and we care more about the above error than this delete portion. Remove it. Link: https://lkml.kernel.org/r/1605283360-6804-1-git-send-email-kaixuxia@tencent.com Reported-by: Tosk Robot Signed-off-by: Kaixu Xia Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index edd912cd14aa..a4b4bbf8c3bf 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -307,7 +307,7 @@ static int __init test_create_synth_event(void) return ret; delete: /* We got an error after creating the event, delete it */ - ret = synth_event_delete("create_synth_test"); + synth_event_delete("create_synth_test"); goto out; } From 76980f5fa06d505879ba936b1b5066a056991de0 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sun, 15 Nov 2020 16:53:36 +0100 Subject: [PATCH 22/33] tracing: Clean up after filter logic rewriting The functions event_{set,clear,}_no_set_filter_flag were only used in replace_system_preds() [now, renamed to process_system_preds()]. Commit 80765597bc58 ("tracing: Rewrite filter logic to be simpler and faster") removed the use of those functions in replace_system_preds(). Since then, the functions event_{set,clear,}_no_set_filter_flag were unused. Fortunately, make CC=clang W=1 indicates this with -Wunused-function warnings on those three functions. So, clean up these obsolete unused functions. Link: https://lkml.kernel.org/r/20201115155336.20248-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d0f515ac9b7c..e91259f6a722 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1561,27 +1561,6 @@ static inline void event_clear_filter(struct trace_event_file *file) RCU_INIT_POINTER(file->filter, NULL); } -static inline void -event_set_no_set_filter_flag(struct trace_event_file *file) -{ - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; -} - -static inline void -event_clear_no_set_filter_flag(struct trace_event_file *file) -{ - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; -} - -static inline bool -event_no_set_filter_flag(struct trace_event_file *file) -{ - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) - return true; - - return false; -} - struct filter_list { struct list_head list; struct event_filter *filter; From 3a37b918946e04da7902b83917764f73cc0bd90c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 16 Nov 2020 15:46:52 -0500 Subject: [PATCH 23/33] ftrace/documentation: Fix RST C code blocks Some C code in the ftrace-users.rst document is missing RST C block annotation, which has to be added. Link: https://lore.kernel.org/r/20201116173502.392a769c@canb.auug.org.au Acked-by: Jonathan Corbet Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 5981d5691745..f7d98ae5b885 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -116,6 +116,8 @@ called by a callback may also be traced, and call that same callback, recursion protection must be used. There are two helper functions that can help in this regard. If you start your code with: +.. code-block:: c + int bit; bit = ftrace_test_recursion_trylock(ip, parent_ip); @@ -124,6 +126,8 @@ can help in this regard. If you start your code with: and end it with: +.. code-block:: c + ftrace_test_recursion_unlock(bit); The code in between will be safe to use, even if it ends up calling a @@ -145,6 +149,8 @@ protection, it is best to make sure that RCU is "watching", otherwise that data or critical section will not be protected as expected. In this case add: +.. code-block:: c + if (!rcu_is_watching()) return; From 5b7be9c709e10e88531f1f81e1150bbad65be1aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 30 Nov 2020 23:37:33 -0500 Subject: [PATCH 24/33] ring-buffer: Add test to validate the time stamp deltas While debugging a situation where a delta for an event was calucalted wrong, I realize there was nothing making sure that the delta of events are correct. If a single event has an incorrect delta, then all events after it will also have one. If the discrepency gets large enough, it could cause the time stamps to go backwards when crossing sub buffers, that record a full 64 bit time stamp, and the new deltas are added to that. Add a way to validate the events at most events and when crossing a buffer page. This will help make sure that the deltas are always correct. This test will detect if they are ever corrupted. The test adds a high overhead to the ring buffer recording, as it does the audit for almost every event, and should only be used for testing the ring buffer. This will catch the bug that is fixed by commit 55ea4cf40380 ("ring-buffer: Update write stamp with the correct ts"), which is not applied when this commit is applied. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 20 +++++ kernel/trace/ring_buffer.c | 150 +++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c9b64dea1216..fe60f9d7a0e6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -845,6 +845,26 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config RING_BUFFER_VALIDATE_TIME_DELTAS + bool "Verify ring buffer time stamp deltas" + depends on RING_BUFFER + help + This will audit the time stamps on the ring buffer sub + buffer to make sure that all the time deltas for the + events on a sub buffer matches the current time stamp. + This audit is performed for every event that is not + interrupted, or interrupting another event. A check + is also made when traversing sub buffers to make sure + that all the deltas on the previous sub buffer do not + add up to be greater than the current time stamp. + + NOTE: This adds significant overhead to recording of events, + and should only be used to test the logic of the ring buffer. + Do not use it on production systems. + + Only say Y if you understand what this does, and you + still want it enabled. Otherwise say N + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ab68f28b8f4b..7cd888ee9ac7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3193,6 +3193,153 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/* Special value to validate all deltas on a page. */ +#define CHECK_FULL_PAGE 1L + +#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS +static void dump_buffer_page(struct buffer_data_page *bpage, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int e; + + ts = bpage->time_stamp; + pr_warn(" [%lld] PAGE TIME STAMP\n", ts); + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = ring_buffer_event_time_stamp(event); + ts += delta; + pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = ring_buffer_event_time_stamp(event); + ts = delta; + pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + break; + + case RINGBUF_TYPE_PADDING: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + break; + + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + break; + + default: + break; + } + } +} + +static DEFINE_PER_CPU(atomic_t, checking); +static atomic_t ts_dump; + +/* + * Check if the current event time stamp matches the deltas on + * the buffer page. + */ +static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + u64 ts, delta; + bool full = false; + int e; + + bpage = info->tail_page->page; + + if (tail == CHECK_FULL_PAGE) { + full = true; + tail = local_read(&bpage->commit); + } else if (info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { + /* Ignore events with absolute time stamps */ + return; + } + + /* + * Do not check the first event (skip possible extends too). + * Also do not check if previous events have not been committed. + */ + if (tail <= 8 || tail > local_read(&bpage->commit)) + return; + + /* + * If this interrupted another event, + */ + if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) + goto out; + + ts = bpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = ring_buffer_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = ring_buffer_event_time_stamp(event); + ts = delta; + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + /* fall through */ + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + break; + + default: + RB_WARN_ON(cpu_buffer, 1); + } + } + if ((full && ts > info->ts) || + (!full && ts + info->delta != info->ts)) { + /* If another report is happening, ignore this one */ + if (atomic_inc_return(&ts_dump) != 1) { + atomic_dec(&ts_dump); + goto out; + } + atomic_inc(&cpu_buffer->record_disabled); + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, info->after); + dump_buffer_page(bpage, info, tail); + atomic_dec(&ts_dump); + /* Do not re-enable checking */ + return; + } +out: + atomic_dec(this_cpu_ptr(&checking)); +} +#else +static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ +} +#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ + static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -3252,6 +3399,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, info->before, info->after); } + if (a_ok && b_ok) + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3272,6 +3421,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* Just use full timestamp for inerrupting event */ info->delta = info->ts; barrier(); + check_buffer(cpu_buffer, info, tail); if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ From a32ded3389abcc51a39fc7cb5f1793f7e5abaa88 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 17 Nov 2020 06:37:03 +0100 Subject: [PATCH 25/33] ring-buffer: Remove obsolete rb_event_is_commit() Commit a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp") removed the only uses of rb_event_is_commit() in rb_update_event() and rb_update_write_stamp(). Hence, since then, make CC=clang W=1 warns: kernel/trace/ring_buffer.c:2763:1: warning: unused function 'rb_event_is_commit' [-Wunused-function] Remove this obsolete function. Link: https://lkml.kernel.org/r/20201117053703.11275-1-lukas.bulwahn@gmail.com Reviewed-by: Nathan Chancellor Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7cd888ee9ac7..1b9da155ff00 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2629,9 +2629,6 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) return skip_time_extend(event); } -static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); - #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { @@ -2759,20 +2756,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -static __always_inline bool -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - static u64 rb_time_delta(struct ring_buffer_event *event) { switch (event->type_len) { From 888834903d362b48c879ce8ab9966428367360c9 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 12 Nov 2020 23:18:00 +0800 Subject: [PATCH 26/33] ring-buffer: Fix a typo in function description s/ring_buffer_commit_discard/ring_buffer_discard_commit/ Link: https://lkml.kernel.org/r/20201112151800.14382-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1b9da155ff00..f09d3f5911cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3645,7 +3645,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, } /** - * ring_buffer_commit_discard - discard an event that has not been committed + * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * From d9a9280a0d0ae51dc1d4142138b99242b7ec8ac6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:10:58 +0100 Subject: [PATCH 27/33] seq_buf: Avoid type mismatch for seq_buf_init Building with W=2 prints a number of warnings for one function that has a pointer type mismatch: linux/seq_buf.h: In function 'seq_buf_init': linux/seq_buf.h:35:12: warning: pointer targets in assignment from 'unsigned char *' to 'char *' differ in signedness [-Wpointer-sign] Change the type in the function prototype according to the type in the structure. Link: https://lkml.kernel.org/r/20201026161108.3707783-1-arnd@kernel.org Fixes: 9a7777935c34 ("tracing: Convert seq_buf fields to be like seq_file fields") Reviewed-by: Cezary Rojewski Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- include/linux/seq_buf.h | 2 +- include/linux/trace_seq.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index fb0205d87d3c..9d6c28cc4d8f 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -30,7 +30,7 @@ static inline void seq_buf_clear(struct seq_buf *s) } static inline void -seq_buf_init(struct seq_buf *s, unsigned char *buf, unsigned int size) +seq_buf_init(struct seq_buf *s, char *buf, unsigned int size) { s->buffer = buf; s->size = size; diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 6c30508fca19..5a2c650d9e1c 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -12,7 +12,7 @@ */ struct trace_seq { - unsigned char buffer[PAGE_SIZE]; + char buffer[PAGE_SIZE]; struct seq_buf seq; int full; }; @@ -51,7 +51,7 @@ static inline int trace_seq_used(struct trace_seq *s) * that is about to be written to and then return the result * of that write. */ -static inline unsigned char * +static inline char * trace_seq_buffer_ptr(struct trace_seq *s) { return s->buffer + seq_buf_used(&s->seq); From 60efe21e5976d3d4170a8190ca76a271d6419754 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2020 17:54:09 +0900 Subject: [PATCH 28/33] tracing: Disable ftrace selftests when any tracer is running Disable ftrace selftests when any tracer (kernel command line options like ftrace=, trace_events=, kprobe_events=, and boot-time tracing) starts running because selftest can disturb it. Currently ftrace= and trace_events= are checked, but kprobe_events has a different flag, and boot-time tracing didn't checked. This unifies the disabled flag and all of those boot-time tracing features sets the flag. This also fixes warnings on kprobe-event selftest (CONFIG_FTRACE_STARTUP_TEST=y and CONFIG_KPROBE_EVENTS=y) with boot-time tracing (ftrace.event.kprobes.EVENT.probes) like below; [ 59.803496] trace_kprobe: Testing kprobe tracing: [ 59.804258] ------------[ cut here ]------------ [ 59.805682] WARNING: CPU: 3 PID: 1 at kernel/trace/trace_kprobe.c:1987 kprobe_trace_self_tests_ib [ 59.806944] Modules linked in: [ 59.807335] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc7+ #172 [ 59.808029] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/204 [ 59.808999] RIP: 0010:kprobe_trace_self_tests_init+0x5f/0x42b [ 59.809696] Code: e8 03 00 00 48 c7 c7 30 8e 07 82 e8 6d 3c 46 ff 48 c7 c6 00 b2 1a 81 48 c7 c7 7 [ 59.812439] RSP: 0018:ffffc90000013e78 EFLAGS: 00010282 [ 59.813038] RAX: 00000000ffffffef RBX: 0000000000000000 RCX: 0000000000049443 [ 59.813780] RDX: 0000000000049403 RSI: 0000000000049403 RDI: 000000000002deb0 [ 59.814589] RBP: ffffc90000013e90 R08: 0000000000000001 R09: 0000000000000001 [ 59.815349] R10: 0000000000000001 R11: 0000000000000000 R12: 00000000ffffffef [ 59.816138] R13: ffff888004613d80 R14: ffffffff82696940 R15: ffff888004429138 [ 59.816877] FS: 0000000000000000(0000) GS:ffff88807dcc0000(0000) knlGS:0000000000000000 [ 59.817772] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 59.818395] CR2: 0000000001a8dd38 CR3: 0000000002222000 CR4: 00000000000006a0 [ 59.819144] Call Trace: [ 59.819469] ? init_kprobe_trace+0x6b/0x6b [ 59.819948] do_one_initcall+0x5f/0x300 [ 59.820392] ? rcu_read_lock_sched_held+0x4f/0x80 [ 59.820916] kernel_init_freeable+0x22a/0x271 [ 59.821416] ? rest_init+0x241/0x241 [ 59.821841] kernel_init+0xe/0x10f [ 59.822251] ret_from_fork+0x22/0x30 [ 59.822683] irq event stamp: 16403349 [ 59.823121] hardirqs last enabled at (16403359): [] console_unlock+0x48e/0x580 [ 59.824074] hardirqs last disabled at (16403368): [] console_unlock+0x3f6/0x580 [ 59.825036] softirqs last enabled at (16403200): [] __do_softirq+0x33a/0x484 [ 59.825982] softirqs last disabled at (16403087): [] asm_call_irq_on_stack+0x10 [ 59.827034] ---[ end trace 200c544775cdfeb3 ]--- [ 59.827635] trace_kprobe: error on probing function entry. Link: https://lkml.kernel.org/r/160741764955.3448999.3347769358299456915.stgit@devnote2 Fixes: 4d655281eb1b ("tracing/boot Add kprobe event support") Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 19 +++++++++++++------ kernel/trace/trace.h | 5 +++++ kernel/trace/trace_boot.c | 2 ++ kernel/trace/trace_events.c | 2 +- kernel/trace/trace_kprobe.c | 9 +++------ kernel/trace/trace_selftest.c | 2 +- 6 files changed, 25 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a282bbc7e7f..eee484afcc51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -68,10 +68,21 @@ bool ring_buffer_expanded; static bool __read_mostly tracing_selftest_running; /* - * If a tracer is running, we do not want to run SELFTEST. + * If boot-time tracing including tracers/events via kernel cmdline + * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; +#ifdef CONFIG_FTRACE_STARTUP_TEST +void __init disable_tracing_selftest(const char *reason) +{ + if (!tracing_selftest_disabled) { + tracing_selftest_disabled = true; + pr_info("Ftrace startup test is disabled due to %s\n", reason); + } +} +#endif + /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; @@ -2112,11 +2123,7 @@ int __init register_tracer(struct tracer *type) apply_trace_boot_options(); /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = true; -#ifdef CONFIG_FTRACE_STARTUP_TEST - printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", - type->name); -#endif + disable_tracing_selftest("running a tracer"); out_unlock: return ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9462251cab92..e448d2da0b99 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -719,6 +719,8 @@ extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST +extern void __init disable_tracing_selftest(const char *reason); + extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_function_graph(struct tracer *trace, @@ -742,6 +744,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace, */ #define __tracer_data __refdata #else +static inline void __init disable_tracing_selftest(const char *reason) +{ +} /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index c22a152ef0b4..a82f03f385f8 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -344,6 +344,8 @@ static int __init trace_boot_init(void) trace_boot_init_one_instance(tr, trace_node); trace_boot_init_instances(trace_node); + disable_tracing_selftest("running boot-time tracing"); + return 0; } /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 98d194d8460e..7d207c5e9802 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3201,7 +3201,7 @@ static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; - tracing_selftest_disabled = true; + disable_tracing_selftest("running event tracing"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b911e9f6d9f5..b29f92c51b1a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,11 +25,12 @@ /* Kprobe early definition from command line */ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; -static bool kprobe_boot_events_enabled __initdata; static int __init set_kprobe_boot_events(char *str) { strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + disable_tracing_selftest("running kprobe events"); + return 0; } __setup("kprobe_event=", set_kprobe_boot_events); @@ -1887,8 +1888,6 @@ static __init void setup_boot_kprobe_events(void) ret = trace_run_command(cmd, create_or_delete_trace_kprobe); if (ret) pr_warn("Failed to add event(%d): %s\n", ret, cmd); - else - kprobe_boot_events_enabled = true; cmd = p; } @@ -1973,10 +1972,8 @@ static __init int kprobe_trace_self_tests_init(void) if (tracing_is_disabled()) return -ENODEV; - if (kprobe_boot_events_enabled) { - pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n"); + if (tracing_selftest_disabled) return 0; - } target = kprobe_trace_selftest_target; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ed081c6471c..73ef12092250 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -786,7 +786,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* Have we just recovered from a hang? */ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { - tracing_selftest_disabled = true; + disable_tracing_selftest("recovering from a hang"); ret = -1; goto out; } From 3b3493531c4d415044442349c9d37ad48ad44c85 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 14 Dec 2020 09:45:03 +0100 Subject: [PATCH 29/33] tracing: Drop unneeded assignment in ring_buffer_resize() Since commit 0a1754b2a97e ("ring-buffer: Return 0 on success from ring_buffer_resize()"), computing the size is not needed anymore. Drop unneeded assignment in ring_buffer_resize(). Link: https://lkml.kernel.org/r/20201214084503.3079-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f09d3f5911cb..8b57251ebf9d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1974,8 +1974,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; - size = nr_pages * BUF_PAGE_SIZE; - /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); From 82db909e6be667f2993802f3a1e86426cab57049 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Wed, 14 Oct 2020 23:27:49 +0800 Subject: [PATCH 30/33] ring-buffer: Fix two typos in comments s/inerrupting/interrupting/ s/beween/between/ Link: https://lkml.kernel.org/r/20201014152749.29986-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8b57251ebf9d..e97ecf72c727 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3399,7 +3399,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* This did not interrupt any time update */ info->delta = info->ts - info->after; else - /* Just use full timestamp for inerrupting event */ + /* Just use full timestamp for interrupting event */ info->delta = info->ts; barrier(); check_buffer(cpu_buffer, info, tail); @@ -3436,7 +3436,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->ts = ts; } else { /* - * Interrupted beween C and E: + * Interrupted between C and E: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that From 74e2afc6df5782ea34bc7ac350aeb206c3666f9a Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 15 Oct 2020 19:38:42 +0800 Subject: [PATCH 31/33] ring-buffer: Add rb_check_bpage in __rb_allocate_pages It may be better to check each page is aligned by 4 bytes. The 2 least significant bits of the address will be used as flags. Link: https://lkml.kernel.org/r/20201015113842.2921-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e97ecf72c727..e03bc4e5d482 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1423,7 +1423,8 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + long nr_pages, struct list_head *pages) { struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; @@ -1463,13 +1464,15 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu)); + mflags, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1501,7 +1504,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, WARN_ON(!nr_pages); - if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* @@ -2008,8 +2011,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); - if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, - &cpu_buffer->new_pages, cpu)) { + if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; @@ -2074,8 +2077,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && - __rb_allocate_pages(cpu_buffer->nr_pages_to_update, - &cpu_buffer->new_pages, cpu_id)) { + __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } From adab66b71abfe206a020f11e561f4df41f0b2aba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 14 Dec 2020 12:33:51 -0500 Subject: [PATCH 32/33] Revert: "ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS" It was believed that metag was the only architecture that required the ring buffer to keep 8 byte words aligned on 8 byte architectures, and with its removal, it was assumed that the ring buffer code did not need to handle this case. It appears that sparc64 also requires this. The following was reported on a sparc64 boot up: kernel: futex hash table entries: 65536 (order: 9, 4194304 bytes, linear) kernel: Running postponed tracer tests: kernel: Testing tracer function: kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140 kernel: Kernel unaligned access at TPC[552a24] trace_function+0x44/0x140 kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140 kernel: Kernel unaligned access at TPC[552a24] trace_function+0x44/0x140 kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140 kernel: PASSED Need to put back the 64BIT aligned code for the ring buffer. Link: https://lore.kernel.org/r/CADxRZqzXQRYgKc=y-KV=S_yHL+Y8Ay2mh5ezeZUnpRvg+syWKw@mail.gmail.com Cc: stable@vger.kernel.org Fixes: 86b3de60a0b6 ("ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS") Reported-by: Anatoly Pugachev Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 16 ++++++++++++++++ kernel/trace/ring_buffer.c | 17 +++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..fa716994f77e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -143,6 +143,22 @@ config UPROBES managed by the kernel and kept transparent to the probed application. ) +config HAVE_64BIT_ALIGNED_ACCESS + def_bool 64BIT && !HAVE_EFFICIENT_UNALIGNED_ACCESS + help + Some architectures require 64 bit accesses to be 64 bit + aligned, which also requires structs containing 64 bit values + to be 64 bit aligned too. This includes some 32 bit + architectures which can do 64 bit accesses, as well as 64 bit + architectures without unaligned access. + + This symbol should be selected by an architecture if 64 bit + accesses are required to be 64 bit aligned in this way even + though it is not a 64 bit architecture. + + See Documentation/unaligned-memory-access.txt for more + information on the topic of unaligned memory accesses. + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e03bc4e5d482..926845eb5ab5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -130,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT) + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -2718,7 +2727,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else @@ -2733,11 +2742,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length++; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it From f6a694665f132cbf6e2222dd2f173dc35330a8aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 14 Dec 2020 21:03:27 -0500 Subject: [PATCH 33/33] tracing: Offload eval map updates to a work queue In order for tracepoints to export their enums to user space, the use of the TRACE_DEFINE_ENUM() macro is used. On boot up, the strings shown in the tracefs "print fmt" lines are processed, and all the enums registered by TRACE_DEFINE_ENUM are replaced with the interger value. This way, userspace tools that read the raw binary data, knows how to evaluate the raw events. This is currently done in an initcall, but it has been noticed that slow embedded boards that have tracing may take a few seconds to process them all, and a few seconds slow down on an embedded device is detrimental to the system. Instead, offload the work to a work queue and make sure that its finished by destroying the work queue (which flushes all work) in a late initcall. This will allow the system to continue to boot and run the updates in the background, and this speeds up the boot time. Note, the strings being updated are only used by user space, so finishing the process before the system is fully booted will prevent any race issues. Link: https://lore.kernel.org/r/68d7b3327052757d0cd6359a6c9015a85b437232.camel@pengutronix.de Reported-by: Lucas Stach Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eee484afcc51..eb5205e48733 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9066,7 +9066,10 @@ int tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_eval_init(void) +static struct workqueue_struct *eval_map_wq __initdata; +static struct work_struct eval_map_work __initdata; + +static void __init eval_map_work_func(struct work_struct *work) { int len; @@ -9074,6 +9077,33 @@ static void __init trace_eval_init(void) trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } +static int __init trace_eval_init(void) +{ + INIT_WORK(&eval_map_work, eval_map_work_func); + + eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); + if (!eval_map_wq) { + pr_err("Unable to allocate eval_map_wq\n"); + /* Do work here */ + eval_map_work_func(&eval_map_work); + return -ENOMEM; + } + + queue_work(eval_map_wq, &eval_map_work); + return 0; +} + +static int __init trace_eval_sync(void) +{ + /* Make sure the eval map updates are finished */ + if (eval_map_wq) + destroy_workqueue(eval_map_wq); + return 0; +} + +late_initcall_sync(trace_eval_sync); + + #ifdef CONFIG_MODULES static void trace_module_add_evals(struct module *mod) {