linux/kernel/events/internal.h
Daniel Borkmann 7e3f977edd perf, events: add non-linear data support for raw records
This patch adds support for non-linear data on raw records. It
extends raw records to have one or multiple fragments that will
be written linearly into the ring slot, where each fragment can
optionally have a custom callback handler to walk and extract
complex, possibly non-linear data.

If a callback handler is provided for a fragment, then the new
__output_custom() will be used instead of __output_copy() for
the perf_output_sample() part. perf_prepare_sample() does all
the size calculation only once, so perf_output_sample() doesn't
need to redo the same work anymore, meaning real_size and padding
will be cached in the raw record. The raw record becomes 32 bytes
in size without holes; to not increase it further and to avoid
doing unnecessary recalculations in fast-path, we can reuse
next pointer of the last fragment, idea here is borrowed from
ZERO_OR_NULL_PTR(), which should keep the perf_output_sample()
path for PERF_SAMPLE_RAW minimal.

This facility is needed for BPF's event output helper as a first
user that will, in a follow-up, add an additional perf_raw_frag
to its perf_raw_record in order to be able to more efficiently
dump skb context after a linear head meta data related to it.
skbs can be non-linear and thus need a custom output function to
dump buffers. Currently, the skb data needs to be copied twice;
with the help of __output_custom() this work only needs to be
done once. Future users could be things like XDP/BPF programs
that work on different context though and would thus also have
a different callback function.

The few users of raw records are adapted to initialize their frag
data from the raw record itself, no change in behavior for them.
The code is based upon a PoC diff provided by Peter Zijlstra [1].

  [1] http://thread.gmane.org/gmane.linux.network/421294

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
2016-07-15 14:23:56 -07:00

247 lines
5.6 KiB
C

#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
#include <linux/uaccess.h>
/* Buffer handling */
#define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
atomic_t refcount;
struct rcu_head rcu_head;
#ifdef CONFIG_PERF_USE_VMALLOC
struct work_struct work;
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
int overwrite; /* can overwrite itself */
int paused; /* can write into ring buffer */
atomic_t poll; /* POLL_ for wakeups */
local_t head; /* write position */
local_t nest; /* nested writers */
local_t events; /* event limit */
local_t wakeup; /* wakeup stamp */
local_t lost; /* nr records lost */
long watermark; /* wakeup watermark */
long aux_watermark;
/* poll crap */
spinlock_t event_lock;
struct list_head event_list;
atomic_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
/* AUX area */
local_t aux_head;
local_t aux_nest;
local_t aux_wakeup;
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
atomic_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
atomic_t aux_refcount;
void **aux_pages;
void *aux_priv;
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
extern void rb_free(struct ring_buffer *rb);
static inline void rb_free_rcu(struct rcu_head *rcu_head)
{
struct ring_buffer *rb;
rb = container_of(rcu_head, struct ring_buffer, rcu_head);
rb_free(rb);
}
static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
{
if (!pause && rb->nr_pages)
rb->paused = 0;
else
rb->paused = 1;
}
extern struct ring_buffer *
rb_alloc(int nr_pages, long watermark, int cpu, int flags);
extern void perf_event_wakeup(struct perf_event *event);
extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags);
extern void rb_free_aux(struct ring_buffer *rb);
extern struct ring_buffer *ring_buffer_get(struct perf_event *event);
extern void ring_buffer_put(struct ring_buffer *rb);
static inline bool rb_has_aux(struct ring_buffer *rb)
{
return !!rb->aux_nr_pages;
}
void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags);
extern struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
#ifdef CONFIG_PERF_USE_VMALLOC
/*
* Back perf_mmap() with vmalloc memory.
*
* Required for architectures that have d-cache aliasing issues.
*/
static inline int page_order(struct ring_buffer *rb)
{
return rb->page_order;
}
#else
static inline int page_order(struct ring_buffer *rb)
{
return 0;
}
#endif
static inline unsigned long perf_data_size(struct ring_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
static inline unsigned long perf_aux_size(struct ring_buffer *rb)
{
return rb->aux_nr_pages << PAGE_SHIFT;
}
#define __DEFINE_OUTPUT_COPY_BODY(memcpy_func) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
written = memcpy_func(handle->addr, buf, size); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
\
handle->page++; \
handle->page &= rb->nr_pages - 1; \
handle->addr = rb->data_pages[handle->page]; \
handle->size = PAGE_SIZE << page_order(rb); \
} \
} while (len && written == size); \
\
return len; \
}
#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
static inline unsigned long \
func_name(struct perf_output_handle *handle, \
const void *buf, unsigned long len) \
__DEFINE_OUTPUT_COPY_BODY(memcpy_func)
static inline unsigned long
__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
const void *buf, unsigned long len)
__DEFINE_OUTPUT_COPY_BODY(copy_func)
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{
memcpy(dst, src, n);
return 0;
}
DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
static inline unsigned long
memcpy_skip(void *dst, const void *src, unsigned long n)
{
return 0;
}
DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
#ifndef arch_perf_out_copy_user
#define arch_perf_out_copy_user arch_perf_out_copy_user
static inline unsigned long
arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
{
unsigned long ret;
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, n);
pagefault_enable();
return ret;
}
#endif
DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
/* Callchain handling */
extern struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs);
static inline int get_recursion_context(int *recursion)
{
int rctx;
if (in_nmi())
rctx = 3;
else if (in_irq())
rctx = 2;
else if (in_softirq())
rctx = 1;
else
rctx = 0;
if (recursion[rctx])
return -1;
recursion[rctx]++;
barrier();
return rctx;
}
static inline void put_recursion_context(int *recursion, int rctx)
{
barrier();
recursion[rctx]--;
}
#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
static inline bool arch_perf_have_user_stack_dump(void)
{
return true;
}
#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
#else
static inline bool arch_perf_have_user_stack_dump(void)
{
return false;
}
#define perf_user_stack_pointer(regs) 0
#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
#endif /* _KERNEL_EVENTS_INTERNAL_H */