linux/kernel/events/core.c
Linus Torvalds 6c373ca893 Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:

 1) Add BQL support to via-rhine, from Tino Reichardt.

 2) Integrate SWITCHDEV layer support into the DSA layer, so DSA drivers
    can support hw switch offloading.  From Floria Fainelli.

 3) Allow 'ip address' commands to initiate multicast group join/leave,
    from Madhu Challa.

 4) Many ipv4 FIB lookup optimizations from Alexander Duyck.

 5) Support EBPF in cls_bpf classifier and act_bpf action, from Daniel
    Borkmann.

 6) Remove the ugly compat support in ARP for ugly layers like ax25,
    rose, etc.  And use this to clean up the neigh layer, then use it to
    implement MPLS support.  All from Eric Biederman.

 7) Support L3 forwarding offloading in switches, from Scott Feldman.

 8) Collapse the LOCAL and MAIN ipv4 FIB tables when possible, to speed
    up route lookups even further.  From Alexander Duyck.

 9) Many improvements and bug fixes to the rhashtable implementation,
    from Herbert Xu and Thomas Graf.  In particular, in the case where
    an rhashtable user bulk adds a large number of items into an empty
    table, we expand the table much more sanely.

10) Don't make the tcp_metrics hash table per-namespace, from Eric
    Biederman.

11) Extend EBPF to access SKB fields, from Alexei Starovoitov.

12) Split out new connection request sockets so that they can be
    established in the main hash table.  Much less false sharing since
    hash lookups go direct to the request sockets instead of having to
    go first to the listener then to the request socks hashed
    underneath.  From Eric Dumazet.

13) Add async I/O support for crytpo AF_ALG sockets, from Tadeusz Struk.

14) Support stable privacy address generation for RFC7217 in IPV6.  From
    Hannes Frederic Sowa.

15) Hash network namespace into IP frag IDs, also from Hannes Frederic
    Sowa.

16) Convert PTP get/set methods to use 64-bit time, from Richard
    Cochran.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1816 commits)
  fm10k: Bump driver version to 0.15.2
  fm10k: corrected VF multicast update
  fm10k: mbx_update_max_size does not drop all oversized messages
  fm10k: reset head instead of calling update_max_size
  fm10k: renamed mbx_tx_dropped to mbx_tx_oversized
  fm10k: update xcast mode before synchronizing multicast addresses
  fm10k: start service timer on probe
  fm10k: fix function header comment
  fm10k: comment next_vf_mbx flow
  fm10k: don't handle mailbox events in iov_event path and always process mailbox
  fm10k: use separate workqueue for fm10k driver
  fm10k: Set PF queues to unlimited bandwidth during virtualization
  fm10k: expose tx_timeout_count as an ethtool stat
  fm10k: only increment tx_timeout_count in Tx hang path
  fm10k: remove extraneous "Reset interface" message
  fm10k: separate PF only stats so that VF does not display them
  fm10k: use hw->mac.max_queues for stats
  fm10k: only show actual queues, not the maximum in hardware
  fm10k: allow creation of VLAN on default vid
  fm10k: fix unused warnings
  ...
2015-04-15 09:00:47 -07:00

9058 lines
210 KiB
C

/*
* Performance events core code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include "internal.h"
#include <asm/irq_regs.h>
static struct workqueue_struct *perf_wq;
struct remote_function_call {
struct task_struct *p;
int (*func)(void *info);
void *info;
int ret;
};
static void remote_function(void *data)
{
struct remote_function_call *tfc = data;
struct task_struct *p = tfc->p;
if (p) {
tfc->ret = -EAGAIN;
if (task_cpu(p) != smp_processor_id() || !task_curr(p))
return;
}
tfc->ret = tfc->func(tfc->info);
}
/**
* task_function_call - call a function on the cpu on which a task runs
* @p: the task to evaluate
* @func: the function to be called
* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
* be on the current CPU, which just calls the function directly
*
* returns: @func return value, or
* -ESRCH - when the process isn't running
* -EAGAIN - when the process moved away
*/
static int
task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
{
struct remote_function_call data = {
.p = p,
.func = func,
.info = info,
.ret = -ESRCH, /* No such (running) process */
};
if (task_curr(p))
smp_call_function_single(task_cpu(p), remote_function, &data, 1);
return data.ret;
}
/**
* cpu_function_call - call a function on the cpu
* @func: the function to be called
* @info: the function call argument
*
* Calls the function @func on the remote cpu.
*
* returns: @func return value or -ENXIO when the cpu is offline
*/
static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
{
struct remote_function_call data = {
.p = NULL,
.func = func,
.info = info,
.ret = -ENXIO, /* No such CPU */
};
smp_call_function_single(cpu, remote_function, &data, 1);
return data.ret;
}
#define EVENT_OWNER_KERNEL ((void *) -1)
static bool is_kernel_event(struct perf_event *event)
{
return event->owner == EVENT_OWNER_KERNEL;
}
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
PERF_FLAG_FD_CLOEXEC)
/*
* branch priv levels that need permission checks
*/
#define PERF_SAMPLE_BRANCH_PERM_PLM \
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
/*
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
/*
* perf event paranoia level:
* -1 - not paranoid at all
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
/*
* max perf event sample rate
*/
#define DEFAULT_MAX_SAMPLE_RATE 100000
#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
static int perf_sample_allowed_ns __read_mostly =
DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
void update_perf_cpu_limits(void)
{
u64 tmp = perf_sample_period_ns;
tmp *= sysctl_perf_cpu_time_max_percent;
do_div(tmp, 100);
ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx);
int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
return 0;
}
int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
return ret;
update_perf_cpu_limits();
return 0;
}
/*
* perf samples are done in some very critical code paths (NMIs).
* If they take too much CPU time, the system can lock up and not
* get any real work done. This will drop the sample rate when
* we detect that events are taking too long.
*/
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);
static void perf_duration_warn(struct irq_work *w)
{
u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
u64 avg_local_sample_len;
u64 local_samples_len;
local_samples_len = __this_cpu_read(running_sample_length);
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
printk_ratelimited(KERN_WARNING
"perf interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
avg_local_sample_len, allowed_ns >> 1,
sysctl_perf_event_sample_rate);
}
static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
void perf_sample_event_took(u64 sample_len_ns)
{
u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
u64 avg_local_sample_len;
u64 local_samples_len;
if (allowed_ns == 0)
return;
/* decay the counter by 1 average sample */
local_samples_len = __this_cpu_read(running_sample_length);
local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
local_samples_len += sample_len_ns;
__this_cpu_write(running_sample_length, local_samples_len);
/*
* note: this will be biased artifically low until we have
* seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
* from having to maintain a count.
*/
avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
if (avg_local_sample_len <= allowed_ns)
return;
if (max_samples_per_tick <= 1)
return;
max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
if (!irq_work_queue(&perf_duration_work)) {
early_printk("perf interrupt took too long (%lld > %lld), lowering "
"kernel.perf_event_max_sample_rate to %d\n",
avg_local_sample_len, allowed_ns >> 1,
sysctl_perf_event_sample_rate);
}
}
static atomic64_t perf_event_id;
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type);
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
{
return "pmu";
}
static inline u64 perf_clock(void)
{
return local_clock();
}
static inline u64 perf_event_clock(struct perf_event *event)
{
return event->clock();
}
static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
}
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
raw_spin_lock(&cpuctx->ctx.lock);
if (ctx)
raw_spin_lock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
raw_spin_unlock(&ctx->lock);
raw_spin_unlock(&cpuctx->ctx.lock);
}
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
/* @event doesn't care about cgroup */
if (!event->cgrp)
return true;
/* wants specific cgroup scope but @cpuctx isn't associated with any */
if (!cpuctx->cgrp)
return false;
/*
* Cgroup scoping is recursive. An event enabled for a cgroup is
* also enabled for all its descendant cgroups. If @cpuctx's
* cgroup is a descendant of @event's (the test covers identity
* case), it's a match.
*/
return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
event->cgrp->css.cgroup);
}
static inline void perf_detach_cgroup(struct perf_event *event)
{
css_put(&event->cgrp->css);
event->cgrp = NULL;
}
static inline int is_cgroup_event(struct perf_event *event)
{
return event->cgrp != NULL;
}
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
return t->time;
}
static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
{
struct perf_cgroup_info *info;
u64 now;
now = perf_clock();
info = this_cpu_ptr(cgrp->info);
info->time += now - info->timestamp;
info->timestamp = now;
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
struct perf_cgroup *cgrp_out = cpuctx->cgrp;
if (cgrp_out)
__update_cgrp_time(cgrp_out);
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
struct perf_cgroup *cgrp;
/*
* ensure we access cgroup data only when needed and
* when we know the cgroup is pinned (css_get)
*/
if (!is_cgroup_event(event))
return;
cgrp = perf_cgroup_from_task(current);
/*
* Do not update time when cgroup is not active
*/
if (cgrp == event->cgrp)
__update_cgrp_time(event->cgrp);
}
static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
struct perf_event_context *ctx)
{
struct perf_cgroup *cgrp;
struct perf_cgroup_info *info;
/*
* ctx->lock held by caller
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
if (!task || !ctx->nr_cgroups)
return;
cgrp = perf_cgroup_from_task(task);
info = this_cpu_ptr(cgrp->info);
info->timestamp = ctx->timestamp;
}
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
/*
* reschedule events based on the cgroup constraint of task.
*
* mode SWOUT : schedule out everything
* mode SWIN : schedule in based on cgroup for next
*/
void perf_cgroup_switch(struct task_struct *task, int mode)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
unsigned long flags;
/*
* disable interrupts to avoid geting nr_cgroup
* changes via __perf_event_disable(). Also
* avoids preemption.
*/
local_irq_save(flags);
/*
* we reschedule only in the presence of cgroup
* constrained events.
*/
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
continue; /* ensure we process each cpuctx once */
/*
* perf_cgroup_events says at least one
* context on this CPU has cgroup events.
*
* ctx->nr_cgroups reports the number of cgroup
* events for a context.
*/
if (cpuctx->ctx.nr_cgroups > 0) {
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
if (mode & PERF_CGROUP_SWOUT) {
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
/*
* must not be done before ctxswout due
* to event_filter_match() in event_sched_out()
*/
cpuctx->cgrp = NULL;
}
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
/*
* set cgrp before ctxsw in to allow
* event_filter_match() to not have to pass
* task around
*/
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
}
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
}
rcu_read_unlock();
local_irq_restore(flags);
}
static inline void perf_cgroup_sched_out(struct task_struct *task,
struct task_struct *next)
{
struct perf_cgroup *cgrp1;
struct perf_cgroup *cgrp2 = NULL;
/*
* we come here when we know perf_cgroup_events > 0
*/
cgrp1 = perf_cgroup_from_task(task);
/*
* next is NULL when called from perf_event_enable_on_exec()
* that will systematically cause a cgroup_switch()
*/
if (next)
cgrp2 = perf_cgroup_from_task(next);
/*
* only schedule out current cgroup events if we know
* that we are switching to a different cgroup. Otherwise,
* do no touch the cgroup events.
*/
if (cgrp1 != cgrp2)
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
}
static inline void perf_cgroup_sched_in(struct task_struct *prev,
struct task_struct *task)
{
struct perf_cgroup *cgrp1;
struct perf_cgroup *cgrp2 = NULL;
/*
* we come here when we know perf_cgroup_events > 0
*/
cgrp1 = perf_cgroup_from_task(task);
/* prev can never be NULL */
cgrp2 = perf_cgroup_from_task(prev);
/*
* only need to schedule in cgroup events if we are changing
* cgroup during ctxsw. Cgroup events were not scheduled
* out of ctxsw out if that was not the case.
*/
if (cgrp1 != cgrp2)
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
}
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
{
struct perf_cgroup *cgrp;
struct cgroup_subsys_state *css;
struct fd f = fdget(fd);
int ret = 0;
if (!f.file)
return -EBADF;
css = css_tryget_online_from_dir(f.file->f_path.dentry,
&perf_event_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
goto out;
}
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
/*
* all events in a group must monitor
* the same cgroup because a task belongs
* to only one perf cgroup at a time
*/
if (group_leader && group_leader->cgrp != cgrp) {
perf_detach_cgroup(event);
ret = -EINVAL;
}
out:
fdput(f);
return ret;
}
static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
struct perf_cgroup_info *t;
t = per_cpu_ptr(event->cgrp->info, event->cpu);
event->shadow_ctx_time = now - t->timestamp;
}
static inline void
perf_cgroup_defer_enabled(struct perf_event *event)
{
/*
* when the current task's perf cgroup does not match
* the event's, we need to remember to call the
* perf_mark_enable() function the first time a task with
* a matching perf cgroup is scheduled in.
*/
if (is_cgroup_event(event) && !perf_cgroup_match(event))
event->cgrp_defer_enabled = 1;
}
static inline void
perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *sub;
u64 tstamp = perf_event_time(event);
if (!event->cgrp_defer_enabled)
return;
event->cgrp_defer_enabled = 0;
event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
sub->tstamp_enabled = tstamp - sub->total_time_enabled;
sub->cgrp_defer_enabled = 0;
}
}
}
#else /* !CONFIG_CGROUP_PERF */
static inline bool
perf_cgroup_match(struct perf_event *event)
{
return true;
}
static inline void perf_detach_cgroup(struct perf_event *event)
{}
static inline int is_cgroup_event(struct perf_event *event)
{
return 0;
}
static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
{
return 0;
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
{
}
static inline void perf_cgroup_sched_out(struct task_struct *task,
struct task_struct *next)
{
}
static inline void perf_cgroup_sched_in(struct task_struct *prev,
struct task_struct *task)
{
}
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
{
return -EINVAL;
}
static inline void
perf_cgroup_set_timestamp(struct task_struct *task,
struct perf_event_context *ctx)
{
}
void
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
static inline void
perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
{
}
static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
return 0;
}
static inline void
perf_cgroup_defer_enabled(struct perf_event *event)
{
}
static inline void
perf_cgroup_mark_enabled(struct perf_event *event,
struct perf_event_context *ctx)
{
}
#endif
/*
* set default to be dependent on timer tick just
* like original code
*/
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
* function must be called with interrupts disbled
*/
static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
{
struct perf_cpu_context *cpuctx;
enum hrtimer_restart ret = HRTIMER_NORESTART;
int rotations = 0;
WARN_ON(!irqs_disabled());
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
rotations = perf_rotate_context(cpuctx);
/*
* arm timer if needed
*/
if (rotations) {
hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
ret = HRTIMER_RESTART;
}
return ret;
}
/* CPU is going down */
void perf_cpu_hrtimer_cancel(int cpu)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
unsigned long flags;
if (WARN_ON(cpu != smp_processor_id()))
return;
local_irq_save(flags);
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
if (pmu->task_ctx_nr == perf_sw_context)
continue;
hrtimer_cancel(&cpuctx->hrtimer);
}
rcu_read_unlock();
local_irq_restore(flags);
}
static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
{
struct hrtimer *hr = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
int timer;
/* no multiplexing needed for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
return;
/*
* check default is sane, if not set then force to
* default interval (1/tick)
*/
timer = pmu->hrtimer_interval_ms;
if (timer < 1)
timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
hr->function = perf_cpu_hrtimer_handler;
}
static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
{
struct hrtimer *hr = &cpuctx->hrtimer;
struct pmu *pmu = cpuctx->ctx.pmu;
/* not for SW PMU */
if (pmu->task_ctx_nr == perf_sw_context)
return;
if (hrtimer_active(hr))
return;
if (!hrtimer_callback_running(hr))
__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
0, HRTIMER_MODE_REL_PINNED, 0);
}
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
if (!--(*count))
pmu->pmu_enable(pmu);
}
static DEFINE_PER_CPU(struct list_head, active_ctx_list);
/*
* perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
* perf_event_task_tick() are fully serialized because they're strictly cpu
* affine and perf_event_ctx{activate,deactivate} are called with IRQs
* disabled, while perf_event_task_tick is called from IRQ context.
*/
static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
WARN_ON(!irqs_disabled());
WARN_ON(!list_empty(&ctx->active_ctx_list));
list_add(&ctx->active_ctx_list, head);
}
static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
WARN_ON(!irqs_disabled());
WARN_ON(list_empty(&ctx->active_ctx_list));
list_del_init(&ctx->active_ctx_list);
}
static void get_ctx(struct perf_event_context *ctx)
{
WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
static void free_ctx(struct rcu_head *head)
{
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
kfree(ctx->task_ctx_data);
kfree(ctx);
}
static void put_ctx(struct perf_event_context *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
}
}
/*
* Because of perf_event::ctx migration in sys_perf_event_open::move_group and
* perf_pmu_migrate_context() we need some magic.
*
* Those places that change perf_event::ctx will hold both
* perf_event_ctx::mutex of the 'old' and 'new' ctx value.
*
* Lock ordering is by mutex address. There is one other site where
* perf_event_context::mutex nests and that is put_event(). But remember that
* that is a parent<->child context relation, and migration does not affect
* children, therefore these two orderings should not interact.
*
* The change in perf_event::ctx does not affect children (as claimed above)
* because the sys_perf_event_open() case will install a new event and break
* the ctx parent<->child relation, and perf_pmu_migrate_context() is only
* concerned with cpuctx and that doesn't have children.
*
* The places that change perf_event::ctx will issue:
*
* perf_remove_from_context();
* synchronize_rcu();
* perf_install_in_context();
*
* to affect the change. The remove_from_context() + synchronize_rcu() should
* quiesce the event, after which we can install it in the new location. This
* means that only external vectors (perf_fops, prctl) can perturb the event
* while in transit. Therefore all such accessors should also acquire
* perf_event_context::mutex to serialize against this.
*
* However; because event->ctx can change while we're waiting to acquire
* ctx->mutex we must be careful and use the below perf_event_ctx_lock()
* function.
*
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event_context::lock
* perf_event::child_mutex;
* perf_event::mmap_mutex
* mmap_sem
*/
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
struct perf_event_context *ctx;
again:
rcu_read_lock();
ctx = ACCESS_ONCE(event->ctx);
if (!atomic_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
}
rcu_read_unlock();
mutex_lock_nested(&ctx->mutex, nesting);
if (event->ctx != ctx) {
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
goto again;
}
return ctx;
}
static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
return perf_event_ctx_lock_nested(event, 0);
}
static void perf_event_ctx_unlock(struct perf_event *event,
struct perf_event_context *ctx)
{
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
}
/*
* This must be done under the ctx->lock, such as to serialize against
* context_equiv(), therefore we cannot call put_ctx() since that might end up
* calling scheduler related locks and ctx->lock nests inside those.
*/
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
struct perf_event_context *parent_ctx = ctx->parent_ctx;
lockdep_assert_held(&ctx->lock);
if (parent_ctx)
ctx->parent_ctx = NULL;
ctx->generation++;
return parent_ctx;
}
static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
return task_tgid_nr_ns(p, event->ns);
}
static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
return task_pid_nr_ns(p, event->ns);
}
/*
* If we inherit events we want to return the parent event id
* to userspace.
*/
static u64 primary_event_id(struct perf_event *event)
{
u64 id = event->id;
if (event->parent)
id = event->parent->id;
return id;
}
/*
* Get the perf_event_context for a task and lock it.
* This has to cope with with the fact that until it is locked,
* the context could get moved to another task.
*/
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
retry:
/*
* One of the few rules of preemptible RCU is that one cannot do
* rcu_read_unlock() while holding a scheduler (or nested) lock when
* part of the read side critical section was preemptible -- see
* rcu_read_unlock_special().
*
* Since ctx->lock nests under rq->lock we must ensure the entire read
* side critical section is non-preemptible.
*/
preempt_disable();
rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
* If this context is a clone of another, it might
* get swapped for another underneath us by
* perf_event_task_sched_out, though the
* rcu_read_lock() protects us from any context
* getting freed. Lock the context and check if it
* got swapped before we could get the lock, and retry
* if so. If we locked the right context, then it
* can't get swapped on us any more.
*/
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
rcu_read_unlock();
preempt_enable();
goto retry;
}
if (!atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
ctx = NULL;
}
}
rcu_read_unlock();
preempt_enable();
return ctx;
}
/*
* Get the context for a task and increment its pin_count so it
* can't get swapped to another task. This also increments its
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task, int ctxn)
{
struct perf_event_context *ctx;
unsigned long flags;
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
return ctx;
}
static void perf_unpin_context(struct perf_event_context *ctx)
{
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
--ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
/*
* Update the record of the current time in a context.
*/
static void update_context_time(struct perf_event_context *ctx)
{
u64 now = perf_clock();
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
return ctx ? ctx->time : 0;
}
/*
* Update the total_time_enabled and total_time_running fields for a event.
* The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
u64 run_end;
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
* tasks were in the monitored cgroup. This is
* independent of the activity of the context as
* there may be a mix of cgroup and non-cgroup events.
*
* That is why we treat cgroup events differently
* here.
*/
if (is_cgroup_event(event))
run_end = perf_cgroup_event_time(event);
else if (ctx->is_active)
run_end = ctx->time;
else
run_end = event->tstamp_stopped;
event->total_time_enabled = run_end - event->tstamp_enabled;
if (event->state == PERF_EVENT_STATE_INACTIVE)
run_end = event->tstamp_stopped;
else
run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
}
/*
* Update total_time_enabled and total_time_running for all events in a group.
*/
static void update_group_times(struct perf_event *leader)
{
struct perf_event *event;
update_event_times(leader);
list_for_each_entry(event, &leader->sibling_list, group_entry)
update_event_times(event);
}
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
if (event->attr.pinned)
return &ctx->pinned_groups;
else
return &ctx->flexible_groups;
}
/*
* Add a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
* perf_group_detach can, at all times, locate all siblings.
*/
if (event->group_leader == event) {
struct list_head *list;
if (is_software_event(event))
event->group_flags |= PERF_GROUP_SOFTWARE;
list = ctx_group_list(event, ctx);
list_add_tail(&event->group_entry, list);
}
if (is_cgroup_event(event))
ctx->nr_cgroups++;
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
ctx->generation++;
}
/*
* Initialize event state based on the perf_event_attr::disabled.
*/
static inline void perf_event__state_init(struct perf_event *event)
{
event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
PERF_EVENT_STATE_INACTIVE;
}
/*
* Called at perf_event creation and when events are attached/detached from a
* group.
*/
static void perf_event__read_size(struct perf_event *event)
{
int entry = sizeof(u64); /* value */
int size = 0;
int nr = 1;
if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
size += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
size += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += event->group_leader->nr_siblings;
size += sizeof(u64);
}
size += entry * nr;
event->read_size = size;
}
static void perf_event__header_size(struct perf_event *event)
{
struct perf_sample_data *data;
u64 sample_type = event->attr.sample_type;
u16 size = 0;
perf_event__read_size(event);
if (sample_type & PERF_SAMPLE_IP)
size += sizeof(data->ip);
if (sample_type & PERF_SAMPLE_ADDR)
size += sizeof(data->addr);
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_WEIGHT)
size += sizeof(data->weight);
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
if (sample_type & PERF_SAMPLE_DATA_SRC)
size += sizeof(data->data_src.val);
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
event->header_size = size;
}
static void perf_event__id_header_size(struct perf_event *event)
{
struct perf_sample_data *data;
u64 sample_type = event->attr.sample_type;
u16 size = 0;
if (sample_type & PERF_SAMPLE_TID)
size += sizeof(data->tid_entry);
if (sample_type & PERF_SAMPLE_TIME)
size += sizeof(data->time);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_ID)
size += sizeof(data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
size += sizeof(data->stream_id);
if (sample_type & PERF_SAMPLE_CPU)
size += sizeof(data->cpu_entry);
event->id_header_size = size;
}
static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
/*
* We can have double attach due to group movement in perf_event_open.
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
event->attach_state |= PERF_ATTACH_GROUP;
if (group_leader == event)
return;
WARN_ON_ONCE(group_leader->ctx != event->ctx);
if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
!is_software_event(event))
group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
list_add_tail(&event->group_entry, &group_leader->sibling_list);
group_leader->nr_siblings++;
perf_event__header_size(group_leader);
list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
perf_event__header_size(pos);
}
/*
* Remove a event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
/*
* We can have double detach due to exit/hot-unplug + close.
*/
if (!(event->attach_state & PERF_ATTACH_CONTEXT))
return;
event->attach_state &= ~PERF_ATTACH_CONTEXT;
if (is_cgroup_event(event)) {
ctx->nr_cgroups--;
cpuctx = __get_cpu_context(ctx);
/*
* if there are no more cgroup events
* then cler cgrp to avoid stale pointer
* in update_cgrp_time_from_cpuctx()
*/
if (!ctx->nr_cgroups)
cpuctx->cgrp = NULL;
}
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
list_del_rcu(&event->event_entry);
if (event->group_leader == event)
list_del_init(&event->group_entry);
update_group_times(event);
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
* returned on read(). The only way to get out
* of error state is by explicit re-enabling
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_OFF;
ctx->generation++;
}
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
struct list_head *list = NULL;
/*
* We can have double detach due to exit/hot-unplug + close.
*/
if (!(event->attach_state & PERF_ATTACH_GROUP))
return;
event->attach_state &= ~PERF_ATTACH_GROUP;
/*
* If this is a sibling, remove it from its group.
*/
if (event->group_leader != event) {
list_del_init(&event->group_entry);
event->group_leader->nr_siblings--;
goto out;
}
if (!list_empty(&event->group_entry))
list = &event->group_entry;
/*
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
if (list)
list_move_tail(&sibling->group_entry, list);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
sibling->group_flags = event->group_flags;
WARN_ON_ONCE(sibling->ctx != event->ctx);
}
out:
perf_event__header_size(event->group_leader);
list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
perf_event__header_size(tmp);
}
/*
* User event without the task.
*/
static bool is_orphaned_event(struct perf_event *event)
{
return event && !is_kernel_event(event) && !event->owner;
}
/*
* Event has a parent but parent's task finished and it's
* alive only because of children holding refference.
*/
static bool is_orphaned_child(struct perf_event *event)
{
return is_orphaned_event(event->parent);
}
static void orphans_remove_work(struct work_struct *work);
static void schedule_orphans_remove(struct perf_event_context *ctx)
{
if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
return;
if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
get_ctx(ctx);
ctx->orphans_remove_sched = true;
}
}
static int __init perf_workqueue_init(void)
{
perf_wq = create_singlethread_workqueue("perf");
WARN(!perf_wq, "failed to create perf workqueue\n");
return perf_wq ? 0 : -1;
}
core_initcall(perf_workqueue_init);
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id())
&& perf_cgroup_match(event);
}
static void
event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
u64 delta;
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
/*
* An event which could not be activated because of
* filter mismatch still needs to have its timings
* maintained, otherwise bogus information is return
* via read() for time_enabled, time_running:
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
&& !event_filter_match(event)) {
delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
perf_pmu_disable(event->pmu);
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
if (!--ctx->nr_active)
perf_event_ctx_deactivate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
if (is_orphaned_child(event))
schedule_orphans_remove(ctx);
perf_pmu_enable(event->pmu);
}
static void
group_sched_out(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
struct perf_event *event;
int state = group_event->state;
event_sched_out(group_event, cpuctx, ctx);
/*
* Schedule out siblings (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry)
event_sched_out(event, cpuctx, ctx);
if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
struct remove_event {
struct perf_event *event;
bool detach_group;
};
/*
* Cross CPU call to remove a performance event
*
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
static int __perf_remove_from_context(void *info)
{
struct remove_event *re = info;
struct perf_event *event = re->event;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
if (re->detach_group)
perf_group_detach(event);
list_del_event(event, ctx);
if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
ctx->is_active = 0;
cpuctx->task_ctx = NULL;
}
raw_spin_unlock(&ctx->lock);
return 0;
}
/*
* Remove the event from a task's (or a CPU's) list of events.
*
* CPU events are removed with a smp call. For task events we only
* call when the task is on a CPU.
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since
* that only calls us on the top-level context, which can't be a clone.
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
static void perf_remove_from_context(struct perf_event *event, bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
struct remove_event re = {
.event = event,
.detach_group = detach_group,
};
lockdep_assert_held(&ctx->mutex);
if (!task) {
/*
* Per cpu events are removed via an smp call. The removal can
* fail if the CPU is currently offline, but in that case we
* already called __perf_remove_from_context from
* perf_event_exit_cpu.
*/
cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
}
retry:
if (!task_function_call(task, __perf_remove_from_context, &re))
return;
raw_spin_lock_irq(&ctx->lock);
/*
* If we failed to find a running task, but find the context active now
* that we've acquired the ctx->lock, retry.
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
goto retry;
}
/*
* Since the task isn't running, its safe to remove the event, us
* holding the ctx->lock ensures the task won't get scheduled in.
*/
if (detach_group)
perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/*
* Cross CPU call to disable a performance event
*/
int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
/*
* If this is a per-task event, need to check whether this
* event's task is the current task on this cpu.
*
* Can trigger due to concurrent perf_event_context_sched_out()
* flipping contexts around.
*/
if (ctx->task && cpuctx->task_ctx != ctx)
return -EINVAL;
raw_spin_lock(&ctx->lock);
/*
* If the event is on, turn it off.
* If it is in error state, leave it in error state.
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
event->state = PERF_EVENT_STATE_OFF;
}
raw_spin_unlock(&ctx->lock);
return 0;
}
/*
* Disable a event.
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This condition is satisifed when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
* goes to exit will block in sync_child_event.
* When called from perf_pending_event it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
*/
static void _perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
if (!task) {
/*
* Disable the event on the cpu that it's on
*/
cpu_function_call(event->cpu, __perf_event_disable, event);
return;
}
retry:
if (!task_function_call(task, __perf_event_disable, event))
return;
raw_spin_lock_irq(&ctx->lock);
/*
* If the event is still active, we need to retry the cross-call.
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
raw_spin_unlock_irq(&ctx->lock);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
goto retry;
}
/*
* Since we have the lock this context can't be scheduled
* in, so we can change the state safely.
*/
if (event->state == PERF_EVENT_STATE_INACTIVE) {
update_group_times(event);
event->state = PERF_EVENT_STATE_OFF;
}
raw_spin_unlock_irq(&ctx->lock);
}
/*
* Strictly speaking kernel users cannot create groups and therefore this
* interface does not need the perf_event_ctx_lock() magic.
*/
void perf_event_disable(struct perf_event *event)
{
struct perf_event_context *ctx;
ctx = perf_event_ctx_lock(event);
_perf_event_disable(event);
perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);
static void perf_set_shadow_time(struct perf_event *event,
struct perf_event_context *ctx,
u64 tstamp)
{
/*
* use the correct time source for the time snapshot
*
* We could get by without this by leveraging the
* fact that to get to this function, the caller
* has most likely already called update_context_time()
* and update_cgrp_time_xx() and thus both timestamp
* are identical (or very close). Given that tstamp is,
* already adjusted for cgroup, we could say that:
* tstamp - ctx->timestamp
* is equivalent to
* tstamp - cgrp->timestamp.
*
* Then, in perf_output_read(), the calculation would
* work with no changes because:
* - event is guaranteed scheduled in
* - no scheduled out in between
* - thus the timestamp would be the same
*
* But this is a bit hairy.
*
* So instead, we have an explicit cgroup call to remain
* within the time time source all along. We believe it
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
perf_cgroup_set_shadow_time(event, tstamp);
else
event->shadow_ctx_time = tstamp - ctx->timestamp;
}
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
int ret = 0;
lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
event->state = PERF_EVENT_STATE_ACTIVE;
event->oncpu = smp_processor_id();
/*
* Unthrottle events, since we scheduled we might have missed several
* ticks already, also for a heavily scheduling task there is little
* guarantee it'll get a tick in a timely manner.
*/
if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
perf_log_throttle(event, 1);
event->hw.interrupts = 0;
}
/*
* The new state must be visible before we turn it on in the hardware:
*/
smp_wmb();
perf_pmu_disable(event->pmu);
event->tstamp_running += tstamp - event->tstamp_stopped;
perf_set_shadow_time(event, ctx, tstamp);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
event->state = PERF_EVENT_STATE_INACTIVE;
event->oncpu = -1;
ret = -EAGAIN;
goto out;
}
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
perf_event_ctx_activate(ctx);
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
cpuctx->exclusive = 1;
if (is_orphaned_child(event))
schedule_orphans_remove(ctx);
out:
perf_pmu_enable(event->pmu);
return ret;
}
static int
group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
u64 now = ctx->time;
bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu);
if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
perf_cpu_hrtimer_restart(cpuctx);
return -EAGAIN;
}
/*
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
}
}
if (!pmu->commit_txn(pmu))
return 0;
group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
* The events up to the failed event are scheduled out normally,
* tstamp_stopped will be updated.
*
* The failed events and the remaining siblings need to have
* their timings updated as if they had gone thru event_sched_in()
* and event_sched_out(). This is required to get consistent timings
* across the group. This also takes care of the case where the group
* could never be scheduled by ensuring tstamp_stopped is set to mark
* the time the event was actually stopped, such that time delta
* calculation in update_event_times() is correct.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
simulate = true;
if (simulate) {
event->tstamp_running += now - event->tstamp_stopped;
event->tstamp_stopped = now;
} else {
event_sched_out(event, cpuctx, ctx);
}
}
event_sched_out(group_event, cpuctx, ctx);
pmu->cancel_txn(pmu);
perf_cpu_hrtimer_restart(cpuctx);
return -EAGAIN;
}
/*
* Work out whether we can put this event group on the CPU now.
*/
static int group_can_go_on(struct perf_event *event,
struct perf_cpu_context *cpuctx,
int can_add_hw)
{
/*
* Groups consisting entirely of software events can always go on.
*/
if (event->group_flags & PERF_GROUP_SOFTWARE)
return 1;
/*
* If an exclusive group is already on, no other hardware
* events can go on.
*/
if (cpuctx->exclusive)
return 0;
/*
* If this group is exclusive and there are already
* events on the CPU, it can't go on.
*/
if (event->attr.exclusive && cpuctx->active_oncpu)
return 0;
/*
* Otherwise, try to add it if all previous groups were able
* to go on.
*/
return can_add_hw;
}
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
u64 tstamp = perf_event_time(event);
list_add_event(event, ctx);
perf_group_attach(event);
event->tstamp_enabled = tstamp;
event->tstamp_running = tstamp;
event->tstamp_stopped = tstamp;
}
static void task_ctx_sched_out(struct perf_event_context *ctx);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
struct task_struct *task)
{
cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
if (ctx)
ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
if (ctx)
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
/*
* Cross CPU call to install and enable a performance event
*
* Must be called with ctx->mutex held
*/
static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
struct task_struct *task = current;
perf_ctx_lock(cpuctx, task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
/*
* If there was an active task_ctx schedule it out.
*/
if (task_ctx)
task_ctx_sched_out(task_ctx);
/*
* If the context we're installing events in is not the
* active task_ctx, flip them.
*/
if (ctx->task && task_ctx != ctx) {
if (task_ctx)
raw_spin_unlock(&task_ctx->lock);
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
}
if (task_ctx) {
cpuctx->task_ctx = task_ctx;
task = task_ctx->task;
}
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
update_context_time(ctx);
/*
* update cgrp time only if current cgrp
* matches event->cgrp. Must be done before
* calling add_event_to_ctx()
*/
update_cgrp_time_from_event(event);
add_event_to_ctx(event, ctx);
/*
* Schedule everything back in
*/
perf_event_sched_in(cpuctx, task_ctx, task);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, task_ctx);
return 0;
}
/*
* Attach a performance event to a context
*
* First we add the event to the list with the hardware enable bit
* in event->hw_config cleared.
*
* If the event is attached to a task which is on a CPU we use a smp
* call to enable it in the task context. The task might have been
* scheduled away, but we check this in the smp call again.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
struct perf_event *event,
int cpu)
{
struct task_struct *task = ctx->task;
lockdep_assert_held(&ctx->mutex);
event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;
if (!task) {
/*
* Per cpu events are installed via an smp call and
* the install is always successful.
*/
cpu_function_call(cpu, __perf_install_in_context, event);
return;
}
retry:
if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
/*
* If we failed to find a running task, but find the context active now
* that we've acquired the ctx->lock, retry.
*/
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
goto retry;
}
/*
* Since the task isn't running, its safe to add the event, us holding
* the ctx->lock ensures the task won't get scheduled in.
*/
add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/*
* Put a event into inactive state and update time fields.
* Enabling the leader of a group effectively enables all
* the group members that aren't explicitly disabled, so we
* have to update their ->tstamp_enabled also.
* Note: this works for group members as well as group leaders
* since the non-leader members' sibling_lists will be empty.
*/
static void __perf_event_mark_enabled(struct perf_event *event)
{
struct perf_event *sub;
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
event->tstamp_enabled = tstamp - event->total_time_enabled;
list_for_each_entry(sub, &event->sibling_list, group_entry) {
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
sub->tstamp_enabled = tstamp - sub->total_time_enabled;
}
}
/*
* Cross CPU call to enable a performance event
*/
static int __perf_event_enable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
/*
* There's a time window between 'ctx->is_active' check
* in perf_event_enable function and this place having:
* - IRQs on
* - ctx->lock unlocked
*
* where the task could be killed and 'ctx' deactivated
* by perf_event_exit_task.
*/
if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
update_context_time(ctx);
if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto unlock;
/*
* set current task's cgroup time reference point
*/
perf_cgroup_set_timestamp(current, ctx);
__perf_event_mark_enabled(event);
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
goto unlock;
}
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
goto unlock;
if (!group_can_go_on(event, cpuctx, 1)) {
err = -EEXIST;
} else {
if (event == leader)
err = group_sched_in(event, cpuctx, ctx);
else
err = event_sched_in(event, cpuctx, ctx);
}
if (err) {
/*
* If this event can't go on and it's part of a
* group, then the whole group has to come off.
*/
if (leader != event) {
group_sched_out(leader, cpuctx, ctx);
perf_cpu_hrtimer_restart(cpuctx);
}
if (leader->attr.pinned) {
update_group_times(leader);
leader->state = PERF_EVENT_STATE_ERROR;
}
}
unlock:
raw_spin_unlock(&ctx->lock);
return 0;
}
/*
* Enable a event.
*
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This condition is satisfied when called through
* perf_event_for_each_child or perf_event_for_each as described
* for perf_event_disable.
*/
static void _perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
if (!task) {
/*
* Enable the event on the cpu that it's on
*/
cpu_function_call(event->cpu, __perf_event_enable, event);
return;
}
raw_spin_lock_irq(&ctx->lock);
if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto out;
/*
* If the event is in error state, clear that first.
* That way, if we see the event in error state below, we
* know that it has gone back into error state, as distinct
* from the task having been scheduled away before the
* cross-call arrived.
*/
if (event->state == PERF_EVENT_STATE_ERROR)
event->state = PERF_EVENT_STATE_OFF;
retry:
if (!ctx->is_active) {
__perf_event_mark_enabled(event);
goto out;
}
raw_spin_unlock_irq(&ctx->lock);
if (!task_function_call(task, __perf_event_enable, event))
return;
raw_spin_lock_irq(&ctx->lock);
/*
* If the context is active and the event is still off,
* we need to retry the cross-call.
*/
if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
/*
* task could have been flipped by a concurrent
* perf_event_context_sched_out()
*/
task = ctx->task;
goto retry;
}
out:
raw_spin_unlock_irq(&ctx->lock);
}
/*
* See perf_event_disable();
*/
void perf_event_enable(struct perf_event *event)
{
struct perf_event_context *ctx;
ctx = perf_event_ctx_lock(event);
_perf_event_enable(event);
perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
* not supported on inherited events
*/
if (event->attr.inherit || !is_sampling_event(event))
return -EINVAL;
atomic_add(refresh, &event->event_limit);
_perf_event_enable(event);
return 0;
}
/*
* See perf_event_disable()
*/
int perf_event_refresh(struct perf_event *event, int refresh)
{
struct perf_event_context *ctx;
int ret;
ctx = perf_event_ctx_lock(event);
ret = _perf_event_refresh(event, refresh);
perf_event_ctx_unlock(event, ctx);
return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);
static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
struct perf_event *event;
int is_active = ctx->is_active;
ctx->is_active &= ~event_type;
if (likely(!ctx->nr_events))
return;
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
return;
perf_pmu_disable(ctx->pmu);
if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
perf_pmu_enable(ctx->pmu);
}
/*
* Test whether two contexts are equivalent, i.e. whether they have both been
* cloned from the same version of the same context.
*
* Equivalence is measured using a generation number in the context that is
* incremented on each modification to it; see unclone_ctx(), list_add_event()
* and list_del_event().
*/
static int context_equiv(struct perf_event_context *ctx1,
struct perf_event_context *ctx2)
{
lockdep_assert_held(&ctx1->lock);
lockdep_assert_held(&ctx2->lock);
/* Pinning disables the swap optimization */
if (ctx1->pin_count || ctx2->pin_count)
return 0;
/* If ctx1 is the parent of ctx2 */
if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
return 1;
/* If ctx2 is the parent of ctx1 */
if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
return 1;
/*
* If ctx1 and ctx2 have the same parent; we flatten the parent
* hierarchy, see perf_event_init_context().
*/
if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
ctx1->parent_gen == ctx2->parent_gen)
return 1;
/* Unmatched */
return 0;
}
static void __perf_event_sync_stat(struct perf_event *event,
struct perf_event *next_event)
{
u64 value;
if (!event->attr.inherit_stat)
return;
/*
* Update the event value, we cannot use perf_event_read()
* because we're in the middle of a context switch and have IRQs
* disabled, which upsets smp_call_function_single(), however
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
switch (event->state) {
case PERF_EVENT_STATE_ACTIVE:
event->pmu->read(event);
/* fall-through */
case PERF_EVENT_STATE_INACTIVE:
update_event_times(event);
break;
default:
break;
}
/*
* In order to keep per-task stats reliable we need to flip the event
* values when we flip the contexts.
*/
value = local64_read(&next_event->count);
value = local64_xchg(&event->count, value);
local64_set(&next_event->count, value);
swap(event->total_time_enabled, next_event->total_time_enabled);
swap(event->total_time_running, next_event->total_time_running);
/*
* Since we swizzled the values, update the user visible data too.
*/
perf_event_update_userpage(event);
perf_event_update_userpage(next_event);
}
static void perf_event_sync_stat(struct perf_event_context *ctx,
struct perf_event_context *next_ctx)
{
struct perf_event *event, *next_event;
if (!ctx->nr_stat)
return;
update_context_time(ctx);
event = list_first_entry(&ctx->event_list,
struct perf_event, event_entry);
next_event = list_first_entry(&next_ctx->event_list,
struct perf_event, event_entry);
while (&event->event_entry != &ctx->event_list &&
&next_event->event_entry != &next_ctx->event_list) {
__perf_event_sync_stat(event, next_event);
event = list_next_entry(event, event_entry);
next_event = list_next_entry(next_event, event_entry);
}
}
static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
struct task_struct *next)
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
struct perf_cpu_context *cpuctx;
int do_switch = 1;
if (likely(!ctx))
return;
cpuctx = __get_cpu_context(ctx);
if (!cpuctx->task_ctx)
return;
rcu_read_lock();
next_ctx = next->perf_event_ctxp[ctxn];
if (!next_ctx)
goto unlock;
parent = rcu_dereference(ctx->parent_ctx);
next_parent = rcu_dereference(next_ctx->parent_ctx);
/* If neither context have a parent context; they cannot be clones. */
if (!parent && !next_parent)
goto unlock;
if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
/*
* Looks like the two contexts are clones, so we might be
* able to optimize the context switch. We lock both
* contexts and check that they are clones under the
* lock (including re-checking that neither has been
* uncloned in the meantime). It doesn't matter which
* order we take the locks because no other cpu could
* be trying to lock both of these tasks.
*/
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
/*
* XXX do we need a memory barrier of sorts
* wrt to rcu_dereference() of perf_event_ctxp
*/
task->perf_event_ctxp[ctxn] = next_ctx;
next->perf_event_ctxp[ctxn] = ctx;
ctx->task = next;
next_ctx->task = task;
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
do_switch = 0;
perf_event_sync_stat(ctx, next_ctx);
}
raw_spin_unlock(&next_ctx->lock);
raw_spin_unlock(&ctx->lock);
}
unlock:
rcu_read_unlock();
if (do_switch) {
raw_spin_lock(&ctx->lock);
ctx_sched_out(ctx, cpuctx, EVENT_ALL);
cpuctx->task_ctx = NULL;
raw_spin_unlock(&ctx->lock);
}
}
void perf_sched_cb_dec(struct pmu *pmu)
{
this_cpu_dec(perf_sched_cb_usages);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
this_cpu_inc(perf_sched_cb_usages);
}
/*
* This function provides the context switch callback to the lower code
* layer. It is invoked ONLY when the context switch callback is enabled.
*/
static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
struct perf_cpu_context *cpuctx;
struct pmu *pmu;
unsigned long flags;
if (prev == next)
return;
local_irq_save(flags);
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
if (pmu->sched_task) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
pmu->sched_task(cpuctx->task_ctx, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
}
rcu_read_unlock();
local_irq_restore(flags);
}
#define for_each_task_context_nr(ctxn) \
for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
*
* We stop each event and update the event value in event->count.
*
* This does not protect us against NMI, but disable()
* sets the disabled bit in the control field of event _before_
* accessing the event control register. If a NMI hits, then it will
* not restart the event.
*/
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
int ctxn;
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch out PMU state.
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_out(task, next);
}
static void task_ctx_sched_out(struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
ctx_sched_out(ctx, cpuctx, EVENT_ALL);
cpuctx->task_ctx = NULL;
}
/*
* Called with IRQs disabled
*/
static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
struct perf_event *event;
list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
if (!event_filter_match(event))
continue;
/* may need to reset tstamp_enabled */
if (is_cgroup_event(event))
perf_cgroup_mark_enabled(event, ctx);
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
/*
* If this pinned group hasn't been scheduled,
* put it in error state.
*/
if (event->state == PERF_EVENT_STATE_INACTIVE) {
update_group_times(event);
event->state = PERF_EVENT_STATE_ERROR;
}
}
}
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
struct perf_event *event;
int can_add_hw = 1;
list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
/* Ignore events in OFF or ERROR state */
if (event->state <= PERF_EVENT_STATE_OFF)
continue;
/*
* Listen to the 'cpu' scheduling filter constraint
* of events:
*/
if (!event_filter_match(event))
continue;
/* may need to reset tstamp_enabled */
if (is_cgroup_event(event))
perf_cgroup_mark_enabled(event, ctx);
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
}
}
}
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task)
{
u64 now;
int is_active = ctx->is_active;
ctx->is_active |= event_type;
if (likely(!ctx->nr_events))
return;
now = perf_clock();
ctx->timestamp = now;
perf_cgroup_set_timestamp(task, ctx);
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
ctx_pinned_sched_in(ctx, cpuctx);
/* Then walk through the lower prio flexible groups */
if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
ctx_flexible_sched_in(ctx, cpuctx);
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task)
{
struct perf_event_context *ctx = &cpuctx->ctx;
ctx_sched_in(ctx, cpuctx, event_type, task);
}
static void perf_event_context_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
cpuctx = __get_cpu_context(ctx);
if (cpuctx->task_ctx == ctx)
return;
perf_ctx_lock(cpuctx, ctx);
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
* cpu flexible, task flexible.
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx->nr_events)
cpuctx->task_ctx = ctx;
perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
}
/*
* Called from scheduler to add the events of the current task
* with interrupts disabled.
*
* We restore the event value and then enable it.
*
* This does not protect us against NMI, but enable()
* sets the enabled bit in the control field of event _before_
* accessing the event control register. If a NMI hits, then it will
* keep the event running.
*/
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
struct perf_event_context *ctx;
int ctxn;
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
continue;
perf_event_context_sched_in(ctx, task);
}
/*
* if cgroup events exist on this CPU, then we need
* to check if we have to switch in PMU state.
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
perf_cgroup_sched_in(prev, task);
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
u64 frequency = event->attr.sample_freq;
u64 sec = NSEC_PER_SEC;
u64 divisor, dividend;
int count_fls, nsec_fls, frequency_fls, sec_fls;
count_fls = fls64(count);
nsec_fls = fls64(nsec);
frequency_fls = fls64(frequency);
sec_fls = 30;
/*
* We got @count in @nsec, with a target of sample_freq HZ
* the target period becomes:
*
* @count * 10^9
* period = -------------------
* @nsec * sample_freq
*
*/
/*
* Reduce accuracy by one bit such that @a and @b converge
* to a similar magnitude.
*/
#define REDUCE_FLS(a, b) \
do { \
if (a##_fls > b##_fls) { \
a >>= 1; \
a##_fls--; \
} else { \
b >>= 1; \
b##_fls--; \
} \
} while (0)
/*
* Reduce accuracy until either term fits in a u64, then proceed with
* the other, so that finally we can do a u64/u64 division.
*/
while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
REDUCE_FLS(nsec, frequency);
REDUCE_FLS(sec, count);
}
if (count_fls + sec_fls > 64) {
divisor = nsec * frequency;
while (count_fls + sec_fls > 64) {
REDUCE_FLS(count, sec);
divisor >>= 1;
}
dividend = count * sec;
} else {
dividend = count * sec;
while (nsec_fls + frequency_fls > 64) {
REDUCE_FLS(nsec, frequency);
dividend >>= 1;
}
divisor = nsec * frequency;
}
if (!divisor)
return dividend;
return div64_u64(dividend, divisor);
}
static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);
static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
struct hw_perf_event *hwc = &event->hw;
s64 period, sample_period;
s64 delta;
period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
delta = (delta + 7) / 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
if (!sample_period)
sample_period = 1;
hwc->sample_period = sample_period;
if (local64_read(&hwc->period_left) > 8*sample_period) {
if (disable)
event->pmu->stop(event, PERF_EF_UPDATE);
local64_set(&hwc->period_left, 0);
if (disable)
event->pmu->start(event, PERF_EF_RELOAD);
}
}
/*
* combine freq adjustment with unthrottling to avoid two passes over the
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
int needs_unthr)
{
struct perf_event *event;
struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC;
s64 delta;
/*
* only need to iterate over all events iff:
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
if (!(ctx->nr_freq || needs_unthr))
return;
raw_spin_lock(&ctx->lock);
perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
if (!event_filter_match(event))
continue;
perf_pmu_disable(event->pmu);
hwc = &event->hw;
if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0;
perf_log_throttle(event, 1);
event->pmu->start(event, 0);
}
if (!event->attr.freq || !event->attr.sample_freq)
goto next;
/*
* stop the event and update event->count
*/
event->pmu->stop(event, PERF_EF_UPDATE);
now = local64_read(&event->count);
delta = now - hwc->freq_count_stamp;
hwc->freq_count_stamp = now;
/*
* restart the event
* reload only if value has changed
* we have stopped the event so tell that
* to perf_adjust_period() to avoid stopping it
* twice.
*/
if (delta > 0)
perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
next:
perf_pmu_enable(event->pmu);
}
perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
/*
* Round-robin a context's events:
*/
static void rotate_ctx(struct perf_event_context *ctx)
{
/*
* Rotate the first entry last of non-pinned groups. Rotation might be
* disabled by the inheritance code.
*/
if (!ctx->rotate_disable)
list_rotate_left(&ctx->flexible_groups);
}
static int perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event_context *ctx = NULL;
int rotate = 0;
if (cpuctx->ctx.nr_events) {
if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
rotate = 1;
}
ctx = cpuctx->task_ctx;
if (ctx && ctx->nr_events) {
if (ctx->nr_events != ctx->nr_active)
rotate = 1;
}
if (!rotate)
goto done;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx)
ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx);
if (ctx)
rotate_ctx(ctx);
perf_event_sched_in(cpuctx, ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
done:
return rotate;
}
#ifdef CONFIG_NO_HZ_FULL
bool perf_event_can_stop_tick(void)
{
if (atomic_read(&nr_freq_events) ||
__this_cpu_read(perf_throttled_count))
return false;
else
return true;
}
#endif
void perf_event_task_tick(void)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
struct perf_event_context *ctx, *tmp;
int throttled;
WARN_ON(!irqs_disabled());
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
perf_adjust_freq_unthr_context(ctx, throttled);
}
static int event_enable_on_exec(struct perf_event *event,
struct perf_event_context *ctx)
{
if (!event->attr.enable_on_exec)
return 0;
event->attr.enable_on_exec = 0;
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
__perf_event_mark_enabled(event);
return 1;
}
/*
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
struct perf_event_context *clone_ctx = NULL;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
int ret;
local_irq_save(flags);
if (!ctx || !ctx->nr_events)
goto out;
/*
* We must ctxsw out cgroup events to avoid conflict
* when invoking perf_task_event_sched_in() later on
* in this function. Otherwise we end up trying to
* ctxswin cgroup events which are already scheduled
* in.
*/
perf_cgroup_sched_out(current, NULL);
raw_spin_lock(&ctx->lock);
task_ctx_sched_out(ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
ret = event_enable_on_exec(event, ctx);
if (ret)
enabled = 1;
}
/*
* Unclone this context if we enabled any event.
*/
if (enabled)
clone_ctx = unclone_ctx(ctx);
raw_spin_unlock(&ctx->lock);
/*
* Also calls ctxswin for cgroup events, if any:
*/
perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
if (clone_ctx)
put_ctx(clone_ctx);
}
void perf_event_exec(void)
{
struct perf_event_context *ctx;
int ctxn;
rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = current->perf_event_ctxp[ctxn];
if (!ctx)
continue;
perf_event_enable_on_exec(ctx);
}
rcu_read_unlock();
}
/*
* Cross CPU call to read the hardware event
*/
static void __perf_event_read(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
/*
* If this is a task context, we need to check whether it is
* the current task context of this cpu. If not it has been
* scheduled out before the smp call arrived. In that case
* event->count would have been updated to a recent sample
* when the event was scheduled out.
*/
if (ctx->task && cpuctx->task_ctx != ctx)
return;
raw_spin_lock(&ctx->lock);
if (ctx->is_active) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
update_event_times(event);
if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
raw_spin_unlock(&ctx->lock);
}
static inline u64 perf_event_count(struct perf_event *event)
{
if (event->pmu->count)
return event->pmu->count(event);
return __perf_event_count(event);
}
static u64 perf_event_read(struct perf_event *event)
{
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
smp_call_function_single(event->oncpu,
__perf_event_read, event, 1);
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
/*
* may read while context is not active
* (e.g., thread is blocked), in that case
* we cannot update context time
*/
if (ctx->is_active) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
return perf_event_count(event);
}
/*
* Initialize the perf_event context in a task_struct:
*/
static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->active_ctx_list);
INIT_LIST_HEAD(&ctx->pinned_groups);
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
alloc_perf_context(struct pmu *pmu, struct task_struct *task)
{
struct perf_event_context *ctx;
ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
if (!ctx)
return NULL;
__perf_event_init_context(ctx);
if (task) {
ctx->task = task;
get_task_struct(task);
}
ctx->pmu = pmu;
return ctx;
}
static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
struct task_struct *task;
int err;
rcu_read_lock();
if (!vpid)
task = current;
else
task = find_task_by_vpid(vpid);
if (task)
get_task_struct(task);
rcu_read_unlock();
if (!task)
return ERR_PTR(-ESRCH);
/* Reuse ptrace permission checks for now. */
err = -EACCES;
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto errout;
return task;
errout:
put_task_struct(task);
return ERR_PTR(err);
}
/*
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task,
struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
void *task_ctx_data = NULL;
unsigned long flags;
int ctxn, err;
int cpu = event->cpu;
if (!task) {
/* Must be root to operate on a CPU event: */
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
/*
* We could be clever and allow to attach a event to an
* offline CPU and activate it when the CPU comes up, but
* that's for later.
*/
if (!cpu_online(cpu))
return ERR_PTR(-ENODEV);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
++ctx->pin_count;
return ctx;
}
err = -EINVAL;
ctxn = pmu->task_ctx_nr;
if (ctxn < 0)
goto errout;
if (event->attach_state & PERF_ATTACH_TASK_DATA) {
task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
if (!task_ctx_data) {
err = -ENOMEM;
goto errout;
}
}
retry:
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
if (task_ctx_data && !ctx->task_ctx_data) {
ctx->task_ctx_data = task_ctx_data;
task_ctx_data = NULL;
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
ctx = alloc_perf_context(pmu, task);
err = -ENOMEM;
if (!ctx)
goto errout;
if (task_ctx_data) {
ctx->task_ctx_data = task_ctx_data;
task_ctx_data = NULL;
}
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
* If it has already passed perf_event_exit_task().
* we must see PF_EXITING, it takes this mutex too.
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
else if (task->perf_event_ctxp[ctxn])
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
}
mutex_unlock(&task->perf_event_mutex);
if (unlikely(err)) {
put_ctx(ctx);
if (err == -EAGAIN)
goto retry;
goto errout;
}
}
kfree(task_ctx_data);
return ctx;
errout:
kfree(task_ctx_data);
return ERR_PTR(err);
}
static void perf_event_free_filter(struct perf_event *event);
static void perf_event_free_bpf_prog(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
struct perf_event *event;
event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
perf_event_free_bpf_prog(event);
kfree(event);
}
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
return;
if (is_cgroup_event(event))
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}
static void unaccount_event(struct perf_event *event)
{
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
static_key_slow_dec_deferred(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
atomic_dec(&nr_comm_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
if (is_cgroup_event(event))
static_key_slow_dec_deferred(&perf_sched_events);
if (has_branch_stack(event))
static_key_slow_dec_deferred(&perf_sched_events);
unaccount_event_cpu(event, event->cpu);
}
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
* at a time, so we disallow creating events that might conflict, namely:
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
* 3) two matching events on the same context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* __free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
return 0;
/*
* Prevent co-existence of per-task and cpu-wide events on the
* same exclusive pmu.
*
* Negative pmu::exclusive_cnt means there are cpu-wide
* events on this "exclusive" pmu, positive means there are
* per-task events.
*
* Since this is called in perf_event_alloc() path, event::ctx
* doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
* to mean "per-task event", because unlike other attach states it
* never gets cleared.
*/
if (event->attach_state & PERF_ATTACH_TASK) {
if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
return -EBUSY;
} else {
if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
return -EBUSY;
}
return 0;
}
static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
return;
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
(e1->cpu == e2->cpu ||
e1->cpu == -1 ||
e2->cpu == -1))
return true;
return false;
}
/* Called under the same ctx::mutex as perf_install_in_context() */
static bool exclusive_event_installable(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
return true;
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
if (exclusive_event_match(iter_event, event))
return false;
}
return true;
}
static void __free_event(struct perf_event *event)
{
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
put_callchain_buffers();
}
if (event->destroy)
event->destroy(event);
if (event->ctx)
put_ctx(event->ctx);
if (event->pmu) {
exclusive_event_destroy(event);
module_put(event->pmu->module);
}
call_rcu(&event->rcu_head, free_event_rcu);
}
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
unaccount_event(event);
if (event->rb) {
/*
* Can happen when we close an event with re-directed output.
*
* Since we have a 0 refcount, perf_mmap_close() will skip
* over us; possibly making our ring_buffer_put() the last.
*/
mutex_lock(&event->mmap_mutex);
ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
}
if (is_cgroup_event(event))
perf_detach_cgroup(event);
__free_event(event);
}
/*
* Used to free events which have a known refcount of 1, such as in error paths
* where the event isn't exposed yet and inherited events.
*/
static void free_event(struct perf_event *event)
{
if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
"unexpected event refcount: %ld; ptr=%p\n",
atomic_long_read(&event->refcount), event)) {
/* leak to avoid use-after-free */
return;
}
_free_event(event);
}
/*
* Remove user event from the owner task.
*/
static void perf_remove_from_owner(struct perf_event *event)
{
struct task_struct *owner;
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
/*
* Matches the smp_wmb() in perf_event_exit_task(). If we observe
* !owner it means the list deletion is complete and we can indeed
* free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
smp_read_barrier_depends();
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
* task reference we can safely take a new reference
* while holding the rcu_read_lock().
*/
get_task_struct(owner);
}
rcu_read_unlock();
if (owner) {
/*
* If we're here through perf_event_exit_task() we're already
* holding ctx->mutex which would be an inversion wrt. the
* normal lock order.
*
* However we can safely take this lock because its the child
* ctx->mutex.
*/
mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
/*
* We have to re-check the event->owner field, if it is cleared
* we raced with perf_event_exit_task(), acquiring the mutex
* ensured they're done, and we can proceed with freeing the
* event.
*/
if (event->owner)
list_del_init(&event->owner_entry);
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
}
/*
* Called when the last reference to the file is gone.
*/
static void put_event(struct perf_event *event)
{
struct perf_event_context *ctx;
if (!atomic_long_dec_and_test(&event->refcount))
return;
if (!is_kernel_event(event))
perf_remove_from_owner(event);
/*
* There are two ways this annotation is useful:
*
* 1) there is a lock recursion from perf_event_exit_task
* see the comment there.
*
* 2) there is a lock-inversion with mmap_sem through
* perf_event_read_group(), which takes faults while
* holding ctx->mutex, however this is called after
* the last filedesc died, so there is no possibility
* to trigger the AB-BA case.
*/
ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
WARN_ON_ONCE(ctx->parent_ctx);
perf_remove_from_context(event, true);
perf_event_ctx_unlock(event, ctx);
_free_event(event);
}
int perf_event_release_kernel(struct perf_event *event)
{
put_event(event);
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
static int perf_release(struct inode *inode, struct file *file)
{
put_event(file->private_data);
return 0;
}
/*
* Remove all orphanes events from the context.
*/
static void orphans_remove_work(struct work_struct *work)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
ctx = container_of(work, struct perf_event_context,
orphans_remove.work);
mutex_lock(&ctx->mutex);
list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
struct perf_event *parent_event = event->parent;
if (!is_orphaned_child(event))
continue;
perf_remove_from_context(event, true);
mutex_lock(&parent_event->child_mutex);
list_del_init(&event->child_list);
mutex_unlock(&parent_event->child_mutex);
free_event(event);
put_event(parent_event);
}
raw_spin_lock_irq(&ctx->lock);
ctx->orphans_remove_sched = false;
raw_spin_unlock_irq(&ctx->lock);
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
*enabled = 0;
*running = 0;
mutex_lock(&event->child_mutex);
total += perf_event_read(event);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
*running += event->total_time_running +
atomic64_read(&event->child_total_time_running);
list_for_each_entry(child, &event->child_list, child_list) {
total += perf_event_read(child);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
mutex_unlock(&event->child_mutex);
return total;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);
static int perf_event_read_group(struct perf_event *event,
u64 read_format, char __user *buf)
{
struct perf_event *leader = event->group_leader, *sub;
struct perf_event_context *ctx = leader->ctx;
int n = 0, size = 0, ret;
u64 count, enabled, running;
u64 values[5];
lockdep_assert_held(&ctx->mutex);
count = perf_event_read_value(leader, &enabled, &running);
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
values[n++] = count;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
size = n * sizeof(u64);
if (copy_to_user(buf, values, size))
return -EFAULT;
ret = size;
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;
values[n++] = perf_event_read_value(sub, &enabled, &running);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
size = n * sizeof(u64);
if (copy_to_user(buf + ret, values, size)) {
return -EFAULT;
}
ret += size;
}
return ret;
}
static int perf_event_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
u64 values[4];
int n = 0;
values[n++] = perf_event_read_value(event, &enabled, &running);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
return n * sizeof(u64);
}
static bool is_event_hup(struct perf_event *event)
{
bool no_children;
if (event->state != PERF_EVENT_STATE_EXIT)
return false;
mutex_lock(&event->child_mutex);
no_children = list_empty(&event->child_list);
mutex_unlock(&event->child_mutex);
return no_children;
}
/*
* Read the performance event - simple non blocking version for now
*/
static ssize_t
perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
{
u64 read_format = event->attr.read_format;
int ret;
/*
* Return end-of-file for a read on a event that is in
* error state (i.e. because it was pinned but it couldn't be
* scheduled on to the CPU at some point).
*/
if (event->state == PERF_EVENT_STATE_ERROR)
return 0;
if (count < event->read_size)
return -ENOSPC;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (read_format & PERF_FORMAT_GROUP)
ret = perf_event_read_group(event, read_format, buf);
else
ret = perf_event_read_one(event, read_format, buf);
return ret;
}
static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
struct perf_event *event = file->private_data;
struct perf_event_context *ctx;
int ret;
ctx = perf_event_ctx_lock(event);
ret = perf_read_hw(event, buf, count);
perf_event_ctx_unlock(event, ctx);
return ret;
}
static unsigned int perf_poll(struct file *file, poll_table *wait)
{
struct perf_event *event = file->private_data;
struct ring_buffer *rb;
unsigned int events = POLLHUP;
poll_wait(file, &event->waitq, wait);
if (is_event_hup(event))
return events;
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
rb = event->rb;
if (rb)
events = atomic_xchg(&rb->poll, 0);
mutex_unlock(&event->mmap_mutex);
return events;
}
static void _perf_event_reset(struct perf_event *event)
{
(void)perf_event_read(event);
local64_set(&event->count, 0);
perf_event_update_userpage(event);
}
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
* in sync_child_event if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
void (*func)(struct perf_event *))
{
struct perf_event *child;
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->child_mutex);
func(event);
list_for_each_entry(child, &event->child_list, child_list)
func(child);
mutex_unlock(&event->child_mutex);
}
static void perf_event_for_each(struct perf_event *event,
void (*func)(struct perf_event *))
{
struct perf_event_context *ctx = event->ctx;
struct perf_event *sibling;
lockdep_assert_held(&ctx->mutex);
event = event->group_leader;
perf_event_for_each_child(event, func);
list_for_each_entry(sibling, &event->sibling_list, group_entry)
perf_event_for_each_child(sibling, func);
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
struct perf_event_context *ctx = event->ctx;
int ret = 0, active;
u64 value;
if (!is_sampling_event(event))
return -EINVAL;
if (copy_from_user(&value, arg, sizeof(value)))
return -EFAULT;
if (!value)
return -EINVAL;
raw_spin_lock_irq(&ctx->lock);
if (event->attr.freq) {
if (value > sysctl_perf_event_sample_rate) {
ret = -EINVAL;
goto unlock;
}
event->attr.sample_freq = value;
} else {
event->attr.sample_period = value;
event->hw.sample_period = value;
}
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
perf_pmu_disable(ctx->pmu);
event->pmu->stop(event, PERF_EF_UPDATE);
}
local64_set(&event->hw.period_left, 0);
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
unlock:
raw_spin_unlock_irq(&ctx->lock);
return ret;
}
static const struct file_operations perf_fops;
static inline int perf_fget_light(int fd, struct fd *p)
{
struct fd f = fdget(fd);
if (!f.file)
return -EBADF;
if (f.file->f_op != &perf_fops) {
fdput(f);
return -EBADF;
}
*p = f;
return 0;
}
static int perf_event_set_output(struct perf_event *event,
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
void (*func)(struct perf_event *);
u32 flags = arg;
switch (cmd) {
case PERF_EVENT_IOC_ENABLE:
func = _perf_event_enable;
break;
case PERF_EVENT_IOC_DISABLE:
func = _perf_event_disable;
break;
case PERF_EVENT_IOC_RESET:
func = _perf_event_reset;
break;
case PERF_EVENT_IOC_REFRESH:
return _perf_event_refresh(event, arg);
case PERF_EVENT_IOC_PERIOD:
return perf_event_period(event, (u64 __user *)arg);
case PERF_EVENT_IOC_ID:
{
u64 id = primary_event_id(event);
if (copy_to_user((void __user *)arg, &id, sizeof(id)))
return -EFAULT;
return 0;
}
case PERF_EVENT_IOC_SET_OUTPUT:
{
int ret;
if (arg != -1) {
struct perf_event *output_event;
struct fd output;
ret = perf_fget_light(arg, &output);
if (ret)
return ret;
output_event = output.file->private_data;
ret = perf_event_set_output(event, output_event);
fdput(output);
} else {
ret = perf_event_set_output(event, NULL);
}
return ret;
}
case PERF_EVENT_IOC_SET_FILTER:
return perf_event_set_filter(event, (void __user *)arg);
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
default:
return -ENOTTY;
}
if (flags & PERF_IOC_FLAG_GROUP)
perf_event_for_each(event, func);
else
perf_event_for_each_child(event, func);
return 0;
}
static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct perf_event *event = file->private_data;
struct perf_event_context *ctx;
long ret;
ctx = perf_event_ctx_lock(event);
ret = _perf_ioctl(event, cmd, arg);
perf_event_ctx_unlock(event, ctx);
return ret;
}
#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
cmd |= sizeof(void *) << IOCSIZE_SHIFT;
}
break;
}
return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif
int perf_event_task_enable(void)
{
struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
list_for_each_entry(event, &current->perf_event_list, owner_entry) {
ctx = perf_event_ctx_lock(event);
perf_event_for_each_child(event, _perf_event_enable);
perf_event_ctx_unlock(event, ctx);
}
mutex_unlock(&current->perf_event_mutex);
return 0;
}
int perf_event_task_disable(void)
{
struct perf_event_context *ctx;
struct perf_event *event;
mutex_lock(&current->perf_event_mutex);
list_for_each_entry(event, &current->perf_event_list, owner_entry) {
ctx = perf_event_ctx_lock(event);
perf_event_for_each_child(event, _perf_event_disable);
perf_event_ctx_unlock(event, ctx);
}
mutex_unlock(&current->perf_event_mutex);
return 0;
}
static int perf_event_index(struct perf_event *event)
{
if (event->hw.state & PERF_HES_STOPPED)
return 0;
if (event->state != PERF_EVENT_STATE_ACTIVE)
return 0;
return event->pmu->event_idx(event);
}
static void calc_timer_values(struct perf_event *event,
u64 *now,
u64 *enabled,
u64 *running)
{
u64 ctx_time;
*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
*enabled = ctx_time - event->tstamp_enabled;
*running = ctx_time - event->tstamp_running;
}
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
struct ring_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (!rb)
goto unlock;
userpg = rb->user_page;
/* Allow new userspace to detect that bit 0 is deprecated */
userpg->cap_bit0_is_deprecated = 1;
userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
userpg->data_offset = PAGE_SIZE;
userpg->data_size = perf_data_size(rb);
unlock:
rcu_read_unlock();
}
void __weak arch_perf_update_userpage(
struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}
/*
* Callers need to ensure there can be no nesting of this function, otherwise
* the seqlock logic goes bad. We can not serialize this because the arch
* code calls this from NMI context.
*/
void perf_event_update_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
struct ring_buffer *rb;
u64 enabled, running, now;
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (!rb)
goto unlock;
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
* was last scheduled in.
*
* we cannot simply called update_context_time()
* because of locking issue as we can be called in
* NMI context
*/
calc_timer_values(event, &now, &enabled, &running);
userpg = rb->user_page;
/*
* Disable preemption so as to not let the corresponding user-space
* spin too long if we get preempted.
*/
preempt_disable();
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
userpg->offset = perf_event_count(event);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
userpg->time_enabled = enabled +
atomic64_read(&event->child_total_time_enabled);
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
arch_perf_update_userpage(event, userpg, now);
barrier();
++userpg->lock;
preempt_enable();
unlock:
rcu_read_unlock();
}
static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct perf_event *event = vma->vm_file->private_data;
struct ring_buffer *rb;
int ret = VM_FAULT_SIGBUS;
if (vmf->flags & FAULT_FLAG_MKWRITE) {
if (vmf->pgoff == 0)
ret = 0;
return ret;
}
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (!rb)
goto unlock;
if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
goto unlock;
vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
if (!vmf->page)
goto unlock;
get_page(vmf->page);
vmf->page->mapping = vma->vm_file->f_mapping;
vmf->page->index = vmf->pgoff;
ret = 0;
unlock:
rcu_read_unlock();
return ret;
}
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
struct ring_buffer *old_rb = NULL;
unsigned long flags;
if (event->rb) {
/*
* Should be impossible, we set this when removing
* event->rb_entry and wait/clear when adding event->rb_entry.
*/
WARN_ON_ONCE(event->rcu_pending);
old_rb = event->rb;
event->rcu_batches = get_state_synchronize_rcu();
event->rcu_pending = 1;
spin_lock_irqsave(&old_rb->event_lock, flags);
list_del_rcu(&event->rb_entry);
spin_unlock_irqrestore(&old_rb->event_lock, flags);
}
if (event->rcu_pending && rb) {
cond_synchronize_rcu(event->rcu_batches);
event->rcu_pending = 0;
}
if (rb) {
spin_lock_irqsave(&rb->event_lock, flags);
list_add_rcu(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
}
rcu_assign_pointer(event->rb, rb);
if (old_rb) {
ring_buffer_put(old_rb);
/*
* Since we detached before setting the new rb, so that we
* could attach the new rb, we could have missed a wakeup.
* Provide it now.
*/
wake_up_all(&event->waitq);
}
}
static void ring_buffer_wakeup(struct perf_event *event)
{
struct ring_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
wake_up_all(&event->waitq);
}
rcu_read_unlock();
}
static void rb_free_rcu(struct rcu_head *rcu_head)
{
struct ring_buffer *rb;
rb = container_of(rcu_head, struct ring_buffer, rcu_head);
rb_free(rb);
}
struct ring_buffer *ring_buffer_get(struct perf_event *event)
{
struct ring_buffer *rb;
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
if (!atomic_inc_not_zero(&rb->refcount))
rb = NULL;
}
rcu_read_unlock();
return rb;
}
void ring_buffer_put(struct ring_buffer *rb)
{
if (!atomic_dec_and_test(&rb->refcount))
return;
WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
static void perf_mmap_open(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
event->pmu->event_mapped(event);
}
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
*
* In order to undo the VM accounting done by perf_mmap() we need to destroy
* the buffer here, where we still have a VM context. This means we need
* to detach all events redirecting to us.
*/
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
struct ring_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
* event->mmap_count, so it is ok to use event->mmap_mutex to
* serialize with perf_mmap here.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
rb_free_aux(rb);
mutex_unlock(&event->mmap_mutex);
}
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
if (atomic_read(&rb->mmap_count))
goto out_put;
/*
* No other mmap()s, detach from all other events that might redirect
* into the now unreachable buffer. Somewhat complicated by the
* fact that rb::event_lock otherwise nests inside mmap_mutex.
*/
again:
rcu_read_lock();
list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
if (!atomic_long_inc_not_zero(&event->refcount)) {
/*
* This event is en-route to free_event() which will
* detach it and remove it from the list.
*/
continue;
}
rcu_read_unlock();
mutex_lock(&event->mmap_mutex);
/*
* Check we didn't race with perf_event_set_output() which can
* swizzle the rb from under us while we were waiting to
* acquire mmap_mutex.
*
* If we find a different rb; ignore this event, a next
* iteration will no longer find it on the list. We have to
* still restart the iteration to make sure we're not now
* iterating the wrong list.
*/
if (event->rb == rb)
ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
put_event(event);
/*
* Restart the iteration; either we're on the wrong list or
* destroyed its integrity by doing a deletion.
*/
goto again;
}
rcu_read_unlock();
/*
* It could be there's still a few 0-ref events on the list; they'll
* get cleaned up by free_event() -- they'll also still have their
* ref on the rb and will free it whenever they are done with it.
*
* Aside from that, this buffer is 'fully' detached and unmapped,
* undo the VM accounting.
*/
atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= mmap_locked;
free_uid(mmap_user);
out_put:
ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergable */
.fault = perf_mmap_fault,
.page_mkwrite = perf_mmap_fault,
};
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
unsigned long locked, lock_limit;
struct ring_buffer *rb = NULL;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret = 0, flags = 0;
/*
* Don't allow mmap() of inherited per-task counters. This would
* create a performance issue due to all children writing to the
* same rb.
*/
if (event->cpu == -1 && event->attr.inherit)
return -EINVAL;
if (!(vma->vm_flags & VM_SHARED))
return -EINVAL;
vma_size = vma->vm_end - vma->vm_start;
if (vma->vm_pgoff == 0) {
nr_pages = (vma_size / PAGE_SIZE) - 1;
} else {
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
* mapped, all subsequent mappings should have the same size
* and offset. Must be above the normal perf buffer.
*/
u64 aux_offset, aux_size;
if (!event->rb)
return -EINVAL;
nr_pages = vma_size / PAGE_SIZE;
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
rb = event->rb;
if (!rb)
goto aux_unlock;
aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
aux_size = ACCESS_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
goto aux_unlock;
/* already mapped with a different offset */
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
goto aux_unlock;
if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
goto aux_unlock;
/* already mapped with a different size */
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
goto aux_unlock;
if (!is_power_of_2(nr_pages))
goto aux_unlock;
if (!atomic_inc_not_zero(&rb->mmap_count))
goto aux_unlock;
if (rb_has_aux(rb)) {
atomic_inc(&rb->aux_mmap_count);
ret = 0;
goto unlock;
}
atomic_set(&rb->aux_mmap_count, 1);
user_extra = nr_pages;
goto accounting;
}
/*
* If we have rb pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
return -EINVAL;
if (vma_size != PAGE_SIZE * (1 + nr_pages))
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
goto unlock;
}
if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
* Raced against perf_mmap_close() through
* perf_event_set_output(). Try again, hope for better
* luck.
*/
mutex_unlock(&event->mmap_mutex);
goto again;
}
goto unlock;
}
user_extra = nr_pages + 1;
accounting:
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
* Increase the limit linearly with more CPUs:
*/
user_lock_limit *= num_online_cpus();
user_locked = atomic_long_read(&user->locked_vm) + user_extra;
if (user_locked > user_lock_limit)
extra = user_locked - user_lock_limit;
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = vma->vm_mm->pinned_vm + extra;
if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
}
WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
if (!rb) {
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
if (!rb) {
ret = -ENOMEM;
goto unlock;
}
atomic_set(&rb->mmap_count, 1);
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
if (!ret)
rb->aux_mmap_locked = extra;
}
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
vma->vm_mm->pinned_vm += extra;
atomic_inc(&event->mmap_count);
} else if (rb) {
atomic_dec(&rb->mmap_count);
}
aux_unlock:
mutex_unlock(&event->mmap_mutex);
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
*/
vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
event->pmu->event_mapped(event);
return ret;
}
static int perf_fasync(int fd, struct file *filp, int on)
{
struct inode *inode = file_inode(filp);
struct perf_event *event = filp->private_data;
int retval;
mutex_lock(&inode->i_mutex);
retval = fasync_helper(fd, filp, on, &event->fasync);
mutex_unlock(&inode->i_mutex);
if (retval < 0)
return retval;
return 0;
}
static const struct file_operations perf_fops = {
.llseek = no_llseek,
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
.unlocked_ioctl = perf_ioctl,
.compat_ioctl = perf_compat_ioctl,
.mmap = perf_mmap,
.fasync = perf_fasync,
};
/*
* Perf event wakeup
*
* If there's data, ensure we set the poll() state and publish everything
* to user-space before waking everybody up.
*/
void perf_event_wakeup(struct perf_event *event)
{
ring_buffer_wakeup(event);
if (event->pending_kill) {
kill_fasync(&event->fasync, SIGIO, event->pending_kill);
event->pending_kill = 0;
}
}
static void perf_pending_event(struct irq_work *entry)
{
struct perf_event *event = container_of(entry,
struct perf_event, pending);
int rctx;
rctx = perf_swevent_get_recursion_context();
/*
* If we 'fail' here, that's OK, it means recursion is already disabled
* and we won't recurse 'further'.
*/
if (event->pending_disable) {
event->pending_disable = 0;
__perf_event_disable(event);
}
if (event->pending_wakeup) {
event->pending_wakeup = 0;
perf_event_wakeup(event);
}
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
}
/*
* We assume there is only KVM supporting the callbacks.
* Later on, we might change it to a list if there is
* another virtualization implementation supporting the callbacks.
*/
struct perf_guest_info_callbacks *perf_guest_cbs;
int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
perf_guest_cbs = cbs;
return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
perf_guest_cbs = NULL;
return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
static void
perf_output_sample_regs(struct perf_output_handle *handle,
struct pt_regs *regs, u64 mask)
{
int bit;
for_each_set_bit(bit, (const unsigned long *) &mask,
sizeof(mask) * BITS_PER_BYTE) {
u64 val;
val = perf_reg_value(regs, bit);
perf_output_put(handle, val);
}
}
static void perf_sample_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs,
struct pt_regs *regs_user_copy)
{
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
} else if (current->mm) {
perf_get_regs_user(regs_user, regs, regs_user_copy);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
}
}
static void perf_sample_regs_intr(struct perf_regs *regs_intr,
struct pt_regs *regs)
{
regs_intr->regs = regs;
regs_intr->abi = perf_reg_abi(current);
}
/*
* Get remaining task size from user stack pointer.
*
* It'd be better to take stack vma map and limit this more
* precisly, but there's no way to get it safely under interrupt,
* so using TASK_SIZE as limit.
*/
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
unsigned long addr = perf_user_stack_pointer(regs);
if (!addr || addr >= TASK_SIZE)
return 0;
return TASK_SIZE - addr;
}
static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
struct pt_regs *regs)
{
u64 task_size;
/* No regs, no stack pointer, no dump. */
if (!regs)
return 0;
/*
* Check if we fit in with the requested stack size into the:
* - TASK_SIZE
* If we don't, we limit the size to the TASK_SIZE.
*
* - remaining sample size
* If we don't, we customize the stack size to
* fit in to the remaining sample size.
*/
task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
stack_size = min(stack_size, (u16) task_size);
/* Current header size plus static size and dynamic size. */
header_size += 2 * sizeof(u64);
/* Do we fit in with the current stack dump size? */
if ((u16) (header_size + stack_size) < header_size) {
/*
* If we overflow the maximum size for the sample,
* we customize the stack dump size to fit in.
*/
stack_size = USHRT_MAX - header_size - sizeof(u64);
stack_size = round_up(stack_size, sizeof(u64));
}
return stack_size;
}
static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
struct pt_regs *regs)
{
/* Case of a kernel thread, nothing to dump */
if (!regs) {
u64 size = 0;
perf_output_put(handle, size);
} else {
unsigned long sp;
unsigned int rem;
u64 dyn_size;
/*
* We dump:
* static size
* - the size requested by user or the best one we can fit
* in to the sample max size
* data
* - user stack dump data
* dynamic size
* - the actual dumped size
*/
/* Static size. */
perf_output_put(handle, dump_size);
/* Data. */
sp = perf_user_stack_pointer(regs);
rem = __output_copy_user(handle, (void *) sp, dump_size);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
/* Dynamic size. */
perf_output_put(handle, dyn_size);
}
}
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
u64 sample_type = event->attr.sample_type;
data->type = sample_type;
header->size += event->id_header_size;
if (sample_type & PERF_SAMPLE_TID) {
/* namespace issues */
data->tid_entry.pid = perf_event_pid(event, current);
data->tid_entry.tid = perf_event_tid(event, current);
}
if (sample_type & PERF_SAMPLE_TIME)
data->time = perf_event_clock(event);
if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
if (sample_type & PERF_SAMPLE_STREAM_ID)
data->stream_id = event->id;
if (sample_type & PERF_SAMPLE_CPU) {
data->cpu_entry.cpu = raw_smp_processor_id();
data->cpu_entry.reserved = 0;
}
}
void perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
if (event->attr.sample_id_all)
__perf_event_header__init_id(header, data, event);
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
struct perf_sample_data *data)
{
u64 sample_type = data->type;
if (sample_type & PERF_SAMPLE_TID)
perf_output_put(handle, data->tid_entry);
if (sample_type & PERF_SAMPLE_TIME)
perf_output_put(handle, data->time);
if (sample_type & PERF_SAMPLE_ID)
perf_output_put(handle, data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
perf_output_put(handle, data->stream_id);
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
perf_output_put(handle, data->id);
}
void perf_event__output_id_sample(struct perf_event *event,
struct perf_output_handle *handle,
struct perf_sample_data *sample)
{
if (event->attr.sample_id_all)
__perf_event__output_id_sample(handle, sample);
}
static void perf_output_read_one(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
u64 values[4];
int n = 0;
values[n++] = perf_event_count(event);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
}
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
values[n++] = running +
atomic64_read(&event->child_total_time_running);
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
__output_copy(handle, values, n * sizeof(u64));
}
/*
* XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
*/
static void perf_output_read_group(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
u64 values[5];
int n = 0;
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
if (leader != event)
leader->pmu->read(leader);
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
__output_copy(handle, values, n * sizeof(u64));
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
n = 0;
if ((sub != event) &&
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
__output_copy(handle, values, n * sizeof(u64));
}
}
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
PERF_FORMAT_TOTAL_TIME_RUNNING)
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
u64 enabled = 0, running = 0, now;
u64 read_format = event->attr.read_format;
/*
* compute total_time_enabled, total_time_running
* based on snapshot values taken when the event
* was last scheduled in.
*
* we cannot simply called update_context_time()
* because of locking issue as we are called in
* NMI context
*/
if (read_format & PERF_FORMAT_TOTAL_TIMES)
calc_timer_values(event, &now, &enabled, &running);
if (event->attr.read_format & PERF_FORMAT_GROUP)
perf_output_read_group(handle, event, enabled, running);
else
perf_output_read_one(handle, event, enabled, running);
}
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
{
u64 sample_type = data->type;
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
perf_output_put(handle, data->id);
if (sample_type & PERF_SAMPLE_IP)
perf_output_put(handle, data->ip);
if (sample_type & PERF_SAMPLE_TID)
perf_output_put(handle, data->tid_entry);
if (sample_type & PERF_SAMPLE_TIME)
perf_output_put(handle, data->time);
if (sample_type & PERF_SAMPLE_ADDR)
perf_output_put(handle, data->addr);
if (sample_type & PERF_SAMPLE_ID)
perf_output_put(handle, data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
perf_output_put(handle, data->stream_id);
if (sample_type & PERF_SAMPLE_CPU)
perf_output_put(handle, data->cpu_entry);
if (sample_type & PERF_SAMPLE_PERIOD)
perf_output_put(handle, data->period);
if (sample_type & PERF_SAMPLE_READ)
perf_output_read(handle, event);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
if (data->callchain) {
int size = 1;
if (data->callchain)
size += data->callchain->nr;
size *= sizeof(u64);
__output_copy(handle, data->callchain, size);
} else {
u64 nr = 0;
perf_output_put(handle, nr);
}
}
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
perf_output_put(handle, data->raw->size);
__output_copy(handle, data->raw->data,
data->raw->size);
} else {
struct {
u32 size;
u32 data;
} raw = {
.size = sizeof(u32),
.data = 0,
};
perf_output_put(handle, raw);
}
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
if (data->br_stack) {
size_t size;
size = data->br_stack->nr
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
/*
* we always store at least the value of nr
*/
u64 nr = 0;
perf_output_put(handle, nr);
}
}
if (sample_type & PERF_SAMPLE_REGS_USER) {
u64 abi = data->regs_user.abi;
/*
* If there are no regs to dump, notice it through
* first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
*/
perf_output_put(handle, abi);
if (abi) {
u64 mask = event->attr.sample_regs_user;
perf_output_sample_regs(handle,
data->regs_user.regs,
mask);
}
}
if (sample_type & PERF_SAMPLE_STACK_USER) {
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
}
if (sample_type & PERF_SAMPLE_WEIGHT)
perf_output_put(handle, data->weight);
if (sample_type & PERF_SAMPLE_DATA_SRC)
perf_output_put(handle, data->data_src.val);
if (sample_type & PERF_SAMPLE_TRANSACTION)
perf_output_put(handle, data->txn);
if (sample_type & PERF_SAMPLE_REGS_INTR) {
u64 abi = data->regs_intr.abi;
/*
* If there are no regs to dump, notice it through
* first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
*/
perf_output_put(handle, abi);
if (abi) {
u64 mask = event->attr.sample_regs_intr;
perf_output_sample_regs(handle,
data->regs_intr.regs,
mask);
}
}
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
if (wakeup_events) {
struct ring_buffer *rb = handle->rb;
int events = local_inc_return(&rb->events);
if (events >= wakeup_events) {
local_sub(wakeup_events, &rb->events);
local_inc(&rb->wakeup);
}
}
}
}
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
header->misc = 0;
header->misc |= perf_misc_flags(regs);
__perf_event_header__init_id(header, data, event);
if (sample_type & PERF_SAMPLE_IP)
data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
data->callchain = perf_callchain(event, regs);
if (data->callchain)
size += data->callchain->nr;
header->size += size * sizeof(u64);
}
if (sample_type & PERF_SAMPLE_RAW) {
int size = sizeof(u32);
if (data->raw)
size += data->raw->size;
else
size += sizeof(u32);
WARN_ON_ONCE(size & (sizeof(u64)-1));
header->size += size;
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
if (data->br_stack) {
size += data->br_stack->nr
* sizeof(struct perf_branch_entry);
}
header->size += size;
}
if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
perf_sample_regs_user(&data->regs_user, regs,
&data->regs_user_copy);
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
if (data->regs_user.regs) {
u64 mask = event->attr.sample_regs_user;
size += hweight64(mask) * sizeof(u64);
}
header->size += size;
}
if (sample_type & PERF_SAMPLE_STACK_USER) {
/*
* Either we need PERF_SAMPLE_STACK_USER bit to be allways
* processed as the last one or have additional check added
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
stack_size = perf_sample_ustack_size(stack_size, header->size,
data->regs_user.regs);
/*
* If there is something to dump, add space for the dump
* itself and for the field that tells the dynamic size,
* which is how many have been actually dumped.
*/
if (stack_size)
size += sizeof(u64) + stack_size;
data->stack_user_size = stack_size;
header->size += size;
}
if (sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
perf_sample_regs_intr(&data->regs_intr, regs);
if (data->regs_intr.regs) {
u64 mask = event->attr.sample_regs_intr;
size += hweight64(mask) * sizeof(u64);
}
header->size += size;
}
}
static void perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct perf_output_handle handle;
struct perf_event_header header;
/* protect the callchain buffers */
rcu_read_lock();
perf_prepare_sample(&header, data, event, regs);
if (perf_output_begin(&handle, event, header.size))
goto exit;
perf_output_sample(&handle, &header, data, event);
perf_output_end(&handle);
exit:
rcu_read_unlock();
}
/*
* read event_id
*/
struct perf_read_event {
struct perf_event_header header;
u32 pid;
u32 tid;
};
static void
perf_event_read_event(struct perf_event *event,
struct task_struct *task)
{
struct perf_output_handle handle;
struct perf_sample_data sample;
struct perf_read_event read_event = {
.header = {
.type = PERF_RECORD_READ,
.misc = 0,
.size = sizeof(read_event) + event->read_size,
},
.pid = perf_event_pid(event, task),
.tid = perf_event_tid(event, task),
};
int ret;
perf_event_header__init_id(&read_event.header, &sample, event);
ret = perf_output_begin(&handle, event, read_event.header.size);
if (ret)
return;
perf_output_put(&handle, read_event);
perf_output_read(&handle, event);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
static void
perf_event_aux_ctx(struct perf_event_context *ctx,
perf_event_aux_output_cb output,
void *data)
{
struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state < PERF_EVENT_STATE_INACTIVE)
continue;
if (!event_filter_match(event))
continue;
output(event, data);
}
}
static void
perf_event_aux(perf_event_aux_output_cb output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
struct pmu *pmu;
int ctxn;
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_aux_ctx(&cpuctx->ctx, output, data);
if (task_ctx)
goto next;
ctxn = pmu->task_ctx_nr;
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
perf_event_aux_ctx(ctx, output, data);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
if (task_ctx) {
preempt_disable();
perf_event_aux_ctx(task_ctx, output, data);
preempt_enable();
}
rcu_read_unlock();
}
/*
* task tracking -- fork/exit
*
* enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
*/
struct perf_task_event {
struct task_struct *task;
struct perf_event_context *task_ctx;
struct {
struct perf_event_header header;
u32 pid;
u32 ppid;
u32 tid;
u32 ptid;
u64 time;
} event_id;
};
static int perf_event_task_match(struct perf_event *event)
{
return event->attr.comm || event->attr.mmap ||
event->attr.mmap2 || event->attr.mmap_data ||
event->attr.task;
}
static void perf_event_task_output(struct perf_event *event,
void *data)
{
struct perf_task_event *task_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
struct task_struct *task = task_event->task;
int ret, size = task_event->event_id.header.size;
if (!perf_event_task_match(event))
return;
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
task_event->event_id.header.size);
if (ret)
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
task_event->event_id.ppid = perf_event_pid(event, current);
task_event->event_id.tid = perf_event_tid(event, task);
task_event->event_id.ptid = perf_event_tid(event, current);
task_event->event_id.time = perf_event_clock(event);
perf_output_put(&handle, task_event->event_id);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
out:
task_event->event_id.header.size = size;
}
static void perf_event_task(struct task_struct *task,
struct perf_event_context *task_ctx,
int new)
{
struct perf_task_event task_event;
if (!atomic_read(&nr_comm_events) &&
!atomic_read(&nr_mmap_events) &&
!atomic_read(&nr_task_events))
return;
task_event = (struct perf_task_event){
.task = task,
.task_ctx = task_ctx,
.event_id = {
.header = {
.type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
.misc = 0,
.size = sizeof(task_event.event_id),
},
/* .pid */
/* .ppid */
/* .tid */
/* .ptid */
/* .time */
},
};
perf_event_aux(perf_event_task_output,
&task_event,
task_ctx);
}
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
}
/*
* comm tracking
*/
struct perf_comm_event {
struct task_struct *task;
char *comm;
int comm_size;
struct {
struct perf_event_header header;
u32 pid;
u32 tid;
} event_id;
};
static int perf_event_comm_match(struct perf_event *event)
{
return event->attr.comm;
}
static void perf_event_comm_output(struct perf_event *event,
void *data)
{
struct perf_comm_event *comm_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
int ret;
if (!perf_event_comm_match(event))
return;
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
comm_event->event_id.header.size);
if (ret)
goto out;
comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
perf_output_put(&handle, comm_event->event_id);
__output_copy(&handle, comm_event->comm,
comm_event->comm_size);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
out:
comm_event->event_id.header.size = size;
}
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
char comm[TASK_COMM_LEN];
unsigned int size;
memset(comm, 0, sizeof(comm));
strlcpy(comm, comm_event->task->comm, sizeof(comm));
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
perf_event_aux(perf_event_comm_output,
comm_event,
NULL);
}
void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
if (!atomic_read(&nr_comm_events))
return;
comm_event = (struct perf_comm_event){
.task = task,
/* .comm */
/* .comm_size */
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
.misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
/* .tid */
},
};
perf_event_comm_event(&comm_event);
}
/*
* mmap tracking
*/
struct perf_mmap_event {
struct vm_area_struct *vma;
const char *file_name;
int file_size;
int maj, min;
u64 ino;
u64 ino_generation;
u32 prot, flags;
struct {
struct perf_event_header header;
u32 pid;
u32 tid;
u64 start;
u64 len;
u64 pgoff;
} event_id;
};
static int perf_event_mmap_match(struct perf_event *event,
void *data)
{
struct perf_mmap_event *mmap_event = data;
struct vm_area_struct *vma = mmap_event->vma;
int executable = vma->vm_flags & VM_EXEC;
return (!executable && event->attr.mmap_data) ||
(executable && (event->attr.mmap || event->attr.mmap2));
}
static void perf_event_mmap_output(struct perf_event *event,
void *data)
{
struct perf_mmap_event *mmap_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
int ret;
if (!perf_event_mmap_match(event, data))
return;
if (event->attr.mmap2) {
mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
mmap_event->event_id.header.size += sizeof(mmap_event->maj);
mmap_event->event_id.header.size += sizeof(mmap_event->min);
mmap_event->event_id.header.size += sizeof(mmap_event->ino);
mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
mmap_event->event_id.header.size += sizeof(mmap_event->prot);
mmap_event->event_id.header.size += sizeof(mmap_event->flags);
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
ret = perf_output_begin(&handle, event,
mmap_event->event_id.header.size);
if (ret)
goto out;
mmap_event->event_id.pid = perf_event_pid(event, current);
mmap_event->event_id.tid = perf_event_tid(event, current);
perf_output_put(&handle, mmap_event->event_id);
if (event->attr.mmap2) {
perf_output_put(&handle, mmap_event->maj);
perf_output_put(&handle, mmap_event->min);
perf_output_put(&handle, mmap_event->ino);
perf_output_put(&handle, mmap_event->ino_generation);
perf_output_put(&handle, mmap_event->prot);
perf_output_put(&handle, mmap_event->flags);
}
__output_copy(&handle, mmap_event->file_name,
mmap_event->file_size);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
out:
mmap_event->event_id.header.size = size;
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
struct vm_area_struct *vma = mmap_event->vma;
struct file *file = vma->vm_file;
int maj = 0, min = 0;
u64 ino = 0, gen = 0;
u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
char *name;
if (file) {
struct inode *inode;
dev_t dev;
buf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!buf) {
name = "//enomem";
goto cpy_name;
}
/*
* d_path() works from the end of the rb backwards, so we
* need to add enough zero bytes after the string to handle
* the 64bit alignment we do later.
*/
name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
if (IS_ERR(name)) {
name = "//toolong";
goto cpy_name;
}
inode = file_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
if (vma->vm_flags & VM_READ)
prot |= PROT_READ;
if (vma->vm_flags & VM_WRITE)
prot |= PROT_WRITE;
if (vma->vm_flags & VM_EXEC)
prot |= PROT_EXEC;
if (vma->vm_flags & VM_MAYSHARE)
flags = MAP_SHARED;
else
flags = MAP_PRIVATE;
if (vma->vm_flags & VM_DENYWRITE)
flags |= MAP_DENYWRITE;
if (vma->vm_flags & VM_MAYEXEC)
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
if (vma->vm_flags & VM_HUGETLB)
flags |= MAP_HUGETLB;
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
name = (char *) vma->vm_ops->name(vma);
if (name)
goto cpy_name;
}
name = (char *)arch_vma_name(vma);
if (name)
goto cpy_name;
if (vma->vm_start <= vma->vm_mm->start_brk &&
vma->vm_end >= vma->vm_mm->brk) {
name = "[heap]";
goto cpy_name;
}
if (vma->vm_start <= vma->vm_mm->start_stack &&
vma->vm_end >= vma->vm_mm->start_stack) {
name = "[stack]";
goto cpy_name;
}
name = "//anon";
goto cpy_name;
}
cpy_name:
strlcpy(tmp, name, sizeof(tmp));
name = tmp;
got_name:
/*
* Since our buffer works in 8 byte units we need to align our string
* size to a multiple of 8. However, we must guarantee the tail end is
* zero'd out to avoid leaking random bits to userspace.
*/
size = strlen(name)+1;
while (!IS_ALIGNED(size, sizeof(u64)))
name[size++] = '\0';
mmap_event->file_name = name;
mmap_event->file_size = size;
mmap_event->maj = maj;
mmap_event->min = min;
mmap_event->ino = ino;
mmap_event->ino_generation = gen;
mmap_event->prot = prot;
mmap_event->flags = flags;
if (!(vma->vm_flags & VM_EXEC))
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
perf_event_aux(perf_event_mmap_output,
mmap_event,
NULL);
kfree(buf);
}
void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
if (!atomic_read(&nr_mmap_events))
return;
mmap_event = (struct perf_mmap_event){
.vma = vma,
/* .file_name */
/* .file_size */
.event_id = {
.header = {
.type = PERF_RECORD_MMAP,
.misc = PERF_RECORD_MISC_USER,
/* .size */
},
/* .pid */
/* .tid */
.start = vma->vm_start,
.len = vma->vm_end - vma->vm_start,
.pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
},
/* .maj (attr_mmap2 only) */
/* .min (attr_mmap2 only) */
/* .ino (attr_mmap2 only) */
/* .ino_generation (attr_mmap2 only) */
/* .prot (attr_mmap2 only) */
/* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
}
void perf_event_aux_event(struct perf_event *event, unsigned long head,
unsigned long size, u64 flags)
{
struct perf_output_handle handle;
struct perf_sample_data sample;
struct perf_aux_event {
struct perf_event_header header;
u64 offset;
u64 size;
u64 flags;
} rec = {
.header = {
.type = PERF_RECORD_AUX,
.misc = 0,
.size = sizeof(rec),
},
.offset = head,
.size = size,
.flags = flags,
};
int ret;
perf_event_header__init_id(&rec.header, &sample, event);
ret = perf_output_begin(&handle, event, rec.header.size);
if (ret)
return;
perf_output_put(&handle, rec);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
/*
* IRQ throttle logging
*/
static void perf_log_throttle(struct perf_event *event, int enable)
{
struct perf_output_handle handle;
struct perf_sample_data sample;
int ret;
struct {
struct perf_event_header header;
u64 time;
u64 id;
u64 stream_id;
} throttle_event = {
.header = {
.type = PERF_RECORD_THROTTLE,
.misc = 0,
.size = sizeof(throttle_event),
},
.time = perf_event_clock(event),
.id = primary_event_id(event),
.stream_id = event->id,
};
if (enable)
throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
perf_event_header__init_id(&throttle_event.header, &sample, event);
ret = perf_output_begin(&handle, event,
throttle_event.header.size);
if (ret)
return;
perf_output_put(&handle, throttle_event);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
struct perf_sample_data sample;
struct perf_aux_event {
struct perf_event_header header;
u32 pid;
u32 tid;
} rec;
int ret;
if (event->parent)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
event->hw.itrace_started)
return;
event->hw.itrace_started = 1;
rec.header.type = PERF_RECORD_ITRACE_START;
rec.header.misc = 0;
rec.header.size = sizeof(rec);
rec.pid = perf_event_pid(event, current);
rec.tid = perf_event_tid(event, current);
perf_event_header__init_id(&rec.header, &sample, event);
ret = perf_output_begin(&handle, event, rec.header.size);
if (ret)
return;
perf_output_put(&handle, rec);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
/*
* Generic event overflow handling, sampling.
*/
static int __perf_event_overflow(struct perf_event *event,
int throttle, struct perf_sample_data *data,
struct pt_regs *regs)
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
u64 seq;
int ret = 0;
/*
* Non-sampling counters might still use the PMI to fold short
* hardware counters, ignore those.
*/
if (unlikely(!is_sampling_event(event)))
return 0;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
hwc->interrupts_seq = seq;
hwc->interrupts = 1;
} else {
hwc->interrupts++;
if (unlikely(throttle
&& hwc->interrupts >= max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
tick_nohz_full_kick();
ret = 1;
}
}
if (event->attr.freq) {
u64 now = perf_clock();
s64 delta = now - hwc->freq_time_stamp;
hwc->freq_time_stamp = now;
if (delta > 0 && delta < 2*TICK_NSEC)
perf_adjust_period(event, delta, hwc->last_period, true);
}
/*
* XXX event_limit might not quite work as expected on inherited
* events
*/
event->pending_kill = POLL_IN;
if (events && atomic_dec_and_test(&event->event_limit)) {
ret = 1;
event->pending_kill = POLL_HUP;
event->pending_disable = 1;
irq_work_queue(&event->pending);
}
if (event->overflow_handler)
event->overflow_handler(event, data, regs);
else
perf_event_output(event, data, regs);
if (event->fasync && event->pending_kill) {
event->pending_wakeup = 1;
irq_work_queue(&event->pending);
}
return ret;
}
int perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
return __perf_event_overflow(event, 1, data, regs);
}
/*
* Generic software event infrastructure
*/
struct swevent_htable {
struct swevent_hlist *swevent_hlist;
struct mutex hlist_mutex;
int hlist_refcount;
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
/* Keeps track of cpu being initialized/exited */
bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
/*
* We directly increment event->count and keep a second value in
* event->hw.period_left to count intervals. This period event
* is kept in the range [-sample_period, 0] so that we can use the
* sign as trigger.
*/
u64 perf_swevent_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
u64 period = hwc->last_period;
u64 nr, offset;
s64 old, val;
hwc->last_period = hwc->sample_period;
again:
old = val = local64_read(&hwc->period_left);
if (val < 0)
return 0;
nr = div64_u64(period + val, period);
offset = nr * period;
val -= offset;
if (local64_cmpxchg(&hwc->period_left, old, val) != old)
goto again;
return nr;
}
static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
int throttle = 0;
if (!overflow)
overflow = perf_swevent_set_period(event);
if (hwc->interrupts == MAX_INTERRUPTS)
return;
for (; overflow; overflow--) {
if (__perf_event_overflow(event, throttle,
data, regs)) {
/*
* We inhibit the overflow from happening when
* hwc->interrupts == MAX_INTERRUPTS.
*/
break;
}
throttle = 1;
}
}
static void perf_swevent_event(struct perf_event *event, u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct hw_perf_event *hwc = &event->hw;
local64_add(nr, &event->count);
if (!regs)
return;
if (!is_sampling_event(event))
return;
if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
data->period = nr;
return perf_swevent_overflow(event, 1, data, regs);
} else
data->period = event->hw.last_period;
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
return perf_swevent_overflow(event, 1, data, regs);
if (local64_add_negative(nr, &hwc->period_left))
return;
perf_swevent_overflow(event, 0, data, regs);
}
static int perf_exclude_event(struct perf_event *event,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 1;
if (regs) {
if (event->attr.exclude_user && user_mode(regs))
return 1;
if (event->attr.exclude_kernel && !user_mode(regs))
return 1;
}
return 0;
}
static int perf_swevent_match(struct perf_event *event,
enum perf_type_id type,
u32 event_id,
struct perf_sample_data *data,
struct pt_regs *regs)
{
if (event->attr.type != type)
return 0;
if (event->attr.config != event_id)
return 0;
if (perf_exclude_event(event, regs))
return 0;
return 1;
}
static inline u64 swevent_hash(u64 type, u32 event_id)
{
u64 val = event_id | (type << 32);
return hash_64(val, SWEVENT_HLIST_BITS);
}
static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
u64 hash = swevent_hash(type, event_id);
return &hlist->heads[hash];
}
/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
struct swevent_hlist *hlist;
hlist = rcu_dereference(swhash->swevent_hlist);
if (!hlist)
return NULL;
return __find_swevent_head(hlist, type, event_id);
}
/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
struct swevent_hlist *hlist;
u32 event_id = event->attr.config;
u64 type = event->attr.type;
/*
* Event scheduling is always serialized against hlist allocation
* and release. Which makes the protected version suitable here.
* The context lock guarantees that.
*/
hlist = rcu_dereference_protected(swhash->swevent_hlist,
lockdep_is_held(&event->ctx->lock));
if (!hlist)
return NULL;
return __find_swevent_head(hlist, type, event_id);
}
static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
u64 nr,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
struct perf_event *event;
struct hlist_head *head;
rcu_read_lock();
head = find_swevent_head_rcu(swhash, type, event_id);
if (!head)
goto end;
hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_swevent_match(event, type, event_id, data, regs))
perf_swevent_event(event, nr, data, regs);
}
end:
rcu_read_unlock();
}
DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
int perf_swevent_get_recursion_context(void)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
return get_recursion_context(swhash->recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
inline void perf_swevent_put_recursion_context(int rctx)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
put_recursion_context(swhash->recursion, rctx);
}
void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data;
if (WARN_ON_ONCE(!regs))
return;
perf_sample_data_init(&data, addr, 0);
do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}
void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
int rctx;
preempt_disable_notrace();
rctx = perf_swevent_get_recursion_context();
if (unlikely(rctx < 0))
goto fail;
___perf_sw_event(event_id, nr, regs, addr);
perf_swevent_put_recursion_context(rctx);
fail:
preempt_enable_notrace();
}
static void perf_swevent_read(struct perf_event *event)
{
}
static int perf_swevent_add(struct perf_event *event, int flags)
{
struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
struct hw_perf_event *hwc = &event->hw;
struct hlist_head *head;
if (is_sampling_event(event)) {
hwc->last_period = hwc->sample_period;
perf_swevent_set_period(event);
}
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
if (!head) {
/*
* We can race with cpu hotplug code. Do not
* WARN if the cpu just got unplugged.
*/
WARN_ON_ONCE(swhash->online);
return -EINVAL;
}
hlist_add_head_rcu(&event->hlist_entry, head);
perf_event_update_userpage(event);
return 0;
}
static void perf_swevent_del(struct perf_event *event, int flags)
{
hlist_del_rcu(&event->hlist_entry);
}
static void perf_swevent_start(struct perf_event *event, int flags)
{
event->hw.state = 0;
}
static void perf_swevent_stop(struct perf_event *event, int flags)
{
event->hw.state = PERF_HES_STOPPED;
}
/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
return rcu_dereference_protected(swhash->swevent_hlist,
lockdep_is_held(&swhash->hlist_mutex));
}
static void swevent_hlist_release(struct swevent_htable *swhash)
{
struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
if (!hlist)
return;
RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
kfree_rcu(hlist, rcu_head);
}
static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
if (!--swhash->hlist_refcount)
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
}
static void swevent_hlist_put(struct perf_event *event)
{
int cpu;
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
}
static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
int err = 0;
mutex_lock(&swhash->hlist_mutex);
if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
struct swevent_hlist *hlist;
hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
if (!hlist) {
err = -ENOMEM;
goto exit;
}
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
swhash->hlist_refcount++;
exit:
mutex_unlock(&swhash->hlist_mutex);
return err;
}
static int swevent_hlist_get(struct perf_event *event)
{
int err;
int cpu, failed_cpu;
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
if (err) {
failed_cpu = cpu;
goto fail;
}
}
put_online_cpus();
return 0;
fail:
for_each_possible_cpu(cpu) {
if (cpu == failed_cpu)
break;
swevent_hlist_put_cpu(event, cpu);
}
put_online_cpus();
return err;
}
struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
static void sw_perf_event_destroy(struct perf_event *event)
{
u64 event_id = event->attr.config;
WARN_ON(event->parent);
static_key_slow_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
}
static int perf_swevent_init(struct perf_event *event)
{
u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
/*
* no branch sampling for software events
*/
if (has_branch_stack(event))
return -EOPNOTSUPP;
switch (event_id) {
case PERF_COUNT_SW_CPU_CLOCK:
case PERF_COUNT_SW_TASK_CLOCK:
return -ENOENT;
default:
break;
}
if (event_id >= PERF_COUNT_SW_MAX)
return -ENOENT;
if (!event->parent) {
int err;
err = swevent_hlist_get(event);
if (err)
return err;
static_key_slow_inc(&perf_swevent_enabled[event_id]);
event->destroy = sw_perf_event_destroy;
}
return 0;
}
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
.event_init = perf_swevent_init,
.add = perf_swevent_add,
.del = perf_swevent_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
};
#ifdef CONFIG_EVENT_TRACING
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
void *record = data->raw->data;
if (likely(!event->filter) || filter_match_preds(event->filter, record))
return 1;
return 0;
}
static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 0;
/*
* All tracepoints are from kernel-space.
*/
if (event->attr.exclude_kernel)
return 0;
if (!perf_tp_filter_match(event, data))
return 0;
return 1;
}
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
{
struct perf_sample_data data;
struct perf_event *event;
struct perf_raw_record raw = {
.size = entry_size,
.data = record,
};
perf_sample_data_init(&data, addr, 0);
data.raw = &raw;
hlist_for_each_entry_rcu(event, head, hlist_entry) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
/*
* If we got specified a target task, also iterate its context and
* deliver this event there too.
*/
if (task && task != current) {
struct perf_event_context *ctx;
struct trace_entry *entry = record;
rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
if (!ctx)
goto unlock;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->attr.type != PERF_TYPE_TRACEPOINT)
continue;
if (event->attr.config != entry->type)
continue;
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
unlock:
rcu_read_unlock();
}
perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
static void tp_perf_event_destroy(struct perf_event *event)
{
perf_trace_destroy(event);
}
static int perf_tp_event_init(struct perf_event *event)
{
int err;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -ENOENT;
/*
* no branch sampling for tracepoint events
*/
if (has_branch_stack(event))
return -EOPNOTSUPP;
err = perf_trace_init(event);
if (err)
return err;
event->destroy = tp_perf_event_destroy;
return 0;
}
static struct pmu perf_tracepoint = {
.task_ctx_nr = perf_sw_context,
.event_init = perf_tp_event_init,
.add = perf_trace_add,
.del = perf_trace_del,
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
};
static inline void perf_tp_register(void)
{
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
char *filter_str;
int ret;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
filter_str = strndup_user(arg, PAGE_SIZE);
if (IS_ERR(filter_str))
return PTR_ERR(filter_str);
ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
kfree(filter_str);
return ret;
}
static void perf_event_free_filter(struct perf_event *event)
{
ftrace_profile_free_filter(event);
}
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
if (event->tp_event->prog)
return -EEXIST;
if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
/* bpf programs can only be attached to kprobes */
return -EINVAL;
prog = bpf_prog_get(prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (prog->type != BPF_PROG_TYPE_KPROBE) {
/* valid fd, but invalid bpf program type */
bpf_prog_put(prog);
return -EINVAL;
}
event->tp_event->prog = prog;
return 0;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
struct bpf_prog *prog;
if (!event->tp_event)
return;
prog = event->tp_event->prog;
if (prog) {
event->tp_event->prog = NULL;
bpf_prog_put(prog);
}
}
#else
static inline void perf_tp_register(void)
{
}
static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
return -ENOENT;
}
static void perf_event_free_filter(struct perf_event *event)
{
}
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
return -ENOENT;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */
#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
struct perf_sample_data sample;
struct pt_regs *regs = data;
perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
if (!bp->hw.state && !perf_exclude_event(bp, regs))
perf_swevent_event(bp, 1, &sample, regs);
}
#endif
/*
* hrtimer based swevent callback
*/
static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
struct pt_regs *regs;
struct perf_event *event;
u64 period;
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
if (event->state != PERF_EVENT_STATE_ACTIVE)
return HRTIMER_NORESTART;
event->pmu->read(event);
perf_sample_data_init(&data, 0, event->hw.last_period);
regs = get_irq_regs();
if (regs && !perf_exclude_event(event, regs)) {
if (!(event->attr.exclude_idle && is_idle_task(current)))
if (__perf_event_overflow(event, 1, &data, regs))
ret = HRTIMER_NORESTART;
}
period = max_t(u64, 10000, event->hw.sample_period);
hrtimer_forward_now(hrtimer, ns_to_ktime(period));
return ret;
}
static void perf_swevent_start_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
s64 period;
if (!is_sampling_event(event))
return;
period = local64_read(&hwc->period_left);
if (period) {
if (period < 0)
period = 10000;
local64_set(&hwc->period_left, 0);
} else {
period = max_t(u64, 10000, hwc->sample_period);
}
__hrtimer_start_range_ns(&hwc->hrtimer,
ns_to_ktime(period), 0,
HRTIMER_MODE_REL_PINNED, 0);
}
static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (is_sampling_event(event)) {
ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
local64_set(&hwc->period_left, ktime_to_ns(remaining));
hrtimer_cancel(&hwc->hrtimer);
}
}
static void perf_swevent_init_hrtimer(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (!is_sampling_event(event))
return;
hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hwc->hrtimer.function = perf_swevent_hrtimer;
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
* mapping and avoid the whole period adjust feedback stuff.
*/
if (event->attr.freq) {
long freq = event->attr.sample_freq;
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
/*
* Software event: cpu wall time clock
*/
static void cpu_clock_event_update(struct perf_event *event)
{
s64 prev;
u64 now;
now = local_clock();
prev = local64_xchg(&event->hw.prev_count, now);
local64_add(now - prev, &event->count);
}
static void cpu_clock_event_start(struct perf_event *event, int flags)
{
local64_set(&event->hw.prev_count, local_clock());
perf_swevent_start_hrtimer(event);
}
static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
cpu_clock_event_update(event);
}
static int cpu_clock_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START)
cpu_clock_event_start(event, flags);
perf_event_update_userpage(event);
return 0;
}
static void cpu_clock_event_del(struct perf_event *event, int flags)
{
cpu_clock_event_stop(event, flags);
}
static void cpu_clock_event_read(struct perf_event *event)
{
cpu_clock_event_update(event);
}
static int cpu_clock_event_init(struct perf_event *event)
{
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
/*
* no branch sampling for software events
*/
if (has_branch_stack(event))
return -EOPNOTSUPP;
perf_swevent_init_hrtimer(event);
return 0;
}
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
.del = cpu_clock_event_del,
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
};
/*
* Software event: task time clock
*/
static void task_clock_event_update(struct perf_event *event, u64 now)
{
u64 prev;
s64 delta;
prev = local64_xchg(&event->hw.prev_count, now);
delta = now - prev;
local64_add(delta, &event->count);
}
static void task_clock_event_start(struct perf_event *event, int flags)
{
local64_set(&event->hw.prev_count, event->ctx->time);
perf_swevent_start_hrtimer(event);
}
static void task_clock_event_stop(struct perf_event *event, int flags)
{
perf_swevent_cancel_hrtimer(event);
task_clock_event_update(event, event->ctx->time);
}
static int task_clock_event_add(struct perf_event *event, int flags)
{
if (flags & PERF_EF_START)
task_clock_event_start(event, flags);
perf_event_update_userpage(event);
return 0;
}
static void task_clock_event_del(struct perf_event *event, int flags)
{
task_clock_event_stop(event, PERF_EF_UPDATE);
}
static void task_clock_event_read(struct perf_event *event)
{
u64 now = perf_clock();
u64 delta = now - event->ctx->timestamp;
u64 time = event->ctx->time + delta;
task_clock_event_update(event, time);
}
static int task_clock_event_init(struct perf_event *event)
{
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
/*
* no branch sampling for software events
*/
if (has_branch_stack(event))
return -EOPNOTSUPP;
perf_swevent_init_hrtimer(event);
return 0;
}
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,
.capabilities = PERF_PMU_CAP_NO_NMI,
.event_init = task_clock_event_init,
.add = task_clock_event_add,
.del = task_clock_event_del,
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
};
static void perf_pmu_nop_void(struct pmu *pmu)
{
}
static int perf_pmu_nop_int(struct pmu *pmu)
{
return 0;
}
static void perf_pmu_start_txn(struct pmu *pmu)
{
perf_pmu_disable(pmu);
}
static int perf_pmu_commit_txn(struct pmu *pmu)
{
perf_pmu_enable(pmu);
return 0;
}
static void perf_pmu_cancel_txn(struct pmu *pmu)
{
perf_pmu_enable(pmu);
}
static int perf_event_idx_default(struct perf_event *event)
{
return 0;
}
/*
* Ensures all contexts with the same task_ctx_nr have the same
* pmu_cpu_context too.
*/
static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
struct pmu *pmu;
if (ctxn < 0)
return NULL;
list_for_each_entry(pmu, &pmus, entry) {
if (pmu->task_ctx_nr == ctxn)
return pmu->pmu_cpu_context;
}
return NULL;
}
static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
{
int cpu;
for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
if (cpuctx->unique_pmu == old_pmu)
cpuctx->unique_pmu = pmu;
}
}
static void free_pmu_context(struct pmu *pmu)
{
struct pmu *i;
mutex_lock(&pmus_lock);
/*
* Like a real lame refcount.
*/
list_for_each_entry(i, &pmus, entry) {
if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
update_pmu_context(i, pmu);
goto out;
}
}
free_percpu(pmu->pmu_cpu_context);
out:
mutex_unlock(&pmus_lock);
}
static struct idr pmu_idr;
static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
}
static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct pmu *pmu = dev_get_drvdata(dev);
int timer, cpu, ret;
ret = kstrtoint(buf, 0, &timer);
if (ret)
return ret;
if (timer < 1)
return -EINVAL;
/* same value, noting to do */
if (timer == pmu->hrtimer_interval_ms)
return count;
pmu->hrtimer_interval_ms = timer;
/* update all cpuctx for this PMU */
for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
if (hrtimer_active(&cpuctx->hrtimer))
hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
}
return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
NULL,
};
ATTRIBUTE_GROUPS(pmu_dev);
static int pmu_bus_running;
static struct bus_type pmu_bus = {
.name = "event_source",
.dev_groups = pmu_dev_groups,
};
static void pmu_dev_release(struct device *dev)
{
kfree(dev);
}
static int pmu_dev_alloc(struct pmu *pmu)
{
int ret = -ENOMEM;
pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
if (!pmu->dev)
goto out;
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
ret = dev_set_name(pmu->dev, "%s", pmu->name);
if (ret)
goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
out:
return ret;
free_dev:
put_device(pmu->dev);
goto out;
}
static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;
int perf_pmu_register(struct pmu *pmu, const char *name, int type)
{
int cpu, ret;
mutex_lock(&pmus_lock);
ret = -ENOMEM;
pmu->pmu_disable_count = alloc_percpu(int);
if (!pmu->pmu_disable_count)
goto unlock;
pmu->type = -1;
if (!name)
goto skip_type;
pmu->name = name;
if (type < 0) {
type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
if (type < 0) {
ret = type;
goto free_pdc;
}
}
pmu->type = type;
if (pmu_bus_running) {
ret = pmu_dev_alloc(pmu);
if (ret)
goto free_idr;
}
skip_type:
pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
if (pmu->pmu_cpu_context)
goto got_cpu_context;
ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
struct perf_cpu_context *cpuctx;
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
__perf_event_init_context(&cpuctx->ctx);
lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
cpuctx->ctx.pmu = pmu;
__perf_cpu_hrtimer_init(cpuctx, cpu);
cpuctx->unique_pmu = pmu;
}
got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
* If we have pmu_enable/pmu_disable calls, install
* transaction stubs that use that to try and batch
* hardware accesses.
*/
pmu->start_txn = perf_pmu_start_txn;
pmu->commit_txn = perf_pmu_commit_txn;
pmu->cancel_txn = perf_pmu_cancel_txn;
} else {
pmu->start_txn = perf_pmu_nop_void;
pmu->commit_txn = perf_pmu_nop_int;
pmu->cancel_txn = perf_pmu_nop_void;
}
}
if (!pmu->pmu_enable) {
pmu->pmu_enable = perf_pmu_nop_void;
pmu->pmu_disable = perf_pmu_nop_void;
}
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
list_add_rcu(&pmu->entry, &pmus);
atomic_set(&pmu->exclusive_cnt, 0);
ret = 0;
unlock:
mutex_unlock(&pmus_lock);
return ret;
free_dev:
device_del(pmu->dev);
put_device(pmu->dev);
free_idr:
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
free_pdc:
free_percpu(pmu->pmu_disable_count);
goto unlock;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
mutex_lock(&pmus_lock);
list_del_rcu(&pmu->entry);
mutex_unlock(&pmus_lock);
/*
* We dereference the pmu list under both SRCU and regular RCU, so
* synchronize against both of those.
*/
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
free_percpu(pmu->pmu_disable_count);
if (pmu->type >= PERF_TYPE_MAX)
idr_remove(&pmu_idr, pmu->type);
device_del(pmu->dev);
put_device(pmu->dev);
free_pmu_context(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
struct perf_event_context *ctx = NULL;
int ret;
if (!try_module_get(pmu->module))
return -ENODEV;
if (event->group_leader != event) {
ctx = perf_event_ctx_lock(event->group_leader);
BUG_ON(!ctx);
}
event->pmu = pmu;
ret = pmu->event_init(event);
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
if (ret)
module_put(pmu->module);
return ret;
}
struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
int idx;
int ret;
idx = srcu_read_lock(&pmus_srcu);
rcu_read_lock();
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
ret = perf_try_init_event(pmu, event);
if (ret)
pmu = ERR_PTR(ret);
goto unlock;
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;
if (ret != -ENOENT) {
pmu = ERR_PTR(ret);
goto unlock;
}
}
pmu = ERR_PTR(-ENOENT);
unlock:
srcu_read_unlock(&pmus_srcu, idx);
return pmu;
}
static void account_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
return;
if (is_cgroup_event(event))
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}
static void account_event(struct perf_event *event)
{
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
static_key_slow_inc(&perf_sched_events.key);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
atomic_inc(&nr_comm_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq) {
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
if (has_branch_stack(event))
static_key_slow_inc(&perf_sched_events.key);
if (is_cgroup_event(event))
static_key_slow_inc(&perf_sched_events.key);
account_event_cpu(event, event->cpu);
}
/*
* Allocate and initialize a event structure
*/
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
struct perf_event *group_leader,
struct perf_event *parent_event,
perf_overflow_handler_t overflow_handler,
void *context, int cgroup_fd)
{
struct pmu *pmu;
struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
if ((unsigned)cpu >= nr_cpu_ids) {
if (!task || cpu != -1)
return ERR_PTR(-EINVAL);
}
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return ERR_PTR(-ENOMEM);
/*
* Single events are their own group leaders, with an
* empty sibling list:
*/
if (!group_leader)
group_leader = event;
mutex_init(&event->child_mutex);
INIT_LIST_HEAD(&event->child_list);
INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
INIT_HLIST_NODE(&event->hlist_entry);
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
event->pmu = NULL;
event->oncpu = -1;
event->parent = parent_event;
event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);
event->state = PERF_EVENT_STATE_INACTIVE;
if (task) {
event->attach_state = PERF_ATTACH_TASK;
/*
* XXX pmu::event_init needs to know what task to account to
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
event->hw.target = task;
}
event->clock = &local_clock;
if (parent_event)
event->clock = parent_event->clock;
if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
}
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
perf_event__state_init(event);
pmu = NULL;
hwc = &event->hw;
hwc->sample_period = attr->sample_period;
if (attr->freq && attr->sample_freq)
hwc->sample_period = 1;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
/*
* we currently do not support PERF_FORMAT_GROUP on inherited events
*/
if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
goto err_ns;
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
goto err_ns;
}
pmu = perf_init_event(event);
if (!pmu)
goto err_ns;
else if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
goto err_ns;
}
err = exclusive_event_init(event);
if (err)
goto err_pmu;
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers();
if (err)
goto err_per_task;
}
}
return event;
err_per_task:
exclusive_event_destroy(event);
err_pmu:
if (event->destroy)
event->destroy(event);
module_put(pmu->module);
err_ns:
if (is_cgroup_event(event))
perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
kfree(event);
return ERR_PTR(err);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
struct perf_event_attr *attr)
{
u32 size;
int ret;
if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
return -EFAULT;
/*
* zero the full structure, so that a short copy will be nice.
*/
memset(attr, 0, sizeof(*attr));
ret = get_user(size, &uattr->size);
if (ret)
return ret;
if (size > PAGE_SIZE) /* silly large */
goto err_size;
if (!size) /* abi compat */
size = PERF_ATTR_SIZE_VER0;
if (size < PERF_ATTR_SIZE_VER0)
goto err_size;
/*
* If we're handed a bigger struct than we know of,
* ensure all the unknown bits are 0 - i.e. new
* user-space does not rely on any kernel feature
* extensions we dont know about yet.
*/
if (size > sizeof(*attr)) {
unsigned char __user *addr;
unsigned char __user *end;
unsigned char val;
addr = (void __user *)uattr + sizeof(*attr);
end = (void __user *)uattr + size;
for (; addr < end; addr++) {
ret = get_user(val, addr);
if (ret)
return ret;
if (val)
goto err_size;
}
size = sizeof(*attr);
}
ret = copy_from_user(attr, uattr, size);
if (ret)
return -EFAULT;
if (attr->__reserved_1)
return -EINVAL;
if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
return -EINVAL;
if (attr->read_format & ~(PERF_FORMAT_MAX-1))
return -EINVAL;
if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
u64 mask = attr->branch_sample_type;
/* only using defined bits */
if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
return -EINVAL;
/* at least one branch bit must be set */
if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
return -EINVAL;
/* propagate priv level, when not set for branch */
if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
/* exclude_kernel checked on syscall entry */
if (!attr->exclude_kernel)
mask |= PERF_SAMPLE_BRANCH_KERNEL;
if (!attr->exclude_user)
mask |= PERF_SAMPLE_BRANCH_USER;
if (!attr->exclude_hv)
mask |= PERF_SAMPLE_BRANCH_HV;
/*
* adjust user setting (for HW filter setup)
*/
attr->branch_sample_type = mask;
}
/* privileged levels capture (kernel, hv): check permissions */
if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
&& perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
}
if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
ret = perf_reg_validate(attr->sample_regs_user);
if (ret)
return ret;
}
if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
if (!arch_perf_have_user_stack_dump())
return -ENOSYS;
/*
* We have __u32 type for the size, but so far
* we can only use __u16 as maximum due to the
* __u16 sample size limit.
*/
if (attr->sample_stack_user >= USHRT_MAX)
ret = -EINVAL;
else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
ret = -EINVAL;
}
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
out:
return ret;
err_size:
put_user(sizeof(*attr), &uattr->size);
ret = -E2BIG;
goto out;
}
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
goto set;
/* don't allow circular references */
if (event == output_event)
goto out;
/*
* Don't allow cross-cpu buffers
*/
if (output_event->cpu != event->cpu)
goto out;
/*
* If its not a per-cpu rb, it must be the same task.
*/
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;
/*
* Mixing clocks in the same buffer is trouble you don't need.
*/
if (output_event->clock != event->clock)
goto out;
/*
* If both events generate aux data, they must be on the same PMU
*/
if (has_aux(event) && has_aux(output_event) &&
event->pmu != output_event->pmu)
goto out;
set:
mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
goto unlock;
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
if (!rb)
goto unlock;
}
ring_buffer_attach(event, rb);
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
out:
return ret;
}
static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
if (b < a)
swap(a, b);
mutex_lock(a);
mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}
static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;
switch (clk_id) {
case CLOCK_MONOTONIC:
event->clock = &ktime_get_mono_fast_ns;
nmi_safe = true;
break;
case CLOCK_MONOTONIC_RAW:
event->clock = &ktime_get_raw_fast_ns;
nmi_safe = true;
break;
case CLOCK_REALTIME:
event->clock = &ktime_get_real_ns;
break;
case CLOCK_BOOTTIME:
event->clock = &ktime_get_boot_ns;
break;
case CLOCK_TAI:
event->clock = &ktime_get_tai_ns;
break;
default:
return -EINVAL;
}
if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
return -EINVAL;
return 0;
}
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
* @attr_uptr: event_id type attributes for monitoring/sampling
* @pid: target pid
* @cpu: target cpu
* @group_fd: group leader event fd
*/
SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
struct perf_event_context *ctx, *uninitialized_var(gctx);
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
struct pmu *pmu;
int event_fd;
int move_group = 0;
int err;
int f_flags = O_RDWR;
int cgroup_fd = -1;
/* for future expandability... */
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
if (!attr.exclude_kernel) {
if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
}
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
} else {
if (attr.sample_period & (1ULL << 63))
return -EINVAL;
}
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
* designates the cpu on which to monitor threads from that
* cgroup.
*/
if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
return -EINVAL;
if (flags & PERF_FLAG_FD_CLOEXEC)
f_flags |= O_CLOEXEC;
event_fd = get_unused_fd_flags(f_flags);
if (event_fd < 0)
return event_fd;
if (group_fd != -1) {
err = perf_fget_light(group_fd, &group);
if (err)
goto err_fd;
group_leader = group.file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
group_leader = NULL;
}
if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
goto err_group_fd;
}
}
if (task && group_leader &&
group_leader->attr.inherit != attr.inherit) {
err = -EINVAL;
goto err_task;
}
get_online_cpus();
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err_cpus;
}
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
err = -ENOTSUPP;
goto err_alloc;
}
}
account_event(event);
/*
* Special case software events and allow them to be part of
* any hardware group.
*/
pmu = event->pmu;
if (attr.use_clockid) {
err = perf_event_set_clock(event, attr.clockid);
if (err)
goto err_alloc;
}
if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
/*
* If event and group_leader are not both a software
* event, and event is, then group leader is not.
*
* Allow the addition of software events to !software
* groups, this is safe because software events never
* fail to schedule.
*/
pmu = group_leader->pmu;
} else if (is_software_event(group_leader) &&
(group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
/*
* In case the group is a pure software group, and we
* try to add a hardware event, move the whole group to
* the hardware context.
*/
move_group = 1;
}
}
/*
* Get the target context (task or percpu):
*/
ctx = find_get_context(pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_alloc;
}
if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
err = -EBUSY;
goto err_context;
}
if (task) {
put_task_struct(task);
task = NULL;
}
/*
* Look up the group leader (we will attach this event to it):
*/
if (group_leader) {
err = -EINVAL;
/*
* Do not allow a recursive hierarchy (this new sibling
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
goto err_context;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
goto err_context;
/*
* Do not allow to attach to a group in a different
* task or CPU context:
*/
if (move_group) {
/*
* Make sure we're both on the same task, or both
* per-cpu events.
*/
if (group_leader->ctx->task != ctx->task)
goto err_context;
/*
* Make sure we're both events for the same CPU;
* grouping events for different CPUs is broken; since
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
goto err_context;
} else {
if (group_leader->ctx != ctx)
goto err_context;
}
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
goto err_context;
}
if (output_event) {
err = perf_event_set_output(event, output_event);
if (err)
goto err_context;
}
event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
goto err_context;
}
if (move_group) {
gctx = group_leader->ctx;
/*
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
mutex_lock_double(&gctx->mutex, &ctx->mutex);
perf_remove_from_context(group_leader, false);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling, false);
put_ctx(gctx);
}
} else {
mutex_lock(&ctx->mutex);
}
WARN_ON_ONCE(ctx->parent_ctx);
if (move_group) {
/*
* Wait for everybody to stop referencing the events through
* the old lists, before installing it on new lists.
*/
synchronize_rcu();
/*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
* (through the sibling list, which is still in-tact), we can
* end up with siblings installed in the wrong context.
*
* By installing siblings first we NO-OP because they're not
* reachable through the group lists.
*/
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
get_ctx(ctx);
}
/*
* Removing from the context ends up with disabled
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
get_ctx(ctx);
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
mutex_unlock(&ctx->mutex);
fput(event_file);
goto err_context;
}
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
if (move_group) {
mutex_unlock(&gctx->mutex);
put_ctx(gctx);
}
mutex_unlock(&ctx->mutex);
put_online_cpus();
event->owner = current;
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
/*
* Precalculate sample_data sizes
*/
perf_event__header_size(event);
perf_event__id_header_size(event);
/*
* Drop the reference on the group_event after placing the
* new event on the sibling_list. This ensures destruction
* of the group leader will find the pointer to itself in
* perf_group_detach().
*/
fdput(group);
fd_install(event_fd, event_file);
return event_fd;
err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
free_event(event);
err_cpus:
put_online_cpus();
err_task:
if (task)
put_task_struct(task);
err_group_fd:
fdput(group);
err_fd:
put_unused_fd(event_fd);
return err;
}
/**
* perf_event_create_kernel_counter
*
* @attr: attributes of the counter to create
* @cpu: cpu in which the counter is bound
* @task: task to profile (NULL for percpu)
*/
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
perf_overflow_handler_t overflow_handler,
void *context)
{
struct perf_event_context *ctx;
struct perf_event *event;
int err;
/*
* Get the target context (task or percpu):
*/
event = perf_event_alloc(attr, cpu, task, NULL, NULL,
overflow_handler, context, -1);
if (IS_ERR(event)) {
err = PTR_ERR(event);
goto err;
}
/* Mark owner so we could distinguish it from user events. */
event->owner = EVENT_OWNER_KERNEL;
account_event(event);
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
goto err_free;
}
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
if (!exclusive_event_installable(event, ctx)) {
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
err = -EBUSY;
goto err_free;
}
perf_install_in_context(ctx, event, cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
return event;
err_free:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
struct perf_event_context *src_ctx;
struct perf_event_context *dst_ctx;
struct perf_event *event, *tmp;
LIST_HEAD(events);
src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
/*
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
perf_remove_from_context(event, false);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
}
/*
* Wait for the events to quiesce before re-instating them.
*/
synchronize_rcu();
/*
* Re-instate events in 2 passes.
*
* Skip over group leaders and only install siblings on this first
* pass, siblings will not get enabled without a leader, however a
* leader will enable its siblings, even if those are still on the old
* context.
*/
list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
account_event_cpu(event, dst_cpu);
perf_install_in_context(dst_ctx, event, dst_cpu);
get_ctx(dst_ctx);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
list_del(&event->migrate_entry);
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
account_event_cpu(event, dst_cpu);
perf_install_in_context(dst_ctx, event, dst_cpu);
get_ctx(dst_ctx);
}
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
static void sync_child_event(struct perf_event *child_event,
struct task_struct *child)
{
struct perf_event *parent_event = child_event->parent;
u64 child_val;
if (child_event->attr.inherit_stat)
perf_event_read_event(child_event, child);
child_val = perf_event_count(child_event);
/*
* Add back the child's count to the parent's count:
*/
atomic64_add(child_val, &parent_event->child_count);
atomic64_add(child_event->total_time_enabled,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
/*
* Remove this event from the parent's list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
mutex_lock(&parent_event->child_mutex);
list_del_init(&child_event->child_list);
mutex_unlock(&parent_event->child_mutex);
/*
* Make sure user/parent get notified, that we just
* lost one event.
*/
perf_event_wakeup(parent_event);
/*
* Release the parent event, if this was the last
* reference to it.
*/
put_event(parent_event);
}
static void
__perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
* random child task.
*
* If we were to destroy the original group, all group related
* operations would cease to function properly after this random
* child dies.
*
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
perf_remove_from_context(child_event, !!child_event->parent);
/*
* It can happen that the parent exits first, and has events
* that are still around due to the child reference. These
* events need to be zapped.
*/
if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
} else {
child_event->state = PERF_EVENT_STATE_EXIT;
perf_event_wakeup(child_event);
}
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
struct perf_event *child_event, *next;
struct perf_event_context *child_ctx, *clone_ctx = NULL;
unsigned long flags;
if (likely(!child->perf_event_ctxp[ctxn])) {
perf_event_task(child, NULL, 0);
return;
}
local_irq_save(flags);
/*
* We can't reschedule here because interrupts are disabled,
* and either child is current or it is a task that can't be
* scheduled, so we are now safe from rescheduling changing
* our context.
*/
child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
/*
* Take the context lock here so that if find_get_context is
* reading child->perf_event_ctxp, we wait until it has
* incremented the context's refcount before we do put_ctx below.
*/
raw_spin_lock(&child_ctx->lock);
task_ctx_sched_out(child_ctx);
child->perf_event_ctxp[ctxn] = NULL;
/*
* If this context is a clone; unclone it so it can't get
* swapped to another process while we're removing all
* the events from it.
*/
clone_ctx = unclone_ctx(child_ctx);
update_context_time(child_ctx);
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
/*
* Report the task dead after unscheduling the events so that we
* won't get any samples after PERF_RECORD_EXIT. We can however still
* get a few PERF_RECORD_READ events.
*/
perf_event_task(child, child_ctx, 0);
/*
* We can recurse on the same lock type through:
*
* __perf_event_exit_task()
* sync_child_event()
* put_event()
* mutex_lock(&ctx->mutex)
*
* But since its the parent context it won't be the same instance.
*/
mutex_lock(&child_ctx->mutex);
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
put_ctx(child_ctx);
}
/*
* When a child task exits, feed back event values to parent events.
*/
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
owner_entry) {
list_del_init(&event->owner_entry);
/*
* Ensure the list deletion is visible before we clear
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
smp_wmb();
event->owner = NULL;
}
mutex_unlock(&child->perf_event_mutex);
for_each_task_context_nr(ctxn)
perf_event_exit_task_context(child, ctxn);
}
static void perf_free_event(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *parent = event->parent;
if (WARN_ON_ONCE(!parent))
return;
mutex_lock(&parent->child_mutex);
list_del_init(&event->child_list);
mutex_unlock(&parent->child_mutex);
put_event(parent);
raw_spin_lock_irq(&ctx->lock);
perf_group_detach(event);
list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
free_event(event);
}
/*
* Free an unexposed, unused context as created by inheritance by
* perf_event_init_task below, used by fork() in case of fail.
*
* Not all locks are strictly required, but take them anyway to be nice and
* help out with the lockdep assertions.
*/
void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
int ctxn;
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
continue;
mutex_lock(&ctx->mutex);
again:
list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
group_entry)
perf_free_event(event, ctx);
list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
group_entry)
perf_free_event(event, ctx);
if (!list_empty(&ctx->pinned_groups) ||
!list_empty(&ctx->flexible_groups))
goto again;
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
}
}
void perf_event_delayed_put(struct task_struct *task)
{
int ctxn;
for_each_task_context_nr(ctxn)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
/*
* inherit a event from parent task to child task:
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
struct task_struct *child,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
enum perf_event_active_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
/*
* Instead of creating recursive hierarchies of events,
* we link inherited events back to the original parent,
* which has a filp for sure, which we use as the reference
* count:
*/
if (parent_event->parent)
parent_event = parent_event->parent;
child_event = perf_event_alloc(&parent_event->attr,
parent_event->cpu,
child,
group_leader, parent_event,
NULL, NULL, -1);
if (IS_ERR(child_event))
return child_event;
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
free_event(child_event);
return NULL;
}
get_ctx(child_ctx);
/*
* Make the child state follow the state of the parent event,
* not its attr.disabled bit. We hold the parent's mutex,
* so we won't race with perf_event_{en, dis}able_family.
*/
if (parent_state >= PERF_EVENT_STATE_INACTIVE)
child_event->state = PERF_EVENT_STATE_INACTIVE;
else
child_event->state = PERF_EVENT_STATE_OFF;
if (parent_event->attr.freq) {
u64 sample_period = parent_event->hw.sample_period;
struct hw_perf_event *hwc = &child_event->hw;
hwc->sample_period = sample_period;
hwc->last_period = sample_period;
local64_set(&hwc->period_left, sample_period);
}
child_event->ctx = child_ctx;
child_event->overflow_handler = parent_event->overflow_handler;
child_event->overflow_handler_context
= parent_event->overflow_handler_context;
/*
* Precalculate sample_data sizes
*/
perf_event__header_size(child_event);
perf_event__id_header_size(child_event);
/*
* Link it up in the child's context:
*/
raw_spin_lock_irqsave(&child_ctx->lock, flags);
add_event_to_ctx(child_event, child_ctx);
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
* Link this into the parent event's child list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
return child_event;
}
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
struct task_struct *child,
struct perf_event_context *child_ctx)
{
struct perf_event *leader;
struct perf_event *sub;
struct perf_event *child_ctr;
leader = inherit_event(parent_event, parent, parent_ctx,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
if (IS_ERR(child_ctr))
return PTR_ERR(child_ctr);
}
return 0;
}
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
struct task_struct *child, int ctxn,
int *inherited_all)
{
int ret;
struct perf_event_context *child_ctx;
if (!event->attr.inherit) {
*inherited_all = 0;
return 0;
}
child_ctx = child->perf_event_ctxp[ctxn];
if (!child_ctx) {
/*
* This is executed from the parent task context, so
* inherit events that have been marked for cloning.
* First allocate and initialize a context for the
* child.
*/
child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
child->perf_event_ctxp[ctxn] = child_ctx;
}
ret = inherit_group(event, parent, parent_ctx,
child, child_ctx);
if (ret)
*inherited_all = 0;
return ret;
}
/*
* Initialize the perf_event context in task_struct
*/
static int perf_event_init_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
struct perf_event *event;
struct task_struct *parent = current;
int inherited_all = 1;
unsigned long flags;
int ret = 0;
if (likely(!parent->perf_event_ctxp[ctxn]))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
parent_ctx = perf_pin_task_context(parent, ctxn);
if (!parent_ctx)
return 0;
/*
* No need to check if parent_ctx != NULL here; since we saw
* it non-NULL earlier, the only reason for it to become NULL
* is if we exit, and since we're currently in the middle of
* a fork we can't be exiting at the same time.
*/
/*
* Lock the parent list. No need to lock the child - not PID
* hashed yet and not running, so nobody can access it.
*/
mutex_lock(&parent_ctx->mutex);
/*
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
break;
}
/*
* We can't hold ctx->lock when iterating the ->flexible_group list due
* to allocations, but we need to prevent rotation because
* rotate_ctx() will change the list from interrupt context.
*/
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 1;
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
break;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
child_ctx = child->perf_event_ctxp[ctxn];
if (child_ctx && inherited_all) {
/*
* Mark the child context as a clone of the parent
* context, or of whatever the parent is a clone of.
*
* Note that if the parent is a clone, the holding of
* parent_ctx->lock avoids it from being uncloned.
*/
cloned_ctx = parent_ctx->parent_ctx;
if (cloned_ctx) {
child_ctx->parent_ctx = cloned_ctx;
child_ctx->parent_gen = parent_ctx->parent_gen;
} else {
child_ctx->parent_ctx = parent_ctx;
child_ctx->parent_gen = parent_ctx->generation;
}
get_ctx(child_ctx->parent_ctx);
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
put_ctx(parent_ctx);
return ret;
}
/*
* Initialize the perf_event context in task_struct
*/
int perf_event_init_task(struct task_struct *child)
{
int ctxn, ret;
memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
if (ret) {
perf_event_free_task(child);
return ret;
}
}
return 0;
}
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
int cpu;
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
}
}
static void perf_event_init_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
WARN_ON(!hlist);
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
static void __perf_event_exit_context(void *__info)
{
struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
rcu_read_lock();
list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
__perf_remove_from_context(&re);
rcu_read_unlock();
}
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_event_context *ctx;
struct pmu *pmu;
int idx;
idx = srcu_read_lock(&pmus_srcu);
list_for_each_entry_rcu(pmu, &pmus, entry) {
ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
mutex_unlock(&ctx->mutex);
}
srcu_read_unlock(&pmus_srcu, idx);
}
static void perf_event_exit_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
perf_event_exit_cpu_context(cpu);
mutex_lock(&swhash->hlist_mutex);
swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
#endif
static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
int cpu;
for_each_online_cpu(cpu)
perf_event_exit_cpu(cpu);
return NOTIFY_OK;
}
/*
* Run the perf reboot notifier at the very last possible moment so that
* the generic watchdog code runs as long as possible.
*/
static struct notifier_block perf_reboot_notifier = {
.notifier_call = perf_reboot,
.priority = INT_MIN,
};
static int
perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
case CPU_DOWN_FAILED:
perf_event_init_cpu(cpu);
break;
case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
default:
break;
}
return NOTIFY_OK;
}
void __init perf_event_init(void)
{
int ret;
idr_init(&pmu_idr);
perf_event_init_all_cpus();
init_srcu_struct(&pmus_srcu);
perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
perf_pmu_register(&perf_cpu_clock, NULL, -1);
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
/* do not patch jump label more than once per second */
jump_label_rate_limit(&perf_sched_events, HZ);
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.
*/
BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
!= 1024);
}
ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page)
{
struct perf_pmu_events_attr *pmu_attr =
container_of(attr, struct perf_pmu_events_attr, attr);
if (pmu_attr->event_str)
return sprintf(page, "%s\n", pmu_attr->event_str);
return 0;
}
static int __init perf_event_sysfs_init(void)
{
struct pmu *pmu;
int ret;
mutex_lock(&pmus_lock);
ret = bus_register(&pmu_bus);
if (ret)
goto unlock;
list_for_each_entry(pmu, &pmus, entry) {
if (!pmu->name || pmu->type < 0)
continue;
ret = pmu_dev_alloc(pmu);
WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
}
pmu_bus_running = 1;
ret = 0;
unlock:
mutex_unlock(&pmus_lock);
return ret;
}
device_initcall(perf_event_sysfs_init);
#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct perf_cgroup *jc;
jc = kzalloc(sizeof(*jc), GFP_KERNEL);
if (!jc)
return ERR_PTR(-ENOMEM);
jc->info = alloc_percpu(struct perf_cgroup_info);
if (!jc->info) {
kfree(jc);
return ERR_PTR(-ENOMEM);
}
return &jc->css;
}
static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
free_percpu(jc->info);
kfree(jc);
}
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
return 0;
}
static void perf_cgroup_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
struct task_struct *task;
cgroup_taskset_for_each(task, tset)
task_function_call(task, __perf_cgroup_move, task);
}
static void perf_cgroup_exit(struct cgroup_subsys_state *css,
struct cgroup_subsys_state *old_css,
struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
* Ignore this case since the task hasn't ran yet, this avoids
* trying to poke a half freed task state from generic code.
*/
if (!(task->flags & PF_EXITING))
return;
task_function_call(task, __perf_cgroup_move, task);
}
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
.exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
};
#endif /* CONFIG_CGROUP_PERF */