diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 1012 |
1 files changed, 327 insertions, 685 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 9efe7108ccaf..0e8457da6f95 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -25,6 +25,7 @@ #include <linux/reboot.h> #include <linux/vmstat.h> #include <linux/device.h> +#include <linux/export.h> #include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> @@ -36,6 +37,8 @@ #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> +#include "internal.h" + #include <asm/irq_regs.h> struct remote_function_call { @@ -200,6 +203,22 @@ __get_cpu_context(struct perf_event_context *ctx) return this_cpu_ptr(ctx->pmu->pmu_cpu_context); } +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + #ifdef CONFIG_CGROUP_PERF /* @@ -340,11 +359,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - perf_pmu_disable(cpuctx->ctx.pmu); - /* * perf_cgroup_events says at least one * context on this CPU has cgroup events. @@ -353,6 +369,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) * events for a context. */ if (cpuctx->ctx.nr_cgroups > 0) { + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); if (mode & PERF_CGROUP_SWOUT) { cpu_ctx_sched_out(cpuctx, EVENT_ALL); @@ -372,9 +390,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode) cpuctx->cgrp = perf_cgroup_from_task(task); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - perf_pmu_enable(cpuctx->ctx.pmu); } rcu_read_unlock(); @@ -382,14 +400,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* + * next is NULL when called from perf_event_enable_on_exec() + * that will systematically cause a cgroup_switch() + */ + if (next) + cgrp2 = perf_cgroup_from_task(next); + + /* + * only schedule out current cgroup events if we know + * that we are switching to a different cgroup. Otherwise, + * do no touch the cgroup events. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { - perf_cgroup_switch(task, PERF_CGROUP_SWIN); + struct perf_cgroup *cgrp1; + struct perf_cgroup *cgrp2 = NULL; + + /* + * we come here when we know perf_cgroup_events > 0 + */ + cgrp1 = perf_cgroup_from_task(task); + + /* prev can never be NULL */ + cgrp2 = perf_cgroup_from_task(prev); + + /* + * only need to schedule in cgroup events if we are changing + * cgroup during ctxsw. Cgroup events were not scheduled + * out of ctxsw out if that was not the case. + */ + if (cgrp1 != cgrp2) + perf_cgroup_switch(task, PERF_CGROUP_SWIN); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -501,11 +559,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { } -static inline void perf_cgroup_sched_out(struct task_struct *task) +static inline void perf_cgroup_sched_out(struct task_struct *task, + struct task_struct *next) { } -static inline void perf_cgroup_sched_in(struct task_struct *task) +static inline void perf_cgroup_sched_in(struct task_struct *prev, + struct task_struct *task) { } @@ -731,6 +791,7 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. + * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { @@ -1105,6 +1166,10 @@ static int __perf_remove_from_context(void *info) raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); list_del_event(event, ctx); + if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + ctx->is_active = 0; + cpuctx->task_ctx = NULL; + } raw_spin_unlock(&ctx->lock); return 0; @@ -1454,8 +1519,24 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *tsk); +static void task_ctx_sched_out(struct perf_event_context *ctx); +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); + +static void perf_event_sched_in(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + struct task_struct *task) +{ + cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + if (ctx) + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + if (ctx) + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); +} /* * Cross CPU call to install and enable a performance event @@ -1466,20 +1547,37 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_event *leader = event->group_leader; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx = cpuctx->task_ctx; + struct task_struct *task = current; + + perf_ctx_lock(cpuctx, task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); /* - * In case we're installing a new context to an already running task, - * could also happen before perf_event_task_sched_in() on architectures - * which do context switches with IRQs enabled. + * If there was an active task_ctx schedule it out. */ - if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx, ctx->task); + if (task_ctx) + task_ctx_sched_out(task_ctx); + + /* + * If the context we're installing events in is not the + * active task_ctx, flip them. + */ + if (ctx->task && task_ctx != ctx) { + if (task_ctx) + raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&ctx->lock); + task_ctx = ctx; + } + + if (task_ctx) { + cpuctx->task_ctx = task_ctx; + task = task_ctx->task; + } + + cpu_ctx_sched_out(cpuctx, EVENT_ALL); - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); /* * update cgrp time only if current cgrp @@ -1490,43 +1588,13 @@ static int __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); - if (!event_filter_match(event)) - goto unlock; - /* - * Don't put the event on if it is disabled or if - * it is in a group and the group isn't on. + * Schedule everything back in */ - if (event->state != PERF_EVENT_STATE_INACTIVE || - (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) - goto unlock; + perf_event_sched_in(cpuctx, task_ctx, task); - /* - * An exclusive event can't go on if there are already active - * hardware events, and no hardware event can go on if there - * is already an exclusive event on. - */ - if (!group_can_go_on(event, cpuctx, 1)) - err = -EEXIST; - else - err = event_sched_in(event, cpuctx, ctx); - - if (err) { - /* - * This event couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the event group is pinned then put it in error state. - */ - if (leader != event) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } - } - -unlock: - raw_spin_unlock(&ctx->lock); + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, task_ctx); return 0; } @@ -1739,7 +1807,7 @@ out: raw_spin_unlock_irq(&ctx->lock); } -static int perf_event_refresh(struct perf_event *event, int refresh) +int perf_event_refresh(struct perf_event *event, int refresh) { /* * not supported on inherited events @@ -1752,36 +1820,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } +EXPORT_SYMBOL_GPL(perf_event_refresh); static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { struct perf_event *event; + int is_active = ctx->is_active; - raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); - ctx->is_active = 0; + ctx->is_active &= ~event_type; if (likely(!ctx->nr_events)) - goto out; + return; + update_context_time(ctx); update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) - goto out; + return; - if (event_type & EVENT_PINNED) { + perf_pmu_disable(ctx->pmu); + if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if (event_type & EVENT_FLEXIBLE) { + if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } -out: perf_pmu_enable(ctx->pmu); - raw_spin_unlock(&ctx->lock); } /* @@ -1929,8 +1996,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, rcu_read_unlock(); if (do_switch) { + raw_spin_lock(&ctx->lock); ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; + raw_spin_unlock(&ctx->lock); } } @@ -1962,11 +2031,10 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_out(task); + perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx, - enum event_type_t event_type) +static void task_ctx_sched_out(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); @@ -1976,7 +2044,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, cpuctx, EVENT_ALL); cpuctx->task_ctx = NULL; } @@ -2055,11 +2123,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task) { u64 now; + int is_active = ctx->is_active; - raw_spin_lock(&ctx->lock); - ctx->is_active = 1; + ctx->is_active |= event_type; if (likely(!ctx->nr_events)) - goto out; + return; now = perf_clock(); ctx->timestamp = now; @@ -2068,15 +2136,12 @@ ctx_sched_in(struct perf_event_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (event_type & EVENT_PINNED) + if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (event_type & EVENT_FLEXIBLE) + if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) ctx_flexible_sched_in(ctx, cpuctx); - -out: - raw_spin_unlock(&ctx->lock); } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, @@ -2088,19 +2153,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type, task); } -static void task_ctx_sched_in(struct perf_event_context *ctx, - enum event_type_t event_type) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = __get_cpu_context(ctx); - if (cpuctx->task_ctx == ctx) - return; - - ctx_sched_in(ctx, cpuctx, event_type, NULL); - cpuctx->task_ctx = ctx; -} - static void perf_event_context_sched_in(struct perf_event_context *ctx, struct task_struct *task) { @@ -2110,6 +2162,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (cpuctx->task_ctx == ctx) return; + perf_ctx_lock(cpuctx, ctx); perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -2118,18 +2171,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + perf_event_sched_in(cpuctx, ctx, task); cpuctx->task_ctx = ctx; + perf_pmu_enable(ctx->pmu); + perf_ctx_unlock(cpuctx, ctx); + /* * Since these rotations are per-cpu, we need to ensure the * cpu-context we got scheduled on is actually rotating. */ perf_pmu_rotate_start(ctx->pmu); - perf_pmu_enable(ctx->pmu); } /* @@ -2143,7 +2196,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void __perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *prev, + struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2161,7 +2215,7 @@ void __perf_event_task_sched_in(struct task_struct *task) * cgroup event are system-wide mode only */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) - perf_cgroup_sched_in(task); + perf_cgroup_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2269,7 +2323,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; - raw_spin_lock(&ctx->lock); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2301,7 +2354,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (delta > 0) perf_adjust_period(event, period, delta); } - raw_spin_unlock(&ctx->lock); } /* @@ -2309,16 +2361,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) */ static void rotate_ctx(struct perf_event_context *ctx) { - raw_spin_lock(&ctx->lock); - /* * Rotate the first entry last of non-pinned groups. Rotation might be * disabled by the inheritance code. */ if (!ctx->rotate_disable) list_rotate_left(&ctx->flexible_groups); - - raw_spin_unlock(&ctx->lock); } /* @@ -2345,6 +2393,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) rotate = 1; } + perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) @@ -2355,21 +2404,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_out(ctx, EVENT_FLEXIBLE); + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); - if (ctx) - task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + perf_event_sched_in(cpuctx, ctx, current); done: if (remove) list_del_init(&cpuctx->rotation_list); perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) @@ -2423,10 +2471,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) * ctxswin cgroup events which are already scheduled * in. */ - perf_cgroup_sched_out(current); - task_ctx_sched_out(ctx, EVENT_ALL); + perf_cgroup_sched_out(current, NULL); raw_spin_lock(&ctx->lock); + task_ctx_sched_out(ctx); list_for_each_entry(event, &ctx->pinned_groups, group_entry) { ret = event_enable_on_exec(event, ctx); @@ -2835,16 +2883,12 @@ retry: unclone_ctx(ctx); ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { + } else { ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - get_ctx(ctx); - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -2856,14 +2900,14 @@ retry: else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; else { + get_ctx(ctx); ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { - put_task_struct(task); - kfree(ctx); + put_ctx(ctx); if (err == -EAGAIN) goto retry; @@ -2890,7 +2934,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_buffer_put(struct perf_buffer *buffer); +static void ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2913,9 +2957,9 @@ static void free_event(struct perf_event *event) } } - if (event->buffer) { - perf_buffer_put(event->buffer); - event->buffer = NULL; + if (event->rb) { + ring_buffer_put(event->rb); + event->rb = NULL; } if (is_cgroup_event(event)) @@ -2934,12 +2978,6 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - /* - * Remove from the PMU, can't get re-enabled since we got - * here because the last ref went. - */ - perf_event_disable(event); - WARN_ON_ONCE(ctx->parent_ctx); /* * There are two ways this annotation is useful: @@ -2956,8 +2994,8 @@ int perf_event_release_kernel(struct perf_event *event) mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); perf_group_detach(event); - list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event); mutex_unlock(&ctx->mutex); free_event(event); @@ -3149,13 +3187,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned int events = POLL_HUP; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) - events = atomic_xchg(&buffer->poll, 0); + rb = rcu_dereference(event->rb); + if (rb) + events = atomic_xchg(&rb->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -3358,6 +3396,18 @@ static int perf_event_index(struct perf_event *event) return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; } +static void calc_timer_values(struct perf_event *event, + u64 *enabled, + u64 *running) +{ + u64 now, ctx_time; + + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + *enabled = ctx_time - event->tstamp_enabled; + *running = ctx_time - event->tstamp_running; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3366,14 +3416,25 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_buffer *buffer; + struct ring_buffer *rb; + u64 enabled, running; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we can be called in + * NMI context + */ + calc_timer_values(event, &enabled, &running); + rb = rcu_dereference(event->rb); + if (!rb) goto unlock; - userpg = buffer->user_page; + userpg = rb->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -3387,10 +3448,10 @@ void perf_event_update_userpage(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ACTIVE) userpg->offset -= local64_read(&event->hw.prev_count); - userpg->time_enabled = event->total_time_enabled + + userpg->time_enabled = enabled + atomic64_read(&event->child_total_time_enabled); - userpg->time_running = event->total_time_running + + userpg->time_running = running + atomic64_read(&event->child_total_time_running); barrier(); @@ -3400,220 +3461,10 @@ unlock: rcu_read_unlock(); } -static unsigned long perf_data_size(struct perf_buffer *buffer); - -static void -perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) -{ - long max_size = perf_data_size(buffer); - - if (watermark) - buffer->watermark = min(max_size, watermark); - - if (!buffer->watermark) - buffer->watermark = max_size / 2; - - if (flags & PERF_BUFFER_WRITABLE) - buffer->writable = 1; - - atomic_set(&buffer->refcount, 1); -} - -#ifndef CONFIG_PERF_USE_VMALLOC - -/* - * Back perf_mmap() with regular GFP_KERNEL-0 pages. - */ - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > buffer->nr_pages) - return NULL; - - if (pgoff == 0) - return virt_to_page(buffer->user_page); - - return virt_to_page(buffer->data_pages[pgoff - 1]); -} - -static void *perf_mmap_alloc_page(int cpu) -{ - struct page *page; - int node; - - node = (cpu == -1) ? cpu : cpu_to_node(cpu); - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); - if (!page) - return NULL; - - return page_address(page); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - int i; - - size = sizeof(struct perf_buffer); - size += nr_pages * sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - buffer->user_page = perf_mmap_alloc_page(cpu); - if (!buffer->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - buffer->data_pages[i] = perf_mmap_alloc_page(cpu); - if (!buffer->data_pages[i]) - goto fail_data_pages; - } - - buffer->nr_pages = nr_pages; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)buffer->data_pages[i]); - - free_page((unsigned long)buffer->user_page); - -fail_user_page: - kfree(buffer); - -fail: - return NULL; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - int i; - - perf_mmap_free_page((unsigned long)buffer->user_page); - for (i = 0; i < buffer->nr_pages; i++) - perf_mmap_free_page((unsigned long)buffer->data_pages[i]); - kfree(buffer); -} - -static inline int page_order(struct perf_buffer *buffer) -{ - return 0; -} - -#else - -/* - * Back perf_mmap() with vmalloc memory. - * - * Required for architectures that have d-cache aliasing issues. - */ - -static inline int page_order(struct perf_buffer *buffer) -{ - return buffer->page_order; -} - -static struct page * -perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) -{ - if (pgoff > (1UL << page_order(buffer))) - return NULL; - - return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); -} - -static void perf_mmap_unmark_page(void *addr) -{ - struct page *page = vmalloc_to_page(addr); - - page->mapping = NULL; -} - -static void perf_buffer_free_work(struct work_struct *work) -{ - struct perf_buffer *buffer; - void *base; - int i, nr; - - buffer = container_of(work, struct perf_buffer, work); - nr = 1 << page_order(buffer); - - base = buffer->user_page; - for (i = 0; i < nr + 1; i++) - perf_mmap_unmark_page(base + (i * PAGE_SIZE)); - - vfree(base); - kfree(buffer); -} - -static void perf_buffer_free(struct perf_buffer *buffer) -{ - schedule_work(&buffer->work); -} - -static struct perf_buffer * -perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) -{ - struct perf_buffer *buffer; - unsigned long size; - void *all_buf; - - size = sizeof(struct perf_buffer); - size += sizeof(void *); - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto fail; - - INIT_WORK(&buffer->work, perf_buffer_free_work); - - all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); - if (!all_buf) - goto fail_all_buf; - - buffer->user_page = all_buf; - buffer->data_pages[0] = all_buf + PAGE_SIZE; - buffer->page_order = ilog2(nr_pages); - buffer->nr_pages = 1; - - perf_buffer_init(buffer, watermark, flags); - - return buffer; - -fail_all_buf: - kfree(buffer); - -fail: - return NULL; -} - -#endif - -static unsigned long perf_data_size(struct perf_buffer *buffer) -{ - return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); -} - static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_buffer *buffer; + struct ring_buffer *rb; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -3623,14 +3474,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (!buffer) + rb = rcu_dereference(event->rb); + if (!rb) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); + vmf->page = perf_mmap_to_page(rb, vmf->pgoff); if (!vmf->page) goto unlock; @@ -3645,35 +3496,35 @@ unlock: return ret; } -static void perf_buffer_free_rcu(struct rcu_head *rcu_head) +static void rb_free_rcu(struct rcu_head *rcu_head) { - struct perf_buffer *buffer; + struct ring_buffer *rb; - buffer = container_of(rcu_head, struct perf_buffer, rcu_head); - perf_buffer_free(buffer); + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); } -static struct perf_buffer *perf_buffer_get(struct perf_event *event) +static struct ring_buffer *ring_buffer_get(struct perf_event *event) { - struct perf_buffer *buffer; + struct ring_buffer *rb; rcu_read_lock(); - buffer = rcu_dereference(event->buffer); - if (buffer) { - if (!atomic_inc_not_zero(&buffer->refcount)) - buffer = NULL; + rb = rcu_dereference(event->rb); + if (rb) { + if (!atomic_inc_not_zero(&rb->refcount)) + rb = NULL; } rcu_read_unlock(); - return buffer; + return rb; } -static void perf_buffer_put(struct perf_buffer *buffer) +static void ring_buffer_put(struct ring_buffer *rb) { - if (!atomic_dec_and_test(&buffer->refcount)) + if (!atomic_dec_and_test(&rb->refcount)) return; - call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); + call_rcu(&rb->rcu_head, rb_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3688,16 +3539,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->buffer); + unsigned long size = perf_data_size(event->rb); struct user_struct *user = event->mmap_user; - struct perf_buffer *buffer = event->buffer; + struct ring_buffer *rb = event->rb; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->buffer, NULL); + vma->vm_mm->pinned_vm -= event->mmap_locked; + rcu_assign_pointer(event->rb, NULL); mutex_unlock(&event->mmap_mutex); - perf_buffer_put(buffer); + ring_buffer_put(rb); free_uid(user); } } @@ -3715,7 +3566,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_buffer *buffer; + struct ring_buffer *rb; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -3724,7 +3575,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) /* * Don't allow mmap() of inherited per-task counters. This would * create a performance issue due to all children writing to the - * same buffer. + * same rb. */ if (event->cpu == -1 && event->attr.inherit) return -EINVAL; @@ -3736,7 +3587,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have buffer pages ensure they're a power-of-two number, so we + * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -3750,9 +3601,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->buffer) { - if (event->buffer->nr_pages == nr_pages) - atomic_inc(&event->buffer->refcount); + if (event->rb) { + if (event->rb->nr_pages == nr_pages) + atomic_inc(&event->rb->refcount); else ret = -EINVAL; goto unlock; @@ -3774,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; + locked = vma->vm_mm->pinned_vm + extra; if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && !capable(CAP_IPC_LOCK)) { @@ -3782,23 +3633,25 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->buffer); + WARN_ON(event->rb); if (vma->vm_flags & VM_WRITE) - flags |= PERF_BUFFER_WRITABLE; + flags |= RING_BUFFER_WRITABLE; - buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, - event->cpu, flags); - if (!buffer) { + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); + + if (!rb) { ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->buffer, buffer); + rcu_assign_pointer(event->rb, rb); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; event->mmap_user = get_current_user(); - vma->vm_mm->locked_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += event->mmap_locked; unlock: if (!ret) @@ -3892,117 +3745,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -/* - * Output - */ -static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!buffer->writable) - return true; - - mask = perf_data_size(buffer) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->buffer->poll, POLL_IN); - - if (handle->nmi) { - handle->event->pending_wakeup = 1; - irq_work_queue(&handle->event->pending); - } else - perf_event_wakeup(handle->event); -} - -/* - * We need to ensure a later event_id doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_get_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - - preempt_disable(); - local_inc(&buffer->nest); - handle->wakeup = local_read(&buffer->wakeup); -} - -static void perf_output_put_handle(struct perf_output_handle *handle) -{ - struct perf_buffer *buffer = handle->buffer; - unsigned long head; - -again: - head = local_read(&buffer->head); - - /* - * IRQ/NMI can happen here, which means we can miss a head update. - */ - - if (!local_dec_and_test(&buffer->nest)) - goto out; - - /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the buffer->head read and this - * write. - */ - buffer->user_page->data_head = head; - - /* - * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read buffer->head. - */ - if (unlikely(head != local_read(&buffer->head))) { - local_inc(&buffer->nest); - goto again; - } - - if (handle->wakeup != local_read(&buffer->wakeup)) - perf_output_wakeup(handle); - -out: - preempt_enable(); -} - -__always_inline void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct perf_buffer *buffer = handle->buffer; - - handle->page++; - handle->page &= buffer->nr_pages - 1; - handle->addr = buffer->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(buffer); - } - } while (len); -} - static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4033,9 +3775,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } } -static void perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_event *event) +void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) { if (event->attr.sample_id_all) __perf_event_header__init_id(header, data, event); @@ -4062,121 +3804,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, perf_output_put(handle, data->cpu_entry); } -static void perf_event__output_id_sample(struct perf_event *event, - struct perf_output_handle *handle, - struct perf_sample_data *sample) +void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) { if (event->attr.sample_id_all) __perf_event__output_id_sample(handle, sample); } -int perf_output_begin(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size, - int nmi, int sample) -{ - struct perf_buffer *buffer; - unsigned long tail, offset, head; - int have_lost; - struct perf_sample_data sample_data; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited events we send all the output towards the parent. - */ - if (event->parent) - event = event->parent; - - buffer = rcu_dereference(event->buffer); - if (!buffer) - goto out; - - handle->buffer = buffer; - handle->event = event; - handle->nmi = nmi; - handle->sample = sample; - - if (!buffer->nr_pages) - goto out; - - have_lost = local_read(&buffer->lost); - if (have_lost) { - lost_event.header.size = sizeof(lost_event); - perf_event_header__init_id(&lost_event.header, &sample_data, - event); - size += lost_event.header.size; - } - - perf_output_get_handle(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(buffer->user_page->data_tail); - smp_rmb(); - offset = head = local_read(&buffer->head); - head += size; - if (unlikely(!perf_output_space(buffer, tail, offset, head))) - goto fail; - } while (local_cmpxchg(&buffer->head, offset, head) != offset); - - if (head - local_read(&buffer->wakeup) > buffer->watermark) - local_add(buffer->watermark, &buffer->wakeup); - - handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); - handle->page &= buffer->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); - handle->addr = buffer->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; - - if (have_lost) { - lost_event.header.type = PERF_RECORD_LOST; - lost_event.header.misc = 0; - lost_event.id = event->id; - lost_event.lost = local_xchg(&buffer->lost, 0); - - perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); - } - - return 0; - -fail: - local_inc(&buffer->lost); - perf_output_put_handle(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_event *event = handle->event; - struct perf_buffer *buffer = handle->buffer; - - int wakeup_events = event->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = local_inc_return(&buffer->events); - if (events >= wakeup_events) { - local_sub(wakeup_events, &buffer->events); - local_inc(&buffer->wakeup); - } - } - - perf_output_put_handle(handle); - rcu_read_unlock(); -} - static void perf_output_read_one(struct perf_output_handle *handle, struct perf_event *event, u64 enabled, u64 running) @@ -4197,7 +3832,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(event); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); } /* @@ -4227,7 +3862,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; @@ -4239,7 +3874,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); - perf_output_copy(handle, values, n * sizeof(u64)); + __output_copy(handle, values, n * sizeof(u64)); } } @@ -4249,7 +3884,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0, now, ctx_time; + u64 enabled = 0, running = 0; u64 read_format = event->attr.read_format; /* @@ -4261,12 +3896,8 @@ static void perf_output_read(struct perf_output_handle *handle, * because of locking issue as we are called in * NMI context */ - if (read_format & PERF_FORMAT_TOTAL_TIMES) { - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; - enabled = ctx_time - event->tstamp_enabled; - running = ctx_time - event->tstamp_running; - } + if (read_format & PERF_FORMAT_TOTAL_TIMES) + calc_timer_values(event, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); @@ -4319,7 +3950,7 @@ void perf_output_sample(struct perf_output_handle *handle, size *= sizeof(u64); - perf_output_copy(handle, data->callchain, size); + __output_copy(handle, data->callchain, size); } else { u64 nr = 0; perf_output_put(handle, nr); @@ -4329,8 +3960,8 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_RAW) { if (data->raw) { perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); + __output_copy(handle, data->raw->data, + data->raw->size); } else { struct { u32 size; @@ -4342,6 +3973,20 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, raw); } } + + if (!event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + struct ring_buffer *rb = handle->rb; + int events = local_inc_return(&rb->events); + + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -4386,7 +4031,7 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_event_output(struct perf_event *event, int nmi, +static void perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4398,7 +4043,7 @@ static void perf_event_output(struct perf_event *event, int nmi, perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size, nmi, 1)) + if (perf_output_begin(&handle, event, header.size)) goto exit; perf_output_sample(&handle, &header, data, event); @@ -4438,7 +4083,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + ret = perf_output_begin(&handle, event, read_event.header.size); if (ret) return; @@ -4481,7 +4126,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - task_event->event_id.header.size, 0, 0); + task_event->event_id.header.size); if (ret) goto out; @@ -4618,7 +4263,7 @@ static void perf_event_comm_output(struct perf_event *event, perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - comm_event->event_id.header.size, 0, 0); + comm_event->event_id.header.size); if (ret) goto out; @@ -4627,7 +4272,7 @@ static void perf_event_comm_output(struct perf_event *event, comm_event->event_id.tid = perf_event_tid(event, comm_event->task); perf_output_put(&handle, comm_event->event_id); - perf_output_copy(&handle, comm_event->comm, + __output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_event__output_id_sample(event, &handle, &sample); @@ -4765,7 +4410,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, - mmap_event->event_id.header.size, 0, 0); + mmap_event->event_id.header.size); if (ret) goto out; @@ -4773,7 +4418,7 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); - perf_output_copy(&handle, mmap_event->file_name, + __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_event__output_id_sample(event, &handle, &sample); @@ -4829,7 +4474,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (file) { /* - * d_path works from the end of the buffer backwards, so we + * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle * the 64bit alignment we do later. */ @@ -4960,7 +4605,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); ret = perf_output_begin(&handle, event, - throttle_event.header.size, 1, 0); + throttle_event.header.size); if (ret) return; @@ -4973,7 +4618,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) * Generic event overflow handling, sampling. */ -static int __perf_event_overflow(struct perf_event *event, int nmi, +static int __perf_event_overflow(struct perf_event *event, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { @@ -5016,34 +4661,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - if (nmi) { - event->pending_disable = 1; - irq_work_queue(&event->pending); - } else - perf_event_disable(event); + event->pending_disable = 1; + irq_work_queue(&event->pending); } if (event->overflow_handler) - event->overflow_handler(event, nmi, data, regs); + event->overflow_handler(event, data, regs); else - perf_event_output(event, nmi, data, regs); + perf_event_output(event, data, regs); if (event->fasync && event->pending_kill) { - if (nmi) { - event->pending_wakeup = 1; - irq_work_queue(&event->pending); - } else - perf_event_wakeup(event); + event->pending_wakeup = 1; + irq_work_queue(&event->pending); } return ret; } -int perf_event_overflow(struct perf_event *event, int nmi, +int perf_event_overflow(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - return __perf_event_overflow(event, nmi, 1, data, regs); + return __perf_event_overflow(event, 1, data, regs); } /* @@ -5092,7 +4731,7 @@ again: } static void perf_swevent_overflow(struct perf_event *event, u64 overflow, - int nmi, struct perf_sample_data *data, + struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; @@ -5106,7 +4745,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, return; for (; overflow; overflow--) { - if (__perf_event_overflow(event, nmi, throttle, + if (__perf_event_overflow(event, throttle, data, regs)) { /* * We inhibit the overflow from happening when @@ -5119,7 +4758,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } static void perf_swevent_event(struct perf_event *event, u64 nr, - int nmi, struct perf_sample_data *data, + struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; @@ -5133,12 +4772,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, return; if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) - return perf_swevent_overflow(event, 1, nmi, data, regs); + return perf_swevent_overflow(event, 1, data, regs); if (local64_add_negative(nr, &hwc->period_left)) return; - perf_swevent_overflow(event, 0, nmi, data, regs); + perf_swevent_overflow(event, 0, data, regs); } static int perf_exclude_event(struct perf_event *event, @@ -5226,7 +4865,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, - u64 nr, int nmi, + u64 nr, struct perf_sample_data *data, struct pt_regs *regs) { @@ -5242,7 +4881,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_event(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, data, regs); } end: rcu_read_unlock(); @@ -5263,8 +4902,7 @@ inline void perf_swevent_put_recursion_context(int rctx) put_recursion_context(swhash->recursion, rctx); } -void __perf_sw_event(u32 event_id, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) +void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct perf_sample_data data; int rctx; @@ -5276,7 +4914,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, perf_sample_data_init(&data, addr); - do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); perf_swevent_put_recursion_context(rctx); preempt_enable_notrace(); @@ -5524,7 +5162,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, 1, &data, regs); + perf_swevent_event(event, count, &data, regs); } perf_swevent_put_recursion_context(rctx); @@ -5617,7 +5255,7 @@ void perf_bp_event(struct perf_event *bp, void *data) perf_sample_data_init(&sample, bp->attr.bp_addr); if (!bp->hw.state && !perf_exclude_event(bp, regs)) - perf_swevent_event(bp, 1, 1, &sample, regs); + perf_swevent_event(bp, 1, &sample, regs); } #endif @@ -5646,7 +5284,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) if (regs && !perf_exclude_event(event, regs)) { if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) + if (perf_event_overflow(event, &data, regs)) ret = HRTIMER_NORESTART; } @@ -5986,6 +5624,7 @@ free_dev: } static struct lock_class_key cpuctx_mutex; +static struct lock_class_key cpuctx_lock; int perf_pmu_register(struct pmu *pmu, char *name, int type) { @@ -6036,6 +5675,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; @@ -6119,6 +5759,7 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + event->pmu = pmu; ret = pmu->event_init(event); if (ret) pmu = ERR_PTR(ret); @@ -6126,6 +5767,7 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + event->pmu = pmu; ret = pmu->event_init(event); if (!ret) goto unlock; @@ -6150,7 +5792,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct task_struct *task, struct perf_event *group_leader, struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) + perf_overflow_handler_t overflow_handler, + void *context) { struct pmu *pmu; struct perf_event *event; @@ -6208,10 +5851,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, #endif } - if (!overflow_handler && parent_event) + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; + context = parent_event->overflow_handler_context; + } event->overflow_handler = overflow_handler; + event->overflow_handler_context = context; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -6248,8 +5894,6 @@ done: return ERR_PTR(err); } - event->pmu = pmu; - if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) jump_label_inc(&perf_sched_events); @@ -6326,13 +5970,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - if (attr->__reserved_1) return -EINVAL; @@ -6354,7 +5991,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_buffer *buffer = NULL, *old_buffer = NULL; + struct ring_buffer *rb = NULL, *old_rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6371,7 +6008,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) goto out; /* - * If its not a per-cpu buffer, it must be the same task. + * If its not a per-cpu rb, it must be the same task. */ if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; @@ -6383,20 +6020,20 @@ set: goto unlock; if (output_event) { - /* get the buffer we want to redirect to */ - buffer = perf_buffer_get(output_event); - if (!buffer) + /* get the rb we want to redirect to */ + rb = ring_buffer_get(output_event); + if (!rb) goto unlock; } - old_buffer = event->buffer; - rcu_assign_pointer(event->buffer, buffer); + old_rb = event->rb; + rcu_assign_pointer(event->rb, rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_buffer) - perf_buffer_put(old_buffer); + if (old_rb) + ring_buffer_put(old_rb); out: return ret; } @@ -6478,7 +6115,8 @@ SYSCALL_DEFINE5(perf_event_open, } } - event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, + NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; @@ -6663,7 +6301,8 @@ err_fd: struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, struct task_struct *task, - perf_overflow_handler_t overflow_handler) + perf_overflow_handler_t overflow_handler, + void *context) { struct perf_event_context *ctx; struct perf_event *event; @@ -6673,7 +6312,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Get the target context (task or percpu): */ - event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, + overflow_handler, context); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -6780,7 +6420,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * our context. */ child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); - task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -6788,6 +6427,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); + task_ctx_sched_out(child_ctx); child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get @@ -6957,7 +6597,7 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL); + NULL, NULL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -6984,6 +6624,8 @@ inherit_event(struct perf_event *parent_event, child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; + child_event->overflow_handler_context + = parent_event->overflow_handler_context; /* * Precalculate sample_data sizes |