From ac9721f3f54b27a16c7e1afb2481e7ee95a70318 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 May 2010 12:54:41 +0200 Subject: perf_events: Fix races and clean up perf_event and perf_mmap_data interaction In order to move toward separate buffer objects, rework the whole perf_mmap_data construct to be a more self-sufficient entity, one with its own lifetime rules. This greatly sanitizes the whole output redirection code, which was riddled with bugs and races. Signed-off-by: Peter Zijlstra Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fb6c91eac7e3..490698590d6e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -585,6 +585,7 @@ enum perf_event_active_state { struct file; struct perf_mmap_data { + atomic_t refcount; struct rcu_head rcu_head; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; @@ -592,7 +593,6 @@ struct perf_mmap_data { #endif int nr_pages; /* nr of data pages */ int writable; /* are we writable */ - int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ @@ -643,7 +643,6 @@ struct perf_event { int nr_siblings; int group_flags; struct perf_event *group_leader; - struct perf_event *output; const struct pmu *pmu; enum perf_event_active_state state; @@ -704,6 +703,8 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; + int mmap_locked; + struct user_struct *mmap_user; struct perf_mmap_data *data; /* poll related */ -- cgit v1.2.3 From 8a49542c0554af7d0073aac0ee73ee65b807ef34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 May 2010 15:47:49 +0200 Subject: perf_events: Fix races in group composition Group siblings don't pin each-other or the parent, so when we destroy events we must make sure to clean up all cross referencing pointers. In particular, for destruction of a group leader we must be able to find all its siblings and remove their reference to it. This means that detaching an event from its context must not detach it from the group, otherwise we can end up failing to clear all pointers. Solve this by clearly separating the attachment to a context and attachment to a group, and keep the group composed until we destroy the events. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++ kernel/perf_event.c | 91 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 71 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 490698590d6e..5d0266d94985 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -631,6 +631,9 @@ struct swevent_hlist { struct rcu_head rcu_head; }; +#define PERF_ATTACH_CONTEXT 0x01 +#define PERF_ATTACH_GROUP 0x02 + /** * struct perf_event - performance event kernel representation: */ @@ -646,6 +649,7 @@ struct perf_event { const struct pmu *pmu; enum perf_event_active_state state; + unsigned int attach_state; atomic64_t count; /* diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 848d49a043e9..10a1aee2309e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *group_leader = event->group_leader; + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; /* - * Depending on whether it is a standalone or sibling event, - * add it straight to the context's event list, or to the group - * leader's sibling list: + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. */ - if (group_leader == event) { + if (event->group_leader == event) { struct list_head *list; if (is_software_event(event)) @@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); - } else { - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; } list_add_rcu(&event->event_entry, &ctx->event_list); @@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader; + + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; +} + /* * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - if (list_empty(&event->group_entry)) + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&event->group_entry); list_del_rcu(&event->event_entry); - if (event->group_leader != event) - event->group_leader->nr_siblings--; + if (event->group_leader == event) + list_del_init(&event->group_entry); update_group_times(event); @@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->state = PERF_EVENT_STATE_OFF; } -static void -perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + return; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them - * to the context list directly: + * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - struct list_head *list; - - list = ctx_group_list(event, ctx); - list_move_tail(&sibling->group_entry, list); + if (list) + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ @@ -727,6 +762,7 @@ static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); + perf_group_attach(event); event->tstamp_enabled = ctx->time; event->tstamp_running = ctx->time; event->tstamp_stopped = ctx->time; @@ -1894,8 +1930,8 @@ int perf_event_release_kernel(struct perf_event *event) */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); list_del_event(event, ctx); - perf_destroy_group(event, ctx); raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); @@ -5127,6 +5163,12 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ fput_light(group_file, fput_needed); fd_install(event_fd, event_file); return event_fd; @@ -5448,6 +5490,7 @@ static void perf_free_event(struct perf_event *event, fput(parent->filp); + perf_group_detach(event); list_del_event(event, ctx); free_event(event); } -- cgit v1.2.3 From 3771f0771154675d4a0ca780be2411f3cc357208 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 May 2010 12:31:09 +0200 Subject: perf_events, trace: Fix probe unregister race tracepoint_probe_unregister() does not synchronize against the probe callbacks, so do that explicitly. This properly serializes the callbacks and the free of the data used therein. Also, use this_cpu_ptr() where possible. Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1274438476.1674.1702.camel@laptop> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 2 +- kernel/trace/trace_event_perf.c | 10 ++++++++-- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3d685d1f2a03..5a64905d7278 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -725,7 +725,7 @@ perf_trace_##call(void *__data, proto) \ \ { assign; } \ \ - head = per_cpu_ptr(event_call->perf_events, smp_processor_id());\ + head = this_cpu_ptr(event_call->perf_events); \ perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ __count, &__regs, head); \ } diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cb6f365016e4..49c7abf2ba5c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -116,7 +116,7 @@ int perf_trace_enable(struct perf_event *p_event) if (WARN_ON_ONCE(!list)) return -EINVAL; - list = per_cpu_ptr(list, smp_processor_id()); + list = this_cpu_ptr(list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; @@ -142,6 +142,12 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->class->perf_probe, tp_event); + /* + * Ensure our callback won't be called anymore. See + * tracepoint_probe_unregister() and __DO_TRACE(). + */ + synchronize_sched(); + free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; @@ -169,7 +175,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, if (*rctxp < 0) return NULL; - raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id()); + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index faf7cefd15da..f52b5f50299d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1359,7 +1359,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - head = per_cpu_ptr(call->perf_events, smp_processor_id()); + head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); } @@ -1392,7 +1392,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - head = per_cpu_ptr(call->perf_events, smp_processor_id()); + head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d2c859cec9ea..34e35804304b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -519,7 +519,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id()); + head = this_cpu_ptr(sys_data->enter_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } @@ -595,7 +595,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id()); + head = this_cpu_ptr(sys_data->exit_event->perf_events); perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } -- cgit v1.2.3