From fe4b04fa31a6dcf4358aa84cf81e5a7fd079469b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Feb 2011 13:19:09 +0100 Subject: perf: Cure task_oncpu_function_call() races Oleg reported that on architectures with __ARCH_WANT_INTERRUPTS_ON_CTXSW the IPI from task_oncpu_function_call() can land before perf_event_task_sched_in() and cause interesting situations for eg. perf_install_in_context(). This patch reworks the task_oncpu_function_call() interface to give a more usable primitive as well as rework all its users to hopefully be more obvious as well as remove the races. While looking at the code I also found a number of races against perf_event_task_sched_out() which can flip contexts between tasks so plug those too. Reported-and-reviewed-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 260 +++++++++++++++++++++++++++++++++------------------- kernel/sched.c | 29 +----- 2 files changed, 172 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 126a302c481c..7d3faa25e136 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,79 @@ #include +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -254,7 +327,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) raw_spin_lock_irqsave(&ctx->lock, flags); --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); } /* @@ -618,35 +690,24 @@ __get_cpu_context(struct perf_event_context *ctx) * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_event_remove_from_context(void *info) +static int __perf_remove_from_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - raw_spin_lock(&ctx->lock); - event_sched_out(event, cpuctx, ctx); - list_del_event(event, ctx); - raw_spin_unlock(&ctx->lock); + + return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * Must be called with ctx->mutex held. - * * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * @@ -657,49 +718,48 @@ static void __perf_event_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_event_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + if (!task) { /* * Per cpu events are removed via an smp call and * the removal is always successful. */ - smp_call_function_single(event->cpu, - __perf_event_remove_from_context, - event, 1); + cpu_function_call(event->cpu, __perf_remove_from_context, event); return; } retry: - task_oncpu_function_call(task, __perf_event_remove_from_context, - event); + if (!task_function_call(task, __perf_remove_from_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * If the context is active we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->nr_active && !list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can remove the event safely, if the call above did not - * succeed. + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. */ - if (!list_empty(&event->group_entry)) - list_del_event(event, ctx); + list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } /* * Cross CPU call to disable a performance event */ -static void __perf_event_disable(void *info) +static int __perf_event_disable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -708,9 +768,12 @@ static void __perf_event_disable(void *info) /* * If this is a per-task event, need to check whether this * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. */ if (ctx->task && cpuctx->task_ctx != ctx) - return; + return -EINVAL; raw_spin_lock(&ctx->lock); @@ -729,6 +792,8 @@ static void __perf_event_disable(void *info) } raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -753,13 +818,13 @@ void perf_event_disable(struct perf_event *event) /* * Disable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_disable, - event, 1); + cpu_function_call(event->cpu, __perf_event_disable, event); return; } retry: - task_oncpu_function_call(task, __perf_event_disable, event); + if (!task_function_call(task, __perf_event_disable, event)) + return; raw_spin_lock_irq(&ctx->lock); /* @@ -767,6 +832,11 @@ retry: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; goto retry; } @@ -778,7 +848,6 @@ retry: update_group_times(event); event->state = PERF_EVENT_STATE_OFF; } - raw_spin_unlock_irq(&ctx->lock); } @@ -928,12 +997,14 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } +static void perf_event_context_sched_in(struct perf_event_context *ctx); + /* * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ -static void __perf_install_in_context(void *info) +static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -942,17 +1013,12 @@ static void __perf_install_in_context(void *info) int err; /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no events. + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx); raw_spin_lock(&ctx->lock); ctx->is_active = 1; @@ -997,6 +1063,8 @@ static void __perf_install_in_context(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1008,8 +1076,6 @@ unlock: * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_event_context *ctx, @@ -1018,6 +1084,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + lockdep_assert_held(&ctx->mutex); + event->ctx = ctx; if (!task) { @@ -1025,31 +1093,29 @@ perf_install_in_context(struct perf_event_context *ctx, * Per cpu events are installed via an smp call and * the install is always successful. */ - smp_call_function_single(cpu, __perf_install_in_context, - event, 1); + cpu_function_call(cpu, __perf_install_in_context, event); return; } retry: - task_oncpu_function_call(task, __perf_install_in_context, - event); + if (!task_function_call(task, __perf_install_in_context, event)) + return; raw_spin_lock_irq(&ctx->lock); /* - * we need to retry the smp call. + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. */ - if (ctx->is_active && list_empty(&event->group_entry)) { + if (ctx->is_active) { raw_spin_unlock_irq(&ctx->lock); goto retry; } /* - * The lock prevents that this context is scheduled in so we - * can add the event safely, if it the call above did not - * succeed. + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. */ - if (list_empty(&event->group_entry)) - add_event_to_ctx(event, ctx); + add_event_to_ctx(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1078,7 +1144,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, /* * Cross CPU call to enable a performance event */ -static void __perf_event_enable(void *info) +static int __perf_event_enable(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; @@ -1086,18 +1152,10 @@ static void __perf_event_enable(void *info) struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; raw_spin_lock(&ctx->lock); - ctx->is_active = 1; update_context_time(ctx); if (event->state >= PERF_EVENT_STATE_INACTIVE) @@ -1138,6 +1196,8 @@ static void __perf_event_enable(void *info) unlock: raw_spin_unlock(&ctx->lock); + + return 0; } /* @@ -1158,8 +1218,7 @@ void perf_event_enable(struct perf_event *event) /* * Enable the event on the cpu that it's on */ - smp_call_function_single(event->cpu, __perf_event_enable, - event, 1); + cpu_function_call(event->cpu, __perf_event_enable, event); return; } @@ -1178,8 +1237,15 @@ void perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + raw_spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_event_enable, event); + + if (!task_function_call(task, __perf_event_enable, event)) + return; raw_spin_lock_irq(&ctx->lock); @@ -1187,15 +1253,14 @@ retry: * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_OFF) - __perf_event_mark_enabled(event, ctx); + } out: raw_spin_unlock_irq(&ctx->lock); @@ -1339,8 +1404,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; @@ -1533,7 +1598,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, { struct perf_cpu_context *cpuctx; - cpuctx = __get_cpu_context(ctx); + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; @@ -1541,7 +1606,7 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, cpuctx->task_ctx = ctx; } -void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1627,7 +1692,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) * Reduce accuracy by one bit such that @a and @b converge * to a similar magnitude. */ -#define REDUCE_FLS(a, b) \ +#define REDUCE_FLS(a, b) \ do { \ if (a##_fls > b##_fls) { \ a >>= 1; \ @@ -2213,6 +2278,9 @@ errout: } +/* + * Returns a matching context with refcount and pincount. + */ static struct perf_event_context * find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { @@ -2237,6 +2305,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + ++ctx->pin_count; return ctx; } @@ -2250,6 +2319,7 @@ retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); + ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2271,8 +2341,10 @@ retry: err = -ESRCH; else if (task->perf_event_ctxp[ctxn]) err = -EAGAIN; - else + else { + ++ctx->pin_count; rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } mutex_unlock(&task->perf_event_mutex); if (unlikely(err)) { @@ -5950,10 +6022,10 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_event_remove_from_context(group_leader); + perf_remove_from_context(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_event_remove_from_context(sibling); + perf_remove_from_context(sibling); put_ctx(gctx); } mutex_unlock(&gctx->mutex); @@ -5976,6 +6048,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); event->owner = current; @@ -6001,6 +6074,7 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: + perf_unpin_context(ctx); put_ctx(ctx); err_alloc: free_event(event); @@ -6051,6 +6125,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); ++ctx->generation; + perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); return event; @@ -6104,7 +6179,7 @@ __perf_event_exit_task(struct perf_event *child_event, { struct perf_event *parent_event; - perf_event_remove_from_context(child_event); + perf_remove_from_context(child_event); parent_event = child_event->parent; /* @@ -6411,7 +6486,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -6526,6 +6601,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); return ret; } @@ -6595,9 +6671,9 @@ static void __perf_event_exit_context(void *__info) perf_pmu_rotate_stop(ctx->pmu); list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) - __perf_event_remove_from_context(event); + __perf_remove_from_context(event); } static void perf_event_exit_cpu_context(int cpu) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..31cb5d5e1aac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2265,27 +2265,6 @@ void kick_process(struct task_struct *p) EXPORT_SYMBOL_GPL(kick_process); #endif /* CONFIG_SMP */ -/** - * task_oncpu_function_call - call a function on the cpu on which a task runs - * @p: the task to evaluate - * @func: the function to be called - * @info: the function call argument - * - * Calls the function @func when the task is currently running. This might - * be on the current CPU, which just calls the function directly - */ -void task_oncpu_function_call(struct task_struct *p, - void (*func) (void *info), void *info) -{ - int cpu; - - preempt_disable(); - cpu = task_cpu(p); - if (task_curr(p)) - smp_call_function_single(cpu, func, info, 1); - preempt_enable(); -} - #ifdef CONFIG_SMP /* * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. @@ -2776,9 +2755,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); + trace_sched_switch(prev, next); } /** @@ -2911,7 +2893,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_sched_switch(prev, next); + mm = next->mm; oldmm = prev->active_mm; /* @@ -3989,9 +3971,6 @@ need_resched_nonpreemptible: rq->skip_clock_update = 0; if (likely(prev != next)) { - sched_info_switch(prev, next); - perf_event_task_sched_out(prev, next); - rq->nr_switches++; rq->curr = next; ++*switch_count; -- cgit v1.2.3 From 76022db323dd6d7c6958df3d595f7dedf7a14778 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:51:53 +0900 Subject: tracing/kprobes: Cleanup strict_strtol() using code Since strict_strtol() accepts minus digits started with '-', it doesn't need to invert after converting. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125153.9507.49335.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2dec9bcde8b4..2088893c049e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -767,16 +767,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, } break; case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ case '-': tmp = strchr(arg, '('); if (!tmp) break; *tmp = '\0'; - ret = strict_strtol(arg + 1, 0, &offset); + ret = strict_strtol(arg, 0, &offset); if (ret) break; - if (arg[0] == '-') - offset = -offset; arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { -- cgit v1.2.3 From e3745369986ddcdaa19f70e2d24e658876b97e84 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:51:59 +0900 Subject: tracing/kprobes: Support longer (>128 bytes) command Expand command line buffer of kprobe-tracer to 4096 bytes. Reported-by: Arnaldo Carvalho de Melo Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125159.9507.20895.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2088893c049e..c6ed88660856 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1129,7 +1129,7 @@ static int command_trace_probe(const char *buf) return ret; } -#define WRITE_BUFSIZE 128 +#define WRITE_BUFSIZE 4096 static ssize_t probes_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) -- cgit v1.2.3 From 1ff511e35ed87cc2ebade9e678e4a2fe39b6f9c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 4 Feb 2011 21:52:05 +0900 Subject: tracing/kprobes: Add bitfield type Add bitfield type for tracing arguments on kprobe-tracer. The syntax of a bitfield type is: b@/ e.g. Accessing 2 bits-width field with 4 bits-offset in 32 bits-width data at 4 bytes offseted from the address pointed by AX register: +4(%ax):b2@4/32 Since the width of container data depends on the arch, so I just added the container-size at the end. Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Srikar Dronamraju Cc: Steven Rostedt LKML-Reference: <20110204125205.9507.11363.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/trace/kprobetrace.txt | 16 +++++- kernel/trace/trace_kprobe.c | 104 +++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 5f77d94598dd..6d27ab8d6e9f 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -42,11 +42,25 @@ Synopsis of kprobe_events +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types - (u8/u16/u32/u64/s8/s16/s32/s64) and string are supported. + (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield + are supported. (*) only for return probe. (**) this is useful for fetching a field of data structures. +Types +----- +Several types are supported for fetch-args. Kprobe tracer will access memory +by given type. Prefix 's' and 'u' means those types are signed and unsigned +respectively. Traced arguments are shown in decimal (signed) or hex (unsigned). +String type is a special type, which fetches a "null-terminated" string from +kernel space. This means it will fail and store NULL if the string container +has been paged out. +Bitfield is another special type, which takes 3 parameters, bit-width, bit- +offset, and container-size (usually 32). The syntax is; + + b@/ + Per-Probe Event Filtering ------------------------- diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c6ed88660856..ccdc542022c3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -353,6 +353,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) kfree(data); } +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} /* Default (unsigned long) fetch type */ #define __DEFAULT_FETCH_TYPE(t) u##t #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) @@ -367,6 +404,7 @@ enum { FETCH_MTD_memory, FETCH_MTD_symbol, FETCH_MTD_deref, + FETCH_MTD_bitfield, FETCH_MTD_END, }; @@ -387,6 +425,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \ ASSIGN_FETCH_FUNC(memory, ftype), \ ASSIGN_FETCH_FUNC(symbol, ftype), \ ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ } \ } @@ -430,9 +469,33 @@ static const struct fetch_type *find_fetch_type(const char *type) if (!type) type = DEFAULT_FETCH_TYPE_STR; + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + type = strchr(type, '/'); + if (!type) + goto fail; + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) if (strcmp(type, fetch_type_table[i].name) == 0) return &fetch_type_table[i]; +fail: return NULL; } @@ -586,7 +649,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); @@ -806,6 +871,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, return ret; } +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + /* String length checking wrapper */ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) @@ -835,6 +935,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, parg->fetch.fn); -- cgit v1.2.3 From 6d54057d76e25c91165cda0e6e007f1811faa2be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:33:26 -0500 Subject: tracing/filter: Have no filter return a match The n_preds field of a file can change at anytime, and even can become zero, just as the filter is about to be processed by an event. In the case that is zero on entering the filter, return 1, telling the caller the event matchs and should be trace. Also use a variable and assign it with ACCESS_ONCE() such that the count stays consistent within the function. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17f..7275f0310ed8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -383,9 +383,14 @@ int filter_match_preds(struct event_filter *filter, void *rec) int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int n_preds = ACCESS_ONCE(filter->n_preds); int i; - for (i = 0; i < filter->n_preds; i++) { + /* no filter is considered a match */ + if (!n_preds) + return 1; + + for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec, val1, val2); -- cgit v1.2.3 From 58d9a597c4275d830a819625e7d437cd6fb23fa5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:37:09 -0500 Subject: tracing/filter: Move OR and AND logic out of fn() method The ops OR and AND act different from the other ops, as they are the only ones to take other ops as their arguements. These ops als change the logic of the filter_match_preds. By removing the OR and AND fn's we can also remove the val1 and val2 that is passed to all other fn's and are unused. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +-- kernel/trace/trace_events_filter.c | 51 ++++++++++++++------------------------ 2 files changed, 20 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c3..1597bc0749c1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,8 +677,7 @@ struct event_subsystem { struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7275f0310ed8..5d719b340a2b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -124,8 +124,7 @@ struct filter_parse_state { }; #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +151,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +176,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +190,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +213,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +228,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -380,7 +360,7 @@ static void filter_build_regex(struct filter_pred *pred) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; + int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); @@ -393,7 +373,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); + match = pred->fn(pred, rec); stack[top++] = match; continue; } @@ -403,7 +383,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) } val1 = stack[--top]; val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); + switch (pred->op) { + case OP_AND: + match = val1 && val2; + break; + case OP_OR: + match = val1 || val2; + break; + default: + WARN_ONCE(1, "filter op is not AND or OR"); + } stack[top++] = match; } @@ -775,15 +764,13 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; if (pred->op == OP_AND) { pred->pop_n = 2; - fn = filter_pred_and; goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - fn = filter_pred_or; goto add_pred_fn; } -- cgit v1.2.3 From c9c53ca03d6f97fdd9832d5ed3f15b30ee5cdb86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:42:43 -0500 Subject: tracing/filter: Dynamically allocate preds For every filter that is made, we create predicates to hold every operation within the filter. We have a max of 32 predicates that we can hold. Currently, we allocate all 32 even if we only need to use one. Part of the reason we do this is that the filter can be used at any moment by any event. Fortunately, the filter is only used with preemption disabled. By reseting the count of preds used "n_preds" to zero, then performing a synchronize_sched(), we can safely free and reallocate a new array of preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 143 ++++++++++++++++++++++++++++--------- 2 files changed, 110 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1597bc0749c1..441fc1bc85d6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -661,7 +661,8 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ struct filter_pred **preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5d719b340a2b..aac6a6183e6a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,6 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; + struct filter_pred **preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -370,8 +371,13 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!n_preds) return 1; + /* + * n_preds and filter->preds is protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; + pred = preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -548,46 +554,55 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } +static void __free_preds(struct event_filter *filter) +{ + int i; + + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; + filter->n_preds = 0; +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + filter->preds[i]->fn = filter_pred_none; + } filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void __free_filter(struct event_filter *filter) { - int i; - if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; - struct filter_pred *pred; - int i; filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter) @@ -595,32 +610,63 @@ static struct event_filter *__alloc_preds(void) filter->n_preds = 0; - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ + struct filter_pred *pred; + int i; + + if (filter->preds) { + if (filter->a_preds < n_preds) { + /* We need to reallocate */ + filter->n_preds = 0; + /* + * It is possible that the filter is currently + * being used. We need to zero out the number + * of preds, wait on preemption and then free + * the preds. + */ + synchronize_sched(); + __free_preds(filter); + } + } + + if (!filter->preds) { + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->a_preds = n_preds; + } if (!filter->preds) - goto oom; + return -ENOMEM; + + if (WARN_ON(filter->a_preds < n_preds)) + return -EINVAL; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + if (!pred) + pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; filter->preds[i] = pred; } - return filter; - -oom: + return 0; + oom: __free_preds(filter); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } -static int init_preds(struct ftrace_event_call *call) +static int init_filter(struct ftrace_event_call *call) { if (call->filter) return 0; call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); + call->filter = __alloc_filter(); if (IS_ERR(call->filter)) return PTR_ERR(call->filter); @@ -636,7 +682,7 @@ static int init_subsystem_preds(struct event_subsystem *system) if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); + err = init_filter(call); if (err) return err; } @@ -665,7 +711,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1179,6 +1225,20 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1191,10 +1251,23 @@ static int replace_preds(struct ftrace_event_call *call, int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_preds(filter, n_preds); + if (err) + return err; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1208,7 +1281,7 @@ static int replace_preds(struct ftrace_event_call *call, continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1283,7 +1356,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); - err = init_preds(call); + err = init_filter(call); if (err) goto out_unlock; @@ -1376,7 +1449,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1402,7 +1475,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); + filter = __alloc_filter(); if (IS_ERR(filter)) { err = PTR_ERR(filter); goto out_unlock; @@ -1411,7 +1484,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1427,9 +1500,9 @@ free_ps: postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); -- cgit v1.2.3 From 0fc3ca9a10a61a77f18710fb708b41fd99c79a56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:46:46 -0500 Subject: tracing/filter: Call synchronize_sched() just once for system filters By separating out the reseting of the filter->n_preds to zero from the reallocation of preds for the filter, we can reset groups of filters first, call synchronize_sched() just once, and then reallocate each of the filters in the system group. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 80 ++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index aac6a6183e6a..8f00a11ce778 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -570,17 +570,28 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } +static void reset_preds(struct event_filter *filter) +{ + struct filter_pred *pred; + int n_preds = filter->n_preds; + int i; + + filter->n_preds = 0; + if (!filter->preds) + return; + + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + pred->fn = filter_pred_none; + } +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; - int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - filter->preds[i]->fn = filter_pred_none; - } - filter->n_preds = 0; + reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -620,15 +631,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) { if (filter->a_preds < n_preds) { - /* We need to reallocate */ - filter->n_preds = 0; /* - * It is possible that the filter is currently - * being used. We need to zero out the number - * of preds, wait on preemption and then free - * the preds. + * We need to reallocate. + * We should have already have zeroed out + * the pred count and called synchronized_sched() + * to make sure no one is using the preds. */ - synchronize_sched(); + if (WARN_ON_ONCE(filter->n_preds)) { + /* We need to reset it now */ + filter->n_preds = 0; + synchronize_sched(); + } __free_preds(filter); } } @@ -1328,6 +1341,30 @@ static int replace_system_preds(struct event_subsystem *system, /* try to see if the filter can be applied */ err = replace_preds(call, filter, ps, filter_string, true); if (err) + goto fail; + } + + /* set all filter pred counts to zero */ + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) + continue; + + reset_preds(filter); + } + + /* + * Since some of the preds may be used under preemption + * we need to wait for them to finish before we may + * reallocate them. + */ + synchronize_sched(); + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) continue; /* really apply the filter */ @@ -1342,11 +1379,13 @@ static int replace_system_preds(struct event_subsystem *system, fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; - } + if (fail) + goto fail; + return 0; + fail: + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) @@ -1381,6 +1420,13 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } + /* + * Make sure all the pred counts are zero so that + * no task is using it when we reallocate the preds array. + */ + reset_preds(call->filter); + synchronize_sched(); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); -- cgit v1.2.3 From 74e9e58c350a24139e268dd6857bbaa55c5aafcf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:49:48 -0500 Subject: tracing/filter: Allocate the preds in an array Currently we allocate an array of pointers to filter_preds, and then allocate a separate filter_pred for each item in the array. This adds slight overhead in the filters as it needs to derefernce twice to get to the op condition. Allocating the preds themselves in a single array removes a dereference as well as helps on the cache footprint. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_events_filter.c | 31 +++++++++---------------------- 2 files changed, 10 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 441fc1bc85d6..254d04a84ec3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -663,7 +663,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred **preds; + struct filter_pred *preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8f00a11ce778..b6c910642a1e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,7 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; - struct filter_pred **preds; + struct filter_pred *preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -377,7 +377,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) preds = rcu_dereference_sched(filter->preds); for (i = 0; i < n_preds; i++) { - pred = preds[i]; + pred = &preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -559,10 +559,8 @@ static void __free_preds(struct event_filter *filter) int i; if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -572,7 +570,6 @@ static void __free_preds(struct event_filter *filter) static void reset_preds(struct event_filter *filter) { - struct filter_pred *pred; int n_preds = filter->n_preds; int i; @@ -580,10 +577,8 @@ static void reset_preds(struct event_filter *filter) if (!filter->preds) return; - for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - pred->fn = filter_pred_none; - } + for (i = 0; i < n_preds; i++) + filter->preds[i].fn = filter_pred_none; } static void filter_disable_preds(struct ftrace_event_call *call) @@ -658,19 +653,11 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return -EINVAL; for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - if (!pred) - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } return 0; - oom: - __free_preds(filter); - return -ENOMEM; } static int init_filter(struct ftrace_event_call *call) @@ -730,8 +717,8 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(&filter->preds[idx], pred, fn); if (err) return err; -- cgit v1.2.3 From f76690afd05e3e163149310bdcd30234f93b3a7a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:53:06 -0500 Subject: tracing/filter: Free pred array on disabling of filter When a filter is disabled, free the preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b6c910642a1e..2f5458e244a3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1388,6 +1388,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); + reset_preds(call->filter); + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_preds(call->filter); remove_filter_string(call->filter); goto out_unlock; } -- cgit v1.2.3 From 61e9dea20e1ada886cc49a9ec6fe3c6ac0de7324 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:54:33 -0500 Subject: tracing/filter: Use a tree instead of stack for filter_match_preds() Currently the filter_match_preds() requires a stack to push and pop the preds to determine if the filter matches the record or not. This has two drawbacks: 1) It requires a stack to store state information. As this is done in fast paths we can't allocate the storage for this stack, and we can't use a global as it must be re-entrant. The stack is stored on the kernel stack and this greatly limits how many preds we may allow. 2) All conditions are calculated even when a short circuit exists. a || b will always calculate a and b even though a was determined to be true. Using a tree we can walk a constant structure that will save the state as we go. The algorithm is simply: pred = root; do { switch (move) { case MOVE_DOWN: if (OR or AND) { pred = left; continue; } if (pred == root) break; match = pred->fn(); pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; case MOVE_UP_FROM_LEFT: /* Only OR or AND can be a parent */ if (match && OR || !match && AND) { /* short circuit */ if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } pred = pred->right; move = MOVE_DOWN; continue; case MOVE_UP_FROM_RIGHT: if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } done = 1; } while (!done); This way there's no strict limit to how many preds we allow and it also will short circuit the logical operations when possible. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +- kernel/trace/trace_events_filter.c | 231 +++++++++++++++++++++++++++++-------- 2 files changed, 194 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 254d04a84ec3..bba34a72c780 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -664,6 +664,7 @@ struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -675,6 +676,9 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) + struct filter_pred; struct regex; @@ -704,7 +708,10 @@ struct filter_pred { int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2f5458e244a3..10390491b6d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,6 +123,11 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; struct filter_pred *preds; struct filter_pred *pred; + struct filter_pred *root; int n_preds = ACCESS_ONCE(filter->n_preds); - int i; + int done = 0; /* no filter is considered a match */ if (!n_preds) return 1; /* - * n_preds and filter->preds is protect with preemption disabled. + * n_preds, root and filter->preds are protect with preemption disabled. */ preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; - for (i = 0; i < n_preds; i++) { - pred = &preds[i]; - if (!pred->pop_n) { + pred = root; + + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* keep going to leaf node */ + pred = &preds[pred->left]; + continue; + } match = pred->fn(pred, rec); - stack[top++] = match; + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* Check for short circuits */ + if ((match && pred->op == OP_OR) || + (!match && pred->op == OP_AND)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - switch (pred->op) { - case OP_AND: - match = val1 && val2; - break; - case OP_OR: - match = val1 || val2; - break; - default: - WARN_ONCE(1, "filter op is not AND or OR"); - } - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + dest->left = left->index; + dest->right = right->index; + left->parent = dest->index; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + return __push_pred_stack(stack, dest); } static void __free_preds(struct event_filter *filter) @@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter) int i; filter->n_preds = 0; + filter->root = NULL; if (!filter->preds) return; @@ -707,6 +821,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; @@ -718,7 +833,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, idx = filter->n_preds; filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(&filter->preds[idx], pred, fn); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -803,6 +918,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -812,13 +928,10 @@ static int filter_add_pred(struct filter_parse_state *ps, fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -867,7 +980,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1248,6 +1361,7 @@ static int replace_preds(struct ftrace_event_call *call, char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; @@ -1262,9 +1376,12 @@ static int replace_preds(struct ftrace_event_call *call, return err; if (!dry_run) { - err = __alloc_preds(filter, n_preds); + err = __alloc_pred_stack(&stack, n_preds); if (err) return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; } n_preds = 0; @@ -1276,14 +1393,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1293,22 +1412,44 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + filter->root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } static int replace_system_preds(struct event_subsystem *system, -- cgit v1.2.3 From 55719274188f13cff9e3bd11fdd4c0e7617cd03d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:12:05 -0500 Subject: tracing/filter: Optimize short ciruit check The test if we should break out early for OR and AND operations can be optimized by comparing the current result with (pred->op == OP_OR) That is if the result is true and the op is an OP_OR, or if the result is false and the op is not an OP_OR (thus an OP_AND) we can break out early in either case. Otherwise we continue processing. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 10390491b6d0..0a3e0502b507 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -426,9 +426,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) pred->parent, &move); continue; case MOVE_UP_FROM_LEFT: - /* Check for short circuits */ - if ((match && pred->op == OP_OR) || - (!match && pred->op == OP_AND)) { + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { if (pred == root) break; pred = get_pred_parent(pred, preds, -- cgit v1.2.3 From ec126cac23945de12eb2d103374e1f7ee97c5595 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:14:25 -0500 Subject: tracing/filter: Check the created pred tree Since the filter walks a tree to determine if a match is made or not, if the tree was incorrectly created, it could cause an infinite loop. Add a check to walk the entire tree before assigning it as a filter to make sure the tree is correct. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 72 +++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0a3e0502b507..91c9cdcb040b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1358,6 +1358,68 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1366,6 +1428,7 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; struct pred_stack stack = { }; /* init to NULL */ int err; @@ -1442,7 +1505,7 @@ add_pred: if (!pred) return -EINVAL; /* This item is where we start from in matching */ - filter->root = pred; + root = pred; /* Make sure the stack is empty */ pred = __pop_pred_stack(&stack); if (WARN_ON(pred)) { @@ -1450,6 +1513,13 @@ add_pred: filter->root = NULL; goto fail; } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; } err = 0; -- cgit v1.2.3 From 43cd414552d8137157e926e46361678ea867e476 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:16:51 -0500 Subject: tracing/filter: Optimize filter by folding the tree There are many cases that a filter will contain multiple ORs or ANDs together near the leafs. Walking up and down the tree to get to the next compare can be a waste. If there are several ORs or ANDs together, fold them into a single pred and allocate an array of the conditions that they check. This will speed up the filter by linearly walking an array and can still break out if a short circuit condition is met. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 +- kernel/trace/trace_events_filter.c | 233 +++++++++++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bba34a72c780..d754330934bb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -678,6 +678,7 @@ struct event_subsystem { #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) struct filter_pred; struct regex; @@ -704,7 +705,16 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 91c9cdcb040b..2403ce5b6507 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,42 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -414,11 +450,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) case MOVE_DOWN: /* only AND and OR have children */ if (pred->left != FILTER_PRED_INVALID) { - /* keep going to leaf node */ - pred = &preds[pred->left]; - continue; - } - match = pred->fn(pred, rec); + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); /* If this pred is the only pred */ if (pred == root) break; @@ -659,17 +700,34 @@ static int filter_set_pred(struct event_filter *filter, left = __pop_pred_stack(stack); if (!left || !right) return -EINVAL; - dest->left = left->index; - dest->right = right->index; - left->parent = dest->index; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else + } else { /* * Make dest->left invalid to be used as a quick * way to know this is a leaf node. */ dest->left = FILTER_PRED_INVALID; + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + return __push_pred_stack(stack, dest); } @@ -1420,6 +1478,158 @@ static int check_pred_tree(struct event_filter *filter, return 0; } +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1517,6 +1727,11 @@ add_pred: if (err) goto fail; + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + /* We don't set root until we know it works */ barrier(); filter->root = root; -- cgit v1.2.3 From 4a3d27e98a7f2682e96d6f863752e0424b00d691 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:19:49 -0500 Subject: tracing/filter: Move MAX_FILTER_PRED to local tracing directory The MAX_FILTER_PRED is only needed by the kernel/trace/*.c files. Move it to kernel/trace/trace.h. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 47e3997f7b5c..1a99e7939c2b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -208,7 +208,6 @@ struct ftrace_event_call { #define PERF_MAX_TRACE_SIZE 2048 -#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ extern void destroy_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d754330934bb..fbff872f8db1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,8 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) +#define MAX_FILTER_PRED 32 + struct filter_pred; struct regex; -- cgit v1.2.3 From bf93f9ed3a2cb89eb7e58851139d3be375b98027 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:21:34 -0500 Subject: tracing/filter: Increase the max preds to 2^14 Now that the filter logic does not require to save the pred results on the stack, we can increase the max number of preds we allow. As the preds are index by a short value, and we use the MSBs as flags we can increase the max preds to 2^14 (16384) which should be way more than enough. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fbff872f8db1..856e73cc1d3f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,7 +680,14 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -#define MAX_FILTER_PRED 32 +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 struct filter_pred; struct regex; -- cgit v1.2.3 From 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:25:46 -0500 Subject: tracing/filter: Swap entire filter of events When creating a new filter, instead of allocating the filter to the event call first and then processing the filter, it is easier to process a temporary filter and then just swap it with the call filter. By doing this, it simplifies the code. A filter is allocated and processed, when it is done, it is swapped with the call filter, synchronize_sched() is called to make sure all callers are done with the old filter (filters are called with premption disabled), and then the old filter is freed. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 251 +++++++++++++++++++++---------------- 1 file changed, 146 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2403ce5b6507..f5d335d28d0b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -425,10 +425,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) struct filter_pred *preds; struct filter_pred *pred; struct filter_pred *root; - int n_preds = ACCESS_ONCE(filter->n_preds); + int n_preds; int done = 0; /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + if (!n_preds) return 1; @@ -509,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -568,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -581,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -745,26 +755,9 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void reset_preds(struct event_filter *filter) -{ - int n_preds = filter->n_preds; - int i; - - filter->n_preds = 0; - filter->root = NULL; - if (!filter->preds) - return; - - for (i = 0; i < n_preds; i++) - filter->preds[i].fn = filter_pred_none; -} - -static void filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable(struct ftrace_event_call *call) { - struct event_filter *filter = call->filter; - call->flags &= ~TRACE_EVENT_FL_FILTERED; - reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -777,11 +770,16 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } static struct event_filter *__alloc_filter(void) @@ -789,11 +787,6 @@ static struct event_filter *__alloc_filter(void) struct event_filter *filter; filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); - - filter->n_preds = 0; - return filter; } @@ -838,46 +831,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static int init_filter(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_filter(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - - return 0; -} - -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_filter(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -1743,88 +1718,129 @@ fail: return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) goto fail; } - /* set all filter pred counts to zero */ list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + struct event_filter *filter; if (strcmp(call->class->system, system->name) != 0) continue; - reset_preds(filter); - } + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; - /* - * Since some of the preds may be used under preemption - * we need to wait for them to finish before we may - * reallocate them. - */ - synchronize_sched(); + list_add_tail(&filter_item->list, &filter_list); - list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; - if (strcmp(call->class->system, system->name) != 0) - continue; + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); + if (err) + goto fail_mem; - /* really apply the filter */ - filter_disable_preds(call); err = replace_preds(call, filter, ps, filter_string, false); - if (err) - filter_disable_preds(call); - else { + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } if (fail) goto fail; + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } return 0; fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_filter(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - reset_preds(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; /* Make sure the filter is not being used */ synchronize_sched(); - __free_preds(call->filter); - remove_filter_string(call->filter); + __free_filter(filter); goto out_unlock; } @@ -1833,29 +1849,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - /* - * Make sure all the pred counts are zero so that - * no task is using it when we reallocate the preds array. - */ - reset_preds(call->filter); - synchronize_sched(); - - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1868,18 +1896,21 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1888,7 +1919,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1945,7 +1986,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; filter = __alloc_filter(); - if (IS_ERR(filter)) { + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } -- cgit v1.2.3 From 4defe682d81a4960b6840ee4ed1a36f9db77c7bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:29:06 -0500 Subject: tracing/filter: Remove synchronize_sched() from __alloc_preds() Because the filters are processed first and then activated (added to the call), we no longer need to worry about the preds of the filter in __alloc_preds() being used. As the filter that is allocating preds is not activated yet. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f5d335d28d0b..3249b4f77ef0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -795,33 +795,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) struct filter_pred *pred; int i; - if (filter->preds) { - if (filter->a_preds < n_preds) { - /* - * We need to reallocate. - * We should have already have zeroed out - * the pred count and called synchronized_sched() - * to make sure no one is using the preds. - */ - if (WARN_ON_ONCE(filter->n_preds)) { - /* We need to reset it now */ - filter->n_preds = 0; - synchronize_sched(); - } - __free_preds(filter); - } - } + if (filter->preds) + __free_preds(filter); + + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - if (!filter->preds) { - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->a_preds = n_preds; - } if (!filter->preds) return -ENOMEM; - if (WARN_ON(filter->a_preds < n_preds)) - return -EINVAL; + filter->a_preds = n_preds; + filter->n_preds = 0; for (i = 0; i < n_preds; i++) { pred = &filter->preds[i]; -- cgit v1.2.3 From ba976970c79fd2fbfe1a4b3b6766a318f4eb9d4c Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:20 +1100 Subject: tracing/syscalls: Don't add events for unmapped syscalls FTRACE_SYSCALLS would create events for each and every system call, even if it had failed to map the system call's name with it's number. This resulted in a number of events being created that would not behave as expected. This could happen, for example, on architectures who's symbol names are unusual and will not match the system call name. It could also happen with system calls which were mapped to sys_ni_syscall. This patch changes the default system call number in the metadata to -1. If the system call name from the metadata is not successfully mapped to a system call number during boot, than the event initialisation routine will now return an error, preventing the event from being created. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-2-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 2 ++ kernel/trace/trace_syscalls.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 98664db1be47..8e8968e74544 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -158,6 +158,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; static struct syscall_metadata __used \ __syscall_meta_##sname = { \ .name = "sys"#sname, \ + .syscall_nr = -1, /* Filled in at boot */ \ .nb_args = nb, \ .types = types_##sname, \ .args = args_##sname, \ @@ -175,6 +176,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; static struct syscall_metadata __used \ __syscall_meta__##sname = { \ .name = "sys_"#sname, \ + .syscall_nr = -1, /* Filled in at boot */ \ .nb_args = 0, \ .enter_event = &event_enter__##sname, \ .exit_event = &event_exit__##sname, \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5c9fe08d2093..a9ceabd52247 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -424,6 +424,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int init_syscall_trace(struct ftrace_event_call *call) { int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } if (set_syscall_print_fmt(call) < 0) return -ENOMEM; -- cgit v1.2.3 From 3773b389b6927595512558594d040c1edba46f36 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:21 +1100 Subject: tracing/syscalls: Convert redundant syscall_nr checks into WARN_ON With the ftrace events now checking if the syscall_nr is valid upon initialisation it should no longer be possible to register or unregister a syscall event without a valid syscall_nr since they should not be created. This adds a WARN_ON_ONCE in the register and unregister functions to locate potential regressions in the future. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-3-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a9ceabd52247..423094288fb5 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -359,7 +359,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) @@ -377,7 +377,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_enter--; @@ -393,7 +393,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) @@ -411,7 +411,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; - if (num < 0 || num >= NR_syscalls) + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); sys_refcount_exit--; -- cgit v1.2.3 From c763ba06bd9b5db2c46c36276c89103d92d2c604 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:22 +1100 Subject: tracing/syscalls: Make arch_syscall_addr weak Some architectures use non-trivial system call tables and will not work with the generic arch_syscall_addr code. For example, PowerPC64 uses a table of twin long longs. This patch makes the generic arch_syscall_addr weak to allow architectures with non-trivial system call tables to override it. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-4-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace-design.txt | 3 +++ kernel/trace/trace_syscalls.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index dc52bd442c92..6fca17beee2f 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -247,6 +247,9 @@ You need very few things to get the syscalls tracing in an arch. - Support the TIF_SYSCALL_TRACEPOINT thread flags. - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace in the ptrace syscalls tracing path. +- If the system call table on this arch is more complicated than a simple array + of addresses of the system calls, implement an arch_syscall_addr to return + the address of a given system call. - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 423094288fb5..af831545f656 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -446,7 +446,7 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init __weak arch_syscall_addr(int nr) { return (unsigned long)sys_call_table[nr]; } -- cgit v1.2.3 From b2d55496818d64310b9f5486d4eea76ea614d7f8 Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:23 +1100 Subject: tracing/syscalls: Allow arch specific syscall symbol matching Some architectures have unusual symbol names and the generic code to match the symbol name with the function name for the syscall metadata will fail. For example, symbols on PPC64 start with a period and the generic code will fail to match them. This patch moves the match logic out into a separate function which an arch can override by defining ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and implementing arch_syscall_match_sym_name. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-5-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace-design.txt | 4 ++++ kernel/trace/trace_syscalls.c | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 6fca17beee2f..79fcafc7fd64 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -250,6 +250,10 @@ You need very few things to get the syscalls tracing in an arch. - If the system call table on this arch is more complicated than a simple array of addresses of the system calls, implement an arch_syscall_addr to return the address of a given system call. +- If the symbol names of the system calls do not match the function names on + this arch, define ARCH_HAS_SYSCALL_MATCH_SYM_NAME in asm/ftrace.h and + implement arch_syscall_match_sym_name with the appropriate logic to return + true if the function name corresponds with the symbol name. - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index af831545f656..86a23e7de031 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) { @@ -73,13 +86,7 @@ find_syscall_meta(unsigned long syscall) kallsyms_lookup(syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { - /* - * Only compare after the "sys" prefix. Archs that use - * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted - * mismatch. - */ - if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; } return NULL; -- cgit v1.2.3 From ae07f551c42d6e4162436ca452a199deac9dab4d Mon Sep 17 00:00:00 2001 From: Ian Munsie Date: Thu, 3 Feb 2011 14:27:25 +1100 Subject: tracing/syscalls: Early terminate search for sys_ni_syscall Many system calls are unimplemented and mapped to sys_ni_syscall, but at boot ftrace would still search through every syscall metadata entry for a match which wouldn't be there. This patch adds causes the search to terminate early if the system call is not mapped. Signed-off-by: Ian Munsie LKML-Reference: <1296703645-18718-7-git-send-email-imunsie@au1.ibm.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 86a23e7de031..ee7b5a0bb9f8 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -85,6 +85,9 @@ find_syscall_meta(unsigned long syscall) stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + for ( ; start < stop; start++) { if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) return *start; -- cgit v1.2.3 From 5e38ca8f3ea423442eaafe1b7e206084aa38120a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 2 Feb 2011 13:28:18 +0100 Subject: tracing: Add unstable sched clock note to the warning The warning "Delta way too big" warning might appear on a system with unstable shed clock right after the system is resumed and tracing was enabled during the suspend. Since it's not realy bug, and the unstable sched clock is working fast and reliable otherwise, Steven suggested to keep using the sched clock in any case and just to make note in the warning itself. Signed-off-by: Jiri Olsa LKML-Reference: <1296649698-6003-1-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..7739893a1d0a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2163,10 +2163,14 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + sched_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } -- cgit v1.2.3 From 87d80de2800d087ea833cb79bc13f85ff34ed49f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 13:19:49 -0500 Subject: tracing: Remove obsolete sched_switch tracer The trace events sched_switch and sched_wakeup do the same thing as the stand alone sched_switch tracer does. It is no longer needed. Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 110 -------------------------------------- kernel/trace/trace_sched_switch.c | 48 ----------------- 2 files changed, 158 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 557c1edeccaf..65eddb7cfa02 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -202,10 +202,6 @@ Here is the list of current tracers that may be configured. to draw a graph of function calls similar to C code source. - "sched_switch" - - Traces the context switches and wakeups between tasks. - "irqsoff" Traces the areas that disable interrupts and saves @@ -273,39 +269,6 @@ format, the function name that was traced "path_put" and the parent function that called this function "path_walk". The timestamp is the time at which the function was entered. -The sched_switch tracer also includes tracing of task wakeups -and context switches. - - ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 2916:115:S - ksoftirqd/1-7 [01] 1453.070013: 7:115:R + 10:115:S - ksoftirqd/1-7 [01] 1453.070013: 7:115:R ==> 10:115:R - events/1-10 [01] 1453.070013: 10:115:S ==> 2916:115:R - kondemand/1-2916 [01] 1453.070013: 2916:115:S ==> 7:115:R - ksoftirqd/1-7 [01] 1453.070013: 7:115:S ==> 0:140:R - -Wake ups are represented by a "+" and the context switches are -shown as "==>". The format is: - - Context switches: - - Previous task Next Task - - :: ==> :: - - Wake ups: - - Current task Task waking up - - :: + :: - -The prio is the internal kernel priority, which is the inverse -of the priority that is usually displayed by user-space tools. -Zero represents the highest priority (99). Prio 100 starts the -"nice" priorities with 100 being equal to nice -20 and 139 being -nice 19. The prio "140" is reserved for the idle task which is -the lowest priority thread (pid 0). - - Latency trace format -------------------- @@ -491,79 +454,6 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] latencies, as described in "Latency trace format". -sched_switch ------------- - -This tracer simply records schedule switches. Here is an example -of how to use it. - - # echo sched_switch > current_tracer - # echo 1 > tracing_enabled - # sleep 1 - # echo 0 > tracing_enabled - # cat trace - -# tracer: sched_switch -# -# TASK-PID CPU# TIMESTAMP FUNCTION -# | | | | | - bash-3997 [01] 240.132281: 3997:120:R + 4055:120:R - bash-3997 [01] 240.132284: 3997:120:R ==> 4055:120:R - sleep-4055 [01] 240.132371: 4055:120:S ==> 3997:120:R - bash-3997 [01] 240.132454: 3997:120:R + 4055:120:S - bash-3997 [01] 240.132457: 3997:120:R ==> 4055:120:R - sleep-4055 [01] 240.132460: 4055:120:D ==> 3997:120:R - bash-3997 [01] 240.132463: 3997:120:R + 4055:120:D - bash-3997 [01] 240.132465: 3997:120:R ==> 4055:120:R - -0 [00] 240.132589: 0:140:R + 4:115:S - -0 [00] 240.132591: 0:140:R ==> 4:115:R - ksoftirqd/0-4 [00] 240.132595: 4:115:S ==> 0:140:R - -0 [00] 240.132598: 0:140:R + 4:115:S - -0 [00] 240.132599: 0:140:R ==> 4:115:R - ksoftirqd/0-4 [00] 240.132603: 4:115:S ==> 0:140:R - sleep-4055 [01] 240.133058: 4055:120:S ==> 3997:120:R - [...] - - -As we have discussed previously about this format, the header -shows the name of the trace and points to the options. The -"FUNCTION" is a misnomer since here it represents the wake ups -and context switches. - -The sched_switch file only lists the wake ups (represented with -'+') and context switches ('==>') with the previous task or -current task first followed by the next task or task waking up. -The format for both of these is PID:KERNEL-PRIO:TASK-STATE. -Remember that the KERNEL-PRIO is the inverse of the actual -priority with zero (0) being the highest priority and the nice -values starting at 100 (nice -20). Below is a quick chart to map -the kernel priority to user land priorities. - - Kernel Space User Space - =============================================================== - 0(high) to 98(low) user RT priority 99(high) to 1(low) - with SCHED_RR or SCHED_FIFO - --------------------------------------------------------------- - 99 sched_priority is not used in scheduling - decisions(it must be specified as 0) - --------------------------------------------------------------- - 100(high) to 139(low) user nice -20(high) to 19(low) - --------------------------------------------------------------- - 140 idle task priority - --------------------------------------------------------------- - -The task states are: - - R - running : wants to run, may not actually be running - S - sleep : process is waiting to be woken up (handles signals) - D - disk sleep (uninterruptible sleep) : process must be woken up - (ignores signals) - T - stopped : process suspended - t - traced : process is being traced (with something like gdb) - Z - zombie : process waiting to be cleaned up - X - unknown - - ftrace_enabled -------------- diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 8f758d070c43..7e62c0a18456 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr) ctx_trace = tr; } -static void stop_sched_trace(struct trace_array *tr) -{ - tracing_stop_sched_switch_record(); -} - -static int sched_switch_trace_init(struct trace_array *tr) -{ - ctx_trace = tr; - tracing_reset_online_cpus(tr); - tracing_start_sched_switch_record(); - return 0; -} - -static void sched_switch_trace_reset(struct trace_array *tr) -{ - if (sched_ref) - stop_sched_trace(tr); -} - -static void sched_switch_trace_start(struct trace_array *tr) -{ - sched_stopped = 0; -} - -static void sched_switch_trace_stop(struct trace_array *tr) -{ - sched_stopped = 1; -} - -static struct tracer sched_switch_trace __read_mostly = -{ - .name = "sched_switch", - .init = sched_switch_trace_init, - .reset = sched_switch_trace_reset, - .start = sched_switch_trace_start, - .stop = sched_switch_trace_stop, - .wait_pipe = poll_wait_pipe, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sched_switch, -#endif -}; - -__init static int init_sched_switch_trace(void) -{ - return register_tracer(&sched_switch_trace); -} -device_initcall(init_sched_switch_trace); - -- cgit v1.2.3 From 6752ab4a9c30d5411b2dfdb251a3f1cb18aae487 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 8 Feb 2011 13:54:06 -0500 Subject: tracing: Deprecate tracing_enabled for tracing_on tracing_enabled should not be used, it is heavy weight and does not do much in helping lower the overhead. tracing_on should be used instead. Warn users to use tracing_on when tracing_enabled is used as it will soon be removed from the tracing directory. Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 38 +++++++++++++++++++------------------- kernel/trace/trace.c | 4 ++++ 2 files changed, 23 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 65eddb7cfa02..67f1cc473257 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -80,11 +80,11 @@ of ftrace. Here is a list of some of the key files: tracers listed here can be configured by echoing their name into current_tracer. - tracing_enabled: + tracing_on: - This sets or displays whether the current_tracer - is activated and tracing or not. Echo 0 into this - file to disable the tracer or 1 to enable it. + This sets or displays whether writing to the trace + ring buffer is enabled. Echo 0 into this file to disable + the tracer or 1 to enable it. trace: @@ -497,10 +497,10 @@ an example: # echo irqsoff > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # ls -ltr [...] - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: irqsoff # @@ -605,10 +605,10 @@ is much like the irqsoff tracer. # echo preemptoff > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # ls -ltr [...] - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: preemptoff # @@ -753,10 +753,10 @@ tracers. # echo preemptirqsoff > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # ls -ltr [...] - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: preemptirqsoff # @@ -916,9 +916,9 @@ Instead of performing an 'ls', we will run 'sleep 1' under # echo wakeup > current_tracer # echo latency-format > trace_options # echo 0 > tracing_max_latency - # echo 1 > tracing_enabled + # echo 1 > tracing_on # chrt -f 5 sleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: wakeup # @@ -1030,9 +1030,9 @@ ftrace_enabled is set; otherwise this tracer is a nop. # sysctl kernel.ftrace_enabled=1 # echo function > current_tracer - # echo 1 > tracing_enabled + # echo 1 > tracing_on # usleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: function # @@ -1070,7 +1070,7 @@ int trace_fd; [...] int main(int argc, char *argv[]) { [...] - trace_fd = open(tracing_file("tracing_enabled"), O_WRONLY); + trace_fd = open(tracing_file("tracing_on"), O_WRONLY); [...] if (condition_hit()) { write(trace_fd, "0", 1); @@ -1521,9 +1521,9 @@ If I am only interested in sys_nanosleep and hrtimer_interrupt: # echo sys_nanosleep hrtimer_interrupt \ > set_ftrace_filter # echo function > current_tracer - # echo 1 > tracing_enabled + # echo 1 > tracing_on # usleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: ftrace # @@ -1769,9 +1769,9 @@ different. The trace is live. # echo function > current_tracer # cat trace_pipe > /tmp/trace.out & [1] 4153 - # echo 1 > tracing_enabled + # echo 1 > tracing_on # usleep 1 - # echo 0 > tracing_enabled + # echo 0 > tracing_on # cat trace # tracer: function # diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dc53ecb80589..8dc8da6733f9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2710,6 +2710,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + if (val) { tracer_enabled = 1; if (current_trace->start) -- cgit v1.2.3 From 868baf07b1a259f5f3803c1dc2777b6c358f83cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Feb 2011 21:26:13 -0500 Subject: ftrace: Fix memory leak with function graph and cpu hotplug When the fuction graph tracer starts, it needs to make a special stack for each task to save the real return values of the tasks. All running tasks have this stack created, as well as any new tasks. On CPU hot plug, the new idle task will allocate a stack as well when init_idle() is called. The problem is that cpu hotplug does not create a new idle_task. Instead it uses the idle task that existed when the cpu went down. ftrace_graph_init_task() will add a new ret_stack to the task that is given to it. Because a clone will make the task have a stack of its parent it does not check if the task's ret_stack is already NULL or not. When the CPU hotplug code starts a CPU up again, it will allocate a new stack even though one already existed for it. The solution is to treat the idle_task specially. In fact, the function_graph code already does, just not at init_idle(). Instead of using the ftrace_graph_init_task() for the idle task, which that function expects the task to be a clone, have a separate ftrace_graph_init_idle_task(). Also, we will create a per_cpu ret_stack that is used by the idle task. When we call ftrace_graph_init_idle_task() it will check if the idle task's ret_stack is NULL, if it is, then it will assign it the per_cpu ret_stack. Reported-by: Benjamin Herrenschmidt Suggested-by: Peter Zijlstra Cc: Stable Tree Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 2 ++ kernel/sched.c | 2 +- kernel/trace/ftrace.c | 52 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 48 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index dcd6a7c3a435..ca29e03c1fac 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -428,6 +428,7 @@ extern void unregister_ftrace_graph(void); extern void ftrace_graph_init_task(struct task_struct *t); extern void ftrace_graph_exit_task(struct task_struct *t); +extern void ftrace_graph_init_idle_task(struct task_struct *t, int cpu); static inline int task_curr_ret_stack(struct task_struct *t) { @@ -451,6 +452,7 @@ static inline void unpause_graph_tracing(void) static inline void ftrace_graph_init_task(struct task_struct *t) { } static inline void ftrace_graph_exit_task(struct task_struct *t) { } +static inline void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) { } static inline int register_ftrace_graph(trace_func_graph_ret_t retfunc, trace_func_graph_ent_t entryfunc) diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..fbe86cb04b61 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5571,7 +5571,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); + ftrace_graph_init_idle_task(idle, cpu); } /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae83883..888b611897d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } -- cgit v1.2.3 From 0de4b34d466bae571b50f41c7296b85248205e35 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 14 Feb 2011 14:48:07 +0900 Subject: tracing/kprobe: Fix NULL pointer deref check Add NULL check for avoiding NULL pointer deref. This bug has been introduced by: 1ff511e35ed8: tracing/kprobes: Add bitfield type which causes a null pointer dereference bug when kprobe-tracer parses an argument without type. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Masami Hiramatsu Cc: 2nddept-manager@sdl.hitachi.co.jp Cc: Peter Zijlstra LKML-Reference: <20110214054807.8919.69740.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Ingo Molnar Reported-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index ccdc542022c3..8435b43b1782 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -935,7 +935,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, parg->offset = tp->size; tp->size += parg->type->size; ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); - if (ret >= 0) + if (ret >= 0 && t != NULL) ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); if (ret >= 0) { parg->fetch_size.fn = get_fetch_size_function(parg->type, -- cgit v1.2.3 From d41d5a01631af821d3a3447e6613a316f5ee6c25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Feb 2011 17:02:20 +0100 Subject: cgroup: Fix cgroup_subsys::exit callback Make the ::exit method act like ::attach, it is after all very nearly the same thing. The bug had no effect on correctness - fixing it is an optimization for the scheduler. Also, later perf-cgroups patches rely on it. Signed-off-by: Peter Zijlstra Acked-by: Paul Menage LKML-Reference: <1297160655.13327.92.camel@laptop> Signed-off-by: Ingo Molnar --- include/linux/cgroup.h | 3 ++- kernel/cgroup.c | 31 ++++++++++++++++++------------- kernel/sched.c | 6 ++---- 3 files changed, 22 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ce104e33cd22..38117d937332 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -474,7 +474,8 @@ struct cgroup_subsys { struct cgroup *old_cgrp, struct task_struct *tsk, bool threadgroup); void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); - void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); + void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task); int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp); void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b24d7027b83c..f6495f33a355 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4230,20 +4230,8 @@ void cgroup_post_fork(struct task_struct *child) */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { - int i; struct css_set *cg; - - if (run_callbacks && need_forkexit_callback) { - /* - * modular subsystems can't use callbacks, so no need to lock - * the subsys array - */ - for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - if (ss->exit) - ss->exit(ss, tsk); - } - } + int i; /* * Unlink from the css_set task list if necessary. @@ -4261,7 +4249,24 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_lock(tsk); cg = tsk->cgroups; tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } task_unlock(tsk); + if (cg) put_css_set_taskexit(cg); } diff --git a/kernel/sched.c b/kernel/sched.c index e142e92f38da..79e611cd83dd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -606,9 +606,6 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; - if (p->flags & PF_EXITING) - return &root_task_group; - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -8863,7 +8860,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. -- cgit v1.2.3 From e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 14 Feb 2011 11:20:01 +0200 Subject: perf: Add cgroup support This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The cgroup to monitor is passed as a file descriptor in the pid argument to the syscall. The file descriptor must be opened to the cgroup name in the cgroup filesystem. For instance, if the cgroup name is foo and cgroupfs is mounted in /cgroup, then the file descriptor is opened to /cgroup/foo. Cgroup mode is activated by passing PERF_FLAG_PID_CGROUP in the flags argument to the syscall. For instance to measure in cgroup foo on CPU1 assuming cgroupfs is mounted under /cgroup: struct perf_event_attr attr; int cgroup_fd, fd; cgroup_fd = open("/cgroup/foo", O_RDONLY); fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP); close(cgroup_fd); Signed-off-by: Stephane Eranian [ added perf_cgroup_{exit,attach} ] Signed-off-by: Peter Zijlstra LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com> Signed-off-by: Ingo Molnar --- include/linux/cgroup.h | 1 + include/linux/cgroup_subsys.h | 4 + include/linux/perf_event.h | 33 ++- init/Kconfig | 10 + kernel/cgroup.c | 23 ++ kernel/perf_event.c | 638 +++++++++++++++++++++++++++++++++++++++--- 6 files changed, 671 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 38117d937332..e654fa239916 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); unsigned short css_depth(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); #else /* !CONFIG_CGROUPS */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ccefff02b6cb..cdbfcb8780ec 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -65,4 +65,8 @@ SUBSYS(net_cls) SUBSYS(blkio) #endif +#ifdef CONFIG_CGROUP_PERF +SUBSYS(perf) +#endif + /* */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dda5b0a3ff60..38c8b2554842 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -464,6 +464,7 @@ enum perf_callchain_context { #define PERF_FLAG_FD_NO_GROUP (1U << 0) #define PERF_FLAG_FD_OUTPUT (1U << 1) +#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ #ifdef __KERNEL__ /* @@ -471,6 +472,7 @@ enum perf_callchain_context { */ #ifdef CONFIG_PERF_EVENTS +# include # include # include #endif @@ -716,6 +718,22 @@ struct swevent_hlist { #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 +#ifdef CONFIG_CGROUP_PERF +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info *info; /* timing info, one per cpu */ +}; +#endif + /** * struct perf_event - performance event kernel representation: */ @@ -832,6 +850,11 @@ struct perf_event { struct event_filter *filter; #endif +#ifdef CONFIG_CGROUP_PERF + struct perf_cgroup *cgrp; /* cgroup event is attach to */ + int cgrp_defer_enabled; +#endif + #endif /* CONFIG_PERF_EVENTS */ }; @@ -886,6 +909,7 @@ struct perf_event_context { u64 generation; int pin_count; struct rcu_head rcu_head; + int nr_cgroups; /* cgroup events present */ }; /* @@ -905,6 +929,9 @@ struct perf_cpu_context { struct list_head rotation_list; int jiffies_interval; struct pmu *active_pmu; +#ifdef CONFIG_CGROUP_PERF + struct perf_cgroup *cgrp; +#endif }; struct perf_output_handle { @@ -1040,11 +1067,11 @@ have_event: __perf_sw_event(event_id, nr, nmi, regs, addr); } -extern atomic_t perf_task_events; +extern atomic_t perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *task) { - COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); + COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task)); } static inline @@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); + COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next)); } extern void perf_event_mmap(struct vm_area_struct *vma); diff --git a/init/Kconfig b/init/Kconfig index be788c0957d4..20d6bd919b8d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED select this option (if, for some reason, they need to disable it then noswapaccount does the trick). +config CGROUP_PERF + bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" + depends on PERF_EVENTS && CGROUPS + help + This option extends the per-cpu mode to restrict monitoring to + threads which belong to the cgroup specificied and run on the + designated cpu. + + Say N if unsure. + menuconfig CGROUP_SCHED bool "Group CPU scheduler" depends on EXPERIMENTAL diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f6495f33a355..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d3f282fa50e..65dcdc76d709 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) return data.ret; } +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; -atomic_t perf_task_events __read_mostly; +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +atomic_t perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); void __weak perf_event_print_debug(void) { } @@ -162,6 +176,338 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp = perf_cgroup_from_task(current); + /* + * do not update time when cgroup is not active + */ + if (!event->cgrp || cgrp != event->cgrp) + return; + + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + if (!task) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = now; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) + return PTR_ERR(css); + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } else { + /* must be done before we fput() the file */ + perf_get_cgroup(event); + } + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx) static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + return ctx ? ctx->time : 0; } @@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - - if (ctx->is_active) + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; else run_end = event->tstamp_stopped; @@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event) run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; + } /* @@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } + if (is_cgroup_event(event)) { + ctx->nr_cgroups++; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -616,7 +995,8 @@ out: static inline int event_filter_match(struct perf_event *event) { - return event->cpu == -1 || event->cpu == smp_processor_id(); + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); } static void @@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event, */ if (event->state == PERF_EVENT_STATE_INACTIVE && !event_filter_match(event)) { - delta = ctx->time - event->tstamp_stopped; + delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; } @@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - /* * Cross CPU call to remove a performance event * @@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info) */ if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); + update_cgrp_time_from_event(event); update_group_times(event); if (event == event->group_leader) group_sched_out(event, cpuctx, ctx); @@ -851,6 +1226,41 @@ retry: raw_spin_unlock_irq(&ctx->lock); } +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event, event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = tstamp - ctx->timestamp; + perf_set_shadow_time(event, ctx, tstamp); if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void perf_event_context_sched_in(struct perf_event_context *ctx); +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); /* * Cross CPU call to install and enable a performance event @@ -1033,11 +1444,17 @@ static int __perf_install_in_context(void *info) * which do context switches with IRQs enabled. */ if (ctx->task && !cpuctx->task_ctx) - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); raw_spin_lock(&ctx->lock); ctx->is_active = 1; update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); add_event_to_ctx(event, ctx); @@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info) if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, perf_clock()); + __perf_event_mark_enabled(event, ctx); - if (!event_filter_match(event)) + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); goto unlock; + } /* * If the event is in a group and isn't the group leader, @@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); if (!ctx->nr_active) goto out; @@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task, for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); } static void task_ctx_sched_out(struct perf_event_context *ctx, @@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, 1)) group_sched_in(event, cpuctx, ctx); @@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (!event_filter_match(event)) continue; + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; @@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { + u64 now; + raw_spin_lock(&ctx->lock); ctx->is_active = 1; if (likely(!ctx->nr_events)) goto out; - ctx->timestamp = perf_clock(); - + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, now); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1601,11 +2048,12 @@ out: } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) + enum event_type_t event_type, + struct task_struct *task) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, task); } static void task_ctx_sched_in(struct perf_event_context *ctx, @@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, if (cpuctx->task_ctx == ctx) return; - ctx_sched_in(ctx, cpuctx, event_type); + ctx_sched_in(ctx, cpuctx, event_type, NULL); cpuctx->task_ctx = ctx; } -static void perf_event_context_sched_in(struct perf_event_context *ctx) +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) { struct perf_cpu_context *cpuctx; @@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx) */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); cpuctx->task_ctx = ctx; @@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task) if (likely(!ctx)) continue; - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, task); } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) if (ctx) rotate_ctx(ctx); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); if (ctx) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); @@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); - perf_event_context_sched_in(ctx); + perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); } @@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); @@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event) * (e.g., thread is blocked), in that case * we cannot update context time */ - if (ctx->is_active) + if (ctx->is_active) { update_context_time(ctx); + update_cgrp_time_from_event(event); + } update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } @@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_task_events); + jump_label_dec(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event) event->buffer = NULL; } + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + if (event->destroy) event->destroy(event); @@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event) if (!in_nmi()) { update_context_time(event->ctx); + update_cgrp_time_from_event(event); time = event->ctx->time; } else { u64 now = perf_clock(); @@ -5725,7 +6189,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_task_events); + jump_label_inc(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open, int err; /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + if (flags & ~PERF_FLAG_ALL) return -EINVAL; err = perf_copy_attr(attr_uptr, &attr); @@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + event_fd = get_unused_fd_flags(O_RDWR); if (event_fd < 0) return event_fd; @@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } - if (pid != -1) { + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { task = find_lively_task_by_vpid(pid); if (IS_ERR(task)) { err = PTR_ERR(task); @@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + } + /* * Special case software events and allow them to be part of * any hardware group. @@ -6808,3 +7287,92 @@ unlock: return ret; } device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + struct perf_cgroup_info *t; + int c; + + jc = kmalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + memset(jc, 0, sizeof(*jc)); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + for_each_possible_cpu(c) { + t = per_cpu_ptr(jc->info, c); + t->time = 0; + t->timestamp = 0; + } + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void perf_cgroup_move(struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task, + bool threadgroup) +{ + perf_cgroup_move(task); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + perf_cgroup_move(c); + } + rcu_read_unlock(); + } +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_move(task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach = perf_cgroup_attach, +}; +#endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From 163ec4354a5135c6c38c3f4a9b46a31900ebdf48 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Feb 2011 11:22:34 +0100 Subject: perf: Optimize throttling code By pre-computing the maximum number of samples per tick we can avoid a multiplication and a conditional since MAX_INTERRUPTS > max_samples_per_tick. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ kernel/perf_event.c | 43 ++++++++++++++++++++++++------------------- kernel/sysctl.c | 2 +- 3 files changed, 29 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 38c8b2554842..8ceb5a6fd9c9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1110,6 +1110,10 @@ extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; +extern int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + static inline bool perf_paranoid_tracepoint_raw(void) { return sysctl_perf_event_paranoid > -1; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 65dcdc76d709..e03be08d0ddf 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -150,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* * max perf event sample rate */ -int sysctl_perf_event_sample_rate __read_mostly = 100000; +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} static atomic64_t perf_event_id; @@ -4941,26 +4958,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (unlikely(!is_sampling_event(event))) return 0; - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_event_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(event, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling events even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the event got enabled again: - */ + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); ret = 1; } - } + } else + hwc->interrupts++; if (event->attr.freq) { u64 now = perf_clock(); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0f1bd83db985..daef911cbadb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -948,7 +948,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_perf_event_sample_rate, .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = perf_proc_update_handler, }, #endif #ifdef CONFIG_KMEMCHECK -- cgit v1.2.3 From ba3dd36c6775264ee6e7354ba1aabcd6e86d7298 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Feb 2011 12:41:46 +0100 Subject: perf: Optimize hrtimer events There is no need to re-initialize the hrtimer every time we start it, so don't do that (shaves a few cycles). Also, since we know hrtimers run at a fixed rate (nanoseconds) we can pre-compute the desired frequency at which they tick. This avoids us having to go through the whole adaptive frequency feedback logic (shaves another few cycles). Signed-off-by: Peter Zijlstra LKML-Reference: <1297448589.5226.47.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e03be08d0ddf..a0a6987fabc4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5602,6 +5602,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) u64 period; event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + event->pmu->read(event); perf_sample_data_init(&data, 0); @@ -5628,9 +5632,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) if (!is_sampling_event(event)) return; - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - period = local64_read(&hwc->period_left); if (period) { if (period < 0) @@ -5657,6 +5658,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) } } +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + /* * Software event: cpu wall time clock */ @@ -5709,6 +5734,8 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } @@ -5787,6 +5814,8 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + perf_swevent_init_hrtimer(event); + return 0; } -- cgit v1.2.3 From e9345aab675382176740bc8a2c6d3caf1510e46d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Feb 2011 08:09:49 +0100 Subject: Revert "tracing: Add unstable sched clock note to the warning" This reverts commit 5e38ca8f3ea423442eaafe1b7e206084aa38120a. Breaks the build of several !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures. Cc: Jiri Olsa Cc: Steven Rostedt Message-ID: <20110217171823.GB17058@elte.hu> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7739893a1d0a..bd1c35a4fbcc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2163,14 +2163,10 @@ rb_reserve_next_event(struct ring_buffer *buffer, delta = diff; if (unlikely(test_time_stamp(delta))) { WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp, - sched_clock_stable ? "" : - "If you just came from a suspend/resume,\n" - "please switch to the trace global clock:\n" - " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + (unsigned long long)cpu_buffer->write_stamp); add_timestamp = 1; } } -- cgit v1.2.3 From 3f7cce3c18188a067d463749168bdda5abc5b0f7 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 18 Feb 2011 14:40:01 +0200 Subject: perf_events: Fix rcu and locking issues with cgroup support This patches ensures that we do not end up calling perf_cgroup_from_task() when there is no cgroup event. This avoids potential RCU and locking issues. The change in perf_cgroup_set_timestamp() ensures we check against ctx->nr_cgroups. It also avoids calling perf_clock() tiwce in a row. It also ensures we do need to grab ctx->lock before calling the function. We drop update_cgrp_time() from task_clock_event_read() because it is not needed. This also avoids having to deal with perf_cgroup_from_task(). Thanks to Peter Zijlstra for his help on this. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d5e76b8.815bdf0a.7ac3.774f@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a0a6987fabc4..dadeaea4b3fc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -201,6 +201,11 @@ __get_cpu_context(struct perf_event_context *ctx) #ifdef CONFIG_CGROUP_PERF +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { @@ -268,28 +273,41 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current); + struct perf_cgroup *cgrp; + /* - * do not update time when cgroup is not active + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) */ - if (!event->cgrp || cgrp != event->cgrp) + if (!is_cgroup_event(event)) return; - __update_cgrp_time(event->cgrp); + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; - if (!task) + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) return; cgrp = perf_cgroup_from_task(task); info = this_cpu_ptr(cgrp->info); - info->timestamp = now; + info->timestamp = ctx->timestamp; } #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ @@ -494,7 +512,8 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, u64 now) +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) { } @@ -1613,7 +1632,7 @@ static int __perf_event_enable(void *info) /* * set current task's cgroup time reference point */ - perf_cgroup_set_timestamp(current, perf_clock()); + perf_cgroup_set_timestamp(current, ctx); __perf_event_mark_enabled(event, ctx); @@ -2048,7 +2067,7 @@ ctx_sched_in(struct perf_event_context *ctx, now = perf_clock(); ctx->timestamp = now; - perf_cgroup_set_timestamp(task, now); + perf_cgroup_set_timestamp(task, ctx); /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -5795,7 +5814,6 @@ static void task_clock_event_read(struct perf_event *event) if (!in_nmi()) { update_context_time(event->ctx); - update_cgrp_time_from_event(event); time = event->ctx->time; } else { u64 now = perf_clock(); -- cgit v1.2.3 From 768a06e2ca49cdf72389208cfc056a36cf8bc5e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Feb 2011 16:52:24 +0100 Subject: perf: Simplify task_clock_event_read() There is no point in us having different code paths for nmi and !nmi here, so remove the !nmi one. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index dadeaea4b3fc..64a018e94fca 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5810,16 +5810,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; task_clock_event_update(event, time); } -- cgit v1.2.3 From 940c5b2971de443df22eed0441bc74fb0116e9f5 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sun, 27 Feb 2011 21:13:31 +0800 Subject: perf: Fix the missing event initialization when pmu is found in idr Currently, the event is not initialized if pmu is found in idr. This never causes bug just because now no pmu is associated with the idr id. Signed-off-by: Lin Ming Signed-off-by: Peter Zijlstra LKML-Reference: <1298812411.2699.9.camel@localhost> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 64a018e94fca..821ce8221974 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6098,17 +6098,22 @@ struct pmu *perf_init_event(struct perf_event *event) { struct pmu *pmu = NULL; int idx; + int ret; idx = srcu_read_lock(&pmus_srcu); rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); - if (pmu) + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); goto unlock; + } list_for_each_entry_rcu(pmu, &pmus, entry) { - int ret = pmu->event_init(event); + ret = pmu->event_init(event); if (!ret) goto unlock; -- cgit v1.2.3 From 3db272c0494900fcb905a201180a78cae3addd6e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:25:37 +0800 Subject: perf cgroup: Fix leak of file reference count In perf_cgroup_connect(), fput_light() is missing in a failure path. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F3461.6060406@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 821ce8221974..7c999e809ba3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -404,8 +404,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, return -EBADF; css = cgroup_css_from_dir(file, perf_subsys_id); - if (IS_ERR(css)) - return PTR_ERR(css); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; @@ -422,6 +424,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, /* must be done before we fput() the file */ perf_get_cgroup(event); } +out: fput_light(file, fput_needed); return ret; } -- cgit v1.2.3 From f75e18cb9627b1d3d752b83a0b5563da0042c50a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:25:50 +0800 Subject: perf cgroup: Fix unmatched call to perf_detach_cgroup() In the failure path, we call perf_detach_cgroup(), but we didn't call perf_get_cgroup() prio to it. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F346E.9070606@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7c999e809ba3..b00209544d57 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -412,6 +412,9 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; + /* must be done before we fput() the file */ + perf_get_cgroup(event); + /* * all events in a group must monitor * the same cgroup because a task belongs @@ -420,9 +423,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (group_leader && group_leader->cgrp != cgrp) { perf_detach_cgroup(event); ret = -EINVAL; - } else { - /* must be done before we fput() the file */ - perf_get_cgroup(event); } out: fput_light(file, fput_needed); -- cgit v1.2.3 From 1b15d0558e82df9b3659804ceb44187b98eda354 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 3 Mar 2011 14:26:06 +0800 Subject: perf cgroup: Clean up perf_cgroup_create() - Use kzalloc() to replace kmalloc() + memset(). - Remove redundant initialization, since alloc_percpu() returns zero-filled percpu memory. Signed-off-by: Li Zefan Acked-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4D6F347E.2010806@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b00209544d57..193b1900e64f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -7346,26 +7346,17 @@ static struct cgroup_subsys_state *perf_cgroup_create( struct cgroup_subsys *ss, struct cgroup *cont) { struct perf_cgroup *jc; - struct perf_cgroup_info *t; - int c; - jc = kmalloc(sizeof(*jc), GFP_KERNEL); + jc = kzalloc(sizeof(*jc), GFP_KERNEL); if (!jc) return ERR_PTR(-ENOMEM); - memset(jc, 0, sizeof(*jc)); - jc->info = alloc_percpu(struct perf_cgroup_info); if (!jc->info) { kfree(jc); return ERR_PTR(-ENOMEM); } - for_each_possible_cpu(c) { - t = per_cpu_ptr(jc->info, c); - t->time = 0; - t->timestamp = 0; - } return &jc->css; } -- cgit v1.2.3 From 08309379b7083a9ceec0f9bb96a629058fb623c4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Mar 2011 11:31:20 +0100 Subject: perf: Fix cgroup vs jump_label problem Li Zefan reported that the jump label code sleeps and we're calling it under a spinlock, *fail* ;-) Reported-by: Li Zefan Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 193b1900e64f..ed253aa24ba4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -820,16 +820,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) { + if (is_cgroup_event(event)) ctx->nr_cgroups++; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); - } list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) @@ -957,11 +949,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { + if (is_cgroup_event(event)) ctx->nr_cgroups--; - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); - } ctx->nr_events--; if (event->attr.inherit_stat) @@ -2903,6 +2892,10 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } } if (event->buffer) { @@ -6478,6 +6471,13 @@ SYSCALL_DEFINE5(perf_event_open, err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); } /* -- cgit v1.2.3 From 750912fa366312e9c5bc83eab352898a26750401 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Wed, 8 Dec 2010 13:46:47 -0800 Subject: tracing: Add an 'overwrite' trace_option. Add an "overwrite" trace_option for ftrace to control whether the buffer should be overwritten on overflow or not. The default remains to overwrite old events when the buffer is full. This patch adds the option to instead discard newest events when the buffer is full. This is useful to get a snapshot of traces just after enabling traces. Dropping the current event is also a simpler code path. Signed-off-by: David Sharp LKML-Reference: <1291844807-15481-1-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 5 +++++ include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 11 +++++++++++ kernel/trace/trace.c | 17 +++++++++++------ kernel/trace/trace.h | 1 + 5 files changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 67f1cc473257..1ebc24cf9a55 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -454,6 +454,11 @@ x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6] latencies, as described in "Latency trace format". + overwrite - This controls what happens when the trace buffer is + full. If "1" (default), the oldest events are + discarded and overwritten. If "0", then the newest + events are discarded. + ftrace_enabled -------------- diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 8d3a2486544d..ab38ac80b0f9 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -100,6 +100,8 @@ void ring_buffer_free(struct ring_buffer *buffer); int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val); + struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length); int ring_buffer_unlock_commit(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bd1c35a4fbcc..269db80a961e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1429,6 +1429,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) } EXPORT_SYMBOL_GPL(ring_buffer_resize); +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + static inline void * __rb_data_page_index(struct buffer_data_page *bpage, unsigned index) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8dc8da6733f9..85e3ee1e474e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -41,8 +41,6 @@ #include "trace.h" #include "trace_output.h" -#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) - /* * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. @@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -425,6 +423,7 @@ static const char *trace_options[] = { "sleep-time", "graph-time", "record-cmd", + "overwrite", NULL }; @@ -2529,6 +2528,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); } static ssize_t @@ -4555,9 +4557,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) __init static int tracer_alloc_buffers(void) { int ring_buf_size; + enum ring_buffer_flags rb_flags; int i; int ret = -ENOMEM; + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; @@ -4570,12 +4574,13 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); @@ -4585,7 +4590,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, rb_flags); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 856e73cc1d3f..951d0b7e7062 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -606,6 +606,7 @@ enum trace_iterator_flags { TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, }; /* -- cgit v1.2.3 From de29be5e712dc8b7eef2bef9417af3bb6a88e47a Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:16 -0800 Subject: ring-buffer: Remove unused #include Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-3-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 269db80a961e..3237d961d905 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5,7 +5,6 @@ */ #include #include -#include #include #include #include -- cgit v1.2.3 From e6e1e2593592a8f6f6380496655d8c6f67431266 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 10:41:56 -0500 Subject: tracing: Remove lock_depth from event entry The lock_depth field in the event headers was added as a temporary data point for help in removing the BKL. Now that the BKL is pretty much been removed, we can remove this field. This in turn changes the header from 12 bytes to 8 bytes, removing the 4 byte buffer that gcc would insert if the first field in the data load was 8 bytes in size. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.c | 8 +++----- kernel/trace/trace_events.c | 1 - kernel/trace/trace_output.c | 10 ++-------- 4 files changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 1a99e7939c2b..22b32af1b5ec 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -37,7 +37,6 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; - int lock_depth; }; #define FTRACE_MAX_EVENT \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85e3ee1e474e..fd6e1b906b3c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1101,7 +1101,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; - entry->lock_depth = (tsk) ? tsk->lock_depth : 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1748,10 +1747,9 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# | / _----=> need-resched \n"); seq_puts(m, "# || / _---=> hardirq/softirq \n"); seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| /_--=> lock-depth \n"); - seq_puts(m, "# |||||/ delay \n"); - seq_puts(m, "# cmd pid |||||| time | caller \n"); - seq_puts(m, "# \\ / |||||| \\ | / \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5f499e0438a4..e1d579b19834 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,7 +116,6 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); - __common_field(int, lock_depth); return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02272baa2206..151f32e5f2c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -529,7 +529,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) * @entry: The trace entry field from the ring buffer * * Prints the generic fields of irqs off, in hard or softirq, preempt - * count and lock depth. + * count. */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { @@ -554,13 +554,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) else ret = trace_seq_putc(s, '.'); - if (!ret) - return 0; - - if (entry->lock_depth < 0) - return trace_seq_putc(s, '.'); - - return trace_seq_printf(s, "%d", entry->lock_depth); + return ret; } static int -- cgit v1.2.3 From 140e4f2d1cd816aed196705c036763313c0e4bd3 Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:19 -0800 Subject: tracing: Fix event alignment: ftrace:context_switch and ftrace:wakeup Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-6-git-send-email-dhsharp@google.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 6cf223764be8..1516cb3ec549 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, */ #define FTRACE_CTX_FIELDS \ __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ __field( unsigned char, prev_prio ) \ __field( unsigned char, prev_state ) \ - __field( unsigned int, next_pid ) \ __field( unsigned char, next_prio ) \ - __field( unsigned char, next_state ) \ - __field( unsigned int, next_cpu ) + __field( unsigned char, next_state ) FTRACE_ENTRY(context_switch, ctx_switch_entry, -- cgit v1.2.3 From 10da37a645b5e915d8572cc2b1f5eb11ada3ea4f Mon Sep 17 00:00:00 2001 From: David Sharp Date: Fri, 3 Dec 2010 16:13:26 -0800 Subject: tracing: Adjust conditional expression latency formatting. Formatting change only to improve code readability. No code changes except to introduce intermediate variables. Signed-off-by: David Sharp LKML-Reference: <1291421609-14665-13-git-send-email-dhsharp@google.com> [ Keep variable declarations and assignment separate ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 151f32e5f2c6..456be9063c2d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -533,20 +533,30 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) */ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { - int hardirq, softirq; + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; int ret; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + if (!trace_seq_printf(s, "%c%c%c", - (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : - (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? - 'X' : '.', - (entry->flags & TRACE_FLAG_NEED_RESCHED) ? - 'N' : '.', - (hardirq && softirq) ? 'H' : - hardirq ? 'h' : softirq ? 's' : '.')) + irqs_off, need_resched, hardsoft_irq)) return 0; if (entry->preempt_count) -- cgit v1.2.3 From 31274d72f01604f4b02d933b4f3cac84d2c201fd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 18 Feb 2011 15:52:19 +0100 Subject: tracing: Explain about unstable clock on resume with ring buffer warning The "Delta way too big" warning might appear on a system with a unstable shed clock right after the system is resumed and tracing was enabled at time of suspend. Since it's not realy a bug, and the unstable sched clock is working fast and reliable otherwise, Steven suggested to keep using the sched clock in any case and just to make note in the warning itself. v2 changes: - added #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Signed-off-by: Jiri Olsa LKML-Reference: <20110218145219.GD2604@jolsa.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3237d961d905..db7b439d23ee 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2172,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (likely(ts >= cpu_buffer->write_stamp)) { delta = diff; if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif WARN_ONCE(delta > (1ULL << 59), - KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n", + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", (unsigned long long)delta, (unsigned long long)ts, - (unsigned long long)cpu_buffer->write_stamp); + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); add_timestamp = 1; } } -- cgit v1.2.3 From 56355b83e2a24ce7e1870c8479205e2cdd332225 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Mon, 8 Nov 2010 14:05:12 +0800 Subject: tracing: Export trace_set_clr_event() Trace events belonging to a module only exists when the module is loaded. Well, we can use trace_set_clr_event funtion to enable some trace event at the module init routine, so that we will not miss something while loading then module. So, Export the trace_set_clr_event function so that module can use it. Signed-off-by: Yuanhan Liu LKML-Reference: <1289196312-25323-1-git-send-email-yuanhan.liu@linux.intel.com> Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e1d579b19834..e88f74fe1d4c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -325,6 +325,7 @@ int trace_set_clr_event(const char *system, const char *event, int set) { return __ftrace_set_clr_event(NULL, system, event, set); } +EXPORT_SYMBOL_GPL(trace_set_clr_event); /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.3 From 9a24470b2826e4665b1484836c7ae6aba1ddea32 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 14:53:38 -0500 Subject: tracing: Align 4 byte ints together in struct tracer Move elements in struct tracer for better alignment. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 951d0b7e7062..5e9dfc6286dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -272,8 +272,8 @@ struct tracer { /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); struct tracer *next; - int print_max; struct tracer_flags *flags; + int print_max; int use_max_tr; }; -- cgit v1.2.3 From 4a0b1665db09cf2da9ad7d0f12da386373c10bfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Mar 2011 20:09:26 -0500 Subject: tracing: Fix irqoff selftest expanding max buffer If the kernel command line declares a tracer "ftrace=sometracer" and that tracer is either not defined or is enabled after irqsoff, then the irqs off selftest will fail with the following error: Testing tracer irqsoff: ------------[ cut here ]------------ WARNING: at /home/rostedt/work/autotest/nobackup/linux-test.git/kernel/trace/tra ce.c:713 update_max_tr_single+0xfa/0x11b() Hardware name: Modules linked in: Pid: 1, comm: swapper Not tainted 2.6.38-rc8-test #1 Call Trace: [] ? warn_slowpath_common+0x65/0x7a [] ? update_max_tr_single+0xfa/0x11b [] ? warn_slowpath_null+0xf/0x13 [] ? update_max_tr_single+0xfa/0x11b [] ? stop_critical_timing+0x154/0x204 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? time_hardirqs_on+0x25/0x28 [] ? trace_hardirqs_on_caller+0x18/0x12f [] ? trace_hardirqs_on+0xb/0xd [] ? trace_selftest_startup_irqsoff+0x5b/0xc1 [] ? register_tracer+0xf8/0x1a3 [] ? init_irqsoff_tracer+0xd/0x11 [] ? do_one_initcall+0x71/0x121 [] ? init_irqsoff_tracer+0x0/0x11 [] ? kernel_init+0x13a/0x1b6 [] ? kernel_init+0x0/0x1b6 [] ? kernel_thread_helper+0x6/0x10 ---[ end trace e93713a9d40cd06c ]--- .. no entries found ..FAILED! What happens is the "ftrace=..." will expand the ring buffer to its default size (from its minimum size) but it will not expand the max ring buffer (the ring buffer to store maximum latencies). When the irqsoff test runs, it will call the ring buffer swap routine that checks if the max ring buffer is the same size as the normal ring buffer, and will fail if it is not. This causes the test to fail. The solution is to expand the max ring buffer before running the self test if the max ring buffer is used by that tracer and the normal ring buffer is expanded. The max ring buffer should be shrunk again after the test is done to save space. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fd6e1b906b3c..9541c27c1cf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -779,6 +779,11 @@ __acquires(kernel_lock) tracing_reset_online_cpus(tr); current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); @@ -791,6 +796,10 @@ __acquires(kernel_lock) /* Only reset on passing, to avoid touching corrupted buffers */ tracing_reset_online_cpus(tr); + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + printk(KERN_CONT "PASSED\n"); } #endif -- cgit v1.2.3