From 2f5f6ad9390c1ebbf738d130dbfe80b60eaa167e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Aug 2011 16:57:47 -0400 Subject: ftrace: Pass ftrace_ops as third parameter to function trace callback Currently the function trace callback receives only the ip and parent_ip of the function that it traced. It would be more powerful to also return the ops that registered the function as well. This allows the same function to act differently depending on what ftrace_ops registered it. Link: http://lkml.kernel.org/r/20120612225424.267254552@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 101 ++++++++++++++++++++++++++------------ kernel/trace/trace_event_perf.c | 3 +- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_functions.c | 9 ++-- kernel/trace/trace_irqsoff.c | 3 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 15 ++++-- kernel/trace/trace_stack.c | 2 +- 8 files changed, 94 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4f20fba09fc..4f2ab9352a68 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -64,12 +64,19 @@ #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) +static struct ftrace_ops ftrace_list_end __read_mostly = { + .func = ftrace_stub, +}; + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; /* Quick disabling of function tracer. */ -int function_trace_stop; +int function_trace_stop __read_mostly; + +/* Current function tracing op */ +struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; /* List for set_ftrace_pid's pids. */ LIST_HEAD(ftrace_pids); @@ -86,10 +93,6 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, -}; - static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; @@ -100,8 +103,14 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op); +#else +/* See comment below, where ftrace_ops_list_func is defined */ +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); +#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) +#endif /* * Traverse the ftrace_global_list, invoking all entries. The reason that we @@ -112,29 +121,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); * * Silly Alpha and silly pointer-speculation compiler optimizations! */ -static void ftrace_global_list_func(unsigned long ip, - unsigned long parent_ip) +static void +ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) return; trace_recursion_set(TRACE_GLOBAL_BIT); op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { - op->func(ip, parent_ip); + op->func(ip, parent_ip, op); op = rcu_dereference_raw(op->next); /*see above*/ }; trace_recursion_clear(TRACE_GLOBAL_BIT); } -static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip); + ftrace_pid_function(ip, parent_ip, op); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -163,12 +172,13 @@ void clear_ftrace_function(void) * For those archs that do not test ftrace_trace_stop in their * mcount call site, we need to do it from C. */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { if (function_trace_stop) return; - __ftrace_trace_function(ip, parent_ip); + __ftrace_trace_function(ip, parent_ip, op); } #endif @@ -230,15 +240,24 @@ static void update_ftrace_function(void) /* * If we are at the end of the list and this ops is - * not dynamic, then have the mcount trampoline call - * the function directly + * not dynamic and the arch supports passing ops, then have the + * mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && - !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + ARCH_SUPPORTS_FTRACE_OPS)) { + /* Set the ftrace_ops that the arch callback uses */ + if (ftrace_ops_list == &global_ops) + function_trace_op = ftrace_global_list; + else + function_trace_op = ftrace_ops_list; func = ftrace_ops_list->func; - else + } else { + /* Just use the default ftrace_ops */ + function_trace_op = &ftrace_list_end; func = ftrace_ops_list_func; + } #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; @@ -773,7 +792,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) } static void -function_profile_call(unsigned long ip, unsigned long parent_ip) +function_profile_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -803,7 +823,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0); + function_profile_call(trace->func, 0, NULL); return 1; } @@ -2790,8 +2810,8 @@ static int __init ftrace_mod_cmd_init(void) } device_initcall(ftrace_mod_cmd_init); -static void -function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct ftrace_func_probe *entry; struct hlist_head *hhd; @@ -3942,10 +3962,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { - struct ftrace_ops *op; - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) return; @@ -3959,7 +3978,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) while (op != &ftrace_list_end) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op); op = rcu_dereference_raw(op->next); }; @@ -3971,8 +3990,9 @@ static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, }; -static void -ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +static inline void +__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ignored) { struct ftrace_ops *op; @@ -3988,13 +4008,32 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip); + op->func(ip, parent_ip, op); op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); trace_recursion_clear(TRACE_INTERNAL_BIT); } +/* + * Some archs only support passing ip and parent_ip. Even though + * the list function ignores the op parameter, we do not want any + * C side effects, where a function is called without the caller + * sending a third parameter. + */ +#if ARCH_SUPPORTS_FTRACE_OPS +static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL); +} +#else +static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +{ + __ftrace_ops_list_func(ip, parent_ip, NULL); +} +#endif + static void clear_ftrace_swapper(void) { struct task_struct *p; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fee3752ae8f6..a872a9a298a0 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void -perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops) { struct ftrace_entry *entry; struct hlist_head *head; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29111da1d100..88daa5177bf4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1681,7 +1681,8 @@ static __init void event_trace_self_tests(void) static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void -function_test_events_call(unsigned long ip, unsigned long parent_ip) +function_test_events_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct ring_buffer_event *event; struct ring_buffer *buffer; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db09..fceb7a9aa06d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -48,7 +48,8 @@ static void function_trace_start(struct trace_array *tr) } static void -function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -75,7 +76,8 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) } static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -106,7 +108,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) } static void -function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +function_stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e920368..2862c77f95d9 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr, * irqsoff uses its own tracer function to keep the overhead down: */ static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..0caf4f5da569 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -108,7 +108,7 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977fb..9ae40c823af8 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -103,35 +103,40 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_probe1_cnt++; } static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_probe2_cnt++; } static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_probe3_cnt++; } static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_global_cnt++; } static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, - unsigned long pip) + unsigned long pip, + struct ftrace_ops *op) { trace_selftest_test_dyn_cnt++; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242e..e20006d5fb6a 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,7 @@ static inline void check_stack(void) } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip) +stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) { int cpu; -- cgit v1.2.3 From ccf3672d530170c98c734dfc5db07d64bcbad2ad Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jun 2012 09:44:25 -0400 Subject: ftrace: Consolidate arch dependent functions with 'list' function As the function tracer starts to get more features, the support for theses features will spread out throughout the different architectures over time. These features boil down to what each arch does in the mcount trampoline (the ftrace_caller). Currently there's two features that are not the same throughout the archs. 1) Support to stop function tracing before the callback 2) passing of the ftrace ops Both of these require placing an indirect function to support the features if the mcount trampoline does not. On a side note, for all architectures, when more than one callback is registered to the function tracer, an intermediate 'list' function is called by the mcount trampoline to iterate through the callbacks that are registered. Instead of making a separate function for each of these features, and requiring several indirect calls, just use the single 'list' function as the intermediate, to handle all cases. If an arch does not support the 'stop function tracing' or the passing of ftrace ops, just force it to use the list function that will handle the features required. This makes the code cleaner and simpler and removes a lot of #ifdefs in the code. Link: http://lkml.kernel.org/r/20120612225424.495625483@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 13 +++++++++++++ kernel/trace/ftrace.c | 45 ++++----------------------------------------- 2 files changed, 17 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 2d5964119885..3651fdc3bec9 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -27,6 +27,19 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif +/* + * If the arch's mcount caller does not support all of ftrace's + * features, then it must call an indirect function that + * does. Or at least does enough to prevent any unwelcomed side effects. + */ +#if !defined(CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST) || \ + !ARCH_SUPPORTS_FTRACE_OPS +# define FTRACE_FORCE_LIST_FUNC 1 +#else +# define FTRACE_FORCE_LIST_FUNC 0 +#endif + + struct module; struct ftrace_hash; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f2ab9352a68..4cbca2e6eb70 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -97,8 +97,6 @@ static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; -ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; @@ -162,26 +160,9 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - __ftrace_trace_function = ftrace_stub; - __ftrace_trace_function_delay = ftrace_stub; ftrace_pid_function = ftrace_stub; } -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST -/* - * For those archs that do not test ftrace_trace_stop in their - * mcount call site, we need to do it from C. - */ -static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) -{ - if (function_trace_stop) - return; - - __ftrace_trace_function(ip, parent_ip, op); -} -#endif - static void control_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -246,7 +227,7 @@ static void update_ftrace_function(void) if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && - ARCH_SUPPORTS_FTRACE_OPS)) { + !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) function_trace_op = ftrace_global_list; @@ -259,18 +240,7 @@ static void update_ftrace_function(void) func = ftrace_ops_list_func; } -#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST ftrace_trace_function = func; -#else -#ifdef CONFIG_DYNAMIC_FTRACE - /* do not update till all functions have been modified */ - __ftrace_trace_function_delay = func; -#else - __ftrace_trace_function = func; -#endif - ftrace_trace_function = - (func == ftrace_stub) ? func : ftrace_test_stop_func; -#endif } static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) @@ -1902,16 +1872,6 @@ static void ftrace_run_update_code(int command) */ arch_ftrace_update_code(command); -#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST - /* - * For archs that call ftrace_test_stop_func(), we must - * wait till after we update all the function callers - * before we update the callback. This keeps different - * ops that record different functions from corrupting - * each other. - */ - __ftrace_trace_function = __ftrace_trace_function_delay; -#endif function_trace_stop--; ret = ftrace_arch_code_modify_post_process(); @@ -3996,6 +3956,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { struct ftrace_ops *op; + if (function_trace_stop) + return; + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) return; -- cgit v1.2.3 From a1e2e31d175a1349274eba3465d17616c6725f8c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Aug 2011 12:50:46 -0400 Subject: ftrace: Return pt_regs to function trace callback Return as the 4th paramater to the function tracer callback the pt_regs. Later patches that implement regs passing for the architectures will require having the ftrace_ops set the SAVE_REGS flag, which will tell the arch to take the time to pass a full set of pt_regs to the ftrace_ops callback function. If the arch does not support it then it should pass NULL. If an arch can pass full regs, then it should define: ARCH_SUPPORTS_FTRACE_SAVE_REGS to 1 Link: http://lkml.kernel.org/r/20120702201821.019966811@goodmis.org Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 6 ++++-- kernel/trace/ftrace.c | 37 ++++++++++++++++++++++--------------- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c | 7 ++++--- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 3 ++- kernel/trace/trace_selftest.c | 15 ++++++++++----- kernel/trace/trace_stack.c | 3 ++- 9 files changed, 47 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3651fdc3bec9..e4202881fb00 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -54,7 +55,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, struct ftrace_ops; typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op); + struct ftrace_ops *op, struct pt_regs *regs); /* * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are @@ -188,7 +189,8 @@ static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) return *this_cpu_ptr(ops->disabled); } -extern void ftrace_stub(unsigned long a0, unsigned long a1, struct ftrace_ops *op); +extern void ftrace_stub(unsigned long a0, unsigned long a1, + struct ftrace_ops *op, struct pt_regs *regs); #else /* !CONFIG_FUNCTION_TRACER */ /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4cbca2e6eb70..6ff07ad0ede3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -103,7 +103,7 @@ static struct ftrace_ops control_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op); + struct ftrace_ops *op, struct pt_regs *regs); #else /* See comment below, where ftrace_ops_list_func is defined */ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); @@ -121,7 +121,7 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); */ static void ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) return; @@ -129,19 +129,19 @@ ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, trace_recursion_set(TRACE_GLOBAL_BIT); op = rcu_dereference_raw(ftrace_global_list); /*see above*/ while (op != &ftrace_list_end) { - op->func(ip, parent_ip, op); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); /*see above*/ }; trace_recursion_clear(TRACE_GLOBAL_BIT); } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip, op); + ftrace_pid_function(ip, parent_ip, op, regs); } static void set_ftrace_pid_function(ftrace_func_t func) @@ -763,7 +763,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) static void function_profile_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops) + struct ftrace_ops *ops, struct pt_regs *regs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -793,7 +793,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip, #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int profile_graph_entry(struct ftrace_graph_ent *trace) { - function_profile_call(trace->func, 0, NULL); + function_profile_call(trace->func, 0, NULL, NULL); return 1; } @@ -2771,7 +2771,7 @@ static int __init ftrace_mod_cmd_init(void) device_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ftrace_func_probe *entry; struct hlist_head *hhd; @@ -3923,7 +3923,7 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) static void ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) return; @@ -3938,7 +3938,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, while (op != &ftrace_list_end) { if (!ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) - op->func(ip, parent_ip, op); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; @@ -3952,7 +3952,7 @@ static struct ftrace_ops control_ops = { static inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ignored) + struct ftrace_ops *ignored, struct pt_regs *regs) { struct ftrace_ops *op; @@ -3971,7 +3971,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, op = rcu_dereference_raw(ftrace_ops_list); while (op != &ftrace_list_end) { if (ftrace_ops_test(op, ip)) - op->func(ip, parent_ip, op); + op->func(ip, parent_ip, op, regs); op = rcu_dereference_raw(op->next); }; preempt_enable_notrace(); @@ -3983,17 +3983,24 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * the list function ignores the op parameter, we do not want any * C side effects, where a function is called without the caller * sending a third parameter. + * Archs are to support both the regs and ftrace_ops at the same time. + * If they support ftrace_ops, it is assumed they support regs. + * If call backs want to use regs, they must either check for regs + * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. + * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. + * An architecture can pass partial regs with ftrace_ops and still + * set the ARCH_SUPPORT_FTARCE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *regs) { - __ftrace_ops_list_func(ip, parent_ip, NULL); + __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { - __ftrace_ops_list_func(ip, parent_ip, NULL); + __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } #endif diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a872a9a298a0..9824419c8404 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -259,7 +259,7 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops) + struct ftrace_ops *ops, struct pt_regs *pt_regs) { struct ftrace_entry *entry; struct hlist_head *head; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 88daa5177bf4..8c6696833686 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1682,7 +1682,7 @@ static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct ring_buffer_event *event; struct ring_buffer *buffer; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fceb7a9aa06d..5675ebd541f0 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -49,7 +49,7 @@ static void function_trace_start(struct trace_array *tr) static void function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -77,7 +77,8 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) + { struct trace_array *tr = func_trace; struct trace_array_cpu *data; @@ -109,7 +110,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = func_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 2862c77f95d9..c7a9ba936de6 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -137,7 +137,7 @@ static int func_prolog_dec(struct trace_array *tr, */ static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op) + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0caf4f5da569..7547e36d483e 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -108,7 +108,8 @@ out_enable: * wakeup uses its own tracer function to keep the overhead down: */ static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9ae40c823af8..add37e019fd0 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -104,7 +104,8 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe1_cnt++; } @@ -112,7 +113,8 @@ static void trace_selftest_test_probe1_func(unsigned long ip, static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe2_cnt++; } @@ -120,7 +122,8 @@ static void trace_selftest_test_probe2_func(unsigned long ip, static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_probe3_cnt++; } @@ -128,7 +131,8 @@ static void trace_selftest_test_probe3_func(unsigned long ip, static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_global_cnt++; } @@ -136,7 +140,8 @@ static void trace_selftest_test_global_func(unsigned long ip, static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, unsigned long pip, - struct ftrace_ops *op) + struct ftrace_ops *op, + struct pt_regs *pt_regs) { trace_selftest_test_dyn_cnt++; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index e20006d5fb6a..2fa5328e8893 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -111,7 +111,8 @@ static inline void check_stack(void) } static void -stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op) +stack_trace_call(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct pt_regs *pt_regs) { int cpu; -- cgit v1.2.3 From 08f6fba503111e0336f2b4d6915a4a18f9b60e51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Apr 2012 16:20:23 -0400 Subject: ftrace/x86: Add separate function to save regs Add a way to have different functions calling different trampolines. If a ftrace_ops wants regs saved on the return, then have only the functions with ops registered to save regs. Functions registered by other ops would not be affected, unless the functions overlap. If one ftrace_ops registered functions A, B and C and another ops registered fucntions to save regs on A, and D, then only functions A and D would be saving regs. Function B and C would work as normal. Although A is registered by both ops: normal and saves regs; this is fine as saving the regs is needed to satisfy one of the ops that calls it but the regs are ignored by the other ops function. x86_64 implements the full regs saving, and i386 just passes a NULL for regs to satisfy the ftrace_ops passing. Where an arch must supply both regs and ftrace_ops parameters, even if regs is just NULL. It is OK for an arch to pass NULL regs. All function trace users that require regs passing must add the flag FTRACE_OPS_FL_SAVE_REGS when registering the ftrace_ops. If the arch does not support saving regs then the ftrace_ops will fail to register. The flag FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED may be set that will prevent the ftrace_ops from failing to register. In this case, the handler may either check if regs is not NULL or check if ARCH_SUPPORTS_FTRACE_SAVE_REGS. If the arch supports passing regs it will set this macro and pass regs for ops that request them. All other archs will just pass NULL. Link: Link: http://lkml.kernel.org/r/20120711195745.107705970@goodmis.org Cc: Alexander van Heukelum Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 47 +++++++++++-------- arch/x86/kernel/entry_32.S | 4 +- arch/x86/kernel/entry_64.S | 94 +++++++++++++++++++++++++++++++++---- arch/x86/kernel/ftrace.c | 77 ++++++++++++++++++++++++++++-- include/linux/ftrace.h | 107 +++++++++++++++++++++++++++++++++++++++--- kernel/trace/ftrace.c | 91 +++++++++++++++++++++++++++++++---- 6 files changed, 373 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b3bb1f3f2773..a8475014c4ad 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,27 +3,33 @@ #ifdef __ASSEMBLY__ - .macro MCOUNT_SAVE_FRAME - /* taken from glibc */ - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) + /* skip is set if the stack was already partially adjusted */ + .macro MCOUNT_SAVE_FRAME skip=0 + /* + * We add enough stack to save all regs. + */ + subq $(SS+8-\skip), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* Move RIP to its proper location */ + movq SS+8(%rsp), %rdx + movq %rdx, RIP(%rsp) .endm - .macro MCOUNT_RESTORE_FRAME - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp + .macro MCOUNT_RESTORE_FRAME skip=0 + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + addq $(SS+8-\skip), %rsp .endm #endif @@ -34,6 +40,9 @@ #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 +#ifdef CONFIG_X86_64 +#define ARCH_SUPPORTS_FTRACE_SAVE_REGS +#endif #endif #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index e3e17a0b7ff8..5da11d1b85ae 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1109,7 +1109,8 @@ ENTRY(ftrace_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %eax + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax movl 0x4(%ebp), %edx leal function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax @@ -1118,6 +1119,7 @@ ENTRY(ftrace_caller) ftrace_call: call ftrace_stub + addl $4,%esp /* skip NULL pointer */ popl %edx popl %ecx popl %eax diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2b4f94c5dc60..52bda2e8f7aa 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -73,21 +73,34 @@ ENTRY(mcount) retq END(mcount) +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 + MCOUNT_SAVE_FRAME \skip + + /* Load the ftrace_ops into the 3rd parameter */ + leaq function_trace_op, %rdx + + /* Load ip into the first parameter */ + movq RIP(%rsp), %rdi + subq $MCOUNT_INSN_SIZE, %rdi + /* Load the parent_ip into the second parameter */ + movq 8(%rbp), %rsi +.endm + ENTRY(ftrace_caller) + /* Check if tracing was disabled (quick check) */ cmpl $0, function_trace_stop jne ftrace_stub - MCOUNT_SAVE_FRAME - - leaq function_trace_op, %rdx - movq 0x38(%rsp), %rdi - movq 8(%rbp), %rsi - subq $MCOUNT_INSN_SIZE, %rdi + ftrace_caller_setup + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx GLOBAL(ftrace_call) call ftrace_stub MCOUNT_RESTORE_FRAME +ftrace_return: #ifdef CONFIG_FUNCTION_GRAPH_TRACER GLOBAL(ftrace_graph_call) @@ -98,6 +111,71 @@ GLOBAL(ftrace_stub) retq END(ftrace_caller) +ENTRY(ftrace_regs_caller) + /* Save the current flags before compare (in SS location)*/ + pushfq + + /* Check if tracing was disabled (quick check) */ + cmpl $0, function_trace_stop + jne ftrace_restore_flags + + /* skip=8 to skip flags saved in SS */ + ftrace_caller_setup 8 + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbp, RBP(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq SS(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address */ + leaq SS+16(%rsp), %rcx + movq %rcx, RSP(%rsp) + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, SS(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBP(%rsp), %rbp + movq RBX(%rsp), %rbx + + /* skip=8 to skip flags saved in SS */ + MCOUNT_RESTORE_FRAME 8 + + /* Restore flags */ + popfq + + jmp ftrace_return +ftrace_restore_flags: + popfq + jmp ftrace_stub + +END(ftrace_regs_caller) + + #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) cmpl $0, function_trace_stop @@ -120,7 +198,7 @@ GLOBAL(ftrace_stub) trace: MCOUNT_SAVE_FRAME - movq 0x38(%rsp), %rdi + movq RIP(%rsp), %rdi movq 8(%rbp), %rsi subq $MCOUNT_INSN_SIZE, %rdi @@ -141,7 +219,7 @@ ENTRY(ftrace_graph_caller) MCOUNT_SAVE_FRAME leaq 8(%rbp), %rdi - movq 0x38(%rsp), %rsi + movq RIP(%rsp), %rsi movq (%rbp), %rdx subq $MCOUNT_INSN_SIZE, %rsi diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c3a7cb4bf6e6..b90eb1a13071 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -206,6 +206,23 @@ static int ftrace_modify_code(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code); +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +/* + * Should never be called: + * As it is only called by __ftrace_replace_code() which is called by + * ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + * which is called to turn mcount into nops or nops into function calls + * but not to convert a function from not using regs to one that uses + * regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + WARN_ON(1); + return -EINVAL; +} +#endif + int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); @@ -220,6 +237,16 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ret = ftrace_modify_code(ip, old, new); +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS + /* Also update the regs callback function */ + if (!ret) { + ip = (unsigned long)(&ftrace_regs_call); + memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + ret = ftrace_modify_code(ip, old, new); + } +#endif + atomic_dec(&modifying_ftrace_code); return ret; @@ -299,6 +326,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec) return add_break(rec->ip, old); } +/* + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + */ +static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + +/* + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + */ +static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) +{ + if (rec->flags & FTRACE_FL_REGS_EN) + return (unsigned long)FTRACE_REGS_ADDR; + else + return (unsigned long)FTRACE_ADDR; +} + static int add_breakpoints(struct dyn_ftrace *rec, int enable) { unsigned long ftrace_addr; @@ -306,7 +359,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: @@ -316,6 +369,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable) /* converting nop to call */ return add_brk_on_nop(rec); + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + ftrace_addr = get_ftrace_old_addr(rec); + /* fall through */ case FTRACE_UPDATE_MAKE_NOP: /* converting a call to a nop */ return add_brk_on_call(rec, ftrace_addr); @@ -360,13 +417,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec) * If not, don't touch the breakpoint, we make just create * a disaster. */ - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); + nop = ftrace_call_replace(ip, ftrace_addr); + + if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) + goto update; + + /* Check both ftrace_addr and ftrace_old_addr */ + ftrace_addr = get_ftrace_old_addr(rec); nop = ftrace_call_replace(ip, ftrace_addr); if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) return -EINVAL; } + update: return probe_kernel_write((void *)ip, &nop[0], 1); } @@ -405,12 +470,14 @@ static int add_update(struct dyn_ftrace *rec, int enable) ret = ftrace_test_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ return add_update_call(rec, ftrace_addr); @@ -455,12 +522,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); - ftrace_addr = (unsigned long)FTRACE_ADDR; + ftrace_addr = get_ftrace_addr(rec); switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: case FTRACE_UPDATE_MAKE_CALL: /* converting nop to call */ return finish_update_call(rec, ftrace_addr); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e4202881fb00..ab39990cc43f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -71,12 +71,28 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * could be controled by following calls: * ftrace_function_local_enable * ftrace_function_local_disable + * SAVE_REGS - The ftrace_ops wants regs saved at each function called + * and passed to the callback. If this flag is set, but the + * architecture does not support passing regs + * (ARCH_SUPPORTS_FTRACE_SAVE_REGS is not defined), then the + * ftrace_ops will fail to register, unless the next flag + * is set. + * SAVE_REGS_IF_SUPPORTED - This is the same as SAVE_REGS, but if the + * handler can handle an arch that does not save regs + * (the handler tests if regs == NULL), then it can set + * this flag instead. It will not fail registering the ftrace_ops + * but, the regs field will be NULL if the arch does not support + * passing regs to the handler. + * Note, if this flag is set, the SAVE_REGS flag will automatically + * get set upon registering the ftrace_ops, if the arch supports it. */ enum { - FTRACE_OPS_FL_ENABLED = 1 << 0, - FTRACE_OPS_FL_GLOBAL = 1 << 1, - FTRACE_OPS_FL_DYNAMIC = 1 << 2, - FTRACE_OPS_FL_CONTROL = 1 << 3, + FTRACE_OPS_FL_ENABLED = 1 << 0, + FTRACE_OPS_FL_GLOBAL = 1 << 1, + FTRACE_OPS_FL_DYNAMIC = 1 << 2, + FTRACE_OPS_FL_CONTROL = 1 << 3, + FTRACE_OPS_FL_SAVE_REGS = 1 << 4, + FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, }; struct ftrace_ops { @@ -254,12 +270,31 @@ extern void unregister_ftrace_function_probe_all(char *glob); extern int ftrace_text_reserved(void *start, void *end); +/* + * The dyn_ftrace record's flags field is split into two parts. + * the first part which is '0-FTRACE_REF_MAX' is a counter of + * the number of callbacks that have registered the function that + * the dyn_ftrace descriptor represents. + * + * The second part is a mask: + * ENABLED - the function is being traced + * REGS - the record wants the function to save regs + * REGS_EN - the function is set up to save regs. + * + * When a new ftrace_ops is registered and wants a function to save + * pt_regs, the rec->flag REGS is set. When the function has been + * set up to save regs, the REG_EN flag is set. Once a function + * starts saving regs it will do so until all ftrace_ops are removed + * from tracing that function. + */ enum { - FTRACE_FL_ENABLED = (1 << 30), + FTRACE_FL_ENABLED = (1UL << 29), + FTRACE_FL_REGS = (1UL << 30), + FTRACE_FL_REGS_EN = (1UL << 31) }; -#define FTRACE_FL_MASK (0x3UL << 30) -#define FTRACE_REF_MAX ((1 << 30) - 1) +#define FTRACE_FL_MASK (0x7UL << 29) +#define FTRACE_REF_MAX ((1UL << 29) - 1) struct dyn_ftrace { union { @@ -290,9 +325,23 @@ enum { FTRACE_STOP_FUNC_RET = (1 << 4), }; +/* + * The FTRACE_UPDATE_* enum is used to pass information back + * from the ftrace_update_record() and ftrace_test_record() + * functions. These are called by the code update routines + * to find out what is to be done for a given function. + * + * IGNORE - The function is already what we want it to be + * MAKE_CALL - Start tracing the function + * MODIFY_CALL - Stop saving regs for the function + * MODIFY_CALL_REGS - Start saving regs for the function + * MAKE_NOP - Stop tracing the function + */ enum { FTRACE_UPDATE_IGNORE, FTRACE_UPDATE_MAKE_CALL, + FTRACE_UPDATE_MODIFY_CALL, + FTRACE_UPDATE_MODIFY_CALL_REGS, FTRACE_UPDATE_MAKE_NOP, }; @@ -344,7 +393,9 @@ extern int ftrace_dyn_arch_init(void *data); extern void ftrace_replace_code(int enable); extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); +extern void ftrace_regs_caller(void); extern void ftrace_call(void); +extern void ftrace_regs_call(void); extern void mcount_call(void); void ftrace_modify_all_code(int command); @@ -352,6 +403,15 @@ void ftrace_modify_all_code(int command); #ifndef FTRACE_ADDR #define FTRACE_ADDR ((unsigned long)ftrace_caller) #endif + +#ifndef FTRACE_REGS_ADDR +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +# define FTRACE_REGS_ADDR ((unsigned long)ftrace_regs_caller) +#else +# define FTRACE_REGS_ADDR FTRACE_ADDR +#endif +#endif + #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern void ftrace_graph_caller(void); extern int ftrace_enable_ftrace_graph_caller(void); @@ -407,6 +467,39 @@ extern int ftrace_make_nop(struct module *mod, */ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr); +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS +/** + * ftrace_modify_call - convert from one addr to another (no nop) + * @rec: the mcount call site record + * @old_addr: the address expected to be currently called to + * @addr: the address to change to + * + * This is a very sensitive operation and great care needs + * to be taken by the arch. The operation should carefully + * read the location, check to see if what is read is indeed + * what we expect it to be, and then on success of the compare, + * it should write to the location. + * + * The code segment at @rec->ip should be a caller to @old_addr + * + * Return must be: + * 0 on success + * -EFAULT on error reading the location + * -EINVAL on a failed compare of the contents + * -EPERM on error writing to the location + * Any other value will be considered a failure. + */ +extern int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr); +#else +/* Should never be called */ +static inline int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, + unsigned long addr) +{ + return -EINVAL; +} +#endif + /* May be defined in arch */ extern int ftrace_arch_read_dyn_info(char *buf, int size); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6ff07ad0ede3..c55f7e274613 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -314,6 +314,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) return -EINVAL; +#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS + /* + * If the ftrace_ops specifies SAVE_REGS, then it only can be used + * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. + * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && + !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) + ops->flags |= FTRACE_OPS_FL_SAVE_REGS; +#endif + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -1515,6 +1529,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags++; if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) return; + /* + * If any ops wants regs saved for this function + * then all ops will get saved regs. + */ + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) + rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) return; @@ -1606,18 +1626,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (enable && (rec->flags & ~FTRACE_FL_MASK)) flag = FTRACE_FL_ENABLED; + /* + * If enabling and the REGS flag does not match the REGS_EN, then + * do not ignore this record. Set flags to fail the compare against + * ENABLED. + */ + if (flag && + (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) + flag |= FTRACE_FL_REGS; + /* If the state of this record hasn't changed, then do nothing */ if ((rec->flags & FTRACE_FL_ENABLED) == flag) return FTRACE_UPDATE_IGNORE; if (flag) { - if (update) + /* Save off if rec is being enabled (for return value) */ + flag ^= rec->flags & FTRACE_FL_ENABLED; + + if (update) { rec->flags |= FTRACE_FL_ENABLED; - return FTRACE_UPDATE_MAKE_CALL; + if (flag & FTRACE_FL_REGS) { + if (rec->flags & FTRACE_FL_REGS) + rec->flags |= FTRACE_FL_REGS_EN; + else + rec->flags &= ~FTRACE_FL_REGS_EN; + } + } + + /* + * If this record is being updated from a nop, then + * return UPDATE_MAKE_CALL. + * Otherwise, if the EN flag is set, then return + * UPDATE_MODIFY_CALL_REGS to tell the caller to convert + * from the non-save regs, to a save regs function. + * Otherwise, + * return UPDATE_MODIFY_CALL to tell the caller to convert + * from the save regs, to a non-save regs function. + */ + if (flag & FTRACE_FL_ENABLED) + return FTRACE_UPDATE_MAKE_CALL; + else if (rec->flags & FTRACE_FL_REGS_EN) + return FTRACE_UPDATE_MODIFY_CALL_REGS; + else + return FTRACE_UPDATE_MODIFY_CALL; } - if (update) - rec->flags &= ~FTRACE_FL_ENABLED; + if (update) { + /* If there's no more users, clear all flags */ + if (!(rec->flags & ~FTRACE_FL_MASK)) + rec->flags = 0; + else + /* Just disable the record (keep REGS state) */ + rec->flags &= ~FTRACE_FL_ENABLED; + } return FTRACE_UPDATE_MAKE_NOP; } @@ -1652,13 +1713,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) static int __ftrace_replace_code(struct dyn_ftrace *rec, int enable) { + unsigned long ftrace_old_addr; unsigned long ftrace_addr; int ret; - ftrace_addr = (unsigned long)FTRACE_ADDR; - ret = ftrace_update_record(rec, enable); + if (rec->flags & FTRACE_FL_REGS) + ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; + else + ftrace_addr = (unsigned long)FTRACE_ADDR; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; @@ -1668,6 +1733,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) case FTRACE_UPDATE_MAKE_NOP: return ftrace_make_nop(NULL, rec, ftrace_addr); + + case FTRACE_UPDATE_MODIFY_CALL_REGS: + case FTRACE_UPDATE_MODIFY_CALL: + if (rec->flags & FTRACE_FL_REGS) + ftrace_old_addr = (unsigned long)FTRACE_ADDR; + else + ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; + + return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } return -1; /* unknow ftrace bug */ @@ -2421,8 +2495,9 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) - seq_printf(m, " (%ld)", - rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, " (%ld)%s", + rec->flags & ~FTRACE_FL_MASK, + rec->flags & FTRACE_FL_REGS ? " R" : ""); seq_printf(m, "\n"); return 0; -- cgit v1.2.3 From 4740974a6844156c14d741b0080b59d275679a23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 11:04:44 -0400 Subject: ftrace: Add default recursion protection for function tracing As more users of the function tracer utility are being added, they do not always add the necessary recursion protection. To protect from function recursion due to tracing, if the callback ftrace_ops does not specifically specify that it protects against recursion (by setting the FTRACE_OPS_FL_RECURSION_SAFE flag), the list operation will be called by the mcount trampoline which adds recursion protection. If the flag is set, then the function will be called directly with no extra protection. Note, the list operation is called if more than one function callback is registered, or if the arch does not support all of the function tracer features. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 5 +++++ kernel/trace/ftrace.c | 10 ++++++++-- kernel/trace/trace_events.c | 1 + kernel/trace/trace_functions.c | 4 ++-- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 7 +++++-- kernel/trace/trace_stack.c | 1 + 8 files changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index ab39990cc43f..65a14e47a0db 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -85,6 +85,10 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * passing regs to the handler. * Note, if this flag is set, the SAVE_REGS flag will automatically * get set upon registering the ftrace_ops, if the arch supports it. + * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure + * that the call back has its own recursion protection. If it does + * not set this, then the ftrace infrastructure will add recursion + * protection for the caller. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -93,6 +97,7 @@ enum { FTRACE_OPS_FL_CONTROL = 1 << 3, FTRACE_OPS_FL_SAVE_REGS = 1 << 4, FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, + FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, }; struct ftrace_ops { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c55f7e274613..ad765b4ba426 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -221,12 +222,13 @@ static void update_ftrace_function(void) /* * If we are at the end of the list and this ops is - * not dynamic and the arch supports passing ops, then have the - * mcount trampoline call the function directly. + * recursion safe and not dynamic and the arch supports passing ops, + * then have the mcount trampoline call the function directly. */ if (ftrace_ops_list == &ftrace_list_end || (ftrace_ops_list->next == &ftrace_list_end && !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && + (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && !FTRACE_FORCE_LIST_FUNC)) { /* Set the ftrace_ops that the arch callback uses */ if (ftrace_ops_list == &global_ops) @@ -867,6 +869,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int register_ftrace_profiler(void) @@ -1049,6 +1052,7 @@ static struct ftrace_ops global_ops = { .func = ftrace_stub, .notrace_hash = EMPTY_HASH, .filter_hash = EMPTY_HASH, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static DEFINE_MUTEX(ftrace_regex_lock); @@ -3967,6 +3971,7 @@ void __init ftrace_init(void) static struct ftrace_ops global_ops = { .func = ftrace_stub, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int __init ftrace_nodyn_init(void) @@ -4023,6 +4028,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops control_ops = { .func = ftrace_ops_control_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static inline void diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8c6696833686..6825d833a257 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1721,6 +1721,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5675ebd541f0..fdff65dff1bb 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -153,13 +153,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops trace_stack_ops __read_mostly = { .func = function_stack_trace_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; /* Our two options */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index c7a9ba936de6..d98ee8283b29 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -154,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7547e36d483e..02170c00c413 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, - .flags = FTRACE_OPS_FL_GLOBAL, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index add37e019fd0..1fb6da85ff8b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -148,19 +148,22 @@ static void trace_selftest_test_dyn_func(unsigned long ip, static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_global = { - .func = trace_selftest_test_global_func, - .flags = FTRACE_OPS_FL_GLOBAL, + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 2fa5328e8893..0c1b165778e5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -137,6 +137,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t -- cgit v1.2.3 From 47239c4d8d6a24796039cada69d477a2b8cac9d6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 11:13:07 -0400 Subject: ftrace: Only compile ftrace selftest if selftests are enabled No need to compile in the ftrace selftest helper file if selftests are not being executed. Signed-off-by: Steven Rostedt --- kernel/trace/Makefile | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b831087c8200..837090808aac 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) +ifdef CONFIG_FTRACE_SELFTEST # selftest needs instrumentation CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif +endif # If unlikely tracing is enabled, do not trace these files ifdef CONFIG_TRACING_BRANCHES -- cgit v1.2.3 From ea701f11da44b44907af226fe5a5f57d2f26eeb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 13:08:05 -0400 Subject: ftrace: Add selftest to test function trace recursion protection Add selftests to test the function tracing recursion protection actually does work. It also tests if a ftrace_ops states it will perform its own protection. Although, even if the ftrace_ops states it will protect itself, the ftrace infrastructure may still provide protection if the arch does not support all features or another ftrace_ops is registered. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 6 ++ kernel/trace/ftrace.c | 21 +++++++ kernel/trace/trace_selftest.c | 136 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 65a14e47a0db..9962e954a633 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -220,6 +220,10 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1, */ #define register_ftrace_function(ops) ({ 0; }) #define unregister_ftrace_function(ops) ({ 0; }) +static inline int ftrace_nr_registered_ops(void) +{ + return 0; +} static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } static inline void ftrace_stop(void) { } @@ -275,6 +279,8 @@ extern void unregister_ftrace_function_probe_all(char *glob); extern int ftrace_text_reserved(void *start, void *end); +extern int ftrace_nr_registered_ops(void); + /* * The dyn_ftrace record's flags field is split into two parts. * the first part which is '0-FTRACE_REF_MAX' is a counter of diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ad765b4ba426..528d997c7f99 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -111,6 +111,27 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) #endif +/** + * ftrace_nr_registered_ops - return number of ops registered + * + * Returns the number of ftrace_ops registered and tracing functions + */ +int ftrace_nr_registered_ops(void) +{ + struct ftrace_ops *ops; + int cnt = 0; + + mutex_lock(&ftrace_lock); + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) + cnt++; + + mutex_unlock(&ftrace_lock); + + return cnt; +} + /* * Traverse the ftrace_global_list, invoking all entries. The reason that we * can use rcu_dereference_raw() is that elements removed from this list diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 1fb6da85ff8b..86422f91dbe1 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -406,8 +406,141 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, return ret; } + +static int trace_selftest_recursion_cnt; +static void trace_selftest_test_recursion_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * This function is registered without the recursion safe flag. + * The ftrace infrastructure should provide the recursion + * protection. If not, this will crash the kernel! + */ + trace_selftest_recursion_cnt++; + DYN_FTRACE_TEST_NAME(); +} + +static void trace_selftest_test_recursion_safe_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + /* + * We said we would provide our own recursion. By calling + * this function again, we should recurse back into this function + * and count again. But this only happens if the arch supports + * all of ftrace features and nothing else is using the function + * tracing utility. + */ + if (trace_selftest_recursion_cnt++) + return; + DYN_FTRACE_TEST_NAME(); +} + +static struct ftrace_ops test_rec_probe = { + .func = trace_selftest_test_recursion_func, +}; + +static struct ftrace_ops test_recsafe_probe = { + .func = trace_selftest_test_recursion_safe_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE, +}; + +static int +trace_selftest_function_recursion(void) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + int len; + int ret; + int cnt; + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion: "); + + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_rec_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_rec_probe); + + ret = -1; + if (trace_selftest_recursion_cnt != 1) { + pr_cont("*callback not called once (%d)* ", + trace_selftest_recursion_cnt); + goto out; + } + + trace_selftest_recursion_cnt = 1; + + pr_cont("PASSED\n"); + pr_info("Testing ftrace recursion safe: "); + + ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); + if (ret) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_recsafe_probe); + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_recsafe_probe); + + /* + * If arch supports all ftrace features, and no other task + * was on the list, we should be fine. + */ + if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) + cnt = 2; /* Should have recursed */ + else + cnt = 1; + + ret = -1; + if (trace_selftest_recursion_cnt != cnt) { + pr_cont("*callback not called expected %d times (%d)* ", + cnt, trace_selftest_recursion_cnt); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + return ret; +} #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +# define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ /* @@ -455,7 +588,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ret = trace_selftest_startup_dynamic_tracing(trace, tr, DYN_FTRACE_TEST_NAME); + if (ret) + goto out; + ret = trace_selftest_function_recursion(); out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; -- cgit v1.2.3 From ad97772ad82f57c83968079d0880c71ab126ab04 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Jul 2012 13:45:59 -0400 Subject: ftrace: Add selftest to test function save-regs support Add selftests to test the save-regs functionality of ftrace. If the arch supports saving regs, then it will make sure that regs is at least not NULL in the callback. If the arch does not support saving regs, it makes sure that the registering of the ftrace_ops that requests saving regs fails. It then tests the registering of the ftrace_ops succeeds if the 'IF_SUPPORTED' flag is set. Then it makes sure that the regs passed to the function is NULL. Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_selftest.c | 114 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..593debefc4e9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -472,11 +472,11 @@ extern void trace_find_cmdline(int pid, char comm[]); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +#endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -#endif extern int ring_buffer_expanded; extern bool tracing_selftest_disabled; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 86422f91dbe1..1003a4d5eb25 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -543,6 +543,116 @@ out: # define trace_selftest_function_recursion() ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ +static enum { + TRACE_SELFTEST_REGS_START, + TRACE_SELFTEST_REGS_FOUND, + TRACE_SELFTEST_REGS_NOT_FOUND, +} trace_selftest_regs_stat; + +static void trace_selftest_test_regs_func(unsigned long ip, + unsigned long pip, + struct ftrace_ops *op, + struct pt_regs *pt_regs) +{ + if (pt_regs) + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; + else + trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; +} + +static struct ftrace_ops test_regs_probe = { + .func = trace_selftest_test_regs_func, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, +}; + +static int +trace_selftest_function_regs(void) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + char *func_name; + int len; + int ret; + int supported = 0; + +#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS + supported = 1; +#endif + + /* The previous test PASSED */ + pr_cont("PASSED\n"); + pr_info("Testing ftrace regs%s: ", + !supported ? "(no arch support)" : ""); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* Handle PPC64 '.' name */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + len = strlen(func_name); + + ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); + /* + * If DYNAMIC_FTRACE is not set, then we just trace all functions. + * This test really doesn't care. + */ + if (ret && ret != -ENODEV) { + pr_cont("*Could not set filter* "); + goto out; + } + + ret = register_ftrace_function(&test_regs_probe); + /* + * Now if the arch does not support passing regs, then this should + * have failed. + */ + if (!supported) { + if (!ret) { + pr_cont("*registered save-regs without arch support* "); + goto out; + } + test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; + ret = register_ftrace_function(&test_regs_probe); + } + if (ret) { + pr_cont("*could not register callback* "); + goto out; + } + + + DYN_FTRACE_TEST_NAME(); + + unregister_ftrace_function(&test_regs_probe); + + ret = -1; + + switch (trace_selftest_regs_stat) { + case TRACE_SELFTEST_REGS_START: + pr_cont("*callback never called* "); + goto out; + + case TRACE_SELFTEST_REGS_FOUND: + if (supported) + break; + pr_cont("*callback received regs without arch support* "); + goto out; + + case TRACE_SELFTEST_REGS_NOT_FOUND: + if (!supported) + break; + pr_cont("*callback received NULL regs* "); + goto out; + } + + ret = 0; +out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + return ret; +} + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -592,6 +702,10 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) goto out; ret = trace_selftest_function_recursion(); + if (ret) + goto out; + + ret = trace_selftest_function_regs(); out: ftrace_enabled = save_ftrace_enabled; tracer_enabled = save_tracer_enabled; -- cgit v1.2.3 From 647664eaf4033501739ac1f42dd52ce8c9266ccc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:08 +0900 Subject: ftrace: add ftrace_set_filter_ip() for address based filter Add a new filter update interface ftrace_set_filter_ip() to set ftrace filter by ip address, not only glob pattern. Link: http://lkml.kernel.org/r/20120605102808.27845.67952.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +++ kernel/trace/ftrace.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9962e954a633..3e711122f66c 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -317,6 +317,8 @@ struct dyn_ftrace { }; int ftrace_force_update(void); +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset); int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, @@ -544,6 +546,7 @@ static inline int ftrace_text_reserved(void *start, void *end) */ #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) +#define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; }) #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_free_filter(ops) do { } while (0) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 528d997c7f99..9dcf15d38380 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3242,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, } static int -ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, - int reset, int enable) +ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) +{ + struct ftrace_func_entry *entry; + + if (!ftrace_location(ip)) + return -EINVAL; + + if (remove) { + entry = ftrace_lookup_ip(hash, ip); + if (!entry) + return -ENOENT; + free_hash_entry(hash, entry); + return 0; + } + + return add_hash_entry(hash, ip); +} + +static int +ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, + unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; struct ftrace_hash *hash; @@ -3272,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, ret = -EINVAL; goto out_regex_unlock; } + if (ip) { + ret = ftrace_match_addr(hash, ip, remove); + if (ret < 0) + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3288,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, return ret; } +static int +ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, + int reset, int enable) +{ + return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); +} + +/** + * ftrace_set_filter_ip - set a function to filter on in ftrace by address + * @ops - the ops to set the filter with + * @ip - the address to add to or remove from the filter. + * @remove - non zero to remove the ip from the filter + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled + * If @ip is NULL, it failes to update filter. + */ +int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, + int remove, int reset) +{ + return ftrace_set_addr(ops, ip, remove, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); +} + /** * ftrace_set_filter - set a function to filter on in ftrace * @ops - the ops to set the filter with -- cgit v1.2.3 From 72ef3794c5cd5f5f0e6355c24a529224c449cd14 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jun 2012 19:28:14 +0900 Subject: kprobes: Inverse taking of module_mutex with kprobe_mutex Currently module_mutex is taken before kprobe_mutex, but this can cause issues when we have kprobes register ftrace, as the ftrace mutex is taken before enabling a tracepoint, which currently takes the module mutex. If module_mutex is taken before kprobe_mutex, then we can not have kprobes use the ftrace infrastructure. There seems to be no reason that the kprobe_mutex can't be taken before the module_mutex. Running lockdep shows that it is safe among the kernels I've run. Link: http://lkml.kernel.org/r/20120605102814.27845.21047.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Cc: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c62b8546cc90..7a8a1222c7b1 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) { LIST_HEAD(free_list); + mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); - mutex_lock(&kprobe_mutex); /* * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) @@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Step 4: Free cleaned kprobes after quiesence period */ do_free_cleaned_kprobes(&free_list); - mutex_unlock(&kprobe_mutex); mutex_unlock(&module_mutex); + mutex_unlock(&kprobe_mutex); /* Step 5: Kick optimizer again if needed */ if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) -- cgit v1.2.3 From f7fa6ef0ded995aad68650a877198f70e44b7621 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:20 +0900 Subject: kprobes: cleanup to separate probe-able check Separate probe-able address checking code from register_kprobe(). Link: http://lkml.kernel.org/r/20120605102820.27845.90133.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 82 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7a8a1222c7b1..6137fe32b4b8 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1313,67 +1313,80 @@ static inline int check_kprobe_rereg(struct kprobe *p) return ret; } -int __kprobes register_kprobe(struct kprobe *p) +static __kprobes int check_kprobe_address_safe(struct kprobe *p, + struct module **probed_mod) { int ret = 0; - struct kprobe *old_p; - struct module *probed_mod; - kprobe_opcode_t *addr; - - addr = kprobe_addr(p); - if (IS_ERR(addr)) - return PTR_ERR(addr); - p->addr = addr; - - ret = check_kprobe_rereg(p); - if (ret) - return ret; jump_label_lock(); preempt_disable(); + + /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; - goto cannot_probe; + goto out; } - /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ - p->flags &= KPROBE_FLAG_DISABLED; - - /* - * Check if are we probing a module. - */ - probed_mod = __module_text_address((unsigned long) p->addr); - if (probed_mod) { - /* Return -ENOENT if fail. */ - ret = -ENOENT; + /* Check if are we probing a module */ + *probed_mod = __module_text_address((unsigned long) p->addr); + if (*probed_mod) { /* * We must hold a refcount of the probed module while updating * its code to prohibit unexpected unloading. */ - if (unlikely(!try_module_get(probed_mod))) - goto cannot_probe; + if (unlikely(!try_module_get(*probed_mod))) { + ret = -ENOENT; + goto out; + } /* * If the module freed .init.text, we couldn't insert * kprobes in there. */ - if (within_module_init((unsigned long)p->addr, probed_mod) && - probed_mod->state != MODULE_STATE_COMING) { - module_put(probed_mod); - goto cannot_probe; + if (within_module_init((unsigned long)p->addr, *probed_mod) && + (*probed_mod)->state != MODULE_STATE_COMING) { + module_put(*probed_mod); + *probed_mod = NULL; + ret = -ENOENT; } - /* ret will be updated by following code */ } +out: preempt_enable(); jump_label_unlock(); + return ret; +} + +int __kprobes register_kprobe(struct kprobe *p) +{ + int ret; + struct kprobe *old_p; + struct module *probed_mod; + kprobe_opcode_t *addr; + + /* Adjust probe address from symbol */ + addr = kprobe_addr(p); + if (IS_ERR(addr)) + return PTR_ERR(addr); + p->addr = addr; + + ret = check_kprobe_rereg(p); + if (ret) + return ret; + + /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ + p->flags &= KPROBE_FLAG_DISABLED; p->nmissed = 0; INIT_LIST_HEAD(&p->list); - mutex_lock(&kprobe_mutex); + ret = check_kprobe_address_safe(p, &probed_mod); + if (ret) + return ret; + + mutex_lock(&kprobe_mutex); jump_label_lock(); /* needed to call jump_label_text_reserved() */ get_online_cpus(); /* For avoiding text_mutex deadlock. */ @@ -1410,11 +1423,6 @@ out: module_put(probed_mod); return ret; - -cannot_probe: - preempt_enable(); - jump_label_unlock(); - return ret; } EXPORT_SYMBOL_GPL(register_kprobe); -- cgit v1.2.3 From 25764288d8dc4792f0f487baf043ccfee5d8c2ba Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:26 +0900 Subject: kprobes: Move locks into appropriate functions Break a big critical region into fine-grained pieces at registering kprobe path. This helps us to solve circular locking dependency when introducing ftrace-based kprobes. Link: http://lkml.kernel.org/r/20120605102826.27845.81689.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/kprobes.c | 63 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6137fe32b4b8..9e47f44f3531 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -759,20 +759,28 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) struct kprobe *ap; struct optimized_kprobe *op; + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + mutex_lock(&text_mutex); + ap = alloc_aggr_kprobe(p); if (!ap) - return; + goto out; op = container_of(ap, struct optimized_kprobe, kp); if (!arch_prepared_optinsn(&op->optinsn)) { /* If failed to setup optimizing, fallback to kprobe */ arch_remove_optimized_kprobe(op); kfree(op); - return; + goto out; } init_aggr_kprobe(ap, p); - optimize_kprobe(ap); + optimize_kprobe(ap); /* This just kicks optimizer thread */ + +out: + mutex_unlock(&text_mutex); + jump_label_unlock(); } #ifdef CONFIG_SYSCTL @@ -1144,12 +1152,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) if (p->post_handler && !ap->post_handler) ap->post_handler = aggr_post_handler; - if (kprobe_disabled(ap) && !kprobe_disabled(p)) { - ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) - /* Arm the breakpoint again. */ - __arm_kprobe(ap); - } return 0; } @@ -1189,11 +1191,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, int ret = 0; struct kprobe *ap = orig_p; + /* For preparing optimization, jump_label_text_reserved() is called */ + jump_label_lock(); + /* + * Get online CPUs to avoid text_mutex deadlock.with stop machine, + * which is invoked by unoptimize_kprobe() in add_new_kprobe() + */ + get_online_cpus(); + mutex_lock(&text_mutex); + if (!kprobe_aggrprobe(orig_p)) { /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ ap = alloc_aggr_kprobe(orig_p); - if (!ap) - return -ENOMEM; + if (!ap) { + ret = -ENOMEM; + goto out; + } init_aggr_kprobe(ap, orig_p); } else if (kprobe_unused(ap)) /* This probe is going to die. Rescue it */ @@ -1213,7 +1226,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, * free aggr_probe. It will be used next time, or * freed by unregister_kprobe. */ - return ret; + goto out; /* Prepare optimized instructions if possible. */ prepare_optimized_kprobe(ap); @@ -1228,7 +1241,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, /* Copy ap's insn slot to p */ copy_kprobe(ap, p); - return add_new_kprobe(ap, p); + ret = add_new_kprobe(ap, p); + +out: + mutex_unlock(&text_mutex); + put_online_cpus(); + jump_label_unlock(); + + if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { + ap->flags &= ~KPROBE_FLAG_DISABLED; + if (!kprobes_all_disarmed) + /* Arm the breakpoint again. */ + arm_kprobe(ap); + } + return ret; } static int __kprobes in_kprobes_functions(unsigned long addr) @@ -1387,10 +1413,6 @@ int __kprobes register_kprobe(struct kprobe *p) return ret; mutex_lock(&kprobe_mutex); - jump_label_lock(); /* needed to call jump_label_text_reserved() */ - - get_online_cpus(); /* For avoiding text_mutex deadlock. */ - mutex_lock(&text_mutex); old_p = get_kprobe(p->addr); if (old_p) { @@ -1399,7 +1421,9 @@ int __kprobes register_kprobe(struct kprobe *p) goto out; } + mutex_lock(&text_mutex); /* Avoiding text modification */ ret = arch_prepare_kprobe(p); + mutex_unlock(&text_mutex); if (ret) goto out; @@ -1408,15 +1432,12 @@ int __kprobes register_kprobe(struct kprobe *p) &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); /* Try to optimize kprobe */ try_to_optimize_kprobe(p); out: - mutex_unlock(&text_mutex); - put_online_cpus(); - jump_label_unlock(); mutex_unlock(&kprobe_mutex); if (probed_mod) -- cgit v1.2.3 From ae6aa16fdc163afe6b04b6c073ad4ddd4663c03b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:32 +0900 Subject: kprobes: introduce ftrace based optimization Introduce function trace based kprobes optimization. With using ftrace optimization, kprobes on the mcount calling address, use ftrace's mcount call instead of breakpoint. Furthermore, this optimization works with preemptive kernel not like as current jump-based optimization. Of cource, this feature works only if the probe is on mcount call. Only if kprobe.break_handler is set, that probe is not optimized with ftrace (nor put on ftrace). The reason why this limitation comes is that this break_handler may be used only from jprobes which changes ip address (for fetching the function arguments), but function tracer ignores modified ip address. Changes in v2: - Fix ftrace_ops registering right after setting its filter. - Unregister ftrace_ops if there is no kprobe using. - Remove notrace dependency from __kprobes macro. Link: http://lkml.kernel.org/r/20120605102832.27845.63461.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/kprobes.h | 27 +++++++++++++ kernel/kprobes.c | 105 ++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 119 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index b6e1f8c00577..aa0d05e852e3 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef CONFIG_KPROBES #include @@ -48,14 +49,26 @@ #define KPROBE_REENTER 0x00000004 #define KPROBE_HIT_SSDONE 0x00000008 +/* + * If function tracer is enabled and the arch supports full + * passing of pt_regs to function tracing, then kprobes can + * optimize on top of function tracing. + */ +#if defined(CONFIG_FUNCTION_TRACER) && defined(ARCH_SUPPORTS_FTRACE_SAVE_REGS) \ + && defined(ARCH_SUPPORTS_KPROBES_ON_FTRACE) +# define KPROBES_CAN_USE_FTRACE +#endif + /* Attach to insert probes on any functions which should be ignored*/ #define __kprobes __attribute__((__section__(".kprobes.text"))) + #else /* CONFIG_KPROBES */ typedef int kprobe_opcode_t; struct arch_specific_insn { int dummy; }; #define __kprobes + #endif /* CONFIG_KPROBES */ struct kprobe; @@ -128,6 +141,7 @@ struct kprobe { * NOTE: * this flag is only for optimized_kprobe. */ +#define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ /* Has this kprobe gone ? */ static inline int kprobe_gone(struct kprobe *p) @@ -146,6 +160,13 @@ static inline int kprobe_optimized(struct kprobe *p) { return p->flags & KPROBE_FLAG_OPTIMIZED; } + +/* Is this kprobe uses ftrace ? */ +static inline int kprobe_ftrace(struct kprobe *p) +{ + return p->flags & KPROBE_FLAG_FTRACE; +} + /* * Special probe type that uses setjmp-longjmp type tricks to resume * execution at a specified entry with a matching prototype corresponding @@ -295,6 +316,12 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, #endif #endif /* CONFIG_OPTPROBES */ +#ifdef KPROBES_CAN_USE_FTRACE +extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct pt_regs *regs); +extern int arch_prepare_kprobe_ftrace(struct kprobe *p); +#endif + /* Get the kprobe at this addr (if any) - called with preemption disabled */ struct kprobe *get_kprobe(void *addr); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e47f44f3531..69c16efc315b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -759,6 +759,10 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) struct kprobe *ap; struct optimized_kprobe *op; + /* Impossible to optimize ftrace-based kprobe */ + if (kprobe_ftrace(p)) + return; + /* For preparing optimization, jump_label_text_reserved() is called */ jump_label_lock(); mutex_lock(&text_mutex); @@ -915,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) } #endif /* CONFIG_OPTPROBES */ +#ifdef KPROBES_CAN_USE_FTRACE +static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { + .regs_func = kprobe_ftrace_handler, + .flags = FTRACE_OPS_FL_SAVE_REGS, +}; +static int kprobe_ftrace_enabled; + +/* Must ensure p->addr is really on ftrace */ +static int __kprobes prepare_kprobe(struct kprobe *p) +{ + if (!kprobe_ftrace(p)) + return arch_prepare_kprobe(p); + + return arch_prepare_kprobe_ftrace(p); +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes arm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 0, 0); + WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + kprobe_ftrace_enabled++; + if (kprobe_ftrace_enabled == 1) { + ret = register_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } +} + +/* Caller must lock kprobe_mutex */ +static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) +{ + int ret; + + kprobe_ftrace_enabled--; + if (kprobe_ftrace_enabled == 0) { + ret = unregister_ftrace_function(&kprobe_ftrace_ops); + WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + } + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, + (unsigned long)p->addr, 1, 0); + WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); +} +#else /* !KPROBES_CAN_USE_FTRACE */ +#define prepare_kprobe(p) arch_prepare_kprobe(p) +#define arm_kprobe_ftrace(p) do {} while (0) +#define disarm_kprobe_ftrace(p) do {} while (0) +#endif + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + if (unlikely(kprobe_ftrace(kp))) { + arm_kprobe_ftrace(kp); + return; + } /* * Here, since __arm_kprobe() doesn't use stop_machine(), * this doesn't cause deadlock on text_mutex. So, we don't @@ -929,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void __kprobes disarm_kprobe(struct kprobe *kp) +static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) { + if (unlikely(kprobe_ftrace(kp))) { + disarm_kprobe_ftrace(kp); + return; + } /* Ditto */ mutex_lock(&text_mutex); - __disarm_kprobe(kp, true); + __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); } @@ -1343,6 +1406,26 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, struct module **probed_mod) { int ret = 0; + unsigned long ftrace_addr; + + /* + * If the address is located on a ftrace nop, set the + * breakpoint to the following instruction. + */ + ftrace_addr = ftrace_location((unsigned long)p->addr); + if (ftrace_addr) { +#ifdef KPROBES_CAN_USE_FTRACE + /* Given address is not on the instruction boundary */ + if ((unsigned long)p->addr != ftrace_addr) + return -EILSEQ; + /* break_handler (jprobe) can not work with ftrace */ + if (p->break_handler) + return -EINVAL; + p->flags |= KPROBE_FLAG_FTRACE; +#else /* !KPROBES_CAN_USE_FTRACE */ + return -EINVAL; +#endif + } jump_label_lock(); preempt_disable(); @@ -1350,7 +1433,6 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Ensure it is not in reserved area nor out of text */ if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr) || jump_label_text_reserved(p->addr, p->addr)) { ret = -EINVAL; goto out; @@ -1422,7 +1504,7 @@ int __kprobes register_kprobe(struct kprobe *p) } mutex_lock(&text_mutex); /* Avoiding text modification */ - ret = arch_prepare_kprobe(p); + ret = prepare_kprobe(p); mutex_unlock(&text_mutex); if (ret) goto out; @@ -1480,7 +1562,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { - disarm_kprobe(orig_p); + disarm_kprobe(orig_p, true); orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -2078,10 +2160,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, if (!pp) pp = p; - seq_printf(pi, "%s%s%s\n", + seq_printf(pi, "%s%s%s%s\n", (kprobe_gone(p) ? "[GONE]" : ""), ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), - (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); + (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), + (kprobe_ftrace(pp) ? "[FTRACE]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -2160,14 +2243,12 @@ static void __kprobes arm_all_kprobes(void) goto already_enabled; /* Arming kprobes doesn't optimize kprobe itself */ - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - __arm_kprobe(p); + arm_kprobe(p); } - mutex_unlock(&text_mutex); kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); @@ -2195,15 +2276,13 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); - mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - __disarm_kprobe(p, false); + disarm_kprobe(p, false); } } - mutex_unlock(&text_mutex); mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ -- cgit v1.2.3 From e52538965119319447c0800c534da73142c27be2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 5 Jun 2012 19:28:38 +0900 Subject: kprobes/x86: ftrace based optimization for x86 Add function tracer based kprobe optimization support handlers on x86. This allows kprobes to use function tracer for probing on mcount call. Link: http://lkml.kernel.org/r/20120605102838.27845.26317.stgit@localhost.localdomain Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ananth N Mavinakayanahalli Cc: "Frank Ch. Eigler" Cc: Andrew Morton Cc: Frederic Weisbecker Signed-off-by: Masami Hiramatsu [ Updated to new port of ftrace save regs functions ] Signed-off-by: Steven Rostedt --- arch/x86/include/asm/kprobes.h | 1 + arch/x86/kernel/kprobes.c | 48 ++++++++++++++++++++++++++++++++++++++++++ include/linux/kprobes.h | 2 +- kernel/kprobes.c | 2 +- 4 files changed, 51 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 547882539157..d3ddd17405d0 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -27,6 +27,7 @@ #include #define __ARCH_WANT_KPROBES_INSN_SLOT +#define ARCH_SUPPORTS_KPROBES_ON_FTRACE struct pt_regs; struct kprobe; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e2f751efb7b1..47ae1023a93c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1052,6 +1052,54 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } +#ifdef KPROBES_CAN_USE_FTRACE +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + struct kprobe *p; + struct kprobe_ctlblk *kcb; + unsigned long flags; + + /* Disable irq for emulating a breakpoint and avoiding preempt */ + local_irq_save(flags); + + p = get_kprobe((kprobe_opcode_t *)ip); + if (unlikely(!p) || kprobe_disabled(p)) + goto end; + + kcb = get_kprobe_ctlblk(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(p); + } else { + regs->ip += sizeof(kprobe_opcode_t); + + __this_cpu_write(current_kprobe, p); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + if (p->pre_handler) + p->pre_handler(p, regs); + + if (unlikely(p->post_handler)) { + /* Emulate singlestep as if there is a 5byte nop */ + regs->ip = ip + MCOUNT_INSN_SIZE; + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); + regs->ip = ip; /* Recover for next callback */ + } +end: + local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ + p->ainsn.insn = NULL; + p->ainsn.boostable = -1; + return 0; +} +#endif + int __init arch_init_kprobes(void) { return arch_init_optprobes(); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index aa0d05e852e3..23755ba42abc 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -318,7 +318,7 @@ extern int proc_kprobes_optimization_handler(struct ctl_table *table, #endif /* CONFIG_OPTPROBES */ #ifdef KPROBES_CAN_USE_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct pt_regs *regs); + struct ftrace_ops *ops, struct pt_regs *regs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); #endif diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 69c16efc315b..35b4315d84f5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -921,7 +921,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) #ifdef KPROBES_CAN_USE_FTRACE static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { - .regs_func = kprobe_ftrace_handler, + .func = kprobe_ftrace_handler, .flags = FTRACE_OPS_FL_SAVE_REGS, }; static int kprobe_ftrace_enabled; -- cgit v1.2.3 From 3c18c10bde65b6dcaffab7a4d040285e4defa49b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 31 Jul 2012 10:23:37 -0400 Subject: tracing: Fix wakeup_rt self test on virtual machines The warkeup_rt self test used msleep() calls to wait for real time tasks to wake up and run. On bare-metal hardware, this was enough as the scheduler should let the RT task run way before the non-RT task wakes up from the msleep(). If it did not, then that would mean the scheduler was broken. But when dealing with virtual machines, this is a different story. If the RT task wakes up on a VCPU, it's up to the host to decide when that task gets to schedule, which can be far behind the time that the non-RT task wakes up. In this case, the test would fail incorrectly. As we are not testing the scheduler, but instead the wake up tracing, we can use completions to wait and not depend on scheduler timings to see if events happen on time. Link: http://lkml.kernel.org/r/1343663105.3847.7.camel@fedora Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace_selftest.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 1003a4d5eb25..2c00a691a540 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -1041,6 +1041,8 @@ static int trace_wakeup_test_thread(void *data) set_current_state(TASK_INTERRUPTIBLE); schedule(); + complete(x); + /* we are awake, now wait to disappear */ while (!kthread_should_stop()) { /* @@ -1084,24 +1086,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* reset the max latency */ tracing_max_latency = 0; - /* sleep to let the RT thread sleep too */ - msleep(100); + while (p->on_rq) { + /* + * Sleep to make sure the RT thread is asleep too. + * On virtual machines we can't rely on timings, + * but we want to make sure this test still works. + */ + msleep(100); + } - /* - * Yes this is slightly racy. It is possible that for some - * strange reason that the RT thread we created, did not - * call schedule for 100ms after doing the completion, - * and we do a wakeup on a task that already is awake. - * But that is extremely unlikely, and the worst thing that - * happens in such a case, is that we disable tracing. - * Honestly, if this race does happen something is horrible - * wrong with the system. - */ + init_completion(&isrt); wake_up_process(p); - /* give a little time to let the thread wake up */ - msleep(100); + /* Wait for the task to wake up */ + wait_for_completion(&isrt); /* stop the tracing. */ tracing_stop(); -- cgit v1.2.3 From 92d8d4a8b0f4c6eba70f6e62b48e38bd005a56e6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 19 Jun 2012 17:47:52 +0200 Subject: tracing/filter: Add missing initialization Add missing initialization for ret variable. Its initialization is based on the re_cnt variable, which is being set deep down in the ftrace_function_filter_re function. I'm not sure compilers would be smart enough to see this in near future, so killing the warning this way. Link: http://lkml.kernel.org/r/1340120894-9465-2-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 431dba8b7542..c154797a7ff7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, static int __ftrace_function_set_filter(int filter, char *buf, int len, struct function_filter_data *data) { - int i, re_cnt, ret; + int i, re_cnt, ret = -EINVAL; int *reset; char **re; -- cgit v1.2.3 From 87abb3b15c62033409f5bf2ffb5620c94f91cf2c Mon Sep 17 00:00:00 2001 From: Wang Tianhong Date: Thu, 2 Aug 2012 14:02:00 +0800 Subject: tracing/trivial: Fix some typos in kernel/trace Fix some typos in kernel/trace. Link: http://lkml.kernel.org/r/1343887320.2228.9.camel@louis-ThinkPad-T410 Signed-off-by: Wang Tianhong Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++-- kernel/trace/trace.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 49491fa7daa2..b32ed0e385a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); * to the buffer after this will fail and return NULL. * * This is different than ring_buffer_record_disable() as - * it works like an on/off switch, where as the disable() verison + * it works like an on/off switch, where as the disable() version * must be paired with a enable(). */ void ring_buffer_record_off(struct ring_buffer *buffer) @@ -2839,7 +2839,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); * ring_buffer_record_off(). * * This is different than ring_buffer_record_enable() as - * it works like an on/off switch, where as the enable() verison + * it works like an on/off switch, where as the enable() version * must be paired with a disable(). */ void ring_buffer_record_on(struct ring_buffer *buffer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a120f98c4112..d1a8d07ec866 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -426,15 +426,15 @@ __setup("trace_buf_size=", set_buf_size); static int __init set_tracing_thresh(char *str) { - unsigned long threshhold; + unsigned long threshold; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &threshhold); + ret = strict_strtoul(str, 0, &threshold); if (ret < 0) return 0; - tracing_thresh = threshhold * 1000; + tracing_thresh = threshold * 1000; return 1; } __setup("tracing_thresh=", set_tracing_thresh); -- cgit v1.2.3 From 4018994f3d8785275ef0e7391b75c3462c029e56 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:37 +0200 Subject: perf: Add ability to attach user level registers dump to sample Introducing PERF_SAMPLE_REGS_USER sample type bit to trigger the dump of user level registers on sample. Registers we want to dump are specified by sample_regs_user bitmask. Only user level registers are dumped at the moment. Meaning the register values of the user space context as it was before the user entered the kernel for whatever reason (syscall, irq, exception, or a PMI happening in userspace). The layout of the sample_regs_user bitmap is described in asm/perf_regs.h for archs that support register dump. This is going to be useful to bring Dwarf CFI based stack unwinding on top of samples. Original-patch-by: Frederic Weisbecker [ Dump registers ABI specification. ] Signed-off-by: Jiri Olsa Suggested-by: Stephane Eranian Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-3-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/perf_regs.c | 15 +++++++++++ include/linux/perf_event.h | 35 +++++++++++++++++++++--- include/linux/perf_regs.h | 6 +++++ kernel/events/core.c | 66 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 119 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 3d6923528b1c..c5a3e5cfe07f 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include #include @@ -71,6 +73,11 @@ int perf_reg_validate(u64 mask) return 0; } + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} #else /* CONFIG_X86_64 */ #define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ (1ULL << PERF_REG_X86_ES) | \ @@ -87,4 +94,12 @@ int perf_reg_validate(u64 mask) return 0; } + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_IA32)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} #endif /* CONFIG_X86_32 */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7602ccb3f40e..3d4d84745f07 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -130,8 +130,9 @@ enum perf_event_sample_format { PERF_SAMPLE_STREAM_ID = 1U << 9, PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_BRANCH_STACK = 1U << 11, + PERF_SAMPLE_REGS_USER = 1U << 12, - PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 13, /* non-ABI */ }; /* @@ -162,6 +163,15 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_KERNEL|\ PERF_SAMPLE_BRANCH_HV) +/* + * Values to determine ABI of the registers dump. + */ +enum perf_sample_regs_abi { + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, +}; + /* * The format of the data returned by read() on a perf event fd, * as specified by attr.read_format: @@ -194,6 +204,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ #define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ #define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 88 /* add: sample_regs_user */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -271,7 +282,13 @@ struct perf_event_attr { __u64 bp_len; __u64 config2; /* extension of config1 */ }; - __u64 branch_sample_type; /* enum branch_sample_type */ + __u64 branch_sample_type; /* enum perf_branch_sample_type */ + + /* + * Defines set of user regs to dump on samples. + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_user; }; /* @@ -548,6 +565,9 @@ enum perf_event_type { * char data[size];}&& PERF_SAMPLE_RAW * * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK + * + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER * }; */ PERF_RECORD_SAMPLE = 9, @@ -609,6 +629,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -654,6 +675,11 @@ struct perf_branch_stack { struct perf_branch_entry entries[0]; }; +struct perf_regs_user { + __u64 abi; + struct pt_regs *regs; +}; + struct task_struct; /* @@ -1133,6 +1159,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; + struct perf_regs_user regs_user; }; static inline void perf_sample_data_init(struct perf_sample_data *data, @@ -1142,7 +1169,9 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->br_stack = NULL; - data->period = period; + data->period = period; + data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE; + data->regs_user.regs = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h index a2f1a98f7839..3c73d5fe18be 100644 --- a/include/linux/perf_regs.h +++ b/include/linux/perf_regs.h @@ -5,6 +5,7 @@ #include u64 perf_reg_value(struct pt_regs *regs, int idx); int perf_reg_validate(u64 mask); +u64 perf_reg_abi(struct task_struct *task); #else static inline u64 perf_reg_value(struct pt_regs *regs, int idx) { @@ -15,5 +16,10 @@ static inline int perf_reg_validate(u64 mask) { return mask ? -ENOSYS : 0; } + +static inline u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_NONE; +} #endif /* CONFIG_HAVE_PERF_REGS */ #endif /* _LINUX_PERF_REGS_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index b7935fcec7d9..d3ce97525b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3756,6 +3756,37 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +static void +perf_output_sample_regs(struct perf_output_handle *handle, + struct pt_regs *regs, u64 mask) +{ + int bit; + + for_each_set_bit(bit, (const unsigned long *) &mask, + sizeof(mask) * BITS_PER_BYTE) { + u64 val; + + val = perf_reg_value(regs, bit); + perf_output_put(handle, val); + } +} + +static void perf_sample_regs_user(struct perf_regs_user *regs_user, + struct pt_regs *regs) +{ + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + regs_user->regs = regs; + regs_user->abi = perf_reg_abi(current); + } +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4016,6 +4047,23 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, nr); } } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + u64 abi = data->regs_user.abi; + + /* + * If there are no regs to dump, notice it through + * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). + */ + perf_output_put(handle, abi); + + if (abi) { + u64 mask = event->attr.sample_regs_user; + perf_output_sample_regs(handle, + data->regs_user.regs, + mask); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -4067,6 +4115,20 @@ void perf_prepare_sample(struct perf_event_header *header, } header->size += size; } + + if (sample_type & PERF_SAMPLE_REGS_USER) { + /* regs dump ABI info */ + int size = sizeof(u64); + + perf_sample_regs_user(&data->regs_user, regs); + + if (data->regs_user.regs) { + u64 mask = event->attr.sample_regs_user; + size += hweight64(mask) * sizeof(u64); + } + + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -6142,6 +6204,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, attr->branch_sample_type = mask; } } + + if (attr->sample_type & PERF_SAMPLE_REGS_USER) + ret = perf_reg_validate(attr->sample_regs_user); + out: return ret; -- cgit v1.2.3 From 91d7753a45f8525dc75b6be01e427dc1c378dc16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Aug 2012 15:20:38 +0200 Subject: perf: Factor __output_copy to be usable with specific copy function Adding a generic way to use __output_copy function with specific copy function via DEFINE_PERF_OUTPUT_COPY macro. Using this to add new __output_copy_user function, that provides output copy from user pointers. For x86 the copy_from_user_nmi function is used and __copy_from_user_inatomic for the rest of the architectures. This new function will be used in user stack dump on sample, coming in next patches. Signed-off-by: Jiri Olsa Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-4-git-send-email-jolsa@redhat.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/include/asm/perf_event.h | 2 ++ include/linux/perf_event.h | 2 +- kernel/events/internal.h | 62 ++++++++++++++++++++++++++------------- kernel/events/ring_buffer.c | 4 +-- 4 files changed, 46 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index cb4e43bce98a..4fabcdf1cfa7 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -262,4 +262,6 @@ static inline void perf_check_microcode(void) { } static inline void amd_pmu_disable_virt(void) { } #endif +#define arch_perf_out_copy_user copy_from_user_nmi + #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3d4d84745f07..d41394a1af36 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1319,7 +1319,7 @@ static inline bool has_branch_stack(struct perf_event *event) extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); -extern void perf_output_copy(struct perf_output_handle *handle, +extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index a096c19f2c2a..7fd5408493d2 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -2,6 +2,7 @@ #define _KERNEL_EVENTS_INTERNAL_H #include +#include /* Buffer handling */ @@ -76,30 +77,49 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } -static inline void -__output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned int \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned int len) \ +{ \ + unsigned long size, written; \ + \ + do { \ + size = min_t(unsigned long, handle->size, len); \ + \ + written = memcpy_func(handle->addr, buf, size); \ + \ + len -= written; \ + handle->addr += written; \ + buf += written; \ + handle->size -= written; \ + if (!handle->size) { \ + struct ring_buffer *rb = handle->rb; \ + \ + handle->page++; \ + handle->page &= rb->nr_pages - 1; \ + handle->addr = rb->data_pages[handle->page]; \ + handle->size = PAGE_SIZE << page_order(rb); \ + } \ + } while (len && written == size); \ + \ + return len; \ +} + +static inline int memcpy_common(void *dst, const void *src, size_t n) { - do { - unsigned long size = min_t(unsigned long, handle->size, len); - - memcpy(handle->addr, buf, size); - - len -= size; - handle->addr += size; - buf += size; - handle->size -= size; - if (!handle->size) { - struct ring_buffer *rb = handle->rb; - - handle->page++; - handle->page &= rb->nr_pages - 1; - handle->addr = rb->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(rb); - } - } while (len); + memcpy(dst, src, n); + return n; } +DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) + +#ifndef arch_perf_out_copy_user +#define arch_perf_out_copy_user __copy_from_user_inatomic +#endif + +DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) + /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ddaba43fb7a..b4c2ad3dee7a 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -182,10 +182,10 @@ out: return -ENOSPC; } -void perf_output_copy(struct perf_output_handle *handle, +unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { - __output_copy(handle, buf, len); + return __output_copy(handle, buf, len); } void perf_output_end(struct perf_output_handle *handle) -- cgit v1.2.3 From 5685e0ff45f5df67e79e9b052b6ffd501ff38c11 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:39 +0200 Subject: perf: Add perf_output_skip function to skip bytes in sample Introducing perf_output_skip function to be able to skip data within the perf ring buffer. When writing data into perf ring buffer we first reserve needed place in ring buffer and then copy the actual data. There's a possibility we won't be able to fill all the reserved size with data, so we need a way to skip the remaining bytes. This is going to be useful when storing the user stack dump, where we might end up with less data than we originally requested. Signed-off-by: Jiri Olsa Acked-by: Frederic Weisbecker Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-5-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 2 ++ kernel/events/internal.h | 4 ++++ kernel/events/ring_buffer.c | 6 ++++++ 3 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d41394a1af36..8a73f75beb16 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1321,6 +1321,8 @@ extern int perf_output_begin(struct perf_output_handle *handle, extern void perf_output_end(struct perf_output_handle *handle); extern unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); +extern unsigned int perf_output_skip(struct perf_output_handle *handle, + unsigned int len); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); extern void perf_event_enable(struct perf_event *event); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 7fd5408493d2..ce7bdfc1d045 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -114,6 +114,10 @@ static inline int memcpy_common(void *dst, const void *src, size_t n) DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) +#define MEMCPY_SKIP(dst, src, n) (n) + +DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) + #ifndef arch_perf_out_copy_user #define arch_perf_out_copy_user __copy_from_user_inatomic #endif diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b4c2ad3dee7a..23cb34ff3973 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -188,6 +188,12 @@ unsigned int perf_output_copy(struct perf_output_handle *handle, return __output_copy(handle, buf, len); } +unsigned int perf_output_skip(struct perf_output_handle *handle, + unsigned int len) +{ + return __output_skip(handle, NULL, len); +} + void perf_output_end(struct perf_output_handle *handle) { perf_output_put_handle(handle); -- cgit v1.2.3 From c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Aug 2012 15:20:40 +0200 Subject: perf: Add ability to attach user stack dump to sample Introducing PERF_SAMPLE_STACK_USER sample type bit to trigger the dump of the user level stack on sample. The size of the dump is specified by sample_stack_user value. Being able to dump parts of the user stack, starting from the stack pointer, will be useful to make a post mortem dwarf CFI based stack unwinding. Added HAVE_PERF_USER_STACK_DUMP config option to determine if the architecture provides user stack dump on perf event samples. This needs access to the user stack pointer which is not unified across architectures. Enabling this for x86 architecture. Signed-off-by: Jiri Olsa Original-patch-by: Frederic Weisbecker Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-6-git-send-email-jolsa@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- arch/Kconfig | 7 +++ arch/x86/Kconfig | 1 + include/linux/perf_event.h | 18 +++++- kernel/events/core.c | 150 ++++++++++++++++++++++++++++++++++++++++++++- kernel/events/internal.h | 16 +++++ 5 files changed, 190 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 68d827b7ae82..2a83a3f6a615 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -228,6 +228,13 @@ config HAVE_PERF_REGS Support selective register dumps for perf events. This includes bit-mapping of each registers and a unique architecture id. +config HAVE_PERF_USER_STACK_DUMP + bool + help + Support user stack dumps for perf event samples. This needs + access to the user stack pointer which is not unified across + architectures. + config HAVE_ARCH_JUMP_LABEL bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3fab6ec9edc4..a2d19ee750ca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -61,6 +61,7 @@ config X86 select PERF_EVENTS select HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select ANON_INODES select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386 select HAVE_CMPXCHG_LOCAL if !M386 diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8a73f75beb16..d1d25f6a5e24 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -131,8 +131,9 @@ enum perf_event_sample_format { PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_BRANCH_STACK = 1U << 11, PERF_SAMPLE_REGS_USER = 1U << 12, + PERF_SAMPLE_STACK_USER = 1U << 13, - PERF_SAMPLE_MAX = 1U << 13, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 14, /* non-ABI */ }; /* @@ -205,6 +206,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ #define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ #define PERF_ATTR_SIZE_VER3 88 /* add: sample_regs_user */ +#define PERF_ATTR_SIZE_VER4 96 /* add: sample_stack_user */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -289,6 +291,14 @@ struct perf_event_attr { * See asm/perf_regs.h for details. */ __u64 sample_regs_user; + + /* + * Defines size of the user stack to dump on samples. + */ + __u32 sample_stack_user; + + /* Align to u64. */ + __u32 __reserved_2; }; /* @@ -568,6 +578,10 @@ enum perf_event_type { * * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * }; */ PERF_RECORD_SAMPLE = 9, @@ -1160,6 +1174,7 @@ struct perf_sample_data { struct perf_raw_record *raw; struct perf_branch_stack *br_stack; struct perf_regs_user regs_user; + u64 stack_user_size; }; static inline void perf_sample_data_init(struct perf_sample_data *data, @@ -1172,6 +1187,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->period = period; data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE; data->regs_user.regs = NULL; + data->stack_user_size = 0; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/kernel/events/core.c b/kernel/events/core.c index d3ce97525b9f..2ba890450d15 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" @@ -3787,6 +3788,101 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user, } } +/* + * Get remaining task size from user stack pointer. + * + * It'd be better to take stack vma map and limit this more + * precisly, but there's no way to get it safely under interrupt, + * so using TASK_SIZE as limit. + */ +static u64 perf_ustack_task_size(struct pt_regs *regs) +{ + unsigned long addr = perf_user_stack_pointer(regs); + + if (!addr || addr >= TASK_SIZE) + return 0; + + return TASK_SIZE - addr; +} + +static u16 +perf_sample_ustack_size(u16 stack_size, u16 header_size, + struct pt_regs *regs) +{ + u64 task_size; + + /* No regs, no stack pointer, no dump. */ + if (!regs) + return 0; + + /* + * Check if we fit in with the requested stack size into the: + * - TASK_SIZE + * If we don't, we limit the size to the TASK_SIZE. + * + * - remaining sample size + * If we don't, we customize the stack size to + * fit in to the remaining sample size. + */ + + task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); + stack_size = min(stack_size, (u16) task_size); + + /* Current header size plus static size and dynamic size. */ + header_size += 2 * sizeof(u64); + + /* Do we fit in with the current stack dump size? */ + if ((u16) (header_size + stack_size) < header_size) { + /* + * If we overflow the maximum size for the sample, + * we customize the stack dump size to fit in. + */ + stack_size = USHRT_MAX - header_size - sizeof(u64); + stack_size = round_up(stack_size, sizeof(u64)); + } + + return stack_size; +} + +static void +perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, + struct pt_regs *regs) +{ + /* Case of a kernel thread, nothing to dump */ + if (!regs) { + u64 size = 0; + perf_output_put(handle, size); + } else { + unsigned long sp; + unsigned int rem; + u64 dyn_size; + + /* + * We dump: + * static size + * - the size requested by user or the best one we can fit + * in to the sample max size + * data + * - user stack dump data + * dynamic size + * - the actual dumped size + */ + + /* Static size. */ + perf_output_put(handle, dump_size); + + /* Data. */ + sp = perf_user_stack_pointer(regs); + rem = __output_copy_user(handle, (void *) sp, dump_size); + dyn_size = dump_size - rem; + + perf_output_skip(handle, rem); + + /* Dynamic size. */ + perf_output_put(handle, dyn_size); + } +} + static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) @@ -4064,6 +4160,11 @@ void perf_output_sample(struct perf_output_handle *handle, mask); } } + + if (sample_type & PERF_SAMPLE_STACK_USER) + perf_output_sample_ustack(handle, + data->stack_user_size, + data->regs_user.regs); } void perf_prepare_sample(struct perf_event_header *header, @@ -4129,6 +4230,35 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + + if (sample_type & PERF_SAMPLE_STACK_USER) { + /* + * Either we need PERF_SAMPLE_STACK_USER bit to be allways + * processed as the last one or have additional check added + * in case new sample type is added, because we could eat + * up the rest of the sample size. + */ + struct perf_regs_user *uregs = &data->regs_user; + u16 stack_size = event->attr.sample_stack_user; + u16 size = sizeof(u64); + + if (!uregs->abi) + perf_sample_regs_user(uregs, regs); + + stack_size = perf_sample_ustack_size(stack_size, header->size, + uregs->regs); + + /* + * If there is something to dump, add space for the dump + * itself and for the field that tells the dynamic size, + * which is how many have been actually dumped. + */ + if (stack_size) + size += sizeof(u64) + stack_size; + + data->stack_user_size = stack_size; + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -6205,8 +6335,26 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, } } - if (attr->sample_type & PERF_SAMPLE_REGS_USER) + if (attr->sample_type & PERF_SAMPLE_REGS_USER) { ret = perf_reg_validate(attr->sample_regs_user); + if (ret) + return ret; + } + + if (attr->sample_type & PERF_SAMPLE_STACK_USER) { + if (!arch_perf_have_user_stack_dump()) + return -ENOSYS; + + /* + * We have __u32 type for the size, but so far + * we can only use __u16 as maximum due to the + * __u16 sample size limit. + */ + if (attr->sample_stack_user >= USHRT_MAX) + ret = -EINVAL; + else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) + ret = -EINVAL; + } out: return ret; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ce7bdfc1d045..d56a64c99a8b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -158,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx) recursion[rctx]--; } +#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP +static inline bool arch_perf_have_user_stack_dump(void) +{ + return true; +} + +#define perf_user_stack_pointer(regs) user_stack_pointer(regs) +#else +static inline bool arch_perf_have_user_stack_dump(void) +{ + return false; +} + +#define perf_user_stack_pointer(regs) 0 +#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ + #endif /* _KERNEL_EVENTS_INTERNAL_H */ -- cgit v1.2.3 From d077526485d5c9b12fe85d0b2b3b7041e6bc5f91 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Aug 2012 15:20:41 +0200 Subject: perf: Add attribute to filter out callchains Introducing following bits to the the perf_event_attr struct: - exclude_callchain_kernel to filter out kernel callchain from the sample dump - exclude_callchain_user to filter out user callchain from the sample dump We need to be able to disable standard user callchain dump when we use the dwarf cfi callchain mode, because frame pointer based user callchains are useless in this mode. Implementing also exclude_callchain_kernel to have complete set of options. Signed-off-by: Jiri Olsa [ Added kernel callchains filtering ] Cc: "Frank Ch. Eigler" Cc: Arun Sharma Cc: Benjamin Redelings Cc: Corey Ashford Cc: Cyrill Gorcunov Cc: Frank Ch. Eigler Cc: Ingo Molnar Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter Cc: Stephane Eranian Cc: Tom Zanussi Cc: Ulrich Drepper Link: http://lkml.kernel.org/r/1344345647-11536-7-git-send-email-jolsa@redhat.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 5 ++++- kernel/events/callchain.c | 38 ++++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d1d25f6a5e24..297ca3db6b4a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -268,7 +268,10 @@ struct perf_event_attr { exclude_host : 1, /* don't count in host */ exclude_guest : 1, /* don't count in guest */ - __reserved_1 : 43; + exclude_callchain_kernel : 1, /* exclude kernel callchains */ + exclude_callchain_user : 1, /* exclude user callchains */ + + __reserved_1 : 41; union { __u32 wakeup_events; /* wakeup every n events */ diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 98d4597f43d6..c77206184b8b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) int rctx; struct perf_callchain_entry *entry; + int kernel = !event->attr.exclude_callchain_kernel; + int user = !event->attr.exclude_callchain_user; + + if (!kernel && !user) + return NULL; entry = get_callchain_entry(&rctx); if (rctx == -1) @@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) entry->nr = 0; - if (!user_mode(regs)) { + if (kernel && !user_mode(regs)) { perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; } - if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) - goto exit_put; - - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + if (user) { + if (!user_mode(regs)) { + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + /* + * Disallow cross-task user callchains. + */ + if (event->ctx->task && event->ctx->task != current) + goto exit_put; + + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } } exit_put: -- cgit v1.2.3 From a2546fae01124fb8063747439300fcf39bac033a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Feb 2011 13:15:59 -0500 Subject: ftrace: Add -mfentry to Makefile on function tracer Thanks to Andi Kleen, gcc 4.6.0 now supports -mfentry for x86 (and hopefully soon for other archs). What this does is to have the function profiler start at the beginning of the function instead of after the stack is set up. As plain -pg (mcount) is called after the stack is set up, and in some cases can have issues with the function graph tracer. It also requires frame pointers to be enabled. The -mfentry now calls __fentry__ at the beginning of the function. This allows for compiling without frame pointers and even has the ability to access parameters if needed. If the architecture and the compiler both support -mfentry then use that instead. Link: http://lkml.kernel.org/r/20120807194059.392617243@goodmis.org Acked-by: H. Peter Anvin Acked-by: Ingo Molnar Cc: Michal Marek Cc: Andi Kleen Signed-off-by: Steven Rostedt --- Makefile | 6 +++++- kernel/trace/Kconfig | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index ddf5be952e45..e7ca93f17851 100644 --- a/Makefile +++ b/Makefile @@ -609,7 +609,11 @@ KBUILD_CFLAGS += $(call cc-option, -femit-struct-debug-baseonly) endif ifdef CONFIG_FUNCTION_TRACER -KBUILD_CFLAGS += -pg +ifdef CONFIG_HAVE_FENTRY +CC_USING_FENTRY := $(call cc-option, -mfentry -DCC_USING_FENTRY) +endif +KBUILD_CFLAGS += -pg $(CC_USING_FENTRY) +KBUILD_AFLAGS += $(CC_USING_FENTRY) ifdef CONFIG_DYNAMIC_FTRACE ifdef CONFIG_HAVE_C_RECORDMCOUNT BUILD_C_RECORDMCOUNT := y diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c07071cc5..9301a0e35e0c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_FENTRY + bool + help + Arch supports the gcc options -pg with -mfentry + config HAVE_C_RECORDMCOUNT bool help -- cgit v1.2.3 From 781d06248234e221edb560a18461d65808a8a942 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Feb 2011 13:27:22 -0500 Subject: ftrace: Do not test frame pointers if -mfentry is used The function graph has a test to check if the frame pointer is corrupted, which can happen with various options of gcc with mcount. But this is not an issue with -mfentry as -mfentry does not need nor use frame pointers for function graph tracing. Link: http://lkml.kernel.org/r/20120807194059.773895870@goodmis.org Acked-by: H. Peter Anvin Acked-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ce27c8ba8d31..99b4378393d5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it @@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, * * Currently, x86_32 with optimize for size (-Os) makes the latest * gcc do the above. + * + * Note, -mfentry does not use frame pointers, and this test + * is not needed if CC_USING_FENTRY is set. */ if (unlikely(current->ret_stack[index].fp != frame_pointer)) { ftrace_graph_stop(); -- cgit v1.2.3 From 61e1d394984110e2e76f25572d5b1b5d48796751 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 1 Jun 2012 14:49:50 +0530 Subject: uprobes: Remove redundant lock_page/unlock_page Since read_opcode() reads from the referenced page and doesnt modify the page contents nor the page attributes, there is no need to lock the page. Signed-off-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c08a22d02f72..7cff24c60dd7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -280,12 +280,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ if (ret <= 0) return ret; - lock_page(page); vaddr_new = kmap_atomic(page); vaddr &= ~PAGE_MASK; memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); kunmap_atomic(vaddr_new); - unlock_page(page); put_page(page); -- cgit v1.2.3 From 8bd874456e2ec49b9e64372ddc89a6f88901d184 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Aug 2012 18:12:30 +0200 Subject: uprobes: Remove check for uprobe variable in handle_swbp() by the time we get here (after we pass cleanup_ret) uprobe is always is set. If it is NULL we leave very early in the code. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov --- kernel/events/uprobes.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7cff24c60dd7..0cefde276641 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1516,17 +1516,15 @@ cleanup_ret: utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; } - if (uprobe) { - if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) + if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) - /* - * cannot singlestep; cannot skip instruction; - * re-execute the instruction. - */ - instruction_pointer_set(regs, bp_vaddr); + /* + * cannot singlestep; cannot skip instruction; + * re-execute the instruction. + */ + instruction_pointer_set(regs, bp_vaddr); - put_uprobe(uprobe); - } + put_uprobe(uprobe); } /* -- cgit v1.2.3 From 647c42dfd40fec032a4c8525a755160f0765921f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 6 Aug 2012 13:15:09 +0200 Subject: uprobes: Kill uprobes_state->count uprobes_state->count is only needed to avoid the slow path in uprobe_pre_sstep_notifier(). It is also checked in uprobe_munmap() but ironically its only goal to decrement this counter. However, it is very broken. Just some examples: - uprobe_mmap() can race with uprobe_unregister() and wrongly increment the counter if it hits the non-uprobe "int3". Note that install_breakpoint() checks ->consumers first and returns -EEXIST if it is NULL. "atomic_sub() if error" in uprobe_mmap() looks obviously wrong too. - uprobe_munmap() can race with uprobe_register() and wrongly decrement the counter by the same reason. - Suppose an appication tries to increase the mmapped area via sys_mremap(). vma_adjust() does uprobe_munmap(whole_vma) first, this can nullify the counter temporarily and race with another thread which can hit the bp, the application will be killed by SIGTRAP. - Suppose an application mmaps 2 consecutive areas in the same file and one (or both) of these areas has uprobes. In the likely case mmap_region()->vma_merge() suceeds. Like above, this leads to uprobe_munmap/uprobe_mmap from vma_merge()->vma_adjust() but then mmap_region() does another uprobe_mmap(resulting_vma) and doubles the counter. This patch only removes this counter and fixes the compile errors, then we will try to cleanup the changed code and add something else instead. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 38 ++------------------------------------ 2 files changed, 3 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index efe4b3308c74..03ae547c1c31 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -99,8 +99,8 @@ struct xol_area { struct uprobes_state { struct xol_area *xol_area; - atomic_t count; }; + extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0cefde276641..6f1664d217dc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -678,18 +678,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, uprobe->flags |= UPROBE_COPY_INSN; } - /* - * Ideally, should be updating the probe count after the breakpoint - * has been successfully inserted. However a thread could hit the - * breakpoint we just inserted even before the probe count is - * incremented. If this is the first breakpoint placed, breakpoint - * notifier might ignore uprobes and pass the trap to the thread. - * Hence increment before and decrement on failure. - */ - atomic_inc(&mm->uprobes_state.count); ret = set_swbp(&uprobe->arch, mm, vaddr); - if (ret) - atomic_dec(&mm->uprobes_state.count); return ret; } @@ -697,8 +686,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) - atomic_dec(&mm->uprobes_state.count); + set_orig_insn(&uprobe->arch, mm, vaddr, true); } /* @@ -1051,13 +1039,6 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!is_swbp_at_addr(vma->vm_mm, vaddr)) continue; - - /* - * Unable to insert a breakpoint, but - * breakpoint lies underneath. Increment the - * probe count. - */ - atomic_inc(&vma->vm_mm->uprobes_state.count); } if (!ret) @@ -1068,9 +1049,6 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_unlock(uprobes_mmap_hash(inode)); - if (ret) - atomic_sub(count, &vma->vm_mm->uprobes_state.count); - return ret; } @@ -1089,9 +1067,6 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!atomic_read(&vma->vm_mm->uprobes_state.count)) - return; - inode = vma->vm_file->f_mapping->host; if (!inode) return; @@ -1100,13 +1075,6 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon build_probe_list(inode, vma, start, end, &tmp_list); list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); @@ -1217,7 +1185,6 @@ void uprobe_clear_state(struct mm_struct *mm) void uprobe_reset_state(struct mm_struct *mm) { mm->uprobes_state.xol_area = NULL; - atomic_set(&mm->uprobes_state.count, 0); } /* @@ -1585,8 +1552,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask; - if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) - /* task is currently not uprobed */ + if (!current->mm) return 0; utask = current->utask; -- cgit v1.2.3 From f1a45d023193f7d8e55e384090b645d609325393 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 6 Aug 2012 14:13:23 +0200 Subject: uprobes: Kill dup_mmap()->uprobe_mmap(), simplify uprobe_mmap/munmap 1. Kill dup_mmap()->uprobe_mmap(), it was only needed to calculate new_mm->uprobes_state.count removed by the previous patch. If the forking process has a pending uprobe (int3) in vma, it will be copied by copy_page_range(), note that it checks vma->anon_vma so "Don't copy ptes" is not possible after install_breakpoint() which does anon_vma_prepare(). 2. Remove is_swbp_at_addr() and "int count" in uprobe_mmap(). Again, this was needed for uprobes_state.count. As a side effect this fixes the bug pointed out by Srikar, this code lacked the necessary put_uprobe(). 3. uprobe_munmap() becomes a nop after the previous patch. Remove the meaningless code but do not remove the helper, we will need it. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 30 +++--------------------------- kernel/fork.c | 3 --- 2 files changed, 3 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f1664d217dc..ce59c100d65f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1010,7 +1010,7 @@ int uprobe_mmap(struct vm_area_struct *vma) struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret, count; + int ret; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) return 0; @@ -1023,8 +1023,6 @@ int uprobe_mmap(struct vm_area_struct *vma) build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); ret = 0; - count = 0; - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!ret) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); @@ -1034,19 +1032,11 @@ int uprobe_mmap(struct vm_area_struct *vma) * We can race against uprobe_register(), see the * comment near uprobe_hash(). */ - if (ret == -EEXIST) { + if (ret == -EEXIST) ret = 0; - - if (!is_swbp_at_addr(vma->vm_mm, vaddr)) - continue; - } - - if (!ret) - count++; } put_uprobe(uprobe); } - mutex_unlock(uprobes_mmap_hash(inode)); return ret; @@ -1057,27 +1047,13 @@ int uprobe_mmap(struct vm_area_struct *vma) */ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct list_head tmp_list; - struct uprobe *uprobe, *u; - struct inode *inode; - if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - inode = vma->vm_file->f_mapping->host; - if (!inode) - return; - - mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, vma, start, end, &tmp_list); - - list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - put_uprobe(uprobe); - } - mutex_unlock(uprobes_mmap_hash(inode)); + /* TODO: unmapping uprobe(s) will need more work */ } /* Slot allocation for XOL */ diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..912b6f6fe5b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,9 +454,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; - - if (file) - uprobe_mmap(tmp); } /* a new mm has just been created */ arch_dup_mmap(oldmm, mm); -- cgit v1.2.3 From 5e5be71ab3fd8bd2076606923791ece1634c199c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 6 Aug 2012 14:49:56 +0200 Subject: uprobes: Change uprobe_mmap() to ignore the errors but check fatal_signal_pending() Once install_breakpoint() fails uprobe_mmap() "ignores" all other uprobes and returns the error. It was never really needed to to stop after the first error, and in fact it was always wrong at least in -ENOTSUPP case. Change uprobe_mmap() to ignore the errors and always return 0. This is not what we want in the long term, but until we teach the callers to handle the failure it would be better to remove the pointless complications. And this doesn't look too bad, the only "reasonable" error is ENOMEM but in this case the caller should be oom-killed in the likely case or the system has more serious problems. However it makes sense to stop if fatal_signal_pending() == T. In particular this helps if the task was oom-killed. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ce59c100d65f..298fbbdf57e6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -994,23 +994,16 @@ static void build_probe_list(struct inode *inode, } /* - * Called from mmap_region. - * called with mm->mmap_sem acquired. + * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. * - * Return -ve no if we fail to insert probes and we cannot - * bail-out. - * Return 0 otherwise. i.e: - * - * - successful insertion of probes - * - (or) no possible probes to be inserted. - * - (or) insertion of probes failed but we can bail-out. + * Currently we ignore all errors and always return 0, the callers + * can't handle the failure anyway. */ int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode; - int ret; if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) return 0; @@ -1022,24 +1015,16 @@ int uprobe_mmap(struct vm_area_struct *vma) mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); - ret = 0; list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - if (!ret) { + if (!fatal_signal_pending(current)) { unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); - - ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); - /* - * We can race against uprobe_register(), see the - * comment near uprobe_hash(). - */ - if (ret == -EEXIST) - ret = 0; + install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); } put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); - return ret; + return 0; } /* -- cgit v1.2.3 From 78f7411668aa0b2006d331f6a288416dd91b8e5d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 17:35:08 +0200 Subject: uprobes: Do not use -EEXIST in install_breakpoint() paths -EEXIST from install_breakpoint() no longer makes sense, all callers should simply treat it as "success". Change the code to return zero and simplify register_for_each_vma(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 298fbbdf57e6..3e2996b809be 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -332,7 +332,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned */ result = is_swbp_at_addr(mm, vaddr); if (result == 1) - return -EEXIST; + return 0; if (result) return result; @@ -657,7 +657,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, * Hence behave as if probe already existed. */ if (!uprobe->consumers) - return -EEXIST; + return 0; if (!(uprobe->flags & UPROBE_COPY_INSN)) { ret = copy_insn(uprobe, vma->vm_file); @@ -817,17 +817,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; - if (is_register) { + if (is_register) err = install_breakpoint(uprobe, mm, vma, info->vaddr); - /* - * We can race against uprobe_mmap(), see the - * comment near uprobe_hash(). - */ - if (err == -EEXIST) - err = 0; - } else { + else remove_breakpoint(uprobe, mm, info->vaddr); - } + unlock: up_write(&mm->mmap_sem); free: -- cgit v1.2.3 From f8ac4ec9c064b330dcc49e03c450fe74298c4622 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 17:11:42 +0200 Subject: uprobes: Introduce MMF_HAS_UPROBES Add the new MMF_HAS_UPROBES flag. It is set by install_breakpoint() and it is copied by dup_mmap(), uprobe_pre_sstep_notifier() checks it to avoid the slow path if the task was never probed. Perhaps it makes sense to check it in valid_vma(is_register => false) as well. This needs the new dup_mmap()->uprobe_dup_mmap() hook. We can't use uprobe_reset_state() or put MMF_HAS_UPROBES into MMF_INIT_MASK, we need oldmm->mmap_sem to avoid the race with uprobe_register() or mmap() from another thread. Currently we never clear this bit, it can be false-positive after uprobe_unregister() or uprobe_munmap() or if dup_mmap() hits the probed VM_DONTCOPY vma. But this is fine correctness-wise and has no effect unless the task hits the non-uprobe breakpoint. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/sched.h | 2 ++ include/linux/uprobes.h | 5 +++++ kernel/events/uprobes.c | 22 +++++++++++++++++++++- kernel/fork.c | 1 + 4 files changed, 29 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index b8c86648a2f9..3667c332e61d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -446,6 +446,8 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HAS_UPROBES 19 /* might have uprobes */ + #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) struct sighand_struct { diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 03ae547c1c31..4a37ab153247 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -108,6 +108,7 @@ extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_con extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_mmap(struct vm_area_struct *vma); extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); extern void uprobe_copy_process(struct task_struct *t); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); @@ -138,6 +139,10 @@ static inline void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { } +static inline void +uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) +{ +} static inline void uprobe_notify_resume(struct pt_regs *regs) { } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3e2996b809be..33870b17e1dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -647,6 +647,7 @@ static int install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr) { + bool first_uprobe; int ret; /* @@ -678,7 +679,17 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, uprobe->flags |= UPROBE_COPY_INSN; } + /* + * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), + * the task can hit this breakpoint right after __replace_page(). + */ + first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + if (first_uprobe) + set_bit(MMF_HAS_UPROBES, &mm->flags); + ret = set_swbp(&uprobe->arch, mm, vaddr); + if (ret && first_uprobe) + clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; } @@ -1032,6 +1043,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; + if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + return; + /* TODO: unmapping uprobe(s) will need more work */ } @@ -1142,6 +1156,12 @@ void uprobe_reset_state(struct mm_struct *mm) mm->uprobes_state.xol_area = NULL; } +void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) + set_bit(MMF_HAS_UPROBES, &newmm->flags); +} + /* * - search for a free slot. */ @@ -1507,7 +1527,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { struct uprobe_task *utask; - if (!current->mm) + if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) return 0; utask = current->utask; diff --git a/kernel/fork.c b/kernel/fork.c index 912b6f6fe5b8..cbb5f9fcd3e8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -353,6 +353,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ -- cgit v1.2.3 From 61559a8165da2b6bab7621ac36379c6280efacb6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 17:17:46 +0200 Subject: uprobes: Fold uprobe_reset_state() into uprobe_dup_mmap() Now that we have uprobe_dup_mmap() we can fold uprobe_reset_state() into the new hook and remove it. mmput()->uprobe_clear_state() can't be called before dup_mmap(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 4 ---- kernel/events/uprobes.c | 10 ++-------- kernel/fork.c | 2 -- 3 files changed, 2 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4a37ab153247..30297f95c8af 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -118,7 +118,6 @@ extern void uprobe_notify_resume(struct pt_regs *regs); extern bool uprobe_deny_signal(void); extern bool __weak arch_uprobe_skip_sstep(struct arch_uprobe *aup, struct pt_regs *regs); extern void uprobe_clear_state(struct mm_struct *mm); -extern void uprobe_reset_state(struct mm_struct *mm); #else /* !CONFIG_UPROBES */ struct uprobes_state { }; @@ -163,8 +162,5 @@ static inline void uprobe_copy_process(struct task_struct *t) static inline void uprobe_clear_state(struct mm_struct *mm) { } -static inline void uprobe_reset_state(struct mm_struct *mm) -{ -} #endif /* !CONFIG_UPROBES */ #endif /* _LINUX_UPROBES_H */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 33870b17e1dd..610e1c8050cf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1148,16 +1148,10 @@ void uprobe_clear_state(struct mm_struct *mm) kfree(area); } -/* - * uprobe_reset_state - Free the area allocated for slots. - */ -void uprobe_reset_state(struct mm_struct *mm) -{ - mm->uprobes_state.xol_area = NULL; -} - void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { + newmm->uprobes_state.xol_area = NULL; + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) set_bit(MMF_HAS_UPROBES, &newmm->flags); } diff --git a/kernel/fork.c b/kernel/fork.c index cbb5f9fcd3e8..2343c9eaaaf4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -837,8 +837,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; #endif - uprobe_reset_state(mm); - if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.2.3 From ded86e7c8fc4404414c4700010c9962ea8bd083a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 8 Aug 2012 18:07:03 +0200 Subject: uprobes: Remove "verify" argument from set_orig_insn() Nobody does set_orig_insn(verify => false), and I think nobody will. Remove this argument. IIUC set_orig_insn(verify => false) was needed to single-step without xol area. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 2 +- kernel/events/uprobes.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 30297f95c8af..6d4fe79a1a6a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,7 +102,7 @@ struct uprobes_state { }; extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr, bool verify); +extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 610e1c8050cf..1666632e6edf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -345,24 +345,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned * @mm: the probed process address space. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. - * @verify: if true, verify existance of breakpoint instruction. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno. */ int __weak -set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) +set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) { - if (verify) { - int result; + int result; + + result = is_swbp_at_addr(mm, vaddr); + if (!result) + return -EINVAL; - result = is_swbp_at_addr(mm, vaddr); - if (!result) - return -EINVAL; + if (result != 1) + return result; - if (result != 1) - return result; - } return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); } @@ -697,7 +695,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { - set_orig_insn(&uprobe->arch, mm, vaddr, true); + set_orig_insn(&uprobe->arch, mm, vaddr); } /* -- cgit v1.2.3 From 76bab1b78ab6f25d5f74165f94526c25fc93d984 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Mon, 27 Aug 2012 15:13:45 +0800 Subject: tracing: Skip printing "OK" if failed to disable event No acutal case found. But logically, we should skip "OK" in case any error met. Link: http://lkml.kernel.org/r/1346051625-25231-1-git-send-email-yuanhan.liu@linux.intel.com Signed-off-by: Yuanhan Liu Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6825d833a257..bbb0e63d78e9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1646,9 +1646,11 @@ static __init void event_trace_self_tests(void) event_test_stuff(); ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); - if (WARN_ON_ONCE(ret)) + if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); + continue; + } pr_cont("OK\n"); } -- cgit v1.2.3 From ea632e9f12033346cc68247faa3b924d54936b8b Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Sep 2012 19:45:14 -0700 Subject: trace: Stop compiling in trace_clock unconditionally Commit 56449f437 "tracing: make the trace clocks available generally", in April 2009, made trace_clock available unconditionally, since CONFIG_X86_DS used it too. Commit faa4602e47 "x86, perf, bts, mm: Delete the never used BTS-ptrace code", in March 2010, removed CONFIG_X86_DS, and now only CONFIG_RING_BUFFER (split out from CONFIG_TRACING for general use) has a dependency on trace_clock. So, only compile in trace_clock with CONFIG_RING_BUFFER or CONFIG_TRACING enabled. Link: http://lkml.kernel.org/r/20120903024513.GA19583@leaf Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrew Morton Cc: "Eric W. Biederman" Cc: Al Viro Signed-off-by: Josh Triplett Signed-off-by: Steven Rostedt --- kernel/Makefile | 2 +- kernel/trace/Kconfig | 5 +++++ kernel/trace/Makefile | 6 +----- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..29d993be7dba 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -98,7 +98,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ -obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9301a0e35e0c..4cea4f41c1d9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -62,8 +62,12 @@ config HAVE_C_RECORDMCOUNT config TRACER_MAX_TRACE bool +config TRACE_CLOCK + bool + config RING_BUFFER bool + select TRACE_CLOCK config FTRACE_NMI_ENTER bool @@ -114,6 +118,7 @@ config TRACING select NOP_TRACER select BINARY_PRINTF select EVENT_TRACING + select TRACE_CLOCK config GENERIC_TRACER bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 837090808aac..d7e2068e4b71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -19,11 +19,7 @@ endif CFLAGS_trace_events_filter.o := -I$(src) -# -# Make the trace clocks available generally: it's infrastructure -# relied on by ptrace for example: -# -obj-y += trace_clock.o +obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o -- cgit v1.2.3 From c6aaf4d0bb86e2154ea31a33804cec300611255f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 5 Sep 2012 23:31:25 +0900 Subject: kprobes/x86: Fix to support jprobes on ftrace-based kprobe Fix kprobes/x86 to support jprobes on ftrace-based kprobes. Because of -mfentry support of ftrace, ftrace is now put on the beginning of function where jprobes are put. Originally ftrace-based kprobes doesn't support jprobe because it will change regs->ip and ftrace doesn't support changing IP and ftrace itself doesn't conflict jprobe. However, ftrace -mfentry support moves mcount call on the top of functions where jprobes are put. This means that jprobe always conflicts with ftrace-based kprobe and fails. This patch allows ftrace-based kprobes to support jprobes by allowing to modify regs->ip and kprobes breakpoint handler also allows to skip singlestepping because there is a ftrace call (not an original instruction). Link: http://lkml.kernel.org/r/20120905143125.10329.90836.stgit@localhost.localdomain Reported-by: Fengguang Wu Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- arch/x86/kernel/kprobes.c | 42 +++++++++++++++++++++++++++++------------- kernel/kprobes.c | 3 --- 2 files changed, 29 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index f49f60cca40d..b7c2a85d1926 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,6 +541,8 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb return 1; } +static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb); /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled throughout this function. @@ -599,6 +601,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) } else if (kprobe_running()) { p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { +#ifdef KPROBES_CAN_USE_FTRACE + if (kprobe_ftrace(p)) { + skip_singlestep(p, regs, kcb); + return 1; + } +#endif setup_singlestep(p, regs, kcb, 0); return 1; } @@ -1053,6 +1061,21 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } #ifdef KPROBES_CAN_USE_FTRACE +static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + /* + * Emulate singlestep (and also recover regs->ip) + * as if there is a 5byte nop + */ + regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; + if (unlikely(p->post_handler)) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + p->post_handler(p, regs, 0); + } + __this_cpu_write(current_kprobe, NULL); +} + /* Ftrace callback handler for kprobes */ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *regs) @@ -1077,19 +1100,12 @@ void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, p); kcb->kprobe_status = KPROBE_HIT_ACTIVE; - if (p->pre_handler) - p->pre_handler(p, regs); - - if (unlikely(p->post_handler)) { - /* - * Emulate singlestep (and also recover regs->ip) - * as if there is a 5byte nop - */ - regs->ip = ip + MCOUNT_INSN_SIZE; - kcb->kprobe_status = KPROBE_HIT_SSDONE; - p->post_handler(p, regs, 0); - } - __this_cpu_write(current_kprobe, NULL); + if (!p->pre_handler || !p->pre_handler(p, regs)) + skip_singlestep(p, regs, kcb); + /* + * If pre_handler returns !0, it sets regs->ip and + * resets current kprobe. + */ } end: local_irq_restore(flags); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 35b4315d84f5..098f396aa409 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1418,9 +1418,6 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p, /* Given address is not on the instruction boundary */ if ((unsigned long)p->addr != ftrace_addr) return -EILSEQ; - /* break_handler (jprobe) can not work with ftrace */ - if (p->break_handler) - return -EINVAL; p->flags |= KPROBE_FLAG_FTRACE; #else /* !KPROBES_CAN_USE_FTRACE */ return -EINVAL; -- cgit v1.2.3 From 6d1d8dfa8b65831cfa9a528e3d17efa7e7f4226c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 30 Aug 2012 19:26:22 +0200 Subject: uprobes: Don't put NULL pointer in uprobe_register() alloc_uprobe() might return a NULL pointer, put_uprobe() can't deal with this. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1666632e6edf..336f06948de1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -897,7 +897,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * } mutex_unlock(uprobes_hash(inode)); - put_uprobe(uprobe); + if (uprobe) + put_uprobe(uprobe); return ret; } -- cgit v1.2.3 From 6f47caa0e1e4887aa2ddca8388d058d35725d815 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 18 Aug 2012 17:01:57 +0200 Subject: uprobes: uprobes_treelock should not disable irqs Nobody plays with uprobes_tree/uprobes_treelock in interrupt context, no need to disable irqs. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 336f06948de1..ba9f1e7c6060 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -411,11 +411,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - unsigned long flags; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); return uprobe; } @@ -462,12 +461,11 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) */ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { - unsigned long flags; struct uprobe *u; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); /* For now assume that the instruction need not be single-stepped */ uprobe->flags |= UPROBE_SKIP_SSTEP; @@ -705,11 +703,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad */ static void delete_uprobe(struct uprobe *uprobe) { - unsigned long flags; - - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); iput(uprobe->inode); put_uprobe(uprobe); atomic_dec(&uprobe_events); @@ -968,7 +964,6 @@ static void build_probe_list(struct inode *inode, struct list_head *head) { loff_t min, max; - unsigned long flags; struct rb_node *n, *t; struct uprobe *u; @@ -976,7 +971,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock_irqsave(&uprobes_treelock, flags); + spin_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -994,7 +989,7 @@ static void build_probe_list(struct inode *inode, atomic_inc(&u->ref); } } - spin_unlock_irqrestore(&uprobes_treelock, flags); + spin_unlock(&uprobes_treelock); } /* -- cgit v1.2.3 From 9f68f672c47b9bd4cfe0a667ecb0b1382c61e2de Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Aug 2012 16:15:09 +0200 Subject: uprobes: Introduce MMF_RECALC_UPROBES Add the new MMF_RECALC_UPROBES flag, it means that MMF_HAS_UPROBES can be false positive after remove_breakpoint() or uprobe_munmap(). It is also set by uprobe_dup_mmap(), this is not optimal but simple. We could add the new hook, uprobe_dup_vma(), to set MMF_HAS_UPROBES only if the new mm actually has uprobes, but I don't think this makes sense. The next patch will use this flag to clear MMF_HAS_UPROBES. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/sched.h | 3 ++- kernel/events/uprobes.c | 39 +++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 3667c332e61d..255661d48834 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -446,7 +446,8 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ -#define MMF_HAS_UPROBES 19 /* might have uprobes */ +#define MMF_HAS_UPROBES 19 /* has uprobes */ +#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ba9f1e7c6060..9a7f08bab91f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -684,7 +684,9 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, set_bit(MMF_HAS_UPROBES, &mm->flags); ret = set_swbp(&uprobe->arch, mm, vaddr); - if (ret && first_uprobe) + if (!ret) + clear_bit(MMF_RECALC_UPROBES, &mm->flags); + else if (first_uprobe) clear_bit(MMF_HAS_UPROBES, &mm->flags); return ret; @@ -693,6 +695,11 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, static void remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) { + /* can happen if uprobe_register() fails */ + if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) + return; + + set_bit(MMF_RECALC_UPROBES, &mm->flags); set_orig_insn(&uprobe->arch, mm, vaddr); } @@ -1026,6 +1033,25 @@ int uprobe_mmap(struct vm_area_struct *vma) return 0; } +static bool +vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + loff_t min, max; + struct inode *inode; + struct rb_node *n; + + inode = vma->vm_file->f_mapping->host; + + min = vaddr_to_offset(vma, start); + max = min + (end - start) - 1; + + spin_lock(&uprobes_treelock); + n = find_node_in_range(inode, min, max); + spin_unlock(&uprobes_treelock); + + return !!n; +} + /* * Called in context of a munmap of a vma. */ @@ -1037,10 +1063,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || + test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return; - /* TODO: unmapping uprobe(s) will need more work */ + if (vma_has_uprobes(vma, start, end)) + set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); } /* Slot allocation for XOL */ @@ -1146,8 +1174,11 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) + if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); + /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ + set_bit(MMF_RECALC_UPROBES, &newmm->flags); + } } /* -- cgit v1.2.3 From 499a4f3ec057a0f79636cc3c1e581bb6e977a30f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Aug 2012 17:41:34 +0200 Subject: uprobes: Teach find_active_uprobe() to clear MMF_HAS_UPROBES The wrong MMF_HAS_UPROBES doesn't really hurt, just it triggers the "slow" and unnecessary handle_swbp() path if the task hits the non-uprobe breakpoint. So this patch changes find_active_uprobe() to check every valid vma and clear MMF_HAS_UPROBES if no uprobes were found. This is adds the slow O(n) path, but it is only called in unlikely case when the task hits the normal breakpoint first time after uprobe_unregister(). Note the "not strictly accurate" comment in mmf_recalc_uprobes(). We can fix this, we only need to teach vma_has_uprobes() to return a bit more more info, but I am not sure this worth the trouble. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9a7f08bab91f..e4a906ce2e1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1396,6 +1396,25 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) return false; } +static void mmf_recalc_uprobes(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!valid_vma(vma, false)) + continue; + /* + * This is not strictly accurate, we can race with + * uprobe_unregister() and see the already removed + * uprobe if delete_uprobe() was not yet called. + */ + if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) + return; + } + + clear_bit(MMF_HAS_UPROBES, &mm->flags); +} + static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) { struct mm_struct *mm = current->mm; @@ -1417,6 +1436,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } else { *is_swbp = -EFAULT; } + + if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + mmf_recalc_uprobes(mm); up_read(&mm->mmap_sem); return uprobe; -- cgit v1.2.3 From 9d778782266f95e5c6ec43ed8195ba331c821018 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Aug 2012 18:12:28 +0200 Subject: uprobes: Introduce arch_uprobe_enable/disable_step() As Oleg pointed out in [0] uprobe should not use the ptrace interface for enabling/disabling single stepping. [0] http://lkml.kernel.org/r/20120730141638.GA5306@redhat.com Add the new "__weak arch" helpers which simply call user_*_single_step() as a preparation. This is only needed to not break the powerpc port, we will fold this logic into arch_uprobe_pre/post_xol() hooks later. We should also change handle_singlestep(), _disable_step(&uprobe->arch) should be called before put_uprobe(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- include/linux/uprobes.h | 2 ++ kernel/events/uprobes.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 6d4fe79a1a6a..e6f0331e3d45 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -112,6 +112,8 @@ extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); extern void uprobe_copy_process(struct task_struct *t); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern void __weak arch_uprobe_enable_step(struct arch_uprobe *arch); +extern void __weak arch_uprobe_disable_step(struct arch_uprobe *arch); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); extern void uprobe_notify_resume(struct pt_regs *regs); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e4a906ce2e1d..912ef48d28ab 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1444,6 +1444,16 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) return uprobe; } +void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) +{ + user_enable_single_step(current); +} + +void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) +{ + user_disable_single_step(current); +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1490,7 +1500,7 @@ static void handle_swbp(struct pt_regs *regs) utask->state = UTASK_SSTEP; if (!pre_ssout(uprobe, regs, bp_vaddr)) { - user_enable_single_step(current); + arch_uprobe_enable_step(&uprobe->arch); return; } @@ -1526,10 +1536,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) else WARN_ON_ONCE(1); + arch_uprobe_disable_step(&uprobe->arch); put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - user_disable_single_step(current); xol_free_insn_slot(current); spin_lock_irq(¤t->sighand->siglock); -- cgit v1.2.3 From 5224c3a31549f1c056039545b289e1b01ed02f12 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Fri, 7 Sep 2012 18:12:19 -0700 Subject: tracing: Add an option for disabling markers In our application, we have trace markers spread through user-space. We have markers in GL, X, etc. These are super handy for Chrome's about:tracing feature (Chrome + system + kernel trace view), but can be very distracting when you're trying to debug a kernel issue. I normally, use "grep -v tracing_mark_write" but it would be nice if I could just temporarily disable markers all together. Link: http://lkml.kernel.org/r/1347066739-26285-1-git-send-email-msb@chromium.org CC: Frederic Weisbecker Signed-off-by: Mandeep Singh Baines Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++++- kernel/trace/trace.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 08acf42e325b..1ec5c1dab629 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -328,7 +328,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO; + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -470,6 +470,7 @@ static const char *trace_options[] = { "overwrite", "disable_on_free", "irq-info", + "markers", NULL }; @@ -3886,6 +3887,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tracing_disabled) return -EINVAL; + if (!(trace_flags & TRACE_ITER_MARKERS)) + return -EINVAL; + if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 593debefc4e9..63a2da0b9a6e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,7 @@ enum trace_iterator_flags { TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, + TRACE_ITER_MARKERS = 0x1000000, }; /* -- cgit v1.2.3 From 8781915ad2716adcd8cd5cc52cee791fc8b00fdf Mon Sep 17 00:00:00 2001 From: Ezequiel Garcia Date: Wed, 12 Sep 2012 11:47:57 -0300 Subject: trace: Move trace event enable from fs_initcall to core_initcall This patch splits trace event initialization in two stages: * ftrace enable * sysfs event entry creation This allows to capture trace events from an earlier point by using 'trace_event' kernel parameter and is important to trace boot-up allocations. Note that, in order to enable events at core_initcall, it's necessary to move init_ftrace_syscalls() from core_initcall to early_initcall. Link: http://lkml.kernel.org/r/1347461277-25302-1-git-send-email-elezegarcia@gmail.com Signed-off-by: Ezequiel Garcia Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 108 ++++++++++++++++++++++++++++-------------- kernel/trace/trace_syscalls.c | 2 +- 2 files changed, 73 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bbb0e63d78e9..d608d09d08c0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } +static void event_remove(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + list_del(&call->list); +} + +static int event_init(struct ftrace_event_call *call) +{ + int ret = 0; + + if (WARN_ON(!call->name)) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0 && ret != -ENOSYS) + pr_warn("Could not initialize trace events/%s\n", + call->name); + } + + return ret; +} + static int __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, const struct file_operations *id, @@ -1209,19 +1234,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, struct dentry *d_events; int ret; - /* The linker may leave blanks */ - if (!call->name) - return -EINVAL; - - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace events/%s\n", - call->name); - return ret; - } - } + ret = event_init(call); + if (ret < 0) + return ret; d_events = event_trace_events_dir(); if (!d_events) @@ -1272,13 +1287,10 @@ static void remove_subsystem_dir(const char *name) */ static void __trace_remove_event_call(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); - if (call->event.funcs) - __unregister_ftrace_event(&call->event); - debugfs_remove_recursive(call->dir); - list_del(&call->list); + event_remove(call); trace_destroy_fields(call); destroy_preds(call); + debugfs_remove_recursive(call->dir); remove_subsystem_dir(call->class->system); } @@ -1450,15 +1462,43 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +static __init int event_trace_enable(void) +{ + struct ftrace_event_call **iter, *call; + char *buf = bootup_event_buf; + char *token; + int ret; + + for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { + + call = *iter; + ret = event_init(call); + if (!ret) + list_add(&call->list, &ftrace_events); + } + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + } + return 0; +} + static __init int event_trace_init(void) { - struct ftrace_event_call **call; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; int ret; - char *buf = bootup_event_buf; - char *token; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -1497,24 +1537,19 @@ static __init int event_trace_init(void) if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, + /* + * Early initialization already enabled ftrace event. + * Now it's only necessary to create the event directory. + */ + list_for_each_entry(call, &ftrace_events, list) { + + ret = event_create_dir(call, d_events, + &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - } - - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(token, 1); - if (ret) - pr_warning("Failed to enable trace event: %s\n", token); + if (ret < 0) + event_remove(call); } ret = register_module_notifier(&trace_module_nb); @@ -1523,6 +1558,7 @@ static __init int event_trace_init(void) return 0; } +core_initcall(event_trace_enable); fs_initcall(event_trace_init); #ifdef CONFIG_FTRACE_STARTUP_TEST diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6b245f64c8dd..2485a7d09b11 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -487,7 +487,7 @@ int __init init_ftrace_syscalls(void) return 0; } -core_initcall(init_ftrace_syscalls); +early_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3