diff options
author | Yonghong Song <yhs@fb.com> | 2017-10-23 23:53:08 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2017-10-25 10:47:47 +0900 |
commit | e87c6bc3852b981e71c757be20771546ce9f76f3 (patch) | |
tree | bad3be630137d8e873f4ad5a1ea77b4aa1853184 /kernel/trace | |
parent | 0b4c6841fee03e096b735074a0c4aab3a8e92986 (diff) |
bpf: permit multiple bpf attachments for a single perf event
This patch enables multiple bpf attachments for a
kprobe/uprobe/tracepoint single trace event.
Each trace_event keeps a list of attached perf events.
When an event happens, all attached bpf programs will
be executed based on the order of attachment.
A global bpf_event_mutex lock is introduced to protect
prog_array attaching and detaching. An alternative will
be introduce a mutex lock in every trace_event_call
structure, but it takes a lot of extra memory.
So a global bpf_event_mutex lock is a good compromise.
The bpf prog detachment involves allocation of memory.
If the allocation fails, a dummy do-nothing program
will replace to-be-detached program in-place.
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/trace')
-rw-r--r-- | kernel/trace/bpf_trace.c | 82 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 6 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 34 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 3 |
4 files changed, 99 insertions, 26 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 3126da2f468a..b65011d320e3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -17,7 +17,7 @@ /** * trace_call_bpf - invoke BPF program - * @prog: BPF program + * @call: tracepoint event * @ctx: opaque context pointer * * kprobe handlers execute BPF programs via this helper. @@ -29,7 +29,7 @@ * 1 - store kprobe event into ring buffer * Other values are reserved and currently alias to 1 */ -unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; @@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) goto out; } - rcu_read_lock(); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); + /* + * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock + * to all call sites, we did a bpf_prog_array_valid() there to check + * whether call->prog_array is empty or not, which is + * a heurisitc to speed up execution. + * + * If bpf_prog_array_valid() fetched prog_array was + * non-NULL, we go into trace_call_bpf() and do the actual + * proper rcu_dereference() under RCU lock. + * If it turns out that prog_array is NULL then, we bail out. + * For the opposite, if the bpf_prog_array_valid() fetched pointer + * was NULL, you'll skip the prog_array with the risk of missing + * out of events when it was updated in between this and the + * rcu_dereference() which is accepted risk. + */ + ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); out: __this_cpu_dec(bpf_prog_active); @@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = { const struct bpf_prog_ops perf_event_prog_ops = { }; + +static DEFINE_MUTEX(bpf_event_mutex); + +int perf_event_attach_bpf_prog(struct perf_event *event, + struct bpf_prog *prog) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret = -EEXIST; + + mutex_lock(&bpf_event_mutex); + + if (event->prog) + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); + if (ret < 0) + goto out; + + /* set the new array to event->tp_event and set event->prog */ + event->prog = prog; + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + +out: + mutex_unlock(&bpf_event_mutex); + return ret; +} + +void perf_event_detach_bpf_prog(struct perf_event *event) +{ + struct bpf_prog_array __rcu *old_array; + struct bpf_prog_array *new_array; + int ret; + + mutex_lock(&bpf_event_mutex); + + if (!event->prog) + goto out; + + old_array = rcu_dereference_protected(event->tp_event->prog_array, + lockdep_is_held(&bpf_event_mutex)); + + ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); + if (ret < 0) { + bpf_prog_array_delete_safe(old_array, event->prog); + } else { + rcu_assign_pointer(event->tp_event->prog_array, new_array); + bpf_prog_array_free(old_array); + } + + bpf_prog_put(event->prog); + event->prog = NULL; + +out: + mutex_unlock(&bpf_event_mutex); +} diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8a907e12b6b9..abf92e478cfb 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,13 +1174,12 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); @@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; - struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; head = this_cpu_ptr(call->perf_events); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 696afe72d3b1..71a6af34d7a9 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_metadata *sys_data, - struct syscall_trace_enter *rec) { +static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_metadata *sys_data, + struct syscall_trace_enter *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) param.args[i] = rec->args[i]; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) @@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (!sys_data) return; - prog = READ_ONCE(sys_data->enter_event->prog); head = this_cpu_ptr(sys_data->enter_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); + if (!valid_prog_array && hlist_empty(head)) return; /* get the size after alignment with the u32 buffer size field */ @@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || + if ((valid_prog_array && + !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; @@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_unlock(&syscall_trace_lock); } -static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, - struct syscall_trace_exit *rec) { +static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, + struct syscall_trace_exit *rec) +{ struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; @@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; - return trace_call_bpf(prog, ¶m); + return trace_call_bpf(call, ¶m); } static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) @@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; struct hlist_head *head; - struct bpf_prog *prog; + bool valid_prog_array; int syscall_nr; int rctx; int size; @@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) if (!sys_data) return; - prog = READ_ONCE(sys_data->exit_event->prog); head = this_cpu_ptr(sys_data->exit_event->perf_events); - if (!prog && hlist_empty(head)) + valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); + if (!valid_prog_array && hlist_empty(head)) return; /* We can probably do that at build time */ @@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || + if ((valid_prog_array && + !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 4525e0271a53..153c0e411461 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; - struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; - if (prog && !trace_call_bpf(prog, regs)) + if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); |