diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/events/core.c | 266 | ||||
-rw-r--r-- | kernel/extable.c | 9 | ||||
-rw-r--r-- | kernel/kprobes.c | 73 |
3 files changed, 219 insertions, 129 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index e235bb991bdd..77a932b54a64 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -355,6 +355,8 @@ enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, EVENT_TIME = 0x4, + /* see ctx_resched() for details */ + EVENT_CPU = 0x8, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -678,6 +680,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, info->timestamp = ctx->timestamp; } +static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); + #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ @@ -690,61 +694,46 @@ perf_cgroup_set_timestamp(struct task_struct *task, static void perf_cgroup_switch(struct task_struct *task, int mode) { struct perf_cpu_context *cpuctx; - struct pmu *pmu; + struct list_head *list; unsigned long flags; /* - * disable interrupts to avoid geting nr_cgroup - * changes via __perf_event_disable(). Also - * avoids preemption. + * Disable interrupts and preemption to avoid this CPU's + * cgrp_cpuctx_entry to change under us. */ local_irq_save(flags); - /* - * we reschedule only in the presence of cgroup - * constrained events. - */ + list = this_cpu_ptr(&cgrp_cpuctx_list); + list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - continue; /* ensure we process each cpuctx once */ - - /* - * perf_cgroup_events says at least one - * context on this CPU has cgroup events. - * - * ctx->nr_cgroups reports the number of cgroup - * events for a context. - */ - if (cpuctx->ctx.nr_cgroups > 0) { - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* + * set cgrp before ctxsw in to allow + * event_filter_match() to not have to pass + * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu + */ + cpuctx->cgrp = perf_cgroup_from_task(task, + &cpuctx->ctx); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } local_irq_restore(flags); @@ -889,6 +878,7 @@ list_update_cgroup_event(struct perf_event *event, struct perf_event_context *ctx, bool add) { struct perf_cpu_context *cpuctx; + struct list_head *cpuctx_entry; if (!is_cgroup_event(event)) return; @@ -902,15 +892,16 @@ list_update_cgroup_event(struct perf_event *event, * this will always be called from the right CPU. */ cpuctx = __get_cpu_context(ctx); - - /* - * cpuctx->cgrp is NULL until a cgroup event is sched in or - * ctx->nr_cgroup == 0 . - */ - if (add && perf_cgroup_from_task(current, ctx) == event->cgrp) - cpuctx->cgrp = event->cgrp; - else if (!add) + cpuctx_entry = &cpuctx->cgrp_cpuctx_entry; + /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/ + if (add) { + list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list)); + if (perf_cgroup_from_task(current, ctx) == event->cgrp) + cpuctx->cgrp = event->cgrp; + } else { + list_del(cpuctx_entry); cpuctx->cgrp = NULL; + } } #else /* !CONFIG_CGROUP_PERF */ @@ -1453,6 +1444,20 @@ static void update_group_times(struct perf_event *leader) update_event_times(event); } +static enum event_type_t get_event_type(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + enum event_type_t event_type; + + lockdep_assert_held(&ctx->lock); + + event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE; + if (!ctx->task) + event_type |= EVENT_CPU; + + return event_type; +} + static struct list_head * ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) { @@ -2226,7 +2231,8 @@ ctx_sched_in(struct perf_event_context *ctx, struct task_struct *task); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) + struct perf_event_context *ctx, + enum event_type_t event_type) { if (!cpuctx->task_ctx) return; @@ -2234,7 +2240,7 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, EVENT_ALL); + ctx_sched_out(ctx, cpuctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, @@ -2249,13 +2255,51 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } +/* + * We want to maintain the following priority of scheduling: + * - CPU pinned (EVENT_CPU | EVENT_PINNED) + * - task pinned (EVENT_PINNED) + * - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE) + * - task flexible (EVENT_FLEXIBLE). + * + * In order to avoid unscheduling and scheduling back in everything every + * time an event is added, only do it for the groups of equal priority and + * below. + * + * This can be called after a batch operation on task events, in which case + * event_type is a bit mask of the types of events involved. For CPU events, + * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, - struct perf_event_context *task_ctx) + struct perf_event_context *task_ctx, + enum event_type_t event_type) { + enum event_type_t ctx_event_type = event_type & EVENT_ALL; + bool cpu_event = !!(event_type & EVENT_CPU); + + /* + * If pinned groups are involved, flexible groups also need to be + * scheduled out. + */ + if (event_type & EVENT_PINNED) + event_type |= EVENT_FLEXIBLE; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx); - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + task_ctx_sched_out(cpuctx, task_ctx, event_type); + + /* + * Decide which cpu ctx groups to schedule out based on the types + * of events that caused rescheduling: + * - EVENT_CPU: schedule out corresponding groups; + * - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups; + * - otherwise, do nothing more. + */ + if (cpu_event) + cpu_ctx_sched_out(cpuctx, ctx_event_type); + else if (ctx_event_type & EVENT_PINNED) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + perf_event_sched_in(cpuctx, task_ctx, current); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -2302,7 +2346,7 @@ static int __perf_install_in_context(void *info) if (reprogram) { ctx_sched_out(ctx, cpuctx, EVENT_TIME); add_event_to_ctx(event, ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { add_event_to_ctx(event, ctx); } @@ -2469,7 +2513,7 @@ static void __perf_event_enable(struct perf_event *event, if (ctx->task) WARN_ON_ONCE(task_ctx != ctx); - ctx_resched(cpuctx, task_ctx); + ctx_resched(cpuctx, task_ctx, get_event_type(event)); } /* @@ -2896,7 +2940,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - task_ctx_sched_out(cpuctx, ctx); + task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); raw_spin_unlock(&ctx->lock); } } @@ -2943,7 +2987,7 @@ static void perf_pmu_sched_task(struct task_struct *prev, return; list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ + pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) continue; @@ -3133,8 +3177,12 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, * cpu flexible, task flexible. + * + * However, if task's ctx is not carrying any pinned + * events, no need to flip the cpuctx's events around. */ - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!list_empty(&ctx->pinned_groups)) + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); @@ -3449,6 +3497,7 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; @@ -3462,15 +3511,17 @@ static void perf_event_enable_on_exec(int ctxn) cpuctx = __get_cpu_context(ctx); perf_ctx_lock(cpuctx, ctx); ctx_sched_out(ctx, cpuctx, EVENT_TIME); - list_for_each_entry(event, &ctx->event_list, event_entry) + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); + event_type |= get_event_type(event); + } /* * Unclone and reschedule this context if we enabled any event. */ if (enabled) { clone_ctx = unclone_ctx(ctx); - ctx_resched(cpuctx, ctx); + ctx_resched(cpuctx, ctx, event_type); } perf_ctx_unlock(cpuctx, ctx); @@ -8044,6 +8095,9 @@ static void perf_event_addr_filters_apply(struct perf_event *event) if (task == TASK_TOMBSTONE) return; + if (!ifh->nr_file_filters) + return; + mm = get_task_mm(event->ctx->task); if (!mm) goto restart; @@ -8214,6 +8268,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, * attribute. */ if (state == IF_STATE_END) { + ret = -EINVAL; if (kernel && event->attr.exclude_kernel) goto fail; @@ -8221,6 +8276,18 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (!filename) goto fail; + /* + * For now, we only support file-based filters + * in per-task events; doing so for CPU-wide + * events requires additional context switching + * trickery, since same object code will be + * mapped at different virtual addresses in + * different processes. + */ + ret = -EOPNOTSUPP; + if (!event->ctx->task) + goto fail_free_name; + /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &path); if (ret) @@ -8236,6 +8303,8 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, !S_ISREG(filter->inode->i_mode)) /* free_filters_list() will iput() */ goto fail; + + event->addr_filters.nr_file_filters++; } /* ready to consume more filters */ @@ -8275,24 +8344,13 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) if (WARN_ON_ONCE(event->parent)) return -EINVAL; - /* - * For now, we only support filtering in per-task events; doing so - * for CPU-wide events requires additional context switching trickery, - * since same object code will be mapped at different virtual - * addresses in different processes. - */ - if (!event->ctx->task) - return -EOPNOTSUPP; - ret = perf_event_parse_addr_filter(event, filter_str, &filters); if (ret) - return ret; + goto fail_clear_files; ret = event->pmu->addr_filters_validate(&filters); - if (ret) { - free_filters_list(&filters); - return ret; - } + if (ret) + goto fail_free_filters; /* remove existing filters, if any */ perf_addr_filters_splice(event, &filters); @@ -8301,6 +8359,14 @@ perf_event_set_addr_filter(struct perf_event *event, char *filter_str) perf_event_for_each_child(event, perf_event_addr_filters_apply); return ret; + +fail_free_filters: + free_filters_list(&filters); + +fail_clear_files: + event->addr_filters.nr_file_filters = 0; + + return ret; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -8652,37 +8718,10 @@ static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) return NULL; } -static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - - if (cpuctx->unique_pmu == old_pmu) - cpuctx->unique_pmu = pmu; - } -} - static void free_pmu_context(struct pmu *pmu) { - struct pmu *i; - mutex_lock(&pmus_lock); - /* - * Like a real lame refcount. - */ - list_for_each_entry(i, &pmus, entry) { - if (i->pmu_cpu_context == pmu->pmu_cpu_context) { - update_pmu_context(i, pmu); - goto out; - } - } - free_percpu(pmu->pmu_cpu_context); -out: mutex_unlock(&pmus_lock); } @@ -8886,8 +8925,6 @@ skip_type: cpuctx->ctx.pmu = pmu; __perf_mux_hrtimer_init(cpuctx, cpu); - - cpuctx->unique_pmu = pmu; } got_cpu_context: @@ -9005,6 +9042,14 @@ static struct pmu *perf_init_event(struct perf_event *event) idx = srcu_read_lock(&pmus_srcu); + /* Try parent's PMU first: */ + if (event->parent && event->parent->pmu) { + pmu = event->parent->pmu; + ret = perf_try_init_event(pmu, event); + if (!ret) + goto unlock; + } + rcu_read_lock(); pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); @@ -10265,7 +10310,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation @@ -10714,6 +10759,9 @@ static void __init perf_event_init_all_cpus(void) INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); +#ifdef CONFIG_CGROUP_PERF + INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); +#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } diff --git a/kernel/extable.c b/kernel/extable.c index e3beec4a2339..e1359474baa5 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/init.h> +#include <linux/kprobes.h> #include <asm/sections.h> #include <linux/uaccess.h> @@ -104,6 +105,8 @@ int __kernel_text_address(unsigned long addr) return 1; if (is_ftrace_trampoline(addr)) return 1; + if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) + return 1; /* * There might be init symbols in saved stacktraces. * Give those symbols a chance to be printed in @@ -123,7 +126,11 @@ int kernel_text_address(unsigned long addr) return 1; if (is_module_text_address(addr)) return 1; - return is_ftrace_trampoline(addr); + if (is_ftrace_trampoline(addr)) + return 1; + if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr)) + return 1; + return 0; } /* diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 43460104f119..ebb4dadca66b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -149,9 +149,11 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) struct kprobe_insn_page *kip; kprobe_opcode_t *slot = NULL; + /* Since the slot array is not protected by rcu, we need a mutex */ mutex_lock(&c->mutex); retry: - list_for_each_entry(kip, &c->pages, list) { + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { int i; for (i = 0; i < slots_per_page(c); i++) { @@ -159,6 +161,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->slot_used[i] = SLOT_USED; kip->nused++; slot = kip->insns + (i * c->insn_size); + rcu_read_unlock(); goto out; } } @@ -167,6 +170,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) WARN_ON(1); } } + rcu_read_unlock(); /* If there are any garbage slots, collect it and try again. */ if (c->nr_garbage && collect_garbage_slots(c) == 0) @@ -193,7 +197,7 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c) kip->nused = 1; kip->ngarbage = 0; kip->cache = c; - list_add(&kip->list, &c->pages); + list_add_rcu(&kip->list, &c->pages); slot = kip->insns; out: mutex_unlock(&c->mutex); @@ -213,7 +217,8 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx) * next time somebody inserts a probe. */ if (!list_is_singular(&kip->list)) { - list_del(&kip->list); + list_del_rcu(&kip->list); + synchronize_rcu(); kip->cache->free(kip->insns); kfree(kip); } @@ -235,8 +240,7 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c) continue; kip->ngarbage = 0; /* we will collect all garbages */ for (i = 0; i < slots_per_page(c); i++) { - if (kip->slot_used[i] == SLOT_DIRTY && - collect_one_slot(kip, i)) + if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } @@ -248,29 +252,60 @@ void __free_insn_slot(struct kprobe_insn_cache *c, kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + long idx; mutex_lock(&c->mutex); - list_for_each_entry(kip, &c->pages, list) { - long idx = ((long)slot - (long)kip->insns) / - (c->insn_size * sizeof(kprobe_opcode_t)); - if (idx >= 0 && idx < slots_per_page(c)) { - WARN_ON(kip->slot_used[idx] != SLOT_USED); - if (dirty) { - kip->slot_used[idx] = SLOT_DIRTY; - kip->ngarbage++; - if (++c->nr_garbage > slots_per_page(c)) - collect_garbage_slots(c); - } else - collect_one_slot(kip, idx); + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + idx = ((long)slot - (long)kip->insns) / + (c->insn_size * sizeof(kprobe_opcode_t)); + if (idx >= 0 && idx < slots_per_page(c)) goto out; - } } - /* Could not free this slot. */ + /* Could not find this slot. */ WARN_ON(1); + kip = NULL; out: + rcu_read_unlock(); + /* Mark and sweep: this may sleep */ + if (kip) { + /* Check double free */ + WARN_ON(kip->slot_used[idx] != SLOT_USED); + if (dirty) { + kip->slot_used[idx] = SLOT_DIRTY; + kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); + } else { + collect_one_slot(kip, idx); + } + } mutex_unlock(&c->mutex); } +/* + * Check given address is on the page of kprobe instruction slots. + * This will be used for checking whether the address on a stack + * is on a text area or not. + */ +bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr) +{ + struct kprobe_insn_page *kip; + bool ret = false; + + rcu_read_lock(); + list_for_each_entry_rcu(kip, &c->pages, list) { + if (addr >= (unsigned long)kip->insns && + addr < (unsigned long)kip->insns + PAGE_SIZE) { + ret = true; + break; + } + } + rcu_read_unlock(); + + return ret; +} + #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { |