summaryrefslogtreecommitdiff
path: root/kernel/events
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-18 15:03:58 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-18 15:03:58 +0200
commit9f0c253ddddca608457a42e509267bed2dee0a50 (patch)
tree6e00e1ed61e997c06169c70c9365f213fe412fc8 /kernel/events
parent941c122da5c8355335dc16011c1c291a32cd1118 (diff)
parent5e645f31139183ac9a282238da18ca6bbc1c6f4a (diff)
Merge tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf events updates from Ingo Molnar: - Implement per-PMU context rescheduling to significantly improve single-PMU performance, and related cleanups/fixes (Peter Zijlstra and Namhyung Kim) - Fix ancient bug resulting in a lot of events being dropped erroneously at higher sampling frequencies (Luo Gengkun) - uprobes enhancements: - Implement RCU-protected hot path optimizations for better performance: "For baseline vs SRCU, peak througput increased from 3.7 M/s (million uprobe triggerings per second) up to about 8 M/s. For uretprobes it's a bit more modest with bump from 2.4 M/s to 5 M/s. For SRCU vs RCU Tasks Trace, peak throughput for uprobes increases further from 8 M/s to 10.3 M/s (+28%!), and for uretprobes from 5.3 M/s to 5.8 M/s (+11%), as we have more work to do on uretprobes side. Even single-thread (no contention) performance is slightly better: 3.276 M/s to 3.396 M/s (+3.5%) for uprobes, and 2.055 M/s to 2.174 M/s (+5.8%) for uretprobes." (Andrii Nakryiko et al) - Document mmap_lock, don't abuse get_user_pages_remote() (Oleg Nesterov) - Cleanups & fixes to prepare for future work: - Remove uprobe_register_refctr() - Simplify error handling for alloc_uprobe() - Make uprobe_register() return struct uprobe * - Fold __uprobe_unregister() into uprobe_unregister() - Shift put_uprobe() from delete_uprobe() to uprobe_unregister() - BPF: Fix use-after-free in bpf_uprobe_multi_link_attach() (Oleg Nesterov) - New feature & ABI extension: allow events to use PERF_SAMPLE READ with inheritance, enabling sample based profiling of a group of counters over a hierarchy of processes or threads (Ben Gainey) - Intel uncore & power events updates: - Add Arrow Lake and Lunar Lake support - Add PERF_EV_CAP_READ_SCOPE - Clean up and enhance cpumask and hotplug support (Kan Liang) - Add LNL uncore iMC freerunning support - Use D0:F0 as a default device (Zhenyu Wang) - Intel PT: fix AUX snapshot handling race (Adrian Hunter) - Misc fixes and cleanups (James Clark, Jiri Olsa, Oleg Nesterov and Peter Zijlstra) * tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits) dmaengine: idxd: Clean up cpumask and hotplug for perfmon iommu/vt-d: Clean up cpumask and hotplug for perfmon perf/x86/intel/cstate: Clean up cpumask and hotplug perf: Add PERF_EV_CAP_READ_SCOPE perf: Generic hotplug support for a PMU with a scope uprobes: perform lockless SRCU-protected uprobes_tree lookup rbtree: provide rb_find_rcu() / rb_find_add_rcu() perf/uprobe: split uprobe_unregister() uprobes: travers uprobe's consumer list locklessly under SRCU protection uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks uprobes: protected uprobe lifetime with SRCU uprobes: revamp uprobe refcounting and lifetime management bpf: Fix use-after-free in bpf_uprobe_multi_link_attach() perf/core: Fix small negative period being ignored perf: Really fix event_function_call() locking perf: Optimize __pmu_ctx_sched_out() perf: Add context time freeze perf: Fix event_function_call() locking perf: Extract a few helpers perf: Optimize context reschedule for single PMU cases ...
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c584
-rw-r--r--kernel/events/uprobes.c499
2 files changed, 701 insertions, 382 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b21c8f24a987..4f03eb908e7f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
+enum event_type_t {
+ EVENT_FLEXIBLE = 0x01,
+ EVENT_PINNED = 0x02,
+ EVENT_TIME = 0x04,
+ EVENT_FROZEN = 0x08,
+ /* see ctx_resched() for details */
+ EVENT_CPU = 0x10,
+ EVENT_CGROUP = 0x20,
+
+ /* compound helpers */
+ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+ EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
+};
+
+static inline void __perf_ctx_lock(struct perf_event_context *ctx)
+{
+ raw_spin_lock(&ctx->lock);
+ WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
+}
+
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- raw_spin_lock(&cpuctx->ctx.lock);
+ __perf_ctx_lock(&cpuctx->ctx);
if (ctx)
- raw_spin_lock(&ctx->lock);
+ __perf_ctx_lock(ctx);
+}
+
+static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
+{
+ /*
+ * If ctx_sched_in() didn't again set any ALL flags, clean up
+ * after ctx_sched_out() by clearing is_active.
+ */
+ if (ctx->is_active & EVENT_FROZEN) {
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+ else
+ ctx->is_active &= ~EVENT_FROZEN;
+ }
+ raw_spin_unlock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
+ __perf_ctx_unlock(ctx);
+ __perf_ctx_unlock(&cpuctx->ctx);
}
#define TASK_TOMBSTONE ((void *)-1L)
@@ -264,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct perf_cpu_context *cpuctx;
struct event_function_struct efs = {
.event = event,
.func = func,
@@ -291,22 +327,25 @@ again:
if (!task_function_call(task, event_function, &efs))
return;
- raw_spin_lock_irq(&ctx->lock);
+ local_irq_disable();
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
+ perf_ctx_lock(cpuctx, ctx);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
- if (task == TASK_TOMBSTONE) {
- raw_spin_unlock_irq(&ctx->lock);
- return;
- }
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
if (ctx->is_active) {
- raw_spin_unlock_irq(&ctx->lock);
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
goto again;
}
func(event, NULL, ctx, data);
- raw_spin_unlock_irq(&ctx->lock);
+unlock:
+ perf_ctx_unlock(cpuctx, ctx);
+ local_irq_enable();
}
/*
@@ -369,16 +408,6 @@ unlock:
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
-enum event_type_t {
- EVENT_FLEXIBLE = 0x1,
- EVENT_PINNED = 0x2,
- EVENT_TIME = 0x4,
- /* see ctx_resched() for details */
- EVENT_CPU = 0x8,
- EVENT_CGROUP = 0x10,
- EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
/*
* perf_sched_events : >0 events exist
*/
@@ -407,6 +436,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
+static cpumask_var_t perf_online_core_mask;
+static cpumask_var_t perf_online_die_mask;
+static cpumask_var_t perf_online_cluster_mask;
+static cpumask_var_t perf_online_pkg_mask;
+static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
@@ -685,30 +719,32 @@ do { \
___p; \
})
+#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
+ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
+ if (_cgroup && !_epc->nr_cgroups) \
+ continue; \
+ else if (_pmu && _epc->pmu != _pmu) \
+ continue; \
+ else
+
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_disable(pmu_ctx->pmu);
- }
}
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_enable(pmu_ctx->pmu);
- }
}
-static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
-static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
@@ -865,7 +901,7 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx, true);
- ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
@@ -877,7 +913,7 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
- ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
+ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -1769,6 +1805,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
typeof(*event), group_node))
/*
+ * Does the event attribute request inherit with PERF_SAMPLE_READ
+ */
+static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
+{
+ return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
+}
+
+/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
*/
@@ -1798,6 +1842,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
@@ -2022,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
+ if (has_inherit_and_sample_read(&event->attr))
+ local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
@@ -2317,6 +2365,45 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
event_sched_out(event, ctx);
}
+static inline void
+__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx, final);
+ }
+}
+
+static inline void
+ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ __ctx_time_update(cpuctx, ctx, false);
+}
+
+/*
+ * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
+ */
+static inline void
+ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
+{
+ ctx_time_update(cpuctx, ctx);
+ if (ctx->is_active & EVENT_TIME)
+ ctx->is_active |= EVENT_FROZEN;
+}
+
+static inline void
+ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
+{
+ if (ctx->is_active & EVENT_TIME) {
+ if (ctx->is_active & EVENT_FROZEN)
+ return;
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+}
+
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
#define DETACH_DEAD 0x04UL
@@ -2336,10 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, false);
- }
+ ctx_time_update(cpuctx, ctx);
/*
* Ensure event_sched_out() switches to OFF, at the very least
@@ -2424,12 +2508,8 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
-
perf_pmu_disable(event->pmu_ctx->pmu);
+ ctx_time_update_event(ctx, event);
if (event == event->group_leader)
group_sched_out(event, ctx);
@@ -2645,7 +2725,8 @@ static void add_event_to_ctx(struct perf_event *event,
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
- enum event_type_t event_type)
+ struct pmu *pmu,
+ enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
@@ -2655,18 +2736,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, event_type);
+ ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+ struct perf_event_context *ctx,
+ struct pmu *pmu)
{
- ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, EVENT_PINNED);
- ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
@@ -2684,16 +2766,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
-/*
- * XXX: ctx_resched() reschedule entire perf_event_context while adding new
- * event to the context or enabling existing event in the context. We can
- * probably optimize it by rescheduling only affected pmu_ctx.
- */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
- enum event_type_t event_type)
+ struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
+ struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
@@ -2704,10 +2782,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
- perf_ctx_disable(&cpuctx->ctx, false);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
if (task_ctx) {
- perf_ctx_disable(task_ctx, false);
- task_ctx_sched_out(task_ctx, event_type);
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_disable(epc->pmu);
+
+ task_ctx_sched_out(task_ctx, pmu, event_type);
}
/*
@@ -2718,15 +2800,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- ctx_sched_out(&cpuctx->ctx, event_type);
+ ctx_sched_out(&cpuctx->ctx, pmu, event_type);
else if (event_type & EVENT_PINNED)
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
+
+ perf_event_sched_in(cpuctx, task_ctx, pmu);
- perf_event_sched_in(cpuctx, task_ctx);
+ for_each_epc(epc, &cpuctx->ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
- perf_ctx_enable(&cpuctx->ctx, false);
- if (task_ctx)
- perf_ctx_enable(task_ctx, false);
+ if (task_ctx) {
+ for_each_epc(epc, task_ctx, pmu, false)
+ perf_pmu_enable(epc->pmu);
+ }
}
void perf_pmu_resched(struct pmu *pmu)
@@ -2735,7 +2821,7 @@ void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
- ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
+ ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
@@ -2791,9 +2877,10 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
add_event_to_ctx(event, ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
+ get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
@@ -2936,8 +3023,7 @@ static void __perf_event_enable(struct perf_event *event,
event->state <= PERF_EVENT_STATE_ERROR)
return;
- if (ctx->is_active)
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2945,25 +3031,21 @@ static void __perf_event_enable(struct perf_event *event,
if (!ctx->is_active)
return;
- if (!event_filter_match(event)) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (!event_filter_match(event))
return;
- }
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, EVENT_TIME);
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
return;
- }
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
- ctx_resched(cpuctx, task_ctx, get_event_type(event));
+ ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
@@ -3231,7 +3313,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct perf_event *event, *tmp;
struct pmu *pmu = pmu_ctx->pmu;
- if (ctx->task && !ctx->is_active) {
+ if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
struct perf_cpu_pmu_context *cpc;
cpc = this_cpu_ptr(pmu->cpu_pmu_context);
@@ -3239,7 +3321,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
cpc->task_epc = NULL;
}
- if (!event_type)
+ if (!(event_type & EVENT_ALL))
return;
perf_pmu_disable(pmu);
@@ -3265,8 +3347,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
perf_pmu_enable(pmu);
}
+/*
+ * Be very careful with the @pmu argument since this will change ctx state.
+ * The @pmu argument works for ctx_resched(), because that is symmetric in
+ * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
+ *
+ * However, if you were to be asymmetrical, you could end up with messed up
+ * state, eg. ctx->is_active cleared even though most EPCs would still actually
+ * be active.
+ */
static void
-ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
@@ -3297,34 +3388,36 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
*
* would only update time for the pinned events.
*/
- if (is_active & EVENT_TIME) {
- /* update (and stop) ctx time */
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
+
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ ctx->is_active &= ~event_type;
+
+ if (!(ctx->is_active & EVENT_ALL)) {
/*
- * CPU-release for the below ->is_active store,
- * see __load_acquire() in perf_event_time_now()
+ * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
+ * does not observe a hole. perf_ctx_unlock() will clean up.
*/
- barrier();
+ if (ctx->is_active & EVENT_FROZEN)
+ ctx->is_active &= EVENT_TIME_FROZEN;
+ else
+ ctx->is_active = 0;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
+ if (!(ctx->is_active & EVENT_ALL))
cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_out(pmu_ctx, is_active);
- }
}
/*
@@ -3517,12 +3610,17 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_disable(ctx, false);
- /* PMIs are disabled; ctx->nr_pending is stable. */
- if (local_read(&ctx->nr_pending) ||
- local_read(&next_ctx->nr_pending)) {
+ /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
+ if (local_read(&ctx->nr_no_switch_fast) ||
+ local_read(&next_ctx->nr_no_switch_fast)) {
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
+ *
+ * Likewise, when a context contains inherit +
+ * SAMPLE_READ events they should be switched
+ * out using the slow path so that they are
+ * treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
@@ -3563,7 +3661,7 @@ unlock:
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
- task_ctx_sched_out(ctx, EVENT_ALL);
+ task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
@@ -3861,29 +3959,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
merge_sched_in, &can_add_hw);
}
-static void ctx_groups_sched_in(struct perf_event_context *ctx,
- struct perf_event_groups *groups,
- bool cgroup)
+static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
- struct perf_event_pmu_context *pmu_ctx;
-
- list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- if (cgroup && !pmu_ctx->nr_cgroups)
- continue;
- pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
- }
-}
+ struct perf_event_context *ctx = pmu_ctx->ctx;
-static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
- struct pmu *pmu)
-{
- pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
+ if (event_type & EVENT_PINNED)
+ pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
+ if (event_type & EVENT_FLEXIBLE)
+ pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
@@ -3907,7 +3998,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
- if (!is_active)
+ if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
else
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
@@ -3919,12 +4010,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (is_active & EVENT_PINNED)
- ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
+ if (is_active & EVENT_PINNED) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
+ }
/* Then walk through the lower prio flexible groups */
- if (is_active & EVENT_FLEXIBLE)
- ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
+ if (is_active & EVENT_FLEXIBLE) {
+ for_each_epc(pmu_ctx, ctx, pmu, cgroup)
+ __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
+ }
}
static void perf_event_context_sched_in(struct task_struct *task)
@@ -3967,10 +4062,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
- ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
- perf_event_sched_in(cpuctx, ctx);
+ perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
@@ -4093,7 +4188,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
- delta = (delta + 7) / 8; /* low pass filter */
+ if (delta >= 0)
+ delta += 7;
+ else
+ delta -= 7;
+ delta /= 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
@@ -4311,14 +4410,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
- __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
- __pmu_ctx_sched_in(task_epc->ctx, pmu);
+ __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4384,7 +4483,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_time_freeze(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
@@ -4396,9 +4495,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
- ctx_resched(cpuctx, ctx, event_type);
- } else {
- ctx_sched_in(ctx, EVENT_TIME);
+ ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4459,16 +4556,24 @@ struct perf_read_data {
int ret;
};
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
+
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
+ int local_cpu = smp_processor_id();
u16 local_pkg, event_pkg;
if ((unsigned)event_cpu >= nr_cpu_ids)
return event_cpu;
- if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
- int local_cpu = smp_processor_id();
+ if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
+ if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
+ return local_cpu;
+ }
+
+ if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
@@ -4501,10 +4606,7 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (data->group)
@@ -4539,8 +4641,11 @@ unlock:
raw_spin_unlock(&ctx->lock);
}
-static inline u64 perf_event_count(struct perf_event *event)
+static inline u64 perf_event_count(struct perf_event *event, bool self)
{
+ if (self)
+ return local64_read(&event->count);
+
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
@@ -4701,10 +4806,7 @@ again:
* May read while context is not active (e.g., thread is
* blocked), in that case we cannot update context time
*/
- if (ctx->is_active & EVENT_TIME) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- }
+ ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (group)
@@ -5205,7 +5307,7 @@ static void perf_pending_task_sync(struct perf_event *event)
*/
if (task_work_cancel(current, head)) {
event->pending_work = 0;
- local_dec(&event->ctx->nr_pending);
+ local_dec(&event->ctx->nr_no_switch_fast);
return;
}
@@ -5499,7 +5601,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
- total += perf_event_count(event);
+ total += perf_event_count(event, false);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -5508,7 +5610,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
- total += perf_event_count(child);
+ total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
@@ -5590,14 +5692,14 @@ static int __perf_read_group_add(struct perf_event *leader,
/*
* Write {count,id} tuples for every sibling.
*/
- values[n++] += perf_event_count(leader);
+ values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
- values[n++] += perf_event_count(sub);
+ values[n++] += perf_event_count(sub, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
@@ -6177,7 +6279,7 @@ void perf_event_update_userpage(struct perf_event *event)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
- userpg->offset = perf_event_count(event);
+ userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
@@ -6874,7 +6976,7 @@ static void perf_pending_task(struct callback_head *head)
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
- local_dec(&event->ctx->nr_pending);
+ local_dec(&event->ctx->nr_no_switch_fast);
rcuwait_wake_up(&event->pending_work_wait);
}
rcu_read_unlock();
@@ -7256,7 +7358,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 values[5];
int n = 0;
- values[n++] = perf_event_count(event);
+ values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
@@ -7274,14 +7376,15 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event,
- u64 enabled, u64 running)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
unsigned long flags;
u64 values[6];
int n = 0;
+ bool self = has_inherit_and_sample_read(&event->attr);
/*
* Disabling interrupts avoids all counter scheduling
@@ -7301,7 +7404,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
- values[n++] = perf_event_count(leader);
+ values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
@@ -7316,7 +7419,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
- values[n++] = perf_event_count(sub);
+ values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
@@ -7337,6 +7440,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
+ *
+ * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
+ * counts rather than attempting to accumulate some value across all children on
+ * all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
@@ -9747,7 +9854,7 @@ static int __perf_event_overflow(struct perf_event *event,
if (!event->pending_work &&
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
- local_inc(&event->ctx->nr_pending);
+ local_inc(&event->ctx->nr_no_switch_fast);
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
@@ -11484,10 +11591,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
+static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return topology_sibling_cpumask(cpu);
+ case PERF_PMU_SCOPE_DIE:
+ return topology_die_cpumask(cpu);
+ case PERF_PMU_SCOPE_CLUSTER:
+ return topology_cluster_cpumask(cpu);
+ case PERF_PMU_SCOPE_PKG:
+ return topology_core_cpumask(cpu);
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return cpu_online_mask;
+ }
+
+ return NULL;
+}
+
+static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
+{
+ switch (scope) {
+ case PERF_PMU_SCOPE_CORE:
+ return perf_online_core_mask;
+ case PERF_PMU_SCOPE_DIE:
+ return perf_online_die_mask;
+ case PERF_PMU_SCOPE_CLUSTER:
+ return perf_online_cluster_mask;
+ case PERF_PMU_SCOPE_PKG:
+ return perf_online_pkg_mask;
+ case PERF_PMU_SCOPE_SYS_WIDE:
+ return perf_online_sys_mask;
+ }
+
+ return NULL;
+}
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cpumask *mask = perf_scope_cpumask(pmu->scope);
+
+ if (mask)
+ return cpumap_print_to_pagebuf(true, buf, mask);
+ return 0;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
+ &dev_attr_cpumask.attr,
NULL,
};
@@ -11499,6 +11656,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
if (n == 2 && !pmu->nr_addr_filters)
return 0;
+ /* cpumask */
+ if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
+ return 0;
+
return a->mode;
}
@@ -11583,6 +11744,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto free_pdc;
}
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
+ ret = -EINVAL;
+ goto free_pdc;
+ }
+
pmu->name = name;
if (type >= 0)
@@ -11737,6 +11903,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
event_has_any_exclude_flag(event))
ret = -EINVAL;
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
+ int cpu;
+
+ if (pmu_cpumask && cpumask) {
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ ret = -ENODEV;
+ else
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
+ } else {
+ ret = -ENODEV;
+ }
+ }
+
if (ret && event->destroy)
event->destroy(event);
}
@@ -12064,10 +12246,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * We do not support PERF_SAMPLE_READ on inherited events unless
+ * PERF_SAMPLE_TID is also selected, which allows inherited events to
+ * collect per-thread samples.
* See perf_output_read().
*/
- if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
+ if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
goto err_ns;
if (!has_branch_stack(event))
@@ -13091,7 +13275,7 @@ static void sync_child_event(struct perf_event *child_event)
perf_event_read_event(child_event, task);
}
- child_val = perf_event_count(child_event);
+ child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count:
@@ -13182,7 +13366,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
@@ -13697,6 +13881,12 @@ static void __init perf_event_init_all_cpus(void)
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
+ zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
+
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
@@ -13740,12 +13930,46 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, EVENT_TIME);
+ ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
}
+static void perf_event_clear_cpumask(unsigned int cpu)
+{
+ int target[PERF_PMU_MAX_SCOPE];
+ unsigned int scope;
+ struct pmu *pmu;
+
+ cpumask_clear_cpu(cpu, perf_online_mask);
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+ struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
+
+ target[scope] = -1;
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
+ continue;
+ target[scope] = cpumask_any_but(cpumask, cpu);
+ if (target[scope] < nr_cpu_ids)
+ cpumask_set_cpu(target[scope], pmu_cpumask);
+ }
+
+ /* migrate */
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
+ if (pmu->scope == PERF_PMU_SCOPE_NONE ||
+ WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
+ continue;
+
+ if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
+ perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13753,6 +13977,11 @@ static void perf_event_exit_cpu_context(int cpu)
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
+ /*
+ * Clear the cpumasks, and migrate to other CPUs if possible.
+ * Must be invoked before the __perf_event_exit_context.
+ */
+ perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
@@ -13760,7 +13989,6 @@ static void perf_event_exit_cpu_context(int cpu)
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
- cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
@@ -13769,6 +13997,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
+static void perf_event_setup_cpumask(unsigned int cpu)
+{
+ struct cpumask *pmu_cpumask;
+ unsigned int scope;
+
+ cpumask_set_cpu(cpu, perf_online_mask);
+
+ /*
+ * Early boot stage, the cpumask hasn't been set yet.
+ * The perf_online_<domain>_masks includes the first CPU of each domain.
+ * Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
+ */
+ if (!topology_sibling_cpumask(cpu)) {
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ pmu_cpumask = perf_scope_cpumask(scope);
+ if (WARN_ON_ONCE(!pmu_cpumask))
+ continue;
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+ return;
+ }
+
+ for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
+ const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
+
+ pmu_cpumask = perf_scope_cpumask(scope);
+
+ if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
+ continue;
+
+ if (!cpumask_empty(cpumask) &&
+ cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
+ cpumask_set_cpu(cpu, pmu_cpumask);
+ }
+}
+
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
@@ -13777,7 +14041,7 @@ int perf_event_init_cpu(unsigned int cpu)
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
- cpumask_set_cpu(cpu, perf_online_mask);
+ perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 50d7949be2b1..4b7e590dc428 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -40,6 +40,9 @@ static struct rb_root uprobes_tree = RB_ROOT;
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
+static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
+
+DEFINE_STATIC_SRCU(uprobes_srcu);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
@@ -57,8 +60,9 @@ struct uprobe {
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
- struct uprobe_consumer *consumers;
+ struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
+ struct rcu_head rcu;
loff_t offset;
loff_t ref_ctr_offset;
unsigned long flags;
@@ -109,6 +113,11 @@ struct xol_area {
unsigned long vaddr; /* Page(s) of instruction slots */
};
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+ pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
+}
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -453,7 +462,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_lock held for write.
+ * Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
@@ -587,25 +596,63 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
*(uprobe_opcode_t *)&auprobe->insn);
}
+/* uprobe should have guaranteed positive refcount */
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref);
return uprobe;
}
+/*
+ * uprobe should have guaranteed lifetime, which can be either of:
+ * - caller already has refcount taken (and wants an extra one);
+ * - uprobe is RCU protected and won't be freed until after grace period;
+ * - we are holding uprobes_treelock (for read or write, doesn't matter).
+ */
+static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
+{
+ if (refcount_inc_not_zero(&uprobe->ref))
+ return uprobe;
+ return NULL;
+}
+
+static inline bool uprobe_is_active(struct uprobe *uprobe)
+{
+ return !RB_EMPTY_NODE(&uprobe->rb_node);
+}
+
+static void uprobe_free_rcu(struct rcu_head *rcu)
+{
+ struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
+
+ kfree(uprobe);
+}
+
static void put_uprobe(struct uprobe *uprobe)
{
- if (refcount_dec_and_test(&uprobe->ref)) {
- /*
- * If application munmap(exec_vma) before uprobe_unregister()
- * gets called, we don't get a chance to remove uprobe from
- * delayed_uprobe_list from remove_breakpoint(). Do it here.
- */
- mutex_lock(&delayed_uprobe_lock);
- delayed_uprobe_remove(uprobe, NULL);
- mutex_unlock(&delayed_uprobe_lock);
- kfree(uprobe);
+ if (!refcount_dec_and_test(&uprobe->ref))
+ return;
+
+ write_lock(&uprobes_treelock);
+
+ if (uprobe_is_active(uprobe)) {
+ write_seqcount_begin(&uprobes_seqcount);
+ rb_erase(&uprobe->rb_node, &uprobes_tree);
+ write_seqcount_end(&uprobes_seqcount);
}
+
+ write_unlock(&uprobes_treelock);
+
+ /*
+ * If application munmap(exec_vma) before uprobe_unregister()
+ * gets called, we don't get a chance to remove uprobe from
+ * delayed_uprobe_list from remove_breakpoint(). Do it here.
+ */
+ mutex_lock(&delayed_uprobe_lock);
+ delayed_uprobe_remove(uprobe, NULL);
+ mutex_unlock(&delayed_uprobe_lock);
+
+ call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
}
static __always_inline
@@ -647,62 +694,86 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
-static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
+/*
+ * Assumes being inside RCU protected region.
+ * No refcount is taken on returned uprobe.
+ */
+static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
- struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
+ struct rb_node *node;
+ unsigned int seq;
- if (node)
- return get_uprobe(__node_2_uprobe(node));
+ lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
+
+ do {
+ seq = read_seqcount_begin(&uprobes_seqcount);
+ node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
+ /*
+ * Lockless RB-tree lookups can result only in false negatives.
+ * If the element is found, it is correct and can be returned
+ * under RCU protection. If we find nothing, we need to
+ * validate that seqcount didn't change. If it did, we have to
+ * try again as we might have missed the element (false
+ * negative). If seqcount is unchanged, search truly failed.
+ */
+ if (node)
+ return __node_2_uprobe(node);
+ } while (read_seqcount_retry(&uprobes_seqcount, seq));
return NULL;
}
/*
- * Find a uprobe corresponding to a given inode:offset
- * Acquires uprobes_treelock
+ * Attempt to insert a new uprobe into uprobes_tree.
+ *
+ * If uprobe already exists (for given inode+offset), we just increment
+ * refcount of previously existing uprobe.
+ *
+ * If not, a provided new instance of uprobe is inserted into the tree (with
+ * assumed initial refcount == 1).
+ *
+ * In any case, we return a uprobe instance that ends up being in uprobes_tree.
+ * Caller has to clean up new uprobe instance, if it ended up not being
+ * inserted into the tree.
+ *
+ * We assume that uprobes_treelock is held for writing.
*/
-static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
-{
- struct uprobe *uprobe;
-
- read_lock(&uprobes_treelock);
- uprobe = __find_uprobe(inode, offset);
- read_unlock(&uprobes_treelock);
-
- return uprobe;
-}
-
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
+again:
+ node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
+ if (node) {
+ struct uprobe *u = __node_2_uprobe(node);
+
+ if (!try_get_uprobe(u)) {
+ rb_erase(node, &uprobes_tree);
+ RB_CLEAR_NODE(&u->rb_node);
+ goto again;
+ }
- node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
- if (node)
- return get_uprobe(__node_2_uprobe(node));
+ return u;
+ }
- /* get access + creation ref */
- refcount_set(&uprobe->ref, 2);
- return NULL;
+ return uprobe;
}
/*
- * Acquire uprobes_treelock.
- * Matching uprobe already exists in rbtree;
- * increment (access refcount) and return the matching uprobe.
- *
- * No matching uprobe; insert the uprobe in rb_tree;
- * get a double refcount (access + creation) and return NULL.
+ * Acquire uprobes_treelock and insert uprobe into uprobes_tree
+ * (or reuse existing one, see __insert_uprobe() comments above).
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
write_lock(&uprobes_treelock);
+ write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
+ write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
return u;
@@ -725,18 +796,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
- return NULL;
+ return ERR_PTR(-ENOMEM);
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
+ INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
+ RB_CLEAR_NODE(&uprobe->rb_node);
+ refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
- if (cur_uprobe) {
+ if (cur_uprobe != uprobe) {
if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
ref_ctr_mismatch_warn(cur_uprobe, uprobe);
put_uprobe(cur_uprobe);
@@ -753,32 +827,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
- uc->next = uprobe->consumers;
- uprobe->consumers = uc;
+ list_add_rcu(&uc->cons_node, &uprobe->consumers);
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
- * Return true if the @uc is deleted successfully
- * or return false.
+ * Should never be called with consumer that's not part of @uprobe->consumers.
*/
-static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
+static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
- struct uprobe_consumer **con;
- bool ret = false;
-
down_write(&uprobe->consumer_rwsem);
- for (con = &uprobe->consumers; *con; con = &(*con)->next) {
- if (*con == uc) {
- *con = uc->next;
- ret = true;
- break;
- }
- }
+ list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
-
- return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
@@ -863,21 +924,20 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
return ret;
}
-static inline bool consumer_filter(struct uprobe_consumer *uc,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
- return !uc->filter || uc->filter(uc, ctx, mm);
+ return !uc->filter || uc->filter(uc, mm);
}
-static bool filter_chain(struct uprobe *uprobe,
- enum uprobe_filter_ctx ctx, struct mm_struct *mm)
+static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
- ret = consumer_filter(uc, ctx, mm);
+ list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+ srcu_read_lock_held(&uprobes_srcu)) {
+ ret = consumer_filter(uc, mm);
if (ret)
break;
}
@@ -921,27 +981,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
-static inline bool uprobe_is_active(struct uprobe *uprobe)
-{
- return !RB_EMPTY_NODE(&uprobe->rb_node);
-}
-/*
- * There could be threads that have already hit the breakpoint. They
- * will recheck the current insn and restart if find_uprobe() fails.
- * See find_active_uprobe().
- */
-static void delete_uprobe(struct uprobe *uprobe)
-{
- if (WARN_ON(!uprobe_is_active(uprobe)))
- return;
-
- write_lock(&uprobes_treelock);
- rb_erase(&uprobe->rb_node, &uprobes_tree);
- write_unlock(&uprobes_treelock);
- RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
- put_uprobe(uprobe);
-}
-
struct map_info {
struct map_info *next;
struct mm_struct *mm;
@@ -1046,7 +1085,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
-
+ /*
+ * We take mmap_lock for writing to avoid the race with
+ * find_active_uprobe_rcu() which takes mmap_lock for reading.
+ * Thus this install_breakpoint() can not make
+ * is_trap_at_addr() true right after find_uprobe_rcu()
+ * returns NULL in find_active_uprobe_rcu().
+ */
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
@@ -1059,12 +1104,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
- if (consumer_filter(new,
- UPROBE_FILTER_REGISTER, mm))
+ if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
- if (!filter_chain(uprobe,
- UPROBE_FILTER_UNREGISTER, mm))
+ if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
@@ -1079,152 +1122,140 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
-static void
-__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
+/**
+ * uprobe_unregister_nosync - unregister an already registered probe.
+ * @uprobe: uprobe to remove
+ * @uc: identify which probe if multiple probes are colocated.
+ */
+void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
- if (WARN_ON(!consumer_del(uprobe, uc)))
- return;
-
+ down_write(&uprobe->register_rwsem);
+ consumer_del(uprobe, uc);
err = register_for_each_vma(uprobe, NULL);
- /* TODO : cant unregister? schedule a worker thread */
- if (!uprobe->consumers && !err)
- delete_uprobe(uprobe);
-}
-
-/*
- * uprobe_unregister - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
- * @uc: identify which probe if multiple probes are colocated.
- */
-void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
-{
- struct uprobe *uprobe;
+ up_write(&uprobe->register_rwsem);
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
+ /* TODO : cant unregister? schedule a worker thread */
+ if (unlikely(err)) {
+ uprobe_warn(current, "unregister, leaking uprobe");
return;
+ }
- down_write(&uprobe->register_rwsem);
- __uprobe_unregister(uprobe, uc);
- up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
}
-EXPORT_SYMBOL_GPL(uprobe_unregister);
+EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
-/*
- * __uprobe_register - register a probe
+void uprobe_unregister_sync(void)
+{
+ /*
+ * Now that handler_chain() and handle_uretprobe_chain() iterate over
+ * uprobe->consumers list under RCU protection without holding
+ * uprobe->register_rwsem, we need to wait for RCU grace period to
+ * make sure that we can't call into just unregistered
+ * uprobe_consumer's callbacks anymore. If we don't do that, fast and
+ * unlucky enough caller can free consumer's memory and cause
+ * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
+ */
+ synchronize_srcu(&uprobes_srcu);
+}
+EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
+
+/**
+ * uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
+ * @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
- * Apart from the access refcount, __uprobe_register() takes a creation
+ * Apart from the access refcount, uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
- * unregisters. Caller of __uprobe_register() is required to keep @inode
+ * unregisters. Caller of uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
- * Return errno if it cannot successully install probes
- * else return 0 (success)
+ * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
-static int __uprobe_register(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
+struct uprobe *uprobe_register(struct inode *inode,
+ loff_t offset, loff_t ref_ctr_offset,
+ struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
if (!inode->i_mapping->a_ops->read_folio &&
!shmem_mapping(inode->i_mapping))
- return -EIO;
+ return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
- retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
- if (!uprobe)
- return -ENOMEM;
if (IS_ERR(uprobe))
- return PTR_ERR(uprobe);
+ return uprobe;
- /*
- * We can race with uprobe_unregister()->delete_uprobe().
- * Check uprobe_is_active() and retry if it is false.
- */
down_write(&uprobe->register_rwsem);
- ret = -EAGAIN;
- if (likely(uprobe_is_active(uprobe))) {
- consumer_add(uprobe, uc);
- ret = register_for_each_vma(uprobe, uc);
- if (ret)
- __uprobe_unregister(uprobe, uc);
- }
+ consumer_add(uprobe, uc);
+ ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
- if (unlikely(ret == -EAGAIN))
- goto retry;
- return ret;
-}
+ if (ret) {
+ uprobe_unregister_nosync(uprobe, uc);
+ /*
+ * Registration might have partially succeeded, so we can have
+ * this consumer being called right at this time. We need to
+ * sync here. It's ok, it's unlikely slow path.
+ */
+ uprobe_unregister_sync();
+ return ERR_PTR(ret);
+ }
-int uprobe_register(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, 0, uc);
+ return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
-int uprobe_register_refctr(struct inode *inode, loff_t offset,
- loff_t ref_ctr_offset, struct uprobe_consumer *uc)
-{
- return __uprobe_register(inode, offset, ref_ctr_offset, uc);
-}
-EXPORT_SYMBOL_GPL(uprobe_register_refctr);
-
-/*
- * uprobe_apply - unregister an already registered probe.
- * @inode: the file in which the probe has to be removed.
- * @offset: offset from the start of the file.
+/**
+ * uprobe_apply - add or remove the breakpoints according to @uc->filter
+ * @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
+ * Return: 0 on success or negative error code.
*/
-int uprobe_apply(struct inode *inode, loff_t offset,
- struct uprobe_consumer *uc, bool add)
+int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
- struct uprobe *uprobe;
struct uprobe_consumer *con;
- int ret = -ENOENT;
-
- uprobe = find_uprobe(inode, offset);
- if (WARN_ON(!uprobe))
- return ret;
+ int ret = -ENOENT, srcu_idx;
down_write(&uprobe->register_rwsem);
- for (con = uprobe->consumers; con && con != uc ; con = con->next)
- ;
- if (con)
- ret = register_for_each_vma(uprobe, add ? uc : NULL);
+
+ srcu_idx = srcu_read_lock(&uprobes_srcu);
+ list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
+ srcu_read_lock_held(&uprobes_srcu)) {
+ if (con == uc) {
+ ret = register_for_each_vma(uprobe, add ? uc : NULL);
+ break;
+ }
+ }
+ srcu_read_unlock(&uprobes_srcu, srcu_idx);
+
up_write(&uprobe->register_rwsem);
- put_uprobe(uprobe);
return ret;
}
@@ -1305,15 +1336,17 @@ static void build_probe_list(struct inode *inode,
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
- list_add(&u->pending_list, head);
- get_uprobe(u);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
- list_add(&u->pending_list, head);
- get_uprobe(u);
+ /* if uprobe went away, it's safe to ignore it */
+ if (try_get_uprobe(u))
+ list_add(&u->pending_list, head);
}
}
read_unlock(&uprobes_treelock);
@@ -1384,7 +1417,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
- filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
+ filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@@ -1770,6 +1803,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return -ENOMEM;
*n = *o;
+ /*
+ * uprobe's refcnt has to be positive at this point, kept by
+ * utask->return_instances items; return_instances can't be
+ * removed right now, as task is blocked due to duping; so
+ * get_uprobe() is safe to use here.
+ */
get_uprobe(n->uprobe);
n->next = NULL;
@@ -1781,12 +1820,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return 0;
}
-static void uprobe_warn(struct task_struct *t, const char *msg)
-{
- pr_warn("uprobe: %s:%d failed to %s\n",
- current->comm, current->pid, msg);
-}
-
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
@@ -1883,9 +1916,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
+ /* we need to bump refcount to store uprobe in utask */
+ if (!try_get_uprobe(uprobe))
+ return;
+
ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
- return;
+ goto fail;
trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
@@ -1912,8 +1949,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
-
- ri->uprobe = get_uprobe(uprobe);
+ ri->uprobe = uprobe;
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
@@ -1924,8 +1960,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
utask->return_instances = ri;
return;
- fail:
+fail:
kfree(ri);
+ put_uprobe(uprobe);
}
/* Prepare to single-step probed instruction out of line. */
@@ -1940,9 +1977,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
if (!utask)
return -ENOMEM;
+ if (!try_get_uprobe(uprobe))
+ return -EINVAL;
+
xol_vaddr = xol_get_insn_slot(uprobe);
- if (!xol_vaddr)
- return -ENOMEM;
+ if (!xol_vaddr) {
+ err = -ENOMEM;
+ goto err_out;
+ }
utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
@@ -1950,12 +1992,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
xol_free_insn_slot(current);
- return err;
+ goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
+err_out:
+ put_uprobe(uprobe);
+ return err;
}
/*
@@ -2028,13 +2073,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
- /*
- * The NULL 'tsk' here ensures that any faults that occur here
- * will not be accounted to the task. 'mm' *is* current->mm,
- * but we treat this as a 'remote' access since it is
- * essentially a kernel access to the memory.
- */
- result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
+ result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
@@ -2045,7 +2084,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
-static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
+/* assumes being inside RCU protected region */
+static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
@@ -2058,7 +2098,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
- uprobe = find_uprobe(inode, offset);
+ uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
@@ -2079,9 +2119,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
+ bool has_consumers = false;
+
+ current->utask->auprobe = &uprobe->arch;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+ srcu_read_lock_held(&uprobes_srcu)) {
int rc = 0;
if (uc->handler) {
@@ -2094,16 +2137,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
need_prep = true;
remove &= rc;
+ has_consumers = true;
}
+ current->utask->auprobe = NULL;
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
- if (remove && uprobe->consumers) {
- WARN_ON(!uprobe_is_active(uprobe));
- unapply_uprobe(uprobe, current->mm);
+ if (remove && has_consumers) {
+ down_read(&uprobe->register_rwsem);
+
+ /* re-check that removal is still required, this time under lock */
+ if (!filter_chain(uprobe, current->mm)) {
+ WARN_ON(!uprobe_is_active(uprobe));
+ unapply_uprobe(uprobe, current->mm);
+ }
+
+ up_read(&uprobe->register_rwsem);
}
- up_read(&uprobe->register_rwsem);
}
static void
@@ -2111,13 +2162,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
+ int srcu_idx;
- down_read(&uprobe->register_rwsem);
- for (uc = uprobe->consumers; uc; uc = uc->next) {
+ srcu_idx = srcu_read_lock(&uprobes_srcu);
+ list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
+ srcu_read_lock_held(&uprobes_srcu)) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
- up_read(&uprobe->register_rwsem);
+ srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
@@ -2202,13 +2255,15 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
- int is_swbp;
+ int is_swbp, srcu_idx;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ srcu_idx = srcu_read_lock(&uprobes_srcu);
+
+ uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -2224,7 +2279,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
- return;
+ goto out;
}
/* change it in advance for ->handler() and restart */
@@ -2259,12 +2314,12 @@ static void handle_swbp(struct pt_regs *regs)
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
- if (!pre_ssout(uprobe, regs, bp_vaddr))
- return;
+ if (pre_ssout(uprobe, regs, bp_vaddr))
+ goto out;
- /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
- put_uprobe(uprobe);
+ /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
+ srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
/*