diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-19 13:34:06 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-11-19 13:34:06 -0800 |
commit | f41dac3efb7582cd3f518fadf7764d424f453788 (patch) | |
tree | da2aee8b82625525adf671911ed862af43a1b60d /kernel/events | |
parent | 9d7d4ad222aea8ab482e78858d03b10221c7fe78 (diff) | |
parent | 2c47e7a74f445426d156278e339b7abb259e50de (diff) |
Merge tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull performance events updates from Ingo Molnar:
"Uprobes:
- Add BPF session support (Jiri Olsa)
- Switch to RCU Tasks Trace flavor for better performance (Andrii
Nakryiko)
- Massively increase uretprobe SMP scalability by SRCU-protecting
the uretprobe lifetime (Andrii Nakryiko)
- Kill xol_area->slot_count (Oleg Nesterov)
Core facilities:
- Implement targeted high-frequency profiling by adding the ability
for an event to "pause" or "resume" AUX area tracing (Adrian
Hunter)
VM profiling/sampling:
- Correct perf sampling with guest VMs (Colton Lewis)
New hardware support:
- x86/intel: Add PMU support for Intel ArrowLake-H CPUs (Dapeng Mi)
Misc fixes and enhancements:
- x86/intel/pt: Fix buffer full but size is 0 case (Adrian Hunter)
- x86/amd: Warn only on new bits set (Breno Leitao)
- x86/amd/uncore: Avoid a false positive warning about snprintf
truncation in amd_uncore_umc_ctx_init (Jean Delvare)
- uprobes: Re-order struct uprobe_task to save some space
(Christophe JAILLET)
- x86/rapl: Move the pmu allocation out of CPU hotplug (Kan Liang)
- x86/rapl: Clean up cpumask and hotplug (Kan Liang)
- uprobes: Deuglify xol_get_insn_slot/xol_free_insn_slot paths (Oleg
Nesterov)"
* tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
perf/core: Correct perf sampling with guest VMs
perf/x86: Refactor misc flag assignments
perf/powerpc: Use perf_arch_instruction_pointer()
perf/core: Hoist perf_instruction_pointer() and perf_misc_flags()
perf/arm: Drop unused functions
uprobes: Re-order struct uprobe_task to save some space
perf/x86/amd/uncore: Avoid a false positive warning about snprintf truncation in amd_uncore_umc_ctx_init
perf/x86/intel: Do not enable large PEBS for events with aux actions or aux sampling
perf/x86/intel/pt: Add support for pause / resume
perf/core: Add aux_pause, aux_resume, aux_start_paused
perf/x86/intel/pt: Fix buffer full but size is 0 case
uprobes: SRCU-protect uretprobe lifetime (with timeout)
uprobes: allow put_uprobe() from non-sleepable softirq context
perf/x86/rapl: Clean up cpumask and hotplug
perf/x86/rapl: Move the pmu allocation out of CPU hotplug
uprobe: Add support for session consumer
uprobe: Add data pointer to consumer handlers
perf/x86/amd: Warn only on new bits set
uprobes: fold xol_take_insn_slot() into xol_get_insn_slot()
uprobes: kill xol_area->slot_count
...
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/core.c | 102 | ||||
-rw-r--r-- | kernel/events/internal.h | 1 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 608 |
3 files changed, 535 insertions, 176 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index b65446be00a7..5d4a54f50826 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2142,7 +2142,7 @@ static void perf_put_aux_event(struct perf_event *event) static bool perf_need_aux_event(struct perf_event *event) { - return !!event->attr.aux_output || !!event->attr.aux_sample_size; + return event->attr.aux_output || has_aux_action(event); } static int perf_get_aux_event(struct perf_event *event, @@ -2167,6 +2167,10 @@ static int perf_get_aux_event(struct perf_event *event, !perf_aux_output_match(event, group_leader)) return 0; + if ((event->attr.aux_pause || event->attr.aux_resume) && + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) + return 0; + if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) return 0; @@ -7003,6 +7007,29 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); #endif +static bool should_sample_guest(struct perf_event *event) +{ + return !event->attr.exclude_guest && perf_guest_state(); +} + +unsigned long perf_misc_flags(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_arch_guest_misc_flags(regs); + + return perf_arch_misc_flags(regs); +} + +unsigned long perf_instruction_pointer(struct perf_event *event, + struct pt_regs *regs) +{ + if (should_sample_guest(event)) + return perf_guest_get_ip(); + + return perf_arch_instruction_pointer(regs); +} + static void perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) @@ -7820,7 +7847,7 @@ void perf_prepare_sample(struct perf_sample_data *data, __perf_event_header__init_id(data, event, filtered_sample_type); if (filtered_sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); + data->ip = perf_instruction_pointer(event, regs); data->sample_flags |= PERF_SAMPLE_IP; } @@ -7984,7 +8011,7 @@ void perf_prepare_header(struct perf_event_header *header, { header->type = PERF_RECORD_SAMPLE; header->size = perf_sample_data_size(data, event); - header->misc = perf_misc_flags(regs); + header->misc = perf_misc_flags(event, regs); /* * If you're adding more sample types here, you likely need to do @@ -7997,6 +8024,49 @@ void perf_prepare_header(struct perf_event_header *header, WARN_ON_ONCE(header->size & 7); } +static void __perf_event_aux_pause(struct perf_event *event, bool pause) +{ + if (pause) { + if (!event->hw.aux_paused) { + event->hw.aux_paused = 1; + event->pmu->stop(event, PERF_EF_PAUSE); + } + } else { + if (event->hw.aux_paused) { + event->hw.aux_paused = 0; + event->pmu->start(event, PERF_EF_RESUME); + } + } +} + +static void perf_event_aux_pause(struct perf_event *event, bool pause) +{ + struct perf_buffer *rb; + + if (WARN_ON_ONCE(!event)) + return; + + rb = ring_buffer_get(event); + if (!rb) + return; + + scoped_guard (irqsave) { + /* + * Guard against self-recursion here. Another event could trip + * this same from NMI context. + */ + if (READ_ONCE(rb->aux_in_pause_resume)) + break; + + WRITE_ONCE(rb->aux_in_pause_resume, 1); + barrier(); + __perf_event_aux_pause(event, pause); + barrier(); + WRITE_ONCE(rb->aux_in_pause_resume, 0); + } + ring_buffer_put(rb); +} + static __always_inline int __perf_event_output(struct perf_event *event, struct perf_sample_data *data, @@ -9799,9 +9869,12 @@ static int __perf_event_overflow(struct perf_event *event, ret = __perf_event_account_interrupt(event, throttle); + if (event->attr.aux_pause) + perf_event_aux_pause(event->aux_event, true); + if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && !bpf_overflow_handler(event, data, regs)) - return ret; + goto out; /* * XXX event_limit might not quite work as expected on inherited @@ -9863,6 +9936,9 @@ static int __perf_event_overflow(struct perf_event *event, event->pending_wakeup = 1; irq_work_queue(&event->pending_irq); } +out: + if (event->attr.aux_resume) + perf_event_aux_pause(event->aux_event, false); return ret; } @@ -12254,11 +12330,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } if (event->attr.aux_output && - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || + event->attr.aux_pause || event->attr.aux_resume)) { err = -EOPNOTSUPP; goto err_pmu; } + if (event->attr.aux_pause && event->attr.aux_resume) { + err = -EINVAL; + goto err_pmu; + } + + if (event->attr.aux_start_paused) { + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { + err = -EOPNOTSUPP; + goto err_pmu; + } + event->hw.aux_paused = 1; + } + if (cgroup_fd != -1) { err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); if (err) @@ -13052,7 +13142,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Grouping is not supported for kernel events, neither is 'AUX', * make sure the caller's intentions are adjusted. */ - if (attr->aux_output) + if (attr->aux_output || attr->aux_action) return ERR_PTR(-EINVAL); event = perf_event_alloc(attr, cpu, task, NULL, NULL, diff --git a/kernel/events/internal.h b/kernel/events/internal.h index e072d995d670..249288d82b8d 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -52,6 +52,7 @@ struct perf_buffer { void (*free_aux)(void *); refcount_t aux_refcount; int aux_in_sampling; + int aux_in_pause_resume; void **aux_pages; void *aux_priv; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b52cb2ae6d6..a76ddc5fc982 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -26,6 +26,9 @@ #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> +#include <linux/rcupdate_trace.h> +#include <linux/workqueue.h> +#include <linux/srcu.h> #include <linux/uprobes.h> @@ -42,8 +45,6 @@ static struct rb_root uprobes_tree = RB_ROOT; static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); -DEFINE_STATIC_SRCU(uprobes_srcu); - #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; @@ -51,6 +52,9 @@ static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); +/* Covers return_instance's uprobe lifetime. */ +DEFINE_STATIC_SRCU(uretprobes_srcu); + /* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0 @@ -62,10 +66,13 @@ struct uprobe { struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct work; + }; loff_t offset; loff_t ref_ctr_offset; - unsigned long flags; + unsigned long flags; /* "unsigned long" so bitops work */ /* * The generic code assumes that it has two members of unknown type @@ -100,7 +107,6 @@ static LIST_HEAD(delayed_uprobe_list); */ struct xol_area { wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ unsigned long *bitmap; /* 0 = free slot */ struct page *page; @@ -620,17 +626,23 @@ static inline bool uprobe_is_active(struct uprobe *uprobe) return !RB_EMPTY_NODE(&uprobe->rb_node); } -static void uprobe_free_rcu(struct rcu_head *rcu) +static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) { struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); kfree(uprobe); } -static void put_uprobe(struct uprobe *uprobe) +static void uprobe_free_srcu(struct rcu_head *rcu) { - if (!refcount_dec_and_test(&uprobe->ref)) - return; + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); + + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); +} + +static void uprobe_free_deferred(struct work_struct *work) +{ + struct uprobe *uprobe = container_of(work, struct uprobe, work); write_lock(&uprobes_treelock); @@ -651,7 +663,162 @@ static void put_uprobe(struct uprobe *uprobe) delayed_uprobe_remove(uprobe, NULL); mutex_unlock(&delayed_uprobe_lock); - call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); + /* start srcu -> rcu_tasks_trace -> kfree chain */ + call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (!refcount_dec_and_test(&uprobe->ref)) + return; + + INIT_WORK(&uprobe->work, uprobe_free_deferred); + schedule_work(&uprobe->work); +} + +/* Initialize hprobe as SRCU-protected "leased" uprobe */ +static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) +{ + WARN_ON(!uprobe); + hprobe->state = HPROBE_LEASED; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = srcu_idx; +} + +/* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ +static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) +{ + hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; + hprobe->uprobe = uprobe; + hprobe->srcu_idx = -1; +} + +/* + * hprobe_consume() fetches hprobe's underlying uprobe and detects whether + * uprobe is SRCU protected or is refcounted. hprobe_consume() can be + * used only once for a given hprobe. + * + * Caller has to call hprobe_finalize() and pass previous hprobe_state, so + * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever + * is appropriate. + */ +static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) +{ + *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); + switch (*hstate) { + case HPROBE_LEASED: + case HPROBE_STABLE: + return hprobe->uprobe; + case HPROBE_GONE: /* uprobe is NULL, no SRCU */ + case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ + return NULL; + default: + WARN(1, "hprobe invalid state %d", *hstate); + return NULL; + } +} + +/* + * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. + * hprobe_finalize() can only be used from current context after + * hprobe_consume() call (which determines uprobe and hstate value). + */ +static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) +{ + switch (hstate) { + case HPROBE_LEASED: + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + break; + case HPROBE_STABLE: + put_uprobe(hprobe->uprobe); + break; + case HPROBE_GONE: + case HPROBE_CONSUMED: + break; + default: + WARN(1, "hprobe invalid state %d", hstate); + break; + } +} + +/* + * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) + * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of + * them can win the race to perform SRCU unlocking. Whoever wins must perform + * SRCU unlock. + * + * Returns underlying valid uprobe or NULL, if there was no underlying uprobe + * to begin with or we failed to bump its refcount and it's going away. + * + * Returned non-NULL uprobe can be still safely used within an ongoing SRCU + * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has + * an extra refcount for caller to assume and use. Otherwise, it's not + * guaranteed that returned uprobe has a positive refcount, so caller has to + * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current + * SRCU lock region. See dup_utask(). + */ +static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) +{ + enum hprobe_state hstate; + + /* + * return_instance's hprobe is protected by RCU. + * Underlying uprobe is itself protected from reuse by SRCU. + */ + lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); + + hstate = READ_ONCE(hprobe->state); + switch (hstate) { + case HPROBE_STABLE: + /* uprobe has positive refcount, bump refcount, if necessary */ + return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; + case HPROBE_GONE: + /* + * SRCU was unlocked earlier and we didn't manage to take + * uprobe refcnt, so it's effectively NULL + */ + return NULL; + case HPROBE_CONSUMED: + /* + * uprobe was consumed, so it's effectively NULL as far as + * uretprobe processing logic is concerned + */ + return NULL; + case HPROBE_LEASED: { + struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); + /* + * Try to switch hprobe state, guarding against + * hprobe_consume() or another hprobe_expire() racing with us. + * Note, if we failed to get uprobe refcount, we use special + * HPROBE_GONE state to signal that hprobe->uprobe shouldn't + * be used as it will be freed after SRCU is unlocked. + */ + if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { + /* We won the race, we are the ones to unlock SRCU */ + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); + return get ? get_uprobe(uprobe) : uprobe; + } + + /* + * We lost the race, undo refcount bump (if it ever happened), + * unless caller would like an extra refcount anyways. + */ + if (uprobe && !get) + put_uprobe(uprobe); + /* + * Even if hprobe_consume() or another hprobe_expire() wins + * the state update race and unlocks SRCU from under us, we + * still have a guarantee that underyling uprobe won't be + * freed due to ongoing caller's SRCU lock region, so we can + * return it regardless. Also, if `get` was true, we also have + * an extra ref for the caller to own. This is used in dup_utask(). + */ + return uprobe; + } + default: + WARN(1, "unknown hprobe state %d", hstate); + return NULL; + } } static __always_inline @@ -706,7 +873,7 @@ static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset) struct rb_node *node; unsigned int seq; - lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); + lockdep_assert(rcu_read_lock_trace_held()); do { seq = read_seqcount_begin(&uprobes_seqcount); @@ -825,8 +992,11 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset, static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) { + static atomic64_t id; + down_write(&uprobe->consumer_rwsem); list_add_rcu(&uc->cons_node, &uprobe->consumers); + uc->id = (__u64) atomic64_inc_return(&id); up_write(&uprobe->consumer_rwsem); } @@ -934,8 +1104,7 @@ static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm) bool ret = false; down_read(&uprobe->consumer_rwsem); - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { ret = consumer_filter(uc, mm); if (ret) break; @@ -1156,7 +1325,8 @@ void uprobe_unregister_sync(void) * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free. */ - synchronize_srcu(&uprobes_srcu); + synchronize_rcu_tasks_trace(); + synchronize_srcu(&uretprobes_srcu); } EXPORT_SYMBOL_GPL(uprobe_unregister_sync); @@ -1240,19 +1410,18 @@ EXPORT_SYMBOL_GPL(uprobe_register); int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) { struct uprobe_consumer *con; - int ret = -ENOENT, srcu_idx; + int ret = -ENOENT; down_write(&uprobe->register_rwsem); - srcu_idx = srcu_read_lock(&uprobes_srcu); - list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + rcu_read_lock_trace(); + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { if (con == uc) { ret = register_for_each_vma(uprobe, add ? uc : NULL); break; } } - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); up_write(&uprobe->register_rwsem); @@ -1475,9 +1644,15 @@ static vm_fault_t xol_fault(const struct vm_special_mapping *sm, return 0; } +static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) +{ + return -EPERM; +} + static const struct vm_special_mapping xol_mapping = { .name = "[uprobes]", .fault = xol_fault, + .mremap = xol_mremap, }; /* Slot allocation for XOL */ @@ -1553,7 +1728,6 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); - atomic_set(&area->slot_count, 1); insns = arch_uprobe_trampoline(&insns_size); arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); @@ -1626,92 +1800,57 @@ void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) } } -/* - * - search for a free slot. - */ -static unsigned long xol_take_insn_slot(struct xol_area *area) +static unsigned long xol_get_slot_nr(struct xol_area *area) { - unsigned long slot_addr; - int slot_nr; - - do { - slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); - if (slot_nr < UINSNS_PER_PAGE) { - if (!test_and_set_bit(slot_nr, area->bitmap)) - break; + unsigned long slot_nr; - slot_nr = UINSNS_PER_PAGE; - continue; - } - wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); - } while (slot_nr >= UINSNS_PER_PAGE); - - slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); - atomic_inc(&area->slot_count); + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); + if (slot_nr < UINSNS_PER_PAGE) { + if (!test_and_set_bit(slot_nr, area->bitmap)) + return slot_nr; + } - return slot_addr; + return UINSNS_PER_PAGE; } /* * xol_get_insn_slot - allocate a slot for xol. - * Returns the allocated slot address or 0. */ -static unsigned long xol_get_insn_slot(struct uprobe *uprobe) +static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) { - struct xol_area *area; - unsigned long xol_vaddr; + struct xol_area *area = get_xol_area(); + unsigned long slot_nr; - area = get_xol_area(); if (!area) - return 0; + return false; - xol_vaddr = xol_take_insn_slot(area); - if (unlikely(!xol_vaddr)) - return 0; + wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); - arch_uprobe_copy_ixol(area->page, xol_vaddr, + utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; + arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); - - return xol_vaddr; + return true; } /* - * xol_free_insn_slot - If slot was earlier allocated by - * @xol_get_insn_slot(), make the slot available for - * subsequent requests. + * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() */ -static void xol_free_insn_slot(struct task_struct *tsk) +static void xol_free_insn_slot(struct uprobe_task *utask) { - struct xol_area *area; - unsigned long vma_end; - unsigned long slot_addr; - - if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) - return; + struct xol_area *area = current->mm->uprobes_state.xol_area; + unsigned long offset = utask->xol_vaddr - area->vaddr; + unsigned int slot_nr; - slot_addr = tsk->utask->xol_vaddr; - if (unlikely(!slot_addr)) + utask->xol_vaddr = 0; + /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ + if (WARN_ON_ONCE(offset >= PAGE_SIZE)) return; - area = tsk->mm->uprobes_state.xol_area; - vma_end = area->vaddr + PAGE_SIZE; - if (area->vaddr <= slot_addr && slot_addr < vma_end) { - unsigned long offset; - int slot_nr; - - offset = slot_addr - area->vaddr; - slot_nr = offset / UPROBE_XOL_SLOT_BYTES; - if (slot_nr >= UINSNS_PER_PAGE) - return; - - clear_bit(slot_nr, area->bitmap); - atomic_dec(&area->slot_count); - smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ - if (waitqueue_active(&area->wq)) - wake_up(&area->wq); - - tsk->utask->xol_vaddr = 0; - } + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; + clear_bit(slot_nr, area->bitmap); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ + if (waitqueue_active(&area->wq)) + wake_up(&area->wq); } void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, @@ -1750,11 +1889,18 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } -static struct return_instance *free_ret_instance(struct return_instance *ri) +static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) { struct return_instance *next = ri->next; - put_uprobe(ri->uprobe); - kfree(ri); + + if (cleanup_hprobe) { + enum hprobe_state hstate; + + (void)hprobe_consume(&ri->hprobe, &hstate); + hprobe_finalize(&ri->hprobe, hstate); + } + + kfree_rcu(ri, rcu); return next; } @@ -1770,18 +1916,50 @@ void uprobe_free_utask(struct task_struct *t) if (!utask) return; - if (utask->active_uprobe) - put_uprobe(utask->active_uprobe); + WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); + + timer_delete_sync(&utask->ri_timer); ri = utask->return_instances; while (ri) - ri = free_ret_instance(ri); + ri = free_ret_instance(ri, true /* cleanup_hprobe */); - xol_free_insn_slot(t); kfree(utask); t->utask = NULL; } +#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ + +#define for_each_ret_instance_rcu(pos, head) \ + for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) + +static void ri_timer(struct timer_list *timer) +{ + struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); + struct return_instance *ri; + + /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ + guard(srcu)(&uretprobes_srcu); + /* RCU protects return_instance from freeing. */ + guard(rcu)(); + + for_each_ret_instance_rcu(ri, utask->return_instances) + hprobe_expire(&ri->hprobe, false); +} + +static struct uprobe_task *alloc_utask(void) +{ + struct uprobe_task *utask; + + utask = kzalloc(sizeof(*utask), GFP_KERNEL); + if (!utask) + return NULL; + + timer_setup(&utask->ri_timer, ri_timer, 0); + + return utask; +} + /* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. @@ -1793,38 +1971,73 @@ void uprobe_free_utask(struct task_struct *t) static struct uprobe_task *get_utask(void) { if (!current->utask) - current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + current->utask = alloc_utask(); return current->utask; } +static size_t ri_size(int consumers_cnt) +{ + struct return_instance *ri; + + return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; +} + +#define DEF_CNT 4 + +static struct return_instance *alloc_return_instance(void) +{ + struct return_instance *ri; + + ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); + if (!ri) + return ZERO_SIZE_PTR; + + ri->consumers_cnt = DEF_CNT; + return ri; +} + +static struct return_instance *dup_return_instance(struct return_instance *old) +{ + size_t size = ri_size(old->consumers_cnt); + + return kmemdup(old, size, GFP_KERNEL); +} + static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) { struct uprobe_task *n_utask; struct return_instance **p, *o, *n; + struct uprobe *uprobe; - n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); + n_utask = alloc_utask(); if (!n_utask) return -ENOMEM; t->utask = n_utask; + /* protect uprobes from freeing, we'll need try_get_uprobe() them */ + guard(srcu)(&uretprobes_srcu); + p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) { - n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); + n = dup_return_instance(o); if (!n) return -ENOMEM; - *n = *o; + /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ + uprobe = hprobe_expire(&o->hprobe, true); + /* - * uprobe's refcnt has to be positive at this point, kept by - * utask->return_instances items; return_instances can't be - * removed right now, as task is blocked due to duping; so - * get_uprobe() is safe to use here. + * New utask will have stable properly refcounted uprobe or + * NULL. Even if we failed to get refcounted uprobe, we still + * need to preserve full set of return_instances for proper + * uretprobe handling and nesting in forked task. */ - get_uprobe(n->uprobe); - n->next = NULL; + hprobe_init_stable(&n->hprobe, uprobe); - *p = n; + n->next = NULL; + rcu_assign_pointer(*p, n); p = &n->next; + n_utask->depth++; } @@ -1900,45 +2113,34 @@ static void cleanup_return_instances(struct uprobe_task *utask, bool chained, enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { - ri = free_ret_instance(ri); + ri = free_ret_instance(ri, true /* cleanup_hprobe */); utask->depth--; } - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); } -static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, + struct return_instance *ri) { - struct return_instance *ri; - struct uprobe_task *utask; + struct uprobe_task *utask = current->utask; unsigned long orig_ret_vaddr, trampoline_vaddr; bool chained; + int srcu_idx; if (!get_xol_area()) - return; - - utask = get_utask(); - if (!utask) - return; + goto free; if (utask->depth >= MAX_URETPROBE_DEPTH) { printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" " nestedness limit pid/tgid=%d/%d\n", current->pid, current->tgid); - return; + goto free; } - /* we need to bump refcount to store uprobe in utask */ - if (!try_get_uprobe(uprobe)) - return; - - ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); - if (!ri) - goto fail; - trampoline_vaddr = uprobe_get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) - goto fail; + goto free; /* drop the entries invalidated by longjmp() */ chained = (orig_ret_vaddr == trampoline_vaddr); @@ -1956,53 +2158,51 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) * attack from user-space. */ uprobe_warn(current, "handle tail call"); - goto fail; + goto free; } orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - ri->uprobe = uprobe; + + /* __srcu_read_lock() because SRCU lock survives switch to user space */ + srcu_idx = __srcu_read_lock(&uretprobes_srcu); + ri->func = instruction_pointer(regs); ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; + + hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); ri->next = utask->return_instances; - utask->return_instances = ri; + rcu_assign_pointer(utask->return_instances, ri); + + mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); return; -fail: +free: kfree(ri); - put_uprobe(uprobe); } /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) { - struct uprobe_task *utask; - unsigned long xol_vaddr; + struct uprobe_task *utask = current->utask; int err; - utask = get_utask(); - if (!utask) - return -ENOMEM; - if (!try_get_uprobe(uprobe)) return -EINVAL; - xol_vaddr = xol_get_insn_slot(uprobe); - if (!xol_vaddr) { + if (!xol_get_insn_slot(uprobe, utask)) { err = -ENOMEM; goto err_out; } - utask->xol_vaddr = xol_vaddr; utask->vaddr = bp_vaddr; - err = arch_uprobe_pre_xol(&uprobe->arch, regs); if (unlikely(err)) { - xol_free_insn_slot(current); + xol_free_insn_slot(utask); goto err_out; } @@ -2125,35 +2325,90 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb return uprobe; } +static struct return_instance* +push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) +{ + if (unlikely(ri == ZERO_SIZE_PTR)) + return ri; + + if (unlikely(idx >= ri->consumers_cnt)) { + struct return_instance *old_ri = ri; + + ri->consumers_cnt += DEF_CNT; + ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); + if (!ri) { + kfree(old_ri); + return ZERO_SIZE_PTR; + } + } + + ri->consumers[idx].id = id; + ri->consumers[idx].cookie = cookie; + return ri; +} + +static struct return_consumer * +return_consumer_find(struct return_instance *ri, int *iter, int id) +{ + struct return_consumer *ric; + int idx = *iter; + + for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { + if (ric->id == id) { + *iter = idx + 1; + return ric; + } + } + return NULL; +} + +static bool ignore_ret_handler(int rc) +{ + return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; +} + static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; - int remove = UPROBE_HANDLER_REMOVE; - bool need_prep = false; /* prepare return uprobe, when needed */ - bool has_consumers = false; + bool has_consumers = false, remove = true; + struct return_instance *ri = NULL; + int push_idx = 0; current->utask->auprobe = &uprobe->arch; - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + __u64 cookie = 0; int rc = 0; if (uc->handler) { - rc = uc->handler(uc, regs); - WARN(rc & ~UPROBE_HANDLER_MASK, + rc = uc->handler(uc, regs, &cookie); + WARN(rc < 0 || rc > 2, "bad rc=0x%x from %ps()\n", rc, uc->handler); } - if (uc->ret_handler) - need_prep = true; - - remove &= rc; + remove &= rc == UPROBE_HANDLER_REMOVE; has_consumers = true; + + if (!uc->ret_handler || ignore_ret_handler(rc)) + continue; + + if (!ri) + ri = alloc_return_instance(); + + if (session) + ri = push_consumer(ri, push_idx++, uc->id, cookie); } current->utask->auprobe = NULL; - if (need_prep && !remove) - prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (!ZERO_OR_NULL_PTR(ri)) { + /* + * The push_idx value has the final number of return consumers, + * and ri->consumers_cnt has number of allocated consumers. + */ + ri->consumers_cnt = push_idx; + prepare_uretprobe(uprobe, regs, ri); + } if (remove && has_consumers) { down_read(&uprobe->register_rwsem); @@ -2169,19 +2424,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) } static void -handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) { - struct uprobe *uprobe = ri->uprobe; + struct return_consumer *ric; struct uprobe_consumer *uc; - int srcu_idx; + int ric_idx = 0; + + /* all consumers unsubscribed meanwhile */ + if (unlikely(!uprobe)) + return; - srcu_idx = srcu_read_lock(&uprobes_srcu); - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, - srcu_read_lock_held(&uprobes_srcu)) { - if (uc->ret_handler) - uc->ret_handler(uc, ri->func, regs); + rcu_read_lock_trace(); + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { + bool session = uc->handler && uc->ret_handler; + + if (uc->ret_handler) { + ric = return_consumer_find(ri, &ric_idx, uc->id); + if (!session || ric) + uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); + } } - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); } static struct return_instance *find_next_ret_chain(struct return_instance *ri) @@ -2200,6 +2463,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs) { struct uprobe_task *utask; struct return_instance *ri, *next; + struct uprobe *uprobe; + enum hprobe_state hstate; bool valid; utask = current->utask; @@ -2230,21 +2495,24 @@ void uprobe_handle_trampoline(struct pt_regs *regs) * trampoline addresses on the stack are replaced with correct * original return addresses */ - utask->return_instances = ri->next; + rcu_assign_pointer(utask->return_instances, ri->next); + + uprobe = hprobe_consume(&ri->hprobe, &hstate); if (valid) - handle_uretprobe_chain(ri, regs); - ri = free_ret_instance(ri); + handle_uretprobe_chain(ri, uprobe, regs); + hprobe_finalize(&ri->hprobe, hstate); + + /* We already took care of hprobe, no need to waste more time on that. */ + ri = free_ret_instance(ri, false /* !cleanup_hprobe */); utask->depth--; } while (ri != next); } while (!valid); - utask->return_instances = ri; return; - sigill: +sigill: uprobe_warn(current, "handle uretprobe, sending SIGILL."); force_sig(SIGILL); - } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -2266,13 +2534,13 @@ static void handle_swbp(struct pt_regs *regs) { struct uprobe *uprobe; unsigned long bp_vaddr; - int is_swbp, srcu_idx; + int is_swbp; bp_vaddr = uprobe_get_swbp_addr(regs); if (bp_vaddr == uprobe_get_trampoline_vaddr()) return uprobe_handle_trampoline(regs); - srcu_idx = srcu_read_lock(&uprobes_srcu); + rcu_read_lock_trace(); uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); if (!uprobe) { @@ -2330,7 +2598,7 @@ static void handle_swbp(struct pt_regs *regs) out: /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ - srcu_read_unlock(&uprobes_srcu, srcu_idx); + rcu_read_unlock_trace(); } /* @@ -2353,7 +2621,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) put_uprobe(uprobe); utask->active_uprobe = NULL; utask->state = UTASK_RUNNING; - xol_free_insn_slot(current); + xol_free_insn_slot(utask); spin_lock_irq(¤t->sighand->siglock); recalc_sigpending(); /* see uprobe_deny_signal() */ |