diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.kexec | 4 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 58 | ||||
-rw-r--r-- | kernel/bpf/core.c | 12 | ||||
-rw-r--r-- | kernel/bpf/memalloc.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 8 | ||||
-rw-r--r-- | kernel/crash_core.c | 10 | ||||
-rw-r--r-- | kernel/cred.c | 251 | ||||
-rw-r--r-- | kernel/events/core.c | 69 | ||||
-rw-r--r-- | kernel/exit.c | 3 | ||||
-rw-r--r-- | kernel/freezer.c | 2 | ||||
-rw-r--r-- | kernel/kprobes.c | 4 | ||||
-rw-r--r-- | kernel/locking/osq_lock.c | 37 | ||||
-rw-r--r-- | kernel/resource.c | 4 | ||||
-rw-r--r-- | kernel/sys_ni.c | 14 | ||||
-rw-r--r-- | kernel/time/posix-stubs.c | 45 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 100 | ||||
-rw-r--r-- | kernel/trace/rethook.c | 23 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 209 | ||||
-rw-r--r-- | kernel/trace/synth_event_gen_test.c | 11 | ||||
-rw-r--r-- | kernel/trace/trace.c | 190 | ||||
-rw-r--r-- | kernel/trace/trace.h | 1 | ||||
-rw-r--r-- | kernel/trace/trace_events_hist.c | 12 | ||||
-rw-r--r-- | kernel/trace/trace_events_synth.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 6 | ||||
-rw-r--r-- | kernel/workqueue.c | 22 |
25 files changed, 410 insertions, 691 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 7aff28ded2f4..946dffa048b7 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -36,6 +36,8 @@ config KEXEC config KEXEC_FILE bool "Enable kexec file based system call" depends on ARCH_SUPPORTS_KEXEC_FILE + select CRYPTO + select CRYPTO_SHA256 select KEXEC_CORE help This is new version of kexec system call. This system call is @@ -94,10 +96,8 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" depends on ARCH_SUPPORTS_CRASH_DUMP - depends on ARCH_SUPPORTS_KEXEC select CRASH_CORE select KEXEC_CORE - select KEXEC help Generate crash dump after being started by kexec. This should be normally only set in special crash dump kernels diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..c85ff9162a5c 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map, mutex_unlock(&aux->poke_mutex); } +void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, + struct bpf_prog *new, struct bpf_prog *old) +{ + WARN_ON_ONCE(1); +} + static void prog_array_map_poke_run(struct bpf_map *map, u32 key, struct bpf_prog *old, struct bpf_prog *new) { - u8 *old_addr, *new_addr, *old_bypass_addr; struct prog_poke_elem *elem; struct bpf_array_aux *aux; @@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, list_for_each_entry(elem, &aux->poke_progs, list) { struct bpf_jit_poke_descriptor *poke; - int i, ret; + int i; for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; @@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, * activated, so tail call updates can arrive from here * while JIT is still finishing its final fixup for * non-activated poke entries. - * 3) On program teardown, the program's kallsym entry gets - * removed out of RCU callback, but we can only untrack - * from sleepable context, therefore bpf_arch_text_poke() - * might not see that this is in BPF text section and - * bails out with -EINVAL. As these are unreachable since - * RCU grace period already passed, we simply skip them. - * 4) Also programs reaching refcount of zero while patching + * 3) Also programs reaching refcount of zero while patching * is in progress is okay since we're protected under * poke_mutex and untrack the programs before the JIT - * buffer is freed. When we're still in the middle of - * patching and suddenly kallsyms entry of the program - * gets evicted, we just skip the rest which is fine due - * to point 3). - * 5) Any other error happening below from bpf_arch_text_poke() - * is a unexpected bug. + * buffer is freed. */ if (!READ_ONCE(poke->tailcall_target_stable)) continue; @@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key, poke->tail_call.key != key) continue; - old_bypass_addr = old ? NULL : poke->bypass_addr; - old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL; - new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL; - - if (new) { - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, new_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - if (!old) { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - poke->bypass_addr, - NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } - } else { - ret = bpf_arch_text_poke(poke->tailcall_bypass, - BPF_MOD_JUMP, - old_bypass_addr, - poke->bypass_addr); - BUG_ON(ret < 0 && ret != -EINVAL); - /* let other CPUs finish the execution of program - * so that it will not possible to expose them - * to invalid nop, stack unwind, nop state - */ - if (!ret) - synchronize_rcu(); - ret = bpf_arch_text_poke(poke->tailcall_target, - BPF_MOD_JUMP, - old_addr, NULL); - BUG_ON(ret < 0 && ret != -EINVAL); - } + bpf_arch_poke_desc_update(poke, new, old); } } } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cd3afe57ece3..fe254ae035fe 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -371,14 +371,18 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { - const s32 off_min = S16_MIN, off_max = S16_MAX; + s64 off_min, off_max, off; s32 delta = end_new - end_old; - s32 off; - if (insn->code == (BPF_JMP32 | BPF_JA)) + if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; - else + off_min = S32_MIN; + off_max = S32_MAX; + } else { off = insn->off; + off_min = S16_MIN; + off_max = S16_MAX; + } if (curr < pos && curr + off + 1 >= end_old) off += delta; diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 63b909d277d4..6a51cfe4c2d6 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -978,6 +978,8 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); + if (ret) + *(struct bpf_mem_cache **)ret = c; set_active_memcg(old_memcg); mem_cgroup_put(memcg); } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 122dacb3a443..66d1708042a7 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -66,9 +66,15 @@ static struct freezer *parent_freezer(struct freezer *freezer) bool cgroup_freezing(struct task_struct *task) { bool ret; + unsigned int state; rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; + /* Check if the cgroup is still FREEZING, but not FROZEN. The extra + * !FROZEN check is required, because the FREEZING bit is not cleared + * when the state FROZEN is reached. + */ + state = task_freezer(task)->state; + ret = (state & CGROUP_FREEZING) && !(state & CGROUP_FROZEN); rcu_read_unlock(); return ret; diff --git a/kernel/crash_core.c b/kernel/crash_core.c index efe87d501c8c..d4313b53837e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -199,7 +199,7 @@ static __initdata char *suffix_tbl[] = { * It returns 0 on success and -EINVAL on failure. */ static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, + unsigned long long *crash_size, const char *suffix) { char *cur = cmdline; @@ -268,9 +268,9 @@ static int __init __parse_crashkernel(char *cmdline, unsigned long long *crash_base, const char *suffix) { - char *first_colon, *first_space; - char *ck_cmdline; - char *name = "crashkernel="; + char *first_colon, *first_space; + char *ck_cmdline; + char *name = "crashkernel="; BUG_ON(!crash_size || !crash_base); *crash_size = 0; @@ -440,7 +440,7 @@ retry: return; } - if ((crash_base > CRASH_ADDR_LOW_MAX) && + if ((crash_base >= CRASH_ADDR_LOW_MAX) && crash_low_size && reserve_crashkernel_low(crash_low_size)) { memblock_phys_free(crash_base, crash_size); return; diff --git a/kernel/cred.c b/kernel/cred.c index 3c714cb31660..c033a201c808 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -43,10 +43,6 @@ static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; */ struct cred init_cred = { .usage = ATOMIC_INIT(4), -#ifdef CONFIG_DEBUG_CREDENTIALS - .subscribers = ATOMIC_INIT(2), - .magic = CRED_MAGIC, -#endif .uid = GLOBAL_ROOT_UID, .gid = GLOBAL_ROOT_GID, .suid = GLOBAL_ROOT_UID, @@ -66,31 +62,6 @@ struct cred init_cred = { .ucounts = &init_ucounts, }; -static inline void set_cred_subscribers(struct cred *cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - atomic_set(&cred->subscribers, n); -#endif -} - -static inline int read_cred_subscribers(const struct cred *cred) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - return atomic_read(&cred->subscribers); -#else - return 0; -#endif -} - -static inline void alter_cred_subscribers(const struct cred *_cred, int n) -{ -#ifdef CONFIG_DEBUG_CREDENTIALS - struct cred *cred = (struct cred *) _cred; - - atomic_add(n, &cred->subscribers); -#endif -} - /* * The RCU callback to actually dispose of a set of credentials */ @@ -100,20 +71,9 @@ static void put_cred_rcu(struct rcu_head *rcu) kdebug("put_cred_rcu(%p)", cred); -#ifdef CONFIG_DEBUG_CREDENTIALS - if (cred->magic != CRED_MAGIC_DEAD || - atomic_read(&cred->usage) != 0 || - read_cred_subscribers(cred) != 0) - panic("CRED: put_cred_rcu() sees %p with" - " mag %x, put %p, usage %d, subscr %d\n", - cred, cred->magic, cred->put_addr, - atomic_read(&cred->usage), - read_cred_subscribers(cred)); -#else - if (atomic_read(&cred->usage) != 0) - panic("CRED: put_cred_rcu() sees %p with usage %d\n", - cred, atomic_read(&cred->usage)); -#endif + if (atomic_long_read(&cred->usage) != 0) + panic("CRED: put_cred_rcu() sees %p with usage %ld\n", + cred, atomic_long_read(&cred->usage)); security_cred_free(cred); key_put(cred->session_keyring); @@ -137,16 +97,10 @@ static void put_cred_rcu(struct rcu_head *rcu) */ void __put_cred(struct cred *cred) { - kdebug("__put_cred(%p{%d,%d})", cred, - atomic_read(&cred->usage), - read_cred_subscribers(cred)); - - BUG_ON(atomic_read(&cred->usage) != 0); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(cred) != 0); - cred->magic = CRED_MAGIC_DEAD; - cred->put_addr = __builtin_return_address(0); -#endif + kdebug("__put_cred(%p{%ld})", cred, + atomic_long_read(&cred->usage)); + + BUG_ON(atomic_long_read(&cred->usage) != 0); BUG_ON(cred == current->cred); BUG_ON(cred == current->real_cred); @@ -164,9 +118,8 @@ void exit_creds(struct task_struct *tsk) { struct cred *real_cred, *cred; - kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); + kdebug("exit_creds(%u,%p,%p,{%ld})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_long_read(&tsk->cred->usage)); real_cred = (struct cred *) tsk->real_cred; tsk->real_cred = NULL; @@ -174,15 +127,10 @@ void exit_creds(struct task_struct *tsk) cred = (struct cred *) tsk->cred; tsk->cred = NULL; - validate_creds(cred); if (real_cred == cred) { - alter_cred_subscribers(cred, -2); put_cred_many(cred, 2); } else { - validate_creds(real_cred); - alter_cred_subscribers(real_cred, -1); put_cred(real_cred); - alter_cred_subscribers(cred, -1); put_cred(cred); } @@ -230,10 +178,7 @@ struct cred *cred_alloc_blank(void) if (!new) return NULL; - atomic_set(&new->usage, 1); -#ifdef CONFIG_DEBUG_CREDENTIALS - new->magic = CRED_MAGIC; -#endif + atomic_long_set(&new->usage, 1); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -264,8 +209,6 @@ struct cred *prepare_creds(void) const struct cred *old; struct cred *new; - validate_process_creds(); - new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; @@ -276,8 +219,7 @@ struct cred *prepare_creds(void) memcpy(new, old, sizeof(struct cred)); new->non_rcu = 0; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); + atomic_long_set(&new->usage, 1); get_group_info(new->group_info); get_uid(new->user); get_user_ns(new->user_ns); @@ -300,7 +242,6 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; - validate_creds(new); return new; error: @@ -362,10 +303,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) clone_flags & CLONE_THREAD ) { p->real_cred = get_cred_many(p->cred, 2); - alter_cred_subscribers(p->cred, 2); - kdebug("share_creds(%p{%d,%d})", - p->cred, atomic_read(&p->cred->usage), - read_cred_subscribers(p->cred)); + kdebug("share_creds(%p{%ld})", + p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -404,8 +343,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); - alter_cred_subscribers(new, 2); - validate_creds(new); return 0; error_put: @@ -457,17 +394,11 @@ int commit_creds(struct cred *new) struct task_struct *task = current; const struct cred *old = task->real_cred; - kdebug("commit_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); + kdebug("commit_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); BUG_ON(task->cred != old); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(old) < 2); - validate_creds(old); - validate_creds(new); -#endif - BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_long_read(&new->usage) < 1); get_cred(new); /* we will require a ref for the subj creds too */ @@ -502,14 +433,12 @@ int commit_creds(struct cred *new) * RLIMIT_NPROC limits on user->processes have already been checked * in set_user(). */ - alter_cred_subscribers(new, 2); if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); - alter_cred_subscribers(old, -2); /* send notifications */ if (!uid_eq(new->uid, old->uid) || @@ -539,14 +468,10 @@ EXPORT_SYMBOL(commit_creds); */ void abort_creds(struct cred *new) { - kdebug("abort_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); + kdebug("abort_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); -#ifdef CONFIG_DEBUG_CREDENTIALS - BUG_ON(read_cred_subscribers(new) != 0); -#endif - BUG_ON(atomic_read(&new->usage) < 1); + BUG_ON(atomic_long_read(&new->usage) < 1); put_cred(new); } EXPORT_SYMBOL(abort_creds); @@ -562,12 +487,8 @@ const struct cred *override_creds(const struct cred *new) { const struct cred *old = current->cred; - kdebug("override_creds(%p{%d,%d})", new, - atomic_read(&new->usage), - read_cred_subscribers(new)); - - validate_creds(old); - validate_creds(new); + kdebug("override_creds(%p{%ld})", new, + atomic_long_read(&new->usage)); /* * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'. @@ -576,18 +497,12 @@ const struct cred *override_creds(const struct cred *new) * we are only installing the cred into the thread-synchronous * '->cred' pointer, not the '->real_cred' pointer that is * visible to other threads under RCU. - * - * Also note that we did validate_creds() manually, not depending - * on the validation in 'get_cred()'. */ get_new_cred((struct cred *)new); - alter_cred_subscribers(new, 1); rcu_assign_pointer(current->cred, new); - alter_cred_subscribers(old, -1); - kdebug("override_creds() = %p{%d,%d}", old, - atomic_read(&old->usage), - read_cred_subscribers(old)); + kdebug("override_creds() = %p{%ld}", old, + atomic_long_read(&old->usage)); return old; } EXPORT_SYMBOL(override_creds); @@ -603,15 +518,10 @@ void revert_creds(const struct cred *old) { const struct cred *override = current->cred; - kdebug("revert_creds(%p{%d,%d})", old, - atomic_read(&old->usage), - read_cred_subscribers(old)); + kdebug("revert_creds(%p{%ld})", old, + atomic_long_read(&old->usage)); - validate_creds(old); - validate_creds(override); - alter_cred_subscribers(old, 1); rcu_assign_pointer(current->cred, old); - alter_cred_subscribers(override, -1); put_cred(override); } EXPORT_SYMBOL(revert_creds); @@ -731,12 +641,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) kdebug("prepare_kernel_cred() alloc %p", new); old = get_task_cred(daemon); - validate_creds(old); *new = *old; new->non_rcu = 0; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); + atomic_long_set(&new->usage, 1); get_uid(new->user); get_user_ns(new->user_ns); get_group_info(new->group_info); @@ -760,7 +668,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) goto error; put_cred(old); - validate_creds(new); return new; error: @@ -825,109 +732,3 @@ int set_create_files_as(struct cred *new, struct inode *inode) return security_kernel_create_files_as(new, inode); } EXPORT_SYMBOL(set_create_files_as); - -#ifdef CONFIG_DEBUG_CREDENTIALS - -bool creds_are_invalid(const struct cred *cred) -{ - if (cred->magic != CRED_MAGIC) - return true; - return false; -} -EXPORT_SYMBOL(creds_are_invalid); - -/* - * dump invalid credentials - */ -static void dump_invalid_creds(const struct cred *cred, const char *label, - const struct task_struct *tsk) -{ - pr_err("%s credentials: %p %s%s%s\n", - label, cred, - cred == &init_cred ? "[init]" : "", - cred == tsk->real_cred ? "[real]" : "", - cred == tsk->cred ? "[eff]" : ""); - pr_err("->magic=%x, put_addr=%p\n", - cred->magic, cred->put_addr); - pr_err("->usage=%d, subscr=%d\n", - atomic_read(&cred->usage), - read_cred_subscribers(cred)); - pr_err("->*uid = { %d,%d,%d,%d }\n", - from_kuid_munged(&init_user_ns, cred->uid), - from_kuid_munged(&init_user_ns, cred->euid), - from_kuid_munged(&init_user_ns, cred->suid), - from_kuid_munged(&init_user_ns, cred->fsuid)); - pr_err("->*gid = { %d,%d,%d,%d }\n", - from_kgid_munged(&init_user_ns, cred->gid), - from_kgid_munged(&init_user_ns, cred->egid), - from_kgid_munged(&init_user_ns, cred->sgid), - from_kgid_munged(&init_user_ns, cred->fsgid)); -#ifdef CONFIG_SECURITY - pr_err("->security is %p\n", cred->security); - if ((unsigned long) cred->security >= PAGE_SIZE && - (((unsigned long) cred->security & 0xffffff00) != - (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) - pr_err("->security {%x, %x}\n", - ((u32*)cred->security)[0], - ((u32*)cred->security)[1]); -#endif -} - -/* - * report use of invalid credentials - */ -void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) -{ - pr_err("Invalid credentials\n"); - pr_err("At %s:%u\n", file, line); - dump_invalid_creds(cred, "Specified", current); - BUG(); -} -EXPORT_SYMBOL(__invalid_creds); - -/* - * check the credentials on a process - */ -void __validate_process_creds(struct task_struct *tsk, - const char *file, unsigned line) -{ - if (tsk->cred == tsk->real_cred) { - if (unlikely(read_cred_subscribers(tsk->cred) < 2 || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } else { - if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || - read_cred_subscribers(tsk->cred) < 1 || - creds_are_invalid(tsk->real_cred) || - creds_are_invalid(tsk->cred))) - goto invalid_creds; - } - return; - -invalid_creds: - pr_err("Invalid process credentials\n"); - pr_err("At %s:%u\n", file, line); - - dump_invalid_creds(tsk->real_cred, "Real", tsk); - if (tsk->cred != tsk->real_cred) - dump_invalid_creds(tsk->cred, "Effective", tsk); - else - pr_err("Effective creds == Real creds\n"); - BUG(); -} -EXPORT_SYMBOL(__validate_process_creds); - -/* - * check creds for do_exit() - */ -void validate_creds_for_do_exit(struct task_struct *tsk) -{ - kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", - tsk->real_cred, tsk->cred, - atomic_read(&tsk->cred->usage), - read_cred_subscribers(tsk->cred)); - - __validate_process_creds(tsk, __FILE__, __LINE__); -} - -#endif /* CONFIG_DEBUG_CREDENTIALS */ diff --git a/kernel/events/core.c b/kernel/events/core.c index b704d83a28b2..9efd0d7775e7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1814,31 +1814,34 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -static void __perf_event_read_size(struct perf_event *event, int nr_siblings) +static int __perf_event_read_size(u64 read_format, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_ID) + if (read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_LOST) + if (read_format & PERF_FORMAT_LOST) entry += sizeof(u64); - if (event->attr.read_format & PERF_FORMAT_GROUP) { + if (read_format & PERF_FORMAT_GROUP) { nr += nr_siblings; size += sizeof(u64); } - size += entry * nr; - event->read_size = size; + /* + * Since perf_event_validate_size() limits this to 16k and inhibits + * adding more siblings, this will never overflow. + */ + return size + nr * entry; } static void __perf_event_header_size(struct perf_event *event, u64 sample_type) @@ -1888,8 +1891,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) */ static void perf_event__header_size(struct perf_event *event) { - __perf_event_read_size(event, - event->group_leader->nr_siblings); + event->read_size = + __perf_event_read_size(event->attr.read_format, + event->group_leader->nr_siblings); __perf_event_header_size(event, event->attr.sample_type); } @@ -1920,23 +1924,44 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +/* + * Check that adding an event to the group does not result in anybody + * overflowing the 64k event limit imposed by the output buffer. + * + * Specifically, check that the read_size for the event does not exceed 16k, + * read_size being the one term that grows with groups size. Since read_size + * depends on per-event read_format, also (re)check the existing events. + * + * This leaves 48k for the constant size fields and things like callchains, + * branch stacks and register sets. + */ static bool perf_event_validate_size(struct perf_event *event) { - /* - * The values computed here will be over-written when we actually - * attach the event. - */ - __perf_event_read_size(event, event->group_leader->nr_siblings + 1); - __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); - perf_event__id_header_size(event); + struct perf_event *sibling, *group_leader = event->group_leader; + + if (__perf_event_read_size(event->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + + if (__perf_event_read_size(group_leader->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; /* - * Sum the lot; should not exceed the 64k limit we have on records. - * Conservative limit to allow for callchains and other variable fields. + * When creating a new group leader, group_leader->ctx is initialized + * after the size has been validated, but we cannot safely use + * for_each_sibling_event() until group_leader->ctx is set. A new group + * leader cannot have any siblings yet, so we can safely skip checking + * the non-existent siblings. */ - if (event->read_size + event->header_size + - event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) - return false; + if (event == group_leader) + return true; + + for_each_sibling_event(sibling, group_leader) { + if (__perf_event_read_size(sibling->attr.read_format, + group_leader->nr_siblings + 1) > 16*1024) + return false; + } return true; } diff --git a/kernel/exit.c b/kernel/exit.c index ee9f43bed49a..aedc0832c9f4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -824,8 +824,6 @@ void __noreturn do_exit(long code) ptrace_event(PTRACE_EVENT_EXIT, code); user_events_exit(tsk); - validate_creds_for_do_exit(tsk); - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ @@ -909,7 +907,6 @@ void __noreturn do_exit(long code) if (tsk->task_frag.page) put_page(tsk->task_frag.page); - validate_creds_for_do_exit(tsk); exit_task_stack_account(tsk); check_stack_usage(); diff --git a/kernel/freezer.c b/kernel/freezer.c index c450fa8b8b5e..759006a9a910 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -201,7 +201,7 @@ void __thaw_task(struct task_struct *p) if (WARN_ON_ONCE(freezing(p))) goto unlock; - if (task_call_func(p, __restore_freezer_state, NULL)) + if (!frozen(p) || task_call_func(p, __restore_freezer_state, NULL)) goto unlock; wake_up_state(p, TASK_FROZEN); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 075a632e6c7c..d5a0ee40bf66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2252,7 +2252,7 @@ int register_kretprobe(struct kretprobe *rp) rp->rph = NULL; return -ENOMEM; } - rp->rph->rp = rp; + rcu_assign_pointer(rp->rph->rp, rp); rp->nmissed = 0; /* Establish function entry probe point */ ret = register_kprobe(&rp->kp); @@ -2300,7 +2300,7 @@ void unregister_kretprobes(struct kretprobe **rps, int num) #ifdef CONFIG_KRETPROBE_ON_RETHOOK rethook_free(rps[i]->rh); #else - rps[i]->rph->rp = NULL; + rcu_assign_pointer(rps[i]->rph->rp, NULL); #endif } mutex_unlock(&kprobe_mutex); diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d5610ad52b92..75a6f6133866 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -11,6 +11,13 @@ * called from interrupt context and we have preemption disabled while * spinning. */ + +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; + int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # + 1 value */ +}; + static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* @@ -37,32 +44,28 @@ static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. + * + * If osq_lock() is being cancelled there must be a previous node + * and 'old_cpu' is its CPU #. + * For osq_unlock() there is never a previous node and old_cpu is + * set to OSQ_UNLOCKED_VAL. */ static inline struct optimistic_spin_node * osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, - struct optimistic_spin_node *prev) + int old_cpu) { - struct optimistic_spin_node *next = NULL; int curr = encode_cpu(smp_processor_id()); - int old; - - /* - * If there is a prev node in queue, then the 'old' value will be - * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if - * we're currently last in queue, then the queue will then become empty. - */ - old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { if (atomic_read(&lock->tail) == curr && - atomic_cmpxchg_acquire(&lock->tail, curr, old) == curr) { + atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its * unlock()/unqueue(). */ - break; + return NULL; } /* @@ -76,15 +79,15 @@ osq_wait_next(struct optimistic_spin_queue *lock, * wait for a new @node->next from its Step-C. */ if (node->next) { + struct optimistic_spin_node *next; + next = xchg(&node->next, NULL); if (next) - break; + return next; } cpu_relax(); } - - return next; } bool osq_lock(struct optimistic_spin_queue *lock) @@ -186,7 +189,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * back to @prev. */ - next = osq_wait_next(lock, node, prev); + next = osq_wait_next(lock, node, prev->cpu); if (!next) return false; @@ -226,7 +229,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) return; } - next = osq_wait_next(lock, node, NULL); + next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL); if (next) WRITE_ONCE(next->locked, 1); } diff --git a/kernel/resource.c b/kernel/resource.c index 866ef3663a0b..91be1bc50b60 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1844,8 +1844,8 @@ get_free_mem_region(struct device *dev, struct resource *base, write_lock(&resource_lock); for (addr = gfr_start(base, size, align, flags); - gfr_continue(base, addr, size, flags); - addr = gfr_next(addr, size, flags)) { + gfr_continue(base, addr, align, flags); + addr = gfr_next(addr, align, flags)) { if (__region_intersects(base, addr, size, 0, IORES_DESC_NONE) != REGION_DISJOINT) continue; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e1a6e3c675c0..9a846439b36a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -201,6 +201,20 @@ COND_SYSCALL(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time32); COND_SYSCALL_COMPAT(recvmmsg_time64); +/* Posix timer syscalls may be configured out */ +COND_SYSCALL(timer_create); +COND_SYSCALL(timer_gettime); +COND_SYSCALL(timer_getoverrun); +COND_SYSCALL(timer_settime); +COND_SYSCALL(timer_delete); +COND_SYSCALL(clock_adjtime); +COND_SYSCALL(getitimer); +COND_SYSCALL(setitimer); +COND_SYSCALL(alarm); +COND_SYSCALL_COMPAT(timer_create); +COND_SYSCALL_COMPAT(getitimer); +COND_SYSCALL_COMPAT(setitimer); + /* * Architecture specific syscalls: see further below */ diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index 828aeecbd1e8..9b6fcb8d85e7 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -17,40 +17,6 @@ #include <linux/time_namespace.h> #include <linux/compat.h> -#ifdef CONFIG_ARCH_HAS_SYSCALL_WRAPPER -/* Architectures may override SYS_NI and COMPAT_SYS_NI */ -#include <asm/syscall_wrapper.h> -#endif - -asmlinkage long sys_ni_posix_timers(void) -{ - pr_err_once("process %d (%s) attempted a POSIX timer syscall " - "while CONFIG_POSIX_TIMERS is not set\n", - current->pid, current->comm); - return -ENOSYS; -} - -#ifndef SYS_NI -#define SYS_NI(name) SYSCALL_ALIAS(sys_##name, sys_ni_posix_timers) -#endif - -#ifndef COMPAT_SYS_NI -#define COMPAT_SYS_NI(name) SYSCALL_ALIAS(compat_sys_##name, sys_ni_posix_timers) -#endif - -SYS_NI(timer_create); -SYS_NI(timer_gettime); -SYS_NI(timer_getoverrun); -SYS_NI(timer_settime); -SYS_NI(timer_delete); -SYS_NI(clock_adjtime); -SYS_NI(getitimer); -SYS_NI(setitimer); -SYS_NI(clock_adjtime32); -#ifdef __ARCH_WANT_SYS_ALARM -SYS_NI(alarm); -#endif - /* * We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC * as it is easy to remain compatible with little code. CLOCK_BOOTTIME @@ -158,18 +124,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, which_clock); } -#ifdef CONFIG_COMPAT -COMPAT_SYS_NI(timer_create); -#endif - -#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -COMPAT_SYS_NI(getitimer); -COMPAT_SYS_NI(setitimer); -#endif - #ifdef CONFIG_COMPAT_32BIT_TIME -SYS_NI(timer_settime32); -SYS_NI(timer_gettime32); SYSCALL_DEFINE2(clock_settime32, const clockid_t, which_clock, struct old_timespec32 __user *, tp) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8de8bec5f366..b01ae7d36021 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash, hash->count++; } -static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +static struct ftrace_func_entry * +add_hash_entry(struct ftrace_hash *hash, unsigned long ip) { struct ftrace_func_entry *entry; entry = kmalloc(sizeof(*entry), GFP_KERNEL); if (!entry) - return -ENOMEM; + return NULL; entry->ip = ip; __add_hash_entry(hash, entry); - return 0; + return entry; } static void @@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) struct ftrace_func_entry *entry; struct ftrace_hash *new_hash; int size; - int ret; int i; new_hash = alloc_ftrace_hash(size_bits); @@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - ret = add_hash_entry(new_hash, entry->ip); - if (ret < 0) + if (add_hash_entry(new_hash, entry->ip) == NULL) goto free_hash; } } @@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS /* Protected by rcu_tasks for reading, and direct_mutex for writing */ -static struct ftrace_hash *direct_functions = EMPTY_HASH; +static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); int ftrace_direct_func_count; @@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) return entry->direct; } -static struct ftrace_func_entry* -ftrace_add_rec_direct(unsigned long ip, unsigned long addr, - struct ftrace_hash **free_hash) -{ - struct ftrace_func_entry *entry; - - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - return NULL; - - *free_hash = direct_functions; - direct_functions = new_hash; - } - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return NULL; - - entry->ip = ip; - entry->direct = addr; - __add_hash_entry(direct_functions, entry); - return entry; -} - static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { @@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter) /* Do nothing if it exists */ if (entry) return 0; - - ret = add_hash_entry(hash, rec->ip); + if (add_hash_entry(hash, rec->ip) == NULL) + ret = -ENOMEM; } return ret; } @@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return 0; } - return add_hash_entry(hash, ip); + entry = add_hash_entry(hash, ip); + return entry ? 0 : -ENOMEM; } static int @@ -5410,7 +5377,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long */ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash, *free_hash = NULL; + struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL; struct ftrace_func_entry *entry, *new; int err = -EBUSY, size, i; @@ -5436,17 +5403,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) } } - /* ... and insert them to direct_functions hash. */ err = -ENOMEM; + + /* Make a copy hash to place the new and the old entries in */ + size = hash->count + direct_functions->count; + if (size > 32) + size = 32; + new_hash = alloc_ftrace_hash(fls(size)); + if (!new_hash) + goto out_unlock; + + /* Now copy over the existing direct entries */ + size = 1 << direct_functions->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) { + new = add_hash_entry(new_hash, entry->ip); + if (!new) + goto out_unlock; + new->direct = entry->direct; + } + } + + /* ... and add the new entries */ + size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(entry, &hash->buckets[i], hlist) { - new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); + new = add_hash_entry(new_hash, entry->ip); if (!new) - goto out_remove; + goto out_unlock; + /* Update both the copy and the hash entry */ + new->direct = addr; entry->direct = addr; } } + free_hash = direct_functions; + rcu_assign_pointer(direct_functions, new_hash); + new_hash = NULL; + ops->func = call_direct_funcs; ops->flags = MULTI_FLAGS; ops->trampoline = FTRACE_REGS_ADDR; @@ -5454,17 +5448,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) err = register_ftrace_function_nolock(ops); - out_remove: - if (err) - remove_direct_functions_hash(hash, addr); - out_unlock: mutex_unlock(&direct_mutex); - if (free_hash) { + if (free_hash && free_hash != EMPTY_HASH) { synchronize_rcu_tasks(); free_ftrace_hash(free_hash); } + + if (new_hash) + free_ftrace_hash(new_hash); + return err; } EXPORT_SYMBOL_GPL(register_ftrace_direct); @@ -6309,7 +6303,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer) if (entry) continue; - if (add_hash_entry(hash, rec->ip) < 0) + if (add_hash_entry(hash, rec->ip) == NULL) goto out; } else { if (entry) { diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index 6fd7d4ecbbc6..fa03094e9e69 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -48,7 +48,7 @@ static void rethook_free_rcu(struct rcu_head *head) */ void rethook_stop(struct rethook *rh) { - WRITE_ONCE(rh->handler, NULL); + rcu_assign_pointer(rh->handler, NULL); } /** @@ -63,7 +63,7 @@ void rethook_stop(struct rethook *rh) */ void rethook_free(struct rethook *rh) { - WRITE_ONCE(rh->handler, NULL); + rethook_stop(rh); call_rcu(&rh->rcu, rethook_free_rcu); } @@ -82,6 +82,12 @@ static int rethook_fini_pool(struct objpool_head *head, void *context) return 0; } +static inline rethook_handler_t rethook_get_handler(struct rethook *rh) +{ + return (rethook_handler_t)rcu_dereference_check(rh->handler, + rcu_read_lock_any_held()); +} + /** * rethook_alloc() - Allocate struct rethook. * @data: a data to pass the @handler when hooking the return. @@ -107,7 +113,7 @@ struct rethook *rethook_alloc(void *data, rethook_handler_t handler, return ERR_PTR(-ENOMEM); rh->data = data; - rh->handler = handler; + rcu_assign_pointer(rh->handler, handler); /* initialize the objpool for rethook nodes */ if (objpool_init(&rh->pool, num, size, GFP_KERNEL, rh, @@ -135,9 +141,10 @@ static void free_rethook_node_rcu(struct rcu_head *head) */ void rethook_recycle(struct rethook_node *node) { - lockdep_assert_preemption_disabled(); + rethook_handler_t handler; - if (likely(READ_ONCE(node->rethook->handler))) + handler = rethook_get_handler(node->rethook); + if (likely(handler)) objpool_push(node, &node->rethook->pool); else call_rcu(&node->rcu, free_rethook_node_rcu); @@ -153,9 +160,7 @@ NOKPROBE_SYMBOL(rethook_recycle); */ struct rethook_node *rethook_try_get(struct rethook *rh) { - rethook_handler_t handler = READ_ONCE(rh->handler); - - lockdep_assert_preemption_disabled(); + rethook_handler_t handler = rethook_get_handler(rh); /* Check whether @rh is going to be freed. */ if (unlikely(!handler)) @@ -300,7 +305,7 @@ unsigned long rethook_trampoline_handler(struct pt_regs *regs, rhn = container_of(first, struct rethook_node, llist); if (WARN_ON_ONCE(rhn->frame != frame)) break; - handler = READ_ONCE(rhn->rethook->handler); + handler = rethook_get_handler(rhn->rethook); if (handler) handler(rhn, rhn->rethook->data, correct_ret_addr, regs); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 43cc47d7faaf..9286f88fcd32 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) *cnt = rb_time_cnt(top); - /* If top and bottom counts don't match, this interrupted a write */ - if (*cnt != rb_time_cnt(bottom)) + /* If top, msb or bottom counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) return false; /* The shift to msb will lose its cnt bits */ @@ -700,44 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) return local_try_cmpxchg(l, &expect, set); } -static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) -{ - unsigned long cnt, top, bottom, msb; - unsigned long cnt2, top2, bottom2, msb2; - u64 val; - - /* The cmpxchg always fails if it interrupted an update */ - if (!__rb_time_read(t, &val, &cnt2)) - return false; - - if (val != expect) - return false; - - cnt = local_read(&t->cnt); - if ((cnt & 3) != cnt2) - return false; - - cnt2 = cnt + 1; - - rb_time_split(val, &top, &bottom, &msb); - top = rb_time_val_cnt(top, cnt); - bottom = rb_time_val_cnt(bottom, cnt); - - rb_time_split(set, &top2, &bottom2, &msb2); - top2 = rb_time_val_cnt(top2, cnt2); - bottom2 = rb_time_val_cnt(bottom2, cnt2); - - if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) - return false; - if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) - return false; - if (!rb_time_read_cmpxchg(&t->top, top, top2)) - return false; - if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) - return false; - return true; -} - #else /* 64 bits */ /* local64_t always succeeds */ @@ -751,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val) { local64_set(&t->time, val); } - -static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) -{ - return local64_try_cmpxchg(&t->time, &expect, set); -} #endif /* @@ -924,9 +881,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f if (!nr_pages || !full) return true; - dirty = ring_buffer_nr_dirty_pages(buffer, cpu); + /* + * Add one as dirty will never equal nr_pages, as the sub-buffer + * that the writer is on is not counted as dirty. + * This is needed if "buffer_percent" is set to 100. + */ + dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1; - return (dirty * 100) > (full * nr_pages); + return (dirty * 100) >= (full * nr_pages); } /* @@ -987,7 +949,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) /* make sure the waiters see the new index */ smp_wmb(); - rb_wake_up_waiters(&rbwork->work); + /* This can be called in any context */ + irq_work_queue(&rbwork->work); } /** @@ -1787,6 +1750,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) free_buffer_page(bpage); } + free_page((unsigned long)cpu_buffer->free_page); + kfree(cpu_buffer); } @@ -2407,7 +2372,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter) */ barrier(); - if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE) + if ((iter->head + length) > commit || length > BUF_PAGE_SIZE) /* Writer corrupted the read? */ goto reset; @@ -2981,25 +2946,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -static u64 rb_time_delta(struct ring_buffer_event *event) -{ - switch (event->type_len) { - case RINGBUF_TYPE_PADDING: - return 0; - - case RINGBUF_TYPE_TIME_EXTEND: - return rb_event_time_stamp(event); - - case RINGBUF_TYPE_TIME_STAMP: - return 0; - - case RINGBUF_TYPE_DATA: - return event->time_delta; - default: - return 0; - } -} - static inline bool rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) @@ -3007,8 +2953,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, unsigned long new_index, old_index; struct buffer_page *bpage; unsigned long addr; - u64 write_stamp; - u64 delta; new_index = rb_event_index(event); old_index = new_index + rb_event_ts_length(event); @@ -3017,41 +2961,34 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, bpage = READ_ONCE(cpu_buffer->tail_page); - delta = rb_time_delta(event); - - if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) - return false; - - /* Make sure the write stamp is read before testing the location */ - barrier(); - + /* + * Make sure the tail_page is still the same and + * the next write location is the end of this event + */ if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; unsigned long event_length = rb_event_length(event); - /* Something came in, can't discard */ - if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, - write_stamp, write_stamp - delta)) - return false; - /* - * It's possible that the event time delta is zero - * (has the same time stamp as the previous event) - * in which case write_stamp and before_stamp could - * be the same. In such a case, force before_stamp - * to be different than write_stamp. It doesn't - * matter what it is, as long as its different. + * For the before_stamp to be different than the write_stamp + * to make sure that the next event adds an absolute + * value and does not rely on the saved write stamp, which + * is now going to be bogus. + * + * By setting the before_stamp to zero, the next event + * is not going to use the write_stamp and will instead + * create an absolute timestamp. This means there's no + * reason to update the wirte_stamp! */ - if (!delta) - rb_time_set(&cpu_buffer->before_stamp, 0); + rb_time_set(&cpu_buffer->before_stamp, 0); /* * If an event were to come in now, it would see that the * write_stamp and the before_stamp are different, and assume * that this event just added itself before updating * the write stamp. The interrupting event will fix the - * write stamp for us, and use the before stamp as its delta. + * write stamp for us, and use an absolute timestamp. */ /* @@ -3488,7 +3425,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, return; /* - * If this interrupted another event, + * If this interrupted another event, */ if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) goto out; @@ -3582,7 +3519,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * absolute timestamp. * Don't bother if this is the start of a new page (w == 0). */ - if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) { + if (!w) { + /* Use the sub-buffer timestamp */ + info->delta = 0; + } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) { info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND; info->length += RB_LEN_TIME_EXTEND; } else { @@ -3605,26 +3545,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - if (a_ok && b_ok) - check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } if (likely(tail == w)) { - u64 save_before; - bool s_ok; - /* Nothing interrupted us between A and C */ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts); - barrier(); - /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before); - RB_WARN_ON(cpu_buffer, !s_ok); + /* + * If something came in between C and D, the write stamp + * may now not be in sync. But that's fine as the before_stamp + * will be different and then next event will just be forced + * to use an absolute timestamp. + */ if (likely(!(info->add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)))) /* This did not interrupt any time update */ @@ -3632,41 +3565,40 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, else /* Just use full timestamp for interrupting event */ info->delta = info->ts; - barrier(); check_buffer(cpu_buffer, info, tail); - if (unlikely(info->ts != save_before)) { - /* SLOW PATH - Interrupted between C and E */ - - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - RB_WARN_ON(cpu_buffer, !a_ok); - - /* Write stamp must only go forward */ - if (save_before > info->after) { - /* - * We do not care about the result, only that - * it gets updated atomically. - */ - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, save_before); - } - } } else { u64 ts; /* SLOW PATH - Interrupted between A and C */ - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - /* Was interrupted before here, write_stamp must be valid */ + + /* Save the old before_stamp */ + a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); RB_WARN_ON(cpu_buffer, !a_ok); + + /* + * Read a new timestamp and update the before_stamp to make + * the next event after this one force using an absolute + * timestamp. This is in case an interrupt were to come in + * between E and F. + */ ts = rb_time_stamp(cpu_buffer->buffer); + rb_time_set(&cpu_buffer->before_stamp, ts); + barrier(); - /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts && - rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, ts)) { - /* Nothing came after this event between C and E */ + /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + /* Was interrupted before here, write_stamp must be valid */ + RB_WARN_ON(cpu_buffer, !a_ok); + barrier(); + /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && + info->after == info->before && info->after < ts) { + /* + * Nothing came after this event between C and F, it is + * safe to use info->after for the delta as it + * matched info->before and is still valid. + */ info->delta = ts - info->after; } else { /* - * Interrupted between C and E: + * Interrupted between C and F: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that @@ -3717,6 +3649,12 @@ rb_reserve_next_event(struct trace_buffer *buffer, int nr_loops = 0; int add_ts_default; + /* ring buffer does cmpxchg, make sure it is safe in NMI context */ + if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && + (unlikely(in_nmi()))) { + return NULL; + } + rb_start_commit(cpu_buffer); /* The commit page can not change after this */ @@ -3740,6 +3678,8 @@ rb_reserve_next_event(struct trace_buffer *buffer, if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) { add_ts_default = RB_ADD_STAMP_ABSOLUTE; info.length += RB_LEN_TIME_EXTEND; + if (info.length > BUF_MAX_DATA_SIZE) + goto out_fail; } else { add_ts_default = RB_ADD_STAMP_NONE; } @@ -5121,7 +5061,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags) if (!iter) return NULL; - iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags); + /* Holds the entire event: data and meta data */ + iter->event = kmalloc(BUF_PAGE_SIZE, flags); if (!iter->event) { kfree(iter); return NULL; diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index 8dfe85499d4a..354c2117be43 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void) ret = test_trace_synth_event(); WARN_ON(ret); + + /* Disable when done */ + trace_array_set_clr_event(gen_synth_test->tr, + "synthetic", + "gen_synth_test", false); + trace_array_set_clr_event(empty_synth_test->tr, + "synthetic", + "empty_synth_test", false); + trace_array_set_clr_event(create_synth_test->tr, + "synthetic", + "create_synth_test", false); out: return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9aebf904ff97..a0defe156b57 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1894,6 +1894,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&tr->max_lock); + + /* Any waiters on the old snapshot buffer need to wake up */ + ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS); } /** @@ -1945,12 +1948,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) static int wait_on_pipe(struct trace_iterator *iter, int full) { + int ret; + /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) return 0; - return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, - full); + ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full); + +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * Make sure this is still the snapshot buffer, as if a snapshot were + * to happen, this would now be the main buffer. + */ + if (iter->snapshot) + iter->array_buffer = &iter->tr->max_buffer; +#endif + return ret; } #ifdef CONFIG_FTRACE_STARTUP_TEST @@ -2360,13 +2374,7 @@ int is_tracing_stopped(void) return global_trace.stop_count; } -/** - * tracing_start - quick start of the tracer - * - * If tracing is enabled but was stopped by tracing_stop, - * this will start the tracer back up. - */ -void tracing_start(void) +static void tracing_start_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; @@ -2374,119 +2382,83 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (--global_trace.stop_count) { - if (global_trace.stop_count < 0) { + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (--tr->stop_count) { + if (WARN_ON_ONCE(tr->stop_count < 0)) { /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - global_trace.stop_count = 0; + tr->stop_count = 0; } goto out; } /* Prevent the buffers from switching */ - arch_spin_lock(&global_trace.max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); #endif - arch_spin_unlock(&global_trace.max_lock); - - out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); -} - -static void tracing_start_tr(struct trace_array *tr) -{ - struct trace_buffer *buffer; - unsigned long flags; - - if (tracing_disabled) - return; - - /* If global, we need to also start the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_start(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); - - if (--tr->stop_count) { - if (tr->stop_count < 0) { - /* Someone screwed up their debugging */ - WARN_ON_ONCE(1); - tr->stop_count = 0; - } - goto out; - } - - buffer = tr->array_buffer.buffer; - if (buffer) - ring_buffer_record_enable(buffer); + arch_spin_unlock(&tr->max_lock); out: raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** - * tracing_stop - quick stop of the tracer + * tracing_start - quick start of the tracer * - * Light weight way to stop tracing. Use in conjunction with - * tracing_start. + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. */ -void tracing_stop(void) +void tracing_start(void) + +{ + return tracing_start_tr(&global_trace); +} + +static void tracing_stop_tr(struct trace_array *tr) { struct trace_buffer *buffer; unsigned long flags; - raw_spin_lock_irqsave(&global_trace.start_lock, flags); - if (global_trace.stop_count++) + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) goto out; /* Prevent the buffers from switching */ - arch_spin_lock(&global_trace.max_lock); + arch_spin_lock(&tr->max_lock); - buffer = global_trace.array_buffer.buffer; + buffer = tr->array_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #ifdef CONFIG_TRACER_MAX_TRACE - buffer = global_trace.max_buffer.buffer; + buffer = tr->max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); #endif - arch_spin_unlock(&global_trace.max_lock); + arch_spin_unlock(&tr->max_lock); out: - raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } -static void tracing_stop_tr(struct trace_array *tr) +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) { - struct trace_buffer *buffer; - unsigned long flags; - - /* If global, we need to also stop the max tracer */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) - return tracing_stop(); - - raw_spin_lock_irqsave(&tr->start_lock, flags); - if (tr->stop_count++) - goto out; - - buffer = tr->array_buffer.buffer; - if (buffer) - ring_buffer_record_disable(buffer); - - out: - raw_spin_unlock_irqrestore(&tr->start_lock, flags); + return tracing_stop_tr(&global_trace); } static int trace_save_cmdline(struct task_struct *tsk) @@ -2770,8 +2742,11 @@ void trace_buffered_event_enable(void) for_each_tracing_cpu(cpu) { page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); - if (!page) - goto failed; + /* This is just an optimization and can handle failures */ + if (!page) { + pr_err("Failed to allocate event buffer\n"); + break; + } event = page_address(page); memset(event, 0, sizeof(*event)); @@ -2785,10 +2760,6 @@ void trace_buffered_event_enable(void) WARN_ON_ONCE(1); preempt_enable(); } - - return; - failed: - trace_buffered_event_disable(); } static void enable_trace_buffered_event(void *data) @@ -2823,11 +2794,9 @@ void trace_buffered_event_disable(void) if (--trace_buffered_event_ref) return; - preempt_disable(); /* For each CPU, set the buffer as used. */ - smp_call_function_many(tracing_buffer_mask, - disable_trace_buffered_event, NULL, 1); - preempt_enable(); + on_each_cpu_mask(tracing_buffer_mask, disable_trace_buffered_event, + NULL, true); /* Wait for all current users to finish */ synchronize_rcu(); @@ -2836,17 +2805,19 @@ void trace_buffered_event_disable(void) free_page((unsigned long)per_cpu(trace_buffered_event, cpu)); per_cpu(trace_buffered_event, cpu) = NULL; } + /* - * Make sure trace_buffered_event is NULL before clearing - * trace_buffered_event_cnt. + * Wait for all CPUs that potentially started checking if they can use + * their event buffer only after the previous synchronize_rcu() call and + * they still read a valid pointer from trace_buffered_event. It must be + * ensured they don't see cleared trace_buffered_event_cnt else they + * could wrongly decide to use the pointed-to buffer which is now freed. */ - smp_wmb(); + synchronize_rcu(); - preempt_disable(); - /* Do the work on each cpu */ - smp_call_function_many(tracing_buffer_mask, - enable_trace_buffered_event, NULL, 1); - preempt_enable(); + /* For each CPU, relinquish the buffer */ + on_each_cpu_mask(tracing_buffer_mask, enable_trace_buffered_event, NULL, + true); } static struct trace_buffer *temp_buffer; @@ -4765,7 +4736,11 @@ static int s_show(struct seq_file *m, void *v) iter->leftover = ret; } else { - print_trace_line(iter); + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.full = 0; + trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n"); + } ret = trace_print_seq(m, &iter->seq); /* * If we overflow the seq_file buffer, then it will @@ -5007,6 +4982,12 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp) return 0; } +int tracing_single_release_file_tr(struct inode *inode, struct file *filp) +{ + tracing_release_file_tr(inode, filp); + return single_release(inode, filp); +} + static int tracing_mark_open(struct inode *inode, struct file *filp) { stream_open(inode, filp); @@ -6387,13 +6368,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, if (!tr->array_buffer.buffer) return 0; + /* Do not allow tracing while resizing ring buffer */ + tracing_stop_tr(tr); + ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu); if (ret < 0) - return ret; + goto out_start; #ifdef CONFIG_TRACER_MAX_TRACE - if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || - !tr->current_trace->use_max_tr) + if (!tr->allocated_snapshot) goto out; ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); @@ -6418,7 +6401,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, WARN_ON(1); tracing_disabled = 1; } - return ret; + goto out_start; } update_buffer_entries(&tr->max_buffer, cpu); @@ -6427,7 +6410,8 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, #endif /* CONFIG_TRACER_MAX_TRACE */ update_buffer_entries(&tr->array_buffer, cpu); - + out_start: + tracing_start_tr(tr); return ret; } @@ -8547,7 +8531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, wait_index = READ_ONCE(iter->wait_index); - ret = wait_on_pipe(iter, iter->tr->buffer_percent); + ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent); if (ret) goto out; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b7f4ea25a194..0489e72c8169 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -617,6 +617,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); int tracing_open_file_tr(struct inode *inode, struct file *filp); int tracing_release_file_tr(struct inode *inode, struct file *filp); +int tracing_single_release_file_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1abc07fba1b9..5ecf3c8bde20 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5623,10 +5623,12 @@ static int event_hist_open(struct inode *inode, struct file *file) { int ret; - ret = security_locked_down(LOCKDOWN_TRACEFS); + ret = tracing_open_file_tr(inode, file); if (ret) return ret; + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; return single_open(file, hist_show, file); } @@ -5634,7 +5636,7 @@ const struct file_operations event_hist_fops = { .open = event_hist_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_file_tr, }; #ifdef CONFIG_HIST_TRIGGERS_DEBUG @@ -5900,10 +5902,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file) { int ret; - ret = security_locked_down(LOCKDOWN_TRACEFS); + ret = tracing_open_file_tr(inode, file); if (ret) return ret; + /* Clear private_data to avoid warning in single_open() */ + file->private_data = NULL; return single_open(file, hist_debug_show, file); } @@ -5911,7 +5915,7 @@ const struct file_operations event_hist_debug_fops = { .open = event_hist_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = tracing_single_release_file_tr, }; #endif diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 846e02c0fb59..e7af286af4f1 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields); * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event * @mod: The module creating the event, NULL if not created from a module - * @args: Variable number of arg (pairs), one pair for each field + * @...: Variable number of arg (pairs), one pair for each field * * NOTE: Users normally won't want to call this function directly, but * rather use the synth_event_gen_cmd_start() wrapper, which @@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state) * synth_event_trace - Trace a synthetic event * @file: The trace_event_file representing the synthetic event * @n_vals: The number of values in vals - * @args: Variable number of args containing the event values + * @...: Variable number of args containing the event values * * Trace a synthetic event using the values passed in the variable * argument list. diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d8b302d01083..3e7fa44dc2b2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, { struct print_entry *field; struct trace_seq *s = &iter->seq; + int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); seq_print_ip_sym(s, field->ip, flags); - trace_seq_printf(s, ": %s", field->buf); + trace_seq_printf(s, ": %.*s", max, field->buf); return trace_handle_return(s); } @@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, struct trace_event *event) { struct print_entry *field; + int max = iter->ent_size - offsetof(struct print_entry, buf); trace_assign_type(field, iter->ent); - trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf); + trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf); return trace_handle_return(&iter->seq); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..2989b57e154a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu) pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } - if (cpumask_empty(wq_unbound_cpumask)) - return cpu; - new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) { @@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) +{ + if (!cpumask_intersects(wq_unbound_cpumask, mask)) { + pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", + cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); + return; + } + + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); +} + /** * workqueue_init_early - early init for workqueue subsystem * @@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); + restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |