diff options
Diffstat (limited to 'kernel')
39 files changed, 469 insertions, 264 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b1f66480135b..14750e7c5ee4 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -26,8 +26,10 @@ static void bpf_array_free_percpu(struct bpf_array *array) { int i; - for (i = 0; i < array->map.max_entries; i++) + for (i = 0; i < array->map.max_entries; i++) { free_percpu(array->pptrs[i]); + cond_resched(); + } } static int bpf_array_alloc_percpu(struct bpf_array *array) @@ -43,6 +45,7 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) return -ENOMEM; } array->pptrs[i] = ptr; + cond_resched(); } return 0; @@ -73,11 +76,11 @@ static int array_map_alloc_check(union bpf_attr *attr) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; - int numa_node = bpf_map_attr_numa_node(attr); + int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool unpriv = !capable(CAP_SYS_ADMIN); + u64 cost, array_size, mask64; struct bpf_array *array; - u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); @@ -109,8 +112,19 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ - if (array_size >= U32_MAX - PAGE_SIZE) + cost = array_size; + if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); + if (percpu) { + cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-ENOMEM); + } + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); @@ -121,20 +135,13 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); + array->map.pages = cost; array->elem_size = elem_size; - if (!percpu) - goto out; - - array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); - - if (array_size >= U32_MAX - PAGE_SIZE || - bpf_array_alloc_percpu(array)) { + if (percpu && bpf_array_alloc_percpu(array)) { bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } -out: - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; return &array->map; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 29ca9208dcfa..d315b393abdd 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1590,7 +1590,7 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, * so always copy 'cnt' prog_ids to the user. * In a rare race the user will see zero prog_ids */ - ids = kcalloc(cnt, sizeof(u32), GFP_USER); + ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; rcu_read_lock(); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index fbfdada6caee..a4bb0b34375a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -334,7 +334,7 @@ static int cpu_map_kthread_run(void *data) static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) { - gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; int numa, err; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 7b469d10d0e9..b4b5b81e7251 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -555,7 +555,10 @@ static void trie_free(struct bpf_map *map) struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; - raw_spin_lock(&trie->lock); + /* Wait for outstanding programs to complete + * update/lookup/delete/get_next_key and free the trie. + */ + synchronize_rcu(); /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent @@ -566,10 +569,9 @@ static void trie_free(struct bpf_map *map) slot = &trie->root; for (;;) { - node = rcu_dereference_protected(*slot, - lockdep_is_held(&trie->lock)); + node = rcu_dereference_protected(*slot, 1); if (!node) - goto unlock; + goto out; if (rcu_access_pointer(node->child[0])) { slot = &node->child[0]; @@ -587,8 +589,8 @@ static void trie_free(struct bpf_map *map) } } -unlock: - raw_spin_unlock(&trie->lock); +out: + kfree(trie); } static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 48c33417d13c..a927e89dad6e 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -521,8 +521,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock, static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; - int err = -EINVAL; u64 cost; + int err; if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); @@ -547,6 +547,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); + err = -EINVAL; if (cost >= U32_MAX - PAGE_SIZE) goto free_stab; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e24aa3241387..43f95d190eea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1845,7 +1845,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr = {}; int err; - if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) + if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; err = check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5fb69a85d967..c6eff108aa99 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1356,6 +1356,13 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_CTX; } +static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return type_is_pkt_pointer(reg->type); +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1416,10 +1423,10 @@ static int check_generic_ptr_alignment(struct bpf_verifier_env *env, } static int check_ptr_alignment(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) + const struct bpf_reg_state *reg, int off, + int size, bool strict_alignment_once) { - bool strict = env->strict_alignment; + bool strict = env->strict_alignment || strict_alignment_once; const char *pointer_desc = ""; switch (reg->type) { @@ -1576,9 +1583,9 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) * if t==write && value_regno==-1, some unknown value is stored into memory * if t==read && value_regno==-1, don't care what we read from memory */ -static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off, - int bpf_size, enum bpf_access_type t, - int value_regno) +static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, + int off, int bpf_size, enum bpf_access_type t, + int value_regno, bool strict_alignment_once) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; @@ -1590,7 +1597,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return size; /* alignment checks will add in reg->off themselves */ - err = check_ptr_alignment(env, reg, off, size); + err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); if (err) return err; @@ -1735,21 +1742,23 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d context is not allowed\n", - insn->dst_reg); + if (is_ctx_reg(env, insn->dst_reg) || + is_pkt_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + insn->dst_reg, is_ctx_reg(env, insn->dst_reg) ? + "context" : "packet"); return -EACCES; } /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1); + BPF_SIZE(insn->code), BPF_READ, -1, true); if (err) return err; /* check whether atomic_add can write into the same memory */ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1); + BPF_SIZE(insn->code), BPF_WRITE, -1, true); } /* when register 'regno' is passed into function that will read 'access_size' @@ -2388,7 +2397,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn * is inferred from register state. */ for (i = 0; i < meta.access_size; i++) { - err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1); + err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, + BPF_WRITE, -1, false); if (err) return err; } @@ -4632,7 +4642,7 @@ static int do_check(struct bpf_verifier_env *env) */ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg); + insn->dst_reg, false); if (err) return err; @@ -4684,7 +4694,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg); + insn->src_reg, false); if (err) return err; @@ -4719,7 +4729,7 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, - -1); + -1, false); if (err) return err; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8cda3bc3ae22..4bfb2908ec15 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp) if (cgroup_is_threaded(cgrp)) return 0; + /* + * If @cgroup is populated or has domain controllers enabled, it + * can't be switched. While the below cgroup_can_be_thread_root() + * test can catch the same conditions, that's only when @parent is + * not mixable, so let's check it explicitly. + */ + if (cgroup_is_populated(cgrp) || + cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask) + return -EOPNOTSUPP; + /* we're joining the parent's domain, ensure its validity */ if (!cgroup_is_valid_domain(dom_cgrp) || !cgroup_can_be_thread_root(dom_cgrp)) diff --git a/kernel/compat.c b/kernel/compat.c index 3247fe761f60..3f5fa8902e7d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -488,25 +488,6 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) } EXPORT_SYMBOL_GPL(get_compat_sigset); -int -put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set, - unsigned int size) -{ - /* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */ -#ifdef __BIG_ENDIAN - compat_sigset_t v; - switch (_NSIG_WORDS) { - case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3]; - case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2]; - case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1]; - case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0]; - } - return copy_to_user(compat, &v, size) ? -EFAULT : 0; -#else - return copy_to_user(compat, set, size) ? -EFAULT : 0; -#endif -} - #ifdef CONFIG_NUMA COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages, compat_uptr_t __user *, pages32, diff --git a/kernel/events/core.c b/kernel/events/core.c index 96db9ae5d5af..709a55b9ad97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -724,9 +724,15 @@ static inline void __update_cgrp_time(struct perf_cgroup *cgrp) static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp_out = cpuctx->cgrp; - if (cgrp_out) - __update_cgrp_time(cgrp_out); + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; + + if (cgrp) { + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + __update_cgrp_time(cgrp); + } + } } static inline void update_cgrp_time_from_event(struct perf_event *event) @@ -754,6 +760,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, { struct perf_cgroup *cgrp; struct perf_cgroup_info *info; + struct cgroup_subsys_state *css; /* * ctx->lock held by caller @@ -764,8 +771,12 @@ perf_cgroup_set_timestamp(struct task_struct *task, return; cgrp = perf_cgroup_from_task(task, ctx); - info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; + } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); @@ -2246,7 +2257,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { - enum event_type_t ctx_event_type = event_type & EVENT_ALL; + enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* @@ -2256,6 +2267,8 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; + ctx_event_type = event_type & EVENT_ALL; + perf_pmu_disable(cpuctx->ctx.pmu); if (task_ctx) task_ctx_sched_out(cpuctx, task_ctx, event_type); diff --git a/kernel/extable.c b/kernel/extable.c index a17fdb63dc3e..6a5b61ebc66c 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -64,7 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -static inline int init_kernel_text(unsigned long addr) +int init_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_sinittext && addr < (unsigned long)_einittext) diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 21b0122cb39c..1d5632d8bbcc 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -14,6 +14,15 @@ static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); +static void fei_post_handler(struct kprobe *kp, struct pt_regs *regs, + unsigned long flags) +{ + /* + * A dummy post handler is required to prohibit optimizing, because + * jump optimization does not support execution path overriding. + */ +} + struct fei_attr { struct list_head list; struct kprobe kp; @@ -56,6 +65,7 @@ static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) return NULL; } attr->kp.pre_handler = fei_kprobe_handler; + attr->kp.post_handler = fei_post_handler; attr->retval = adjust_error_retval(addr, 0); INIT_LIST_HEAD(&attr->list); } diff --git a/kernel/fork.c b/kernel/fork.c index be8aa5b98666..e5d9d405ae4e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -592,7 +592,7 @@ static void check_mm(struct mm_struct *mm) * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ -static void __mmdrop(struct mm_struct *mm) +void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); mm_free_pgd(mm); @@ -603,18 +603,7 @@ static void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); free_mm(mm); } - -void mmdrop(struct mm_struct *mm) -{ - /* - * The implicit full barrier implied by atomic_dec_and_test() is - * required by the membarrier system call before returning to - * user-space, after storing to rq->curr. - */ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} -EXPORT_SYMBOL_GPL(mmdrop); +EXPORT_SYMBOL_GPL(__mmdrop); static void mmdrop_async_fn(struct work_struct *work) { diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e6a9c36470ee..82b8b18ee1eb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1726,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p) irq_domain_debug_show_one(m, d, 0); return 0; } - -static int irq_domain_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_domain_debug_show, inode->i_private); -} - -static const struct file_operations dfs_domain_ops = { - .open = irq_domain_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir || d->debugfs_file) return; d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &dfs_domain_ops); + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) @@ -1760,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root) if (!domain_dir) return; - debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + debugfs_create_file("default", 0444, domain_dir, NULL, + &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 5187dfe809ac..4c5770407031 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -16,6 +16,7 @@ struct cpumap { unsigned int available; unsigned int allocated; unsigned int managed; + bool initialized; bool online; unsigned long alloc_map[IRQ_MATRIX_SIZE]; unsigned long managed_map[IRQ_MATRIX_SIZE]; @@ -81,9 +82,11 @@ void irq_matrix_online(struct irq_matrix *m) BUG_ON(cm->online); - bitmap_zero(cm->alloc_map, m->matrix_bits); - cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc); - cm->allocated = 0; + if (!cm->initialized) { + cm->available = m->alloc_size; + cm->available -= cm->managed + m->systembits_inalloc; + cm->initialized = true; + } m->global_available += cm->available; cm->online = true; m->online_maps++; @@ -370,14 +373,16 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu, if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end)) return; - if (cm->online) { - clear_bit(bit, cm->alloc_map); - cm->allocated--; + clear_bit(bit, cm->alloc_map); + cm->allocated--; + + if (cm->online) m->total_allocated--; - if (!managed) { - cm->available++; + + if (!managed) { + cm->available++; + if (cm->online) m->global_available++; - } } trace_irq_matrix_free(bit, cpu, m, cm); } diff --git a/kernel/jump_label.c b/kernel/jump_label.c index b4517095db6a..01ebdf1f9f40 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -16,6 +16,7 @@ #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> +#include <asm/sections.h> #ifdef HAVE_JUMP_LABEL @@ -366,12 +367,16 @@ static void __jump_label_update(struct static_key *key, { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { /* - * entry->code set to 0 invalidates module init text sections - * kernel_text_address() verifies we are not in core kernel - * init code, see jump_label_invalidate_module_init(). + * An entry->code of 0 indicates an entry which has been + * disabled because it was in an init text area. */ - if (entry->code && kernel_text_address(entry->code)) - arch_jump_label_transform(entry, jump_label_type(entry)); + if (entry->code) { + if (kernel_text_address(entry->code)) + arch_jump_label_transform(entry, jump_label_type(entry)); + else + WARN_ONCE(1, "can't patch jump_label at %pS", + (void *)(unsigned long)entry->code); + } } } @@ -417,6 +422,19 @@ void __init jump_label_init(void) cpus_read_unlock(); } +/* Disable any jump label entries in __init/__exit code */ +void __init jump_label_invalidate_initmem(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + for (iter = iter_start; iter < iter_stop; iter++) { + if (init_section_contains((void *)(unsigned long)iter->code, 1)) + iter->code = 0; + } +} + #ifdef CONFIG_MODULES static enum jump_label_type jump_label_init_type(struct jump_entry *entry) @@ -633,6 +651,7 @@ static void jump_label_del_module(struct module *mod) } } +/* Disable any jump label entries in module init code */ static void jump_label_invalidate_module_init(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..102160ff5c66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,67 +978,90 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void arm_kprobe_ftrace(struct kprobe *p) +static int arm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); - WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); - kprobe_ftrace_enabled++; - if (kprobe_ftrace_enabled == 1) { + if (ret) { + pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; + } + + if (kprobe_ftrace_enabled == 0) { ret = register_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (ret) { + pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + goto err_ftrace; + } } + + kprobe_ftrace_enabled++; + return ret; + +err_ftrace: + /* + * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a + * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental + * empty filter_hash which would undesirably trace all functions. + */ + ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + return ret; } /* Caller must lock kprobe_mutex */ -static void disarm_kprobe_ftrace(struct kprobe *p) +static int disarm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; - kprobe_ftrace_enabled--; - if (kprobe_ftrace_enabled == 0) { + if (kprobe_ftrace_enabled == 1) { ret = unregister_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + return ret; } + + kprobe_ftrace_enabled--; + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) do {} while (0) -#define disarm_kprobe_ftrace(p) do {} while (0) +#define arm_kprobe_ftrace(p) (-ENODEV) +#define disarm_kprobe_ftrace(p) (-ENODEV) #endif /* Arm a kprobe with text_mutex */ -static void arm_kprobe(struct kprobe *kp) +static int arm_kprobe(struct kprobe *kp) { - if (unlikely(kprobe_ftrace(kp))) { - arm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return arm_kprobe_ftrace(kp); + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* Disarm a kprobe with text_mutex */ -static void disarm_kprobe(struct kprobe *kp, bool reopt) +static int disarm_kprobe(struct kprobe *kp, bool reopt) { - if (unlikely(kprobe_ftrace(kp))) { - disarm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return disarm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* @@ -1362,9 +1385,15 @@ out: if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) + if (!kprobes_all_disarmed) { /* Arm the breakpoint again. */ - arm_kprobe(ap); + ret = arm_kprobe(ap); + if (ret) { + ap->flags |= KPROBE_FLAG_DISABLED; + list_del_rcu(&p->list); + synchronize_sched(); + } + } } return ret; } @@ -1573,8 +1602,14 @@ int register_kprobe(struct kprobe *p) hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arm_kprobe(p); + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_sched(); + goto out; + } + } /* Try to optimize kprobe */ try_to_optimize_kprobe(p); @@ -1608,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap) static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; + int ret; /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) - return NULL; + return ERR_PTR(-EINVAL); if (!kprobe_disabled(p)) { /* Disable probe if it is a child probe */ @@ -1626,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) * should have already been disarmed, so * skip unneed disarming process. */ - if (!kprobes_all_disarmed) - disarm_kprobe(orig_p, true); + if (!kprobes_all_disarmed) { + ret = disarm_kprobe(orig_p, true); + if (ret) { + p->flags &= ~KPROBE_FLAG_DISABLED; + return ERR_PTR(ret); + } + } orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -1644,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p) /* Disable kprobe. This will disarm it if needed. */ ap = __disable_kprobe(p); - if (ap == NULL) - return -EINVAL; + if (IS_ERR(ap)) + return PTR_ERR(ap); if (ap == p) /* @@ -2078,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p) int disable_kprobe(struct kprobe *kp) { int ret = 0; + struct kprobe *p; mutex_lock(&kprobe_mutex); /* Disable this kprobe */ - if (__disable_kprobe(kp) == NULL) - ret = -EINVAL; + p = __disable_kprobe(kp); + if (IS_ERR(p)) + ret = PTR_ERR(p); mutex_unlock(&kprobe_mutex); return ret; @@ -2116,7 +2159,9 @@ int enable_kprobe(struct kprobe *kp) if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); + ret = arm_kprobe(p); + if (ret) + p->flags |= KPROBE_FLAG_DISABLED; } out: mutex_unlock(&kprobe_mutex); @@ -2407,11 +2452,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -static void arm_all_kprobes(void) +static int arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); @@ -2428,46 +2474,74 @@ static void arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) - if (!kprobe_disabled(p)) - arm_kprobe(p); + /* Arm all kprobes on a best-effort basis */ + hlist_for_each_entry_rcu(p, head, hlist) { + if (!kprobe_disabled(p)) { + err = arm_kprobe(p); + if (err) { + errors++; + ret = err; + } + total++; + } + } } - printk(KERN_INFO "Kprobes globally enabled\n"); + if (errors) + pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); - return; + return ret; } -static void disarm_all_kprobes(void) +static int disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ if (kprobes_all_disarmed) { mutex_unlock(&kprobe_mutex); - return; + return 0; } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; + /* Disarm all kprobes on a best-effort basis */ hlist_for_each_entry_rcu(p, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - disarm_kprobe(p, false); + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { + err = disarm_kprobe(p, false); + if (err) { + errors++; + ret = err; + } + total++; + } } } + + if (errors) + pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally disabled\n"); + mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ wait_for_kprobe_optimizer(); + + return ret; } /* @@ -2494,6 +2568,7 @@ static ssize_t write_enabled_file_bool(struct file *file, { char buf[32]; size_t buf_size; + int ret = 0; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -2504,17 +2579,20 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'y': case 'Y': case '1': - arm_all_kprobes(); + ret = arm_all_kprobes(); break; case 'n': case 'N': case '0': - disarm_all_kprobes(); + ret = disarm_all_kprobes(); break; default: return -EINVAL; } + if (ret) + return ret; + return count; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 858a07590e39..2048359f33d2 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -1082,15 +1082,16 @@ static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock); /** - * mutex_lock_interruptible - acquire the mutex, interruptible - * @lock: the mutex to be acquired + * mutex_lock_interruptible() - Acquire the mutex, interruptible by signals. + * @lock: The mutex to be acquired. * - * Lock the mutex like mutex_lock(), and return 0 if the mutex has - * been acquired or sleep until the mutex becomes available. If a - * signal arrives while waiting for the lock then this function - * returns -EINTR. + * Lock the mutex like mutex_lock(). If a signal is delivered while the + * process is sleeping, this function will return without acquiring the + * mutex. * - * This function is similar to (but not equivalent to) down_interruptible(). + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * signal arrived. */ int __sched mutex_lock_interruptible(struct mutex *lock) { @@ -1104,6 +1105,18 @@ int __sched mutex_lock_interruptible(struct mutex *lock) EXPORT_SYMBOL(mutex_lock_interruptible); +/** + * mutex_lock_killable() - Acquire the mutex, interruptible by fatal signals. + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). If a signal which will be fatal to + * the current process is delivered while the process is sleeping, this + * function will return without acquiring the mutex. + * + * Context: Process context. + * Return: 0 if the lock was successfully acquired or %-EINTR if a + * fatal signal arrived. + */ int __sched mutex_lock_killable(struct mutex *lock) { might_sleep(); @@ -1115,6 +1128,16 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +/** + * mutex_lock_io() - Acquire the mutex and mark the process as waiting for I/O + * @lock: The mutex to be acquired. + * + * Lock the mutex like mutex_lock(). While the task is waiting for this + * mutex, it will be accounted as being in the IO wait state by the + * scheduler. + * + * Context: Process context. + */ void __sched mutex_lock_io(struct mutex *lock) { int token; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38ece035039e..d880296245c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,6 +379,14 @@ queue: tail = encode_tail(smp_processor_id(), idx); node += idx; + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node->locked = 0; node->next = NULL; pv_init_node(node); @@ -408,14 +416,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* - * The above xchg_tail() is also a load of @lock which - * generates, through decode_tail(), a pointer. The address - * dependency matches the RELEASE of xchg_tail() such that - * the subsequent access to @prev happens after. + * We must ensure that the stores to @node are observed before + * the write to prev->next. The address dependency from + * xchg_tail is not sufficient to ensure this because the read + * component of xchg_tail is unordered with respect to the + * initialisation of @node. */ - - WRITE_ONCE(prev->next, node); + smp_store_release(&prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 65cc0cb984e6..940633c63254 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1616,11 +1616,12 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) { DEFINE_WAKE_Q(wake_q); + unsigned long flags; bool postunlock; - raw_spin_lock_irq(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); postunlock = __rt_mutex_futex_unlock(lock, &wake_q); - raw_spin_unlock_irq(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); if (postunlock) rt_mutex_postunlock(&wake_q); diff --git a/kernel/memremap.c b/kernel/memremap.c index 4849be5f9b3c..895e6b76b25e 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -275,8 +275,15 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap) return (res->start + resource_size(res)) >> PAGE_SHIFT; } +static unsigned long pfn_next(unsigned long pfn) +{ + if (pfn % 1024 == 0) + cond_resched(); + return pfn + 1; +} + #define for_each_device_pfn(pfn, map) \ - for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) static void devm_memremap_pages_release(void *data) { @@ -337,10 +344,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) resource_size_t align_start, align_size, align_end; struct vmem_altmap *altmap = pgmap->altmap_valid ? &pgmap->altmap : NULL; + struct resource *res = &pgmap->res; unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; - int error, nid, is_ram, i = 0; - struct resource *res = &pgmap->res; + int error, nid, is_ram; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -409,8 +416,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) list_del(&page->lru); page->pgmap = pgmap; percpu_ref_get(pgmap->ref); - if (!(++i % 1024)) - cond_resched(); } devm_add_action(dev, devm_memremap_pages_release, pgmap); @@ -422,7 +427,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) err_pfn_remap: err_radix: pgmap_radix_release(res, pgoff); - devres_free(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); diff --git a/kernel/module.c b/kernel/module.c index ad2d420024f6..e42764acedb4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4228,7 +4228,7 @@ static int modules_open(struct inode *inode, struct file *file) m->private = kallsyms_show_value() ? NULL : (void *)8ul; } - return 0; + return err; } static const struct file_operations proc_modules_operations = { diff --git a/kernel/panic.c b/kernel/panic.c index 2cfef408fec9..4b794f1d8561 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -640,7 +640,7 @@ device_initcall(register_warn_debugfs); */ __visible void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted in: %p\n", + panic("stack-protector: Kernel stack is corrupted in: %pB\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fc1123583fa6..f274fbef821d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2397,7 +2397,7 @@ skip: if (console_lock_spinning_disable_and_check()) { printk_safe_exit_irqrestore(flags); - return; + goto out; } printk_safe_exit_irqrestore(flags); @@ -2430,6 +2430,7 @@ skip: if (retry && console_trylock()) goto again; +out: if (wake_klogd) wake_up_klogd(); } diff --git a/kernel/relay.c b/kernel/relay.c index c3029402f15c..c955b10c973c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -163,7 +163,7 @@ static struct rchan_buf *relay_create_buf(struct rchan *chan) { struct rchan_buf *buf; - if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) + if (chan->n_subbufs > KMALLOC_MAX_SIZE / sizeof(size_t *)) return NULL; buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf724c1952ea..c94895bc5a2c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2601,19 +2601,31 @@ static inline void finish_task(struct task_struct *prev) #endif } -static inline void finish_lock_switch(struct rq *rq) +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + rq_unpin_lock(rq, rf); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; + rq->lock.owner = next; #endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); } @@ -2844,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev, rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -6678,13 +6683,18 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) parent_quota = parent_b->hierarchical_quota; /* - * Ensure max(child_quota) <= parent_quota, inherit when no + * Ensure max(child_quota) <= parent_quota. On cgroup2, + * always take the min. On cgroup1, only inherit when no * limit is set: */ - if (quota == RUNTIME_INF) - quota = parent_quota; - else if (parent_quota != RUNTIME_INF && quota > parent_quota) - return -EINVAL; + if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { + quota = min(quota, parent_quota); + } else { + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF && quota > parent_quota) + return -EINVAL; + } } cfs_b->hierarchical_quota = quota; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index dd062a1c8cf0..7936f548e071 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,8 +19,6 @@ #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY 50 - struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9bb0e0c412ec..9df09782025c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1153,6 +1153,7 @@ static void update_curr_dl(struct rq *rq) struct sched_dl_entity *dl_se = &curr->dl; u64 delta_exec, scaled_delta_exec; int cpu = cpu_of(rq); + u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1165,7 +1166,8 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1178,7 +1180,7 @@ static void update_curr_dl(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 1ca0130ed4f9..72c401b3b15c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -32,7 +32,7 @@ static DEFINE_SPINLOCK(sched_debug_lock); if (m) \ seq_printf(m, x); \ else \ - printk(x); \ + pr_cont(x); \ } while (0) /* @@ -501,12 +501,12 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) { struct task_struct *g, *p; - SEQ_printf(m, - "\nrunnable tasks:\n" - " S task PID tree-key switches prio" - " wait-time sum-exec sum-sleep\n" - "-------------------------------------------------------" - "----------------------------------------------------\n"); + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); + SEQ_printf(m, " S task PID tree-key switches prio" + " wait-time sum-exec sum-sleep\n"); + SEQ_printf(m, "-------------------------------------------------------" + "----------------------------------------------------\n"); rcu_read_lock(); for_each_process_thread(g, p) { @@ -527,9 +527,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED - SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); #else - SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -595,9 +597,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #ifdef CONFIG_RT_GROUP_SCHED - SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); #else - SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "rt_rq[%d]:\n", cpu); #endif #define P(x) \ @@ -624,7 +628,8 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { struct dl_bw *dl_bw; - SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); + SEQ_printf(m, "\n"); + SEQ_printf(m, "dl_rq[%d]:\n", cpu); #define PU(x) \ SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 663b2355a3aa..aad49451584e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - u64 now = rq_clock_task(rq); u64 delta_exec; + u64 now; if (curr->sched_class != &rt_sched_class) return; + now = rq_clock_task(rq); delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 940fa408a288..dc77548167ef 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1076,14 +1076,16 @@ long seccomp_get_metadata(struct task_struct *task, size = min_t(unsigned long, size, sizeof(kmd)); - if (copy_from_user(&kmd, data, size)) + if (size < sizeof(kmd.filter_off)) + return -EINVAL; + + if (copy_from_user(&kmd.filter_off, data, sizeof(kmd.filter_off))) return -EFAULT; filter = get_nth_filter(task, kmd.filter_off); if (IS_ERR(filter)) return PTR_ERR(filter); - memset(&kmd, 0, sizeof(kmd)); if (filter->log) kmd.flags |= SECCOMP_FILTER_FLAG_LOG; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 75043046914e..10b7186d0638 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -50,6 +50,7 @@ #include <linux/export.h> #include <linux/hashtable.h> #include <linux/compat.h> +#include <linux/nospec.h> #include "timekeeping.h" #include "posix-timers.h" @@ -1346,11 +1347,15 @@ static const struct k_clock * const posix_clocks[] = { static const struct k_clock *clockid_to_kclock(const clockid_t id) { - if (id < 0) + clockid_t idx = id; + + if (id < 0) { return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; + } - if (id >= ARRAY_SIZE(posix_clocks) || !posix_clocks[id]) + if (id >= ARRAY_SIZE(posix_clocks)) return NULL; - return posix_clocks[id]; + + return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 48150ab42de9..4a4fd567fb26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1894,6 +1894,12 @@ int timers_dead_cpu(unsigned int cpu) raw_spin_lock_irq(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + /* + * The current CPUs base clock might be stale. Update it + * before moving the timers over. + */ + forward_timer_base(new_base); + BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fc2838ac8b78..01e6b3a38871 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,7 +661,41 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, +static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_tp; + default: + return tracing_func_proto(func_id); + } +} + +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); + return true; +} + +const struct bpf_verifier_ops tracepoint_verifier_ops = { + .get_func_proto = tp_prog_func_proto, + .is_valid_access = tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops tracepoint_prog_ops = { +}; + +BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, struct bpf_perf_event_value *, buf, u32, size) { int err = -EINVAL; @@ -678,8 +712,8 @@ clear: return err; } -static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { - .func = bpf_perf_prog_read_value_tp, +static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { + .func = bpf_perf_prog_read_value, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -687,7 +721,7 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -695,34 +729,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; case BPF_FUNC_perf_prog_read_value: - return &bpf_perf_prog_read_value_proto_tp; + return &bpf_perf_prog_read_value_proto; default: return tracing_func_proto(func_id); } } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ - if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) - return false; - if (type != BPF_READ) - return false; - if (off % size != 0) - return false; - - BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); - return true; -} - -const struct bpf_verifier_ops tracepoint_verifier_ops = { - .get_func_proto = tp_prog_func_proto, - .is_valid_access = tp_prog_is_valid_access, -}; - -const struct bpf_prog_ops tracepoint_prog_ops = { -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -779,7 +791,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, } const struct bpf_verifier_ops perf_event_verifier_ops = { - .get_func_proto = tp_prog_func_proto, + .get_func_proto = pe_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; @@ -872,6 +884,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return -EINVAL; if (copy_from_user(&query, uquery, sizeof(query))) return -EFAULT; + if (query.ids_len > BPF_TRACE_MAX_PROGS) + return -E2BIG; mutex_lock(&bpf_event_mutex); ret = bpf_prog_array_copy_info(event->tp_event->prog_array, diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1fad24acd444..ae4147eaebd4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -659,7 +659,7 @@ static int create_trace_kprobe(int argc, char **argv) char *symbol = NULL, *event = NULL, *group = NULL; int maxactive = 0; char *arg; - unsigned long offset = 0; + long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -747,7 +747,7 @@ static int create_trace_kprobe(int argc, char **argv) symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); - if (ret) { + if (ret || offset < 0 || offset > UINT_MAX) { pr_info("Failed to parse either an address or a symbol.\n"); return ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index d59357308677..daf54bda4dc8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -320,7 +320,7 @@ static fetch_func_t get_fetch_size_function(const struct fetch_type *type, } /* Split symbol and offset. */ -int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) +int traceprobe_split_symbol_offset(char *symbol, long *offset) { char *tmp; int ret; @@ -328,13 +328,11 @@ int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset) if (!offset) return -EINVAL; - tmp = strchr(symbol, '+'); + tmp = strpbrk(symbol, "+-"); if (tmp) { - /* skip sign because kstrtoul doesn't accept '+' */ - ret = kstrtoul(tmp + 1, 0, offset); + ret = kstrtol(tmp, 0, offset); if (ret) return ret; - *tmp = '\0'; } else *offset = 0; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index e101c5bb9eda..6a4d3fa94042 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -365,7 +365,7 @@ extern int traceprobe_conflict_field_name(const char *name, extern void traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); -extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset); +extern int traceprobe_split_symbol_offset(char *symbol, long *offset); /* Sum up total data length for dynamic arraies (strings) */ static nokprobe_inline int diff --git a/kernel/user.c b/kernel/user.c index 9a20acce460d..36288d840675 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -101,6 +101,7 @@ struct user_struct root_user = { .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, + .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; /* @@ -191,6 +192,8 @@ struct user_struct *alloc_uid(kuid_t uid) new->uid = uid; atomic_set(&new->__count, 1); + ratelimit_state_init(&new->ratelimit, HZ, 100); + ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); /* * Before adding this, check whether we raced diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 017044c26233..6ec6ba65127b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3018,14 +3018,6 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) return ret; } -/* - * See cancel_delayed_work() - */ -bool cancel_work(struct work_struct *work) -{ - return __cancel_work(work, false); -} - /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -4180,6 +4172,22 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** + * current_work - retrieve %current task's work struct + * + * Determine if %current task is a workqueue worker and what it's working on. + * Useful to find out the context that the %current task is running in. + * + * Return: work struct if %current task is a workqueue worker, %NULL otherwise. + */ +struct work_struct *current_work(void) +{ + struct worker *worker = current_wq_worker(); + + return worker ? worker->current_work : NULL; +} +EXPORT_SYMBOL(current_work); + +/** * current_is_workqueue_rescuer - is %current workqueue rescuer? * * Determine whether %current is a workqueue rescuer. Can be used from @@ -5321,7 +5329,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) ret = device_register(&wq_dev->dev); if (ret) { - kfree(wq_dev); + put_device(&wq_dev->dev); wq->wq_dev = NULL; return ret; } |