diff options
Diffstat (limited to 'kernel')
54 files changed, 1012 insertions, 420 deletions
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6ae02be7a48e..286ab3db0fde 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -28,6 +28,7 @@ #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/completion.h> #include <trace/events/xdp.h> #include <linux/btf_ids.h> @@ -73,6 +74,7 @@ struct bpf_cpu_map_entry { struct rcu_head rcu; struct work_struct kthread_stop_wq; + struct completion kthread_running; }; struct bpf_cpu_map { @@ -129,11 +131,17 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_frame *xdpf; + void *ptr; - while ((xdpf = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf); + while ((ptr = ptr_ring_consume(ring))) { + WARN_ON_ONCE(1); + if (unlikely(__ptr_test_bit(0, &ptr))) { + __ptr_clear_bit(0, &ptr); + kfree_skb(ptr); + continue; + } + xdp_return_frame(ptr); + } } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -153,7 +161,6 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) static void cpu_map_kthread_stop(struct work_struct *work) { struct bpf_cpu_map_entry *rcpu; - int err; rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); @@ -163,14 +170,7 @@ static void cpu_map_kthread_stop(struct work_struct *work) rcu_barrier(); /* kthread_stop will wake_up_process and wait for it to complete */ - err = kthread_stop(rcpu->kthread); - if (err) { - /* kthread_stop may be called before cpu_map_kthread_run - * is executed, so we need to release the memory related - * to rcpu. - */ - put_cpu_map_entry(rcpu); - } + kthread_stop(rcpu->kthread); } static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, @@ -298,11 +298,11 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames, return nframes; } - static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; + complete(&rcpu->kthread_running); set_current_state(TASK_INTERRUPTIBLE); /* When kthread gives stop order, then rcpu have been disconnected @@ -467,6 +467,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, goto free_ptr_ring; /* Setup kthread */ + init_completion(&rcpu->kthread_running); rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, "cpumap/%d/map:%d", cpu, map->id); @@ -480,6 +481,12 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, kthread_bind(rcpu->kthread, cpu); wake_up_process(rcpu->kthread); + /* Make sure kthread has been running, so kthread_stop() will not + * stop the kthread prematurely and all pending frames or skbs + * will be handled by the kthread before kthread_stop() returns. + */ + wait_for_completion(&rcpu->kthread_running); + return rcpu; free_prog: diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4174f76133df..99d0625b6c82 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -118,9 +118,8 @@ static struct inode *bpf_get_inode(struct super_block *sb, return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); - inode->i_atime = current_time(inode); + inode->i_atime = inode_set_ctime_current(inode); inode->i_mtime = inode->i_atime; - inode->i_ctime = inode->i_atime; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); @@ -148,8 +147,7 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; + dir->i_mtime = inode_set_ctime_current(dir); } static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 930b5555cfd3..02a021c524ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5573,16 +5573,17 @@ static int update_stack_depth(struct bpf_verifier_env *env, * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth(struct bpf_verifier_env *env) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) { - int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + int depth = 0, frame = 0, i, subprog_end; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; + i = subprog[idx].start; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -5621,7 +5622,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - int next_insn; + int next_insn, sidx; if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; @@ -5631,14 +5632,14 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; - idx = find_subprog(env, next_insn); - if (idx < 0) { + sidx = find_subprog(env, next_insn); + if (sidx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", next_insn); return -EFAULT; } - if (subprog[idx].is_async_cb) { - if (subprog[idx].has_tail_call) { + if (subprog[sidx].is_async_cb) { + if (subprog[sidx].has_tail_call) { verbose(env, "verifier bug. subprog has tail_call and async cb\n"); return -EFAULT; } @@ -5647,6 +5648,7 @@ continue_func: continue; } i = next_insn; + idx = sidx; if (subprog[idx].has_tail_call) tail_call_reachable = true; @@ -5682,6 +5684,22 @@ continue_func: goto continue_func; } +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *si = env->subprog_info; + int ret; + + for (int i = 0; i < env->subprog_cnt; i++) { + if (!i || si[i].is_async_cb) { + ret = check_max_stack_depth_subprog(env, i); + if (ret < 0) + return ret; + } + continue; + } + return 0; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON static int get_callee_stack_depth(struct bpf_verifier_env *env, const struct bpf_insn *insn, int idx) diff --git a/kernel/cpu.c b/kernel/cpu.c index 88a7ede322bd..f6811c857102 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -592,7 +592,10 @@ static void lockdep_release_cpus_lock(void) void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT + enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; +static unsigned int cpu_smt_max_threads __ro_after_init; +unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX; void __init cpu_smt_disable(bool force) { @@ -606,16 +609,33 @@ void __init cpu_smt_disable(bool force) pr_info("SMT: disabled\n"); cpu_smt_control = CPU_SMT_DISABLED; } + cpu_smt_num_threads = 1; } /* * The decision whether SMT is supported can only be done after the full * CPU identification. Called from architecture code. */ -void __init cpu_smt_check_topology(void) +void __init cpu_smt_set_num_threads(unsigned int num_threads, + unsigned int max_threads) { - if (!topology_smt_supported()) + WARN_ON(!num_threads || (num_threads > max_threads)); + + if (max_threads == 1) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; + + cpu_smt_max_threads = max_threads; + + /* + * If SMT has been disabled via the kernel command line or SMT is + * not supported, set cpu_smt_num_threads to 1 for consistency. + * If enabled, take the architecture requested number of threads + * to bring up into account. + */ + if (cpu_smt_control != CPU_SMT_ENABLED) + cpu_smt_num_threads = 1; + else if (num_threads < cpu_smt_num_threads) + cpu_smt_num_threads = num_threads; } static int __init smt_cmdline_disable(char *str) @@ -625,9 +645,23 @@ static int __init smt_cmdline_disable(char *str) } early_param("nosmt", smt_cmdline_disable); +/* + * For Archicture supporting partial SMT states check if the thread is allowed. + * Otherwise this has already been checked through cpu_smt_max_threads when + * setting the SMT level. + */ +static inline bool cpu_smt_thread_allowed(unsigned int cpu) +{ +#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC + return topology_smt_thread_allowed(cpu); +#else + return true; +#endif +} + static inline bool cpu_smt_allowed(unsigned int cpu) { - if (cpu_smt_control == CPU_SMT_ENABLED) + if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) return true; if (topology_is_primary_thread(cpu)) @@ -642,7 +676,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) return !cpumask_test_cpu(cpu, &cpus_booted_once_mask); } -/* Returns true if SMT is not supported of forcefully (irreversibly) disabled */ +/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */ bool cpu_smt_possible(void) { return cpu_smt_control != CPU_SMT_FORCE_DISABLED && @@ -650,22 +684,8 @@ bool cpu_smt_possible(void) } EXPORT_SYMBOL_GPL(cpu_smt_possible); -static inline bool cpuhp_smt_aware(void) -{ - return topology_smt_supported(); -} - -static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) -{ - return cpu_primary_thread_mask; -} #else static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } -static inline bool cpuhp_smt_aware(void) { return false; } -static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) -{ - return cpu_present_mask; -} #endif static inline enum cpuhp_state @@ -1793,6 +1813,16 @@ static int __init parallel_bringup_parse_param(char *arg) } early_param("cpuhp.parallel", parallel_bringup_parse_param); +static inline bool cpuhp_smt_aware(void) +{ + return cpu_smt_max_threads > 1; +} + +static inline const struct cpumask *cpuhp_get_primary_thread_mask(void) +{ + return cpu_primary_thread_mask; +} + /* * On architectures which have enabled parallel bringup this invokes all BP * prepare states for each of the to be onlined APs first. The last state @@ -2626,6 +2656,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) for_each_online_cpu(cpu) { if (topology_is_primary_thread(cpu)) continue; + /* + * Disable can be called with CPU_SMT_ENABLED when changing + * from a higher to lower number of SMT threads per core. + */ + if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) + continue; ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); if (ret) break; @@ -2660,6 +2696,8 @@ int cpuhp_smt_enable(void) /* Skip online CPUs and CPUs on offline nodes */ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) continue; + if (!cpu_smt_thread_allowed(cpu)) + continue; ret = _cpu_up(cpu, 0, CPUHP_ONLINE); if (ret) break; @@ -2838,20 +2876,19 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = { #ifdef CONFIG_HOTPLUG_SMT +static bool cpu_smt_num_threads_valid(unsigned int threads) +{ + if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC)) + return threads >= 1 && threads <= cpu_smt_max_threads; + return threads == 1 || threads == cpu_smt_max_threads; +} + static ssize_t __store_smt_control(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int ctrlval, ret; - - if (sysfs_streq(buf, "on")) - ctrlval = CPU_SMT_ENABLED; - else if (sysfs_streq(buf, "off")) - ctrlval = CPU_SMT_DISABLED; - else if (sysfs_streq(buf, "forceoff")) - ctrlval = CPU_SMT_FORCE_DISABLED; - else - return -EINVAL; + int ctrlval, ret, num_threads, orig_threads; + bool force_off; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED) return -EPERM; @@ -2859,21 +2896,39 @@ __store_smt_control(struct device *dev, struct device_attribute *attr, if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) return -ENODEV; + if (sysfs_streq(buf, "on")) { + ctrlval = CPU_SMT_ENABLED; + num_threads = cpu_smt_max_threads; + } else if (sysfs_streq(buf, "off")) { + ctrlval = CPU_SMT_DISABLED; + num_threads = 1; + } else if (sysfs_streq(buf, "forceoff")) { + ctrlval = CPU_SMT_FORCE_DISABLED; + num_threads = 1; + } else if (kstrtoint(buf, 10, &num_threads) == 0) { + if (num_threads == 1) + ctrlval = CPU_SMT_DISABLED; + else if (cpu_smt_num_threads_valid(num_threads)) + ctrlval = CPU_SMT_ENABLED; + else + return -EINVAL; + } else { + return -EINVAL; + } + ret = lock_device_hotplug_sysfs(); if (ret) return ret; - if (ctrlval != cpu_smt_control) { - switch (ctrlval) { - case CPU_SMT_ENABLED: - ret = cpuhp_smt_enable(); - break; - case CPU_SMT_DISABLED: - case CPU_SMT_FORCE_DISABLED: - ret = cpuhp_smt_disable(ctrlval); - break; - } - } + orig_threads = cpu_smt_num_threads; + cpu_smt_num_threads = num_threads; + + force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED; + + if (num_threads > orig_threads) + ret = cpuhp_smt_enable(); + else if (num_threads < orig_threads || force_off) + ret = cpuhp_smt_disable(ctrlval); unlock_device_hotplug(); return ret ? ret : count; @@ -2901,6 +2956,17 @@ static ssize_t control_show(struct device *dev, { const char *state = smt_states[cpu_smt_control]; +#ifdef CONFIG_HOTPLUG_SMT + /* + * If SMT is enabled but not all threads are enabled then show the + * number of threads. If all threads are enabled show "on". Otherwise + * show the state name. + */ + if (cpu_smt_control == CPU_SMT_ENABLED && + cpu_smt_num_threads != cpu_smt_max_threads) + return sysfs_emit(buf, "%d\n", cpu_smt_num_threads); +#endif + return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index be61332c66b5..d7ee4bc3f2ba 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -205,8 +205,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) arch_exit_to_user_mode_prepare(regs, ti_work); - /* Ensure that the address limit is intact and no locks are held */ - addr_limit_user_check(); + /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); diff --git a/kernel/events/core.c b/kernel/events/core.c index 78ae7b6f90fd..2554f5fc70dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8249,7 +8249,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) unsigned int size; memset(comm, 0, sizeof(comm)); - strlcpy(comm, comm_event->task->comm, sizeof(comm)); + strscpy(comm, comm_event->task->comm, sizeof(comm)); size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; @@ -8704,7 +8704,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } cpy_name: - strlcpy(tmp, name, sizeof(tmp)); + strscpy(tmp, name, sizeof(tmp)); name = tmp; got_name: /* @@ -9128,7 +9128,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) goto err; - strlcpy(name, sym, KSYM_NAME_LEN); + strscpy(name, sym, KSYM_NAME_LEN); name_len = strlen(name) + 1; while (!IS_ALIGNED(name_len, sizeof(u64))) name[name_len++] = '\0'; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ee8c0acf39df..dc94e0bf2c94 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -473,11 +473,12 @@ void handle_nested_irq(unsigned int irq) action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; - goto out_unlock; + raw_spin_unlock_irq(&desc->lock); + return; } kstat_incr_irqs_this_cpu(desc); - irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + atomic_inc(&desc->threads_active); raw_spin_unlock_irq(&desc->lock); action_ret = IRQ_NONE; @@ -487,11 +488,7 @@ void handle_nested_irq(unsigned int irq) if (!irq_settings_no_debug(desc)) note_interrupt(desc, action_ret); - raw_spin_lock_irq(&desc->lock); - irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); - -out_unlock: - raw_spin_unlock_irq(&desc->lock); + wake_threads_waitq(desc); } EXPORT_SYMBOL_GPL(handle_nested_irq); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index bdd35bb9c735..bcc7f21db9ee 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -108,8 +108,6 @@ extern int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state); -extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); - irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); @@ -121,6 +119,8 @@ void irq_resend_init(struct irq_desc *desc); bool irq_wait_for_poll(struct irq_desc *desc); void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); +void wake_threads_waitq(struct irq_desc *desc); + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d2742af0f0fd..d309ba84e08a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -108,6 +108,16 @@ bool synchronize_hardirq(unsigned int irq) } EXPORT_SYMBOL(synchronize_hardirq); +static void __synchronize_irq(struct irq_desc *desc) +{ + __synchronize_hardirq(desc, true); + /* + * We made sure that no hardirq handler is running. Now verify that no + * threaded handlers are active. + */ + wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +} + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -127,16 +137,8 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) { - __synchronize_hardirq(desc, true); - /* - * We made sure that no hardirq handler is - * running. Now verify that no threaded handlers are - * active. - */ - wait_event(desc->wait_for_threads, - !atomic_read(&desc->threads_active)); - } + if (desc) + __synchronize_irq(desc); } EXPORT_SYMBOL(synchronize_irq); @@ -1216,7 +1218,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, return ret; } -static void wake_threads_waitq(struct irq_desc *desc) +void wake_threads_waitq(struct irq_desc *desc) { if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); @@ -1944,7 +1946,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) * supports it also make sure that there is no (not yet serviced) * interrupt in flight at the hardware level. */ - __synchronize_hardirq(desc, true); + __synchronize_irq(desc); #ifdef CONFIG_DEBUG_SHIRQ /* diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index edec335c0a7a..5f2c66860ac6 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -68,11 +68,16 @@ static int irq_sw_resend(struct irq_desc *desc) */ if (!desc->parent_irq) return -EINVAL; + + desc = irq_to_desc(desc->parent_irq); + if (!desc) + return -EINVAL; } /* Add to resend_list and activate the softirq: */ raw_spin_lock(&irq_resend_lock); - hlist_add_head(&desc->resend_node, &irq_resend_list); + if (hlist_unhashed(&desc->resend_node)) + hlist_add_head(&desc->resend_node, &irq_resend_list); raw_spin_unlock(&irq_resend_lock); tasklet_schedule(&resend_tasklet); return 0; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 016d997131d4..18edd57b5fe8 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -163,12 +163,12 @@ unsigned long kallsyms_sym_address(int idx) return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; } -static bool cleanup_symbol_name(char *s) +static void cleanup_symbol_name(char *s) { char *res; if (!IS_ENABLED(CONFIG_LTO_CLANG)) - return false; + return; /* * LLVM appends various suffixes for local functions and variables that @@ -178,26 +178,21 @@ static bool cleanup_symbol_name(char *s) * - foo.llvm.[0-9a-f]+ */ res = strstr(s, ".llvm."); - if (res) { + if (res) *res = '\0'; - return true; - } - return false; + return; } static int compare_symbol_name(const char *name, char *namebuf) { - int ret; - - ret = strcmp(name, namebuf); - if (!ret) - return ret; - - if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf)) - return 0; - - return ret; + /* The kallsyms_seqs_of_names is sorted based on names after + * cleanup_symbol_name() (see scripts/kallsyms.c) if clang lto is enabled. + * To ensure correct bisection in kallsyms_lookup_names(), do + * cleanup_symbol_name(namebuf) before comparing name and namebuf. + */ + cleanup_symbol_name(namebuf); + return strcmp(name, namebuf); } static unsigned int get_symbol_seq(int index) diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index a2e3745d15c4..e05ddc33a752 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -196,7 +196,7 @@ static bool match_cleanup_name(const char *s, const char *name) if (!IS_ENABLED(CONFIG_LTO_CLANG)) return false; - p = strchr(s, '.'); + p = strstr(s, ".llvm."); if (!p) return false; @@ -344,27 +344,6 @@ static int test_kallsyms_basic_function(void) goto failed; } - /* - * The first '.' may be the initial letter, in which case the - * entire symbol name will be truncated to an empty string in - * cleanup_symbol_name(). Do not test these symbols. - * - * For example: - * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head - * .E_read_words - * .E_leading_bytes - * .E_trailing_bytes - * .E_write_words - * .E_copy - * .str.292.llvm.12122243386960820698 - * .str.24.llvm.12122243386960820698 - * .str.29.llvm.12122243386960820698 - * .str.75.llvm.12122243386960820698 - * .str.99.llvm.12122243386960820698 - */ - if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0]) - continue; - lookup_addr = kallsyms_lookup_name(namebuf); memset(stat, 0, sizeof(*stat)); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fc6095d502d..ca385b61d546 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1545,6 +1545,17 @@ static int check_ftrace_location(struct kprobe *p) return 0; } +static bool is_cfi_preamble_symbol(unsigned long addr) +{ + char symbuf[KSYM_NAME_LEN]; + + if (lookup_symbol_name(addr, symbuf)) + return false; + + return str_has_prefix("__cfi_", symbuf) || + str_has_prefix("__pfx_", symbuf); +} + static int check_kprobe_address_safe(struct kprobe *p, struct module **probed_mod) { @@ -1563,7 +1574,8 @@ static int check_kprobe_address_safe(struct kprobe *p, within_kprobe_blacklist((unsigned long) p->addr) || jump_label_text_reserved(p->addr, p->addr) || static_call_text_reserved(p->addr, p->addr) || - find_bug((unsigned long)p->addr)) { + find_bug((unsigned long)p->addr) || + is_cfi_preamble_symbol((unsigned long)p->addr)) { ret = -EINVAL; goto out; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 949d3deae506..270c7f80ce84 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -45,6 +45,7 @@ torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(int, rt_boost, 2, "Do periodic rt-boost. 0=Disable, 1=Only for rt_mutex, 2=For all lock types."); torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens."); +torture_param(int, writer_fifo, 0, "Run writers at sched_set_fifo() priority"); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)"); /* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */ @@ -809,7 +810,8 @@ static int lock_torture_writer(void *arg) bool skip_main_lock; VERBOSE_TOROUT_STRING("lock_torture_writer task started"); - set_user_nice(current, MAX_NICE); + if (!rt_task(current)) + set_user_nice(current, MAX_NICE); do { if ((torture_random(&rand) & 0xfffff) == 0) @@ -1015,8 +1017,7 @@ static void lock_torture_cleanup(void) if (writer_tasks) { for (i = 0; i < cxt.nrealwriters_stress; i++) - torture_stop_kthread(lock_torture_writer, - writer_tasks[i]); + torture_stop_kthread(lock_torture_writer, writer_tasks[i]); kfree(writer_tasks); writer_tasks = NULL; } @@ -1244,8 +1245,9 @@ static int __init lock_torture_init(void) goto create_reader; /* Create writer. */ - firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], - writer_tasks[i]); + firsterr = torture_create_kthread_cb(lock_torture_writer, &cxt.lwsa[i], + writer_tasks[i], + writer_fifo ? sched_set_fifo : NULL); if (torture_init_error(firsterr)) goto unwind; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 728f434de2bb..21db0df0eb00 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -333,21 +333,43 @@ static __always_inline int __waiter_prio(struct task_struct *task) return prio; } +/* + * Update the waiter->tree copy of the sort keys. + */ static __always_inline void waiter_update_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) { - waiter->prio = __waiter_prio(task); - waiter->deadline = task->dl.deadline; + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->tree.entry)); + + waiter->tree.prio = __waiter_prio(task); + waiter->tree.deadline = task->dl.deadline; +} + +/* + * Update the waiter->pi_tree copy of the sort keys (from the tree copy). + */ +static __always_inline void +waiter_clone_prio(struct rt_mutex_waiter *waiter, struct task_struct *task) +{ + lockdep_assert_held(&waiter->lock->wait_lock); + lockdep_assert_held(&task->pi_lock); + lockdep_assert(RB_EMPTY_NODE(&waiter->pi_tree.entry)); + + waiter->pi_tree.prio = waiter->tree.prio; + waiter->pi_tree.deadline = waiter->tree.deadline; } /* - * Only use with rt_mutex_waiter_{less,equal}() + * Only use with rt_waiter_node_{less,equal}() */ +#define task_to_waiter_node(p) \ + &(struct rt_waiter_node){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = __waiter_prio(p), .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .tree = *task_to_waiter_node(p) } -static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_less(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio < right->prio) return 1; @@ -364,8 +386,8 @@ static __always_inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, return 0; } -static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) +static __always_inline int rt_waiter_node_equal(struct rt_waiter_node *left, + struct rt_waiter_node *right) { if (left->prio != right->prio) return 0; @@ -385,7 +407,7 @@ static __always_inline int rt_mutex_waiter_equal(struct rt_mutex_waiter *left, static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, struct rt_mutex_waiter *top_waiter) { - if (rt_mutex_waiter_less(waiter, top_waiter)) + if (rt_waiter_node_less(&waiter->tree, &top_waiter->tree)) return true; #ifdef RT_MUTEX_BUILD_SPINLOCKS @@ -393,30 +415,30 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, * Note that RT tasks are excluded from same priority (lateral) * steals to prevent the introduction of an unbounded latency. */ - if (rt_prio(waiter->prio) || dl_prio(waiter->prio)) + if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) return false; - return rt_mutex_waiter_equal(waiter, top_waiter); + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); #else return false; #endif } #define __node_2_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, tree_entry) + rb_entry((node), struct rt_mutex_waiter, tree.entry) static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) { struct rt_mutex_waiter *aw = __node_2_waiter(a); struct rt_mutex_waiter *bw = __node_2_waiter(b); - if (rt_mutex_waiter_less(aw, bw)) + if (rt_waiter_node_less(&aw->tree, &bw->tree)) return 1; if (!build_ww_mutex()) return 0; - if (rt_mutex_waiter_less(bw, aw)) + if (rt_waiter_node_less(&bw->tree, &aw->tree)) return 0; /* NOTE: relies on waiter->ww_ctx being set before insertion */ @@ -434,48 +456,58 @@ static __always_inline bool __waiter_less(struct rb_node *a, const struct rb_nod static __always_inline void rt_mutex_enqueue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); + lockdep_assert_held(&lock->wait_lock); + + rb_add_cached(&waiter->tree.entry, &lock->waiters, __waiter_less); } static __always_inline void rt_mutex_dequeue(struct rt_mutex_base *lock, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->tree_entry)) + lockdep_assert_held(&lock->wait_lock); + + if (RB_EMPTY_NODE(&waiter->tree.entry)) return; - rb_erase_cached(&waiter->tree_entry, &lock->waiters); - RB_CLEAR_NODE(&waiter->tree_entry); + rb_erase_cached(&waiter->tree.entry, &lock->waiters); + RB_CLEAR_NODE(&waiter->tree.entry); } -#define __node_2_pi_waiter(node) \ - rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) +#define __node_2_rt_node(node) \ + rb_entry((node), struct rt_waiter_node, entry) -static __always_inline bool -__pi_waiter_less(struct rb_node *a, const struct rb_node *b) +static __always_inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) { - return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); + return rt_waiter_node_less(__node_2_rt_node(a), __node_2_rt_node(b)); } static __always_inline void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); + lockdep_assert_held(&task->pi_lock); + + rb_add_cached(&waiter->pi_tree.entry, &task->pi_waiters, __pi_waiter_less); } static __always_inline void rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) + lockdep_assert_held(&task->pi_lock); + + if (RB_EMPTY_NODE(&waiter->pi_tree.entry)) return; - rb_erase_cached(&waiter->pi_tree_entry, &task->pi_waiters); - RB_CLEAR_NODE(&waiter->pi_tree_entry); + rb_erase_cached(&waiter->pi_tree.entry, &task->pi_waiters); + RB_CLEAR_NODE(&waiter->pi_tree.entry); } -static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) +static __always_inline void rt_mutex_adjust_prio(struct rt_mutex_base *lock, + struct task_struct *p) { struct task_struct *pi_task = NULL; + lockdep_assert_held(&lock->wait_lock); + lockdep_assert(rt_mutex_owner(lock) == p); lockdep_assert_held(&p->pi_lock); if (task_has_pi_waiters(p)) @@ -571,9 +603,14 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * Chain walk basics and protection scope * * [R] refcount on task - * [P] task->pi_lock held + * [Pn] task->pi_lock held * [L] rtmutex->wait_lock held * + * Normal locking order: + * + * rtmutex->wait_lock + * task->pi_lock + * * Step Description Protected by * function arguments: * @task [R] @@ -588,27 +625,32 @@ static __always_inline struct rt_mutex_base *task_blocked_on_lock(struct task_st * again: * loop_sanity_check(); * retry: - * [1] lock(task->pi_lock); [R] acquire [P] - * [2] waiter = task->pi_blocked_on; [P] - * [3] check_exit_conditions_1(); [P] - * [4] lock = waiter->lock; [P] - * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L] - * unlock(task->pi_lock); release [P] + * [1] lock(task->pi_lock); [R] acquire [P1] + * [2] waiter = task->pi_blocked_on; [P1] + * [3] check_exit_conditions_1(); [P1] + * [4] lock = waiter->lock; [P1] + * [5] if (!try_lock(lock->wait_lock)) { [P1] try to acquire [L] + * unlock(task->pi_lock); release [P1] * goto retry; * } - * [6] check_exit_conditions_2(); [P] + [L] - * [7] requeue_lock_waiter(lock, waiter); [P] + [L] - * [8] unlock(task->pi_lock); release [P] + * [6] check_exit_conditions_2(); [P1] + [L] + * [7] requeue_lock_waiter(lock, waiter); [P1] + [L] + * [8] unlock(task->pi_lock); release [P1] * put_task_struct(task); release [R] * [9] check_exit_conditions_3(); [L] * [10] task = owner(lock); [L] * get_task_struct(task); [L] acquire [R] - * lock(task->pi_lock); [L] acquire [P] - * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L] - * [12] check_exit_conditions_4(); [P] + [L] - * [13] unlock(task->pi_lock); release [P] + * lock(task->pi_lock); [L] acquire [P2] + * [11] requeue_pi_waiter(tsk, waiters(lock));[P2] + [L] + * [12] check_exit_conditions_4(); [P2] + [L] + * [13] unlock(task->pi_lock); release [P2] * unlock(lock->wait_lock); release [L] * goto again; + * + * Where P1 is the blocking task and P2 is the lock owner; going up one step + * the owner becomes the next blocked task etc.. + * +* */ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, enum rtmutex_chainwalk chwalk, @@ -756,7 +798,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -764,13 +806,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, } /* - * [4] Get the next lock + * [4] Get the next lock; per holding task->pi_lock we can't unblock + * and guarantee @lock's existence. */ lock = waiter->lock; /* * [5] We need to trylock here as we are holding task->pi_lock, * which is the reverse lock order versus the other rtmutex * operations. + * + * Per the above, holding task->pi_lock guarantees lock exists, so + * inverting this lock order is infeasible from a life-time + * perspective. */ if (!raw_spin_trylock(&lock->wait_lock)) { raw_spin_unlock_irq(&task->pi_lock); @@ -874,17 +921,18 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * or * * DL CBS enforcement advancing the effective deadline. - * - * Even though pi_waiters also uses these fields, and that tree is only - * updated in [11], we can do this here, since we hold [L], which - * serializes all pi_waiters access and rb_erase() does not care about - * the values of the node being removed. */ waiter_update_prio(waiter, task); rt_mutex_enqueue(lock, waiter); - /* [8] Release the task */ + /* + * [8] Release the (blocking) task in preparation for + * taking the owner task in [10]. + * + * Since we hold lock->waiter_lock, task cannot unblock, even if we + * release task->pi_lock. + */ raw_spin_unlock(&task->pi_lock); put_task_struct(task); @@ -908,7 +956,12 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, return 0; } - /* [10] Grab the next task, i.e. the owner of @lock */ + /* + * [10] Grab the next task, i.e. the owner of @lock + * + * Per holding lock->wait_lock and checking for !owner above, there + * must be an owner and it cannot go away. + */ task = get_task_struct(rt_mutex_owner(lock)); raw_spin_lock(&task->pi_lock); @@ -921,8 +974,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * and adjust the priority of the owner. */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else if (prerequeue_top_waiter == waiter) { /* @@ -937,8 +991,9 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); + waiter_clone_prio(waiter, task); rt_mutex_enqueue_pi(task, waiter); - rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(lock, task); } else { /* * Nothing changed. No need to do any priority @@ -1154,6 +1209,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, waiter->task = task; waiter->lock = lock; waiter_update_prio(waiter, task); + waiter_clone_prio(waiter, task); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -1187,7 +1243,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1234,6 +1290,8 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, { struct rt_mutex_waiter *waiter; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1246,7 +1304,7 @@ static void __sched mark_wakeup_next_waiter(struct rt_wake_q_head *wqh, * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); - rt_mutex_adjust_prio(current); + rt_mutex_adjust_prio(lock, current); /* * As we are waking up the top waiter, and the waiter stays @@ -1482,7 +1540,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(lock, owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index cb9fdff76a8a..a6974d044593 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -459,7 +459,7 @@ void __sched rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (!waiter || rt_waiter_node_equal(&waiter->tree, task_to_waiter_node(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index c47e8361bfb5..1162e07cdaea 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -17,27 +17,44 @@ #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> + +/* + * This is a helper for the struct rt_mutex_waiter below. A waiter goes in two + * separate trees and they need their own copy of the sort keys because of + * different locking requirements. + * + * @entry: rbtree node to enqueue into the waiters tree + * @prio: Priority of the waiter + * @deadline: Deadline of the waiter if applicable + * + * See rt_waiter_node_less() and waiter_*_prio(). + */ +struct rt_waiter_node { + struct rb_node entry; + int prio; + u64 deadline; +}; + /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * - * @tree_entry: pi node to enqueue into the mutex waiters tree - * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree + * @tree: node to enqueue into the mutex waiters tree + * @pi_tree: node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task * @lock: Pointer to the rt_mutex on which the waiter blocks * @wake_state: Wakeup state to use (TASK_NORMAL or TASK_RTLOCK_WAIT) - * @prio: Priority of the waiter - * @deadline: Deadline of the waiter if applicable * @ww_ctx: WW context pointer + * + * @tree is ordered by @lock->wait_lock + * @pi_tree is ordered by rt_mutex_owner(@lock)->pi_lock */ struct rt_mutex_waiter { - struct rb_node tree_entry; - struct rb_node pi_tree_entry; + struct rt_waiter_node tree; + struct rt_waiter_node pi_tree; struct task_struct *task; struct rt_mutex_base *lock; unsigned int wake_state; - int prio; - u64 deadline; struct ww_acquire_ctx *ww_ctx; }; @@ -105,7 +122,7 @@ static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, { struct rb_node *leftmost = rb_first_cached(&lock->waiters); - return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; + return rb_entry(leftmost, struct rt_mutex_waiter, tree.entry) == waiter; } static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) @@ -113,8 +130,10 @@ static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base * struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; + lockdep_assert_held(&lock->wait_lock); + if (leftmost) { - w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); + w = rb_entry(leftmost, struct rt_mutex_waiter, tree.entry); BUG_ON(w->lock != lock); } return w; @@ -127,8 +146,10 @@ static inline int task_has_pi_waiters(struct task_struct *p) static inline struct rt_mutex_waiter *task_top_pi_waiter(struct task_struct *p) { + lockdep_assert_held(&p->pi_lock); + return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, - pi_tree_entry); + pi_tree.entry); } #define RT_MUTEX_HAS_WAITERS 1UL @@ -190,8 +211,8 @@ static inline void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) static inline void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { debug_rt_mutex_init_waiter(waiter); - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); + RB_CLEAR_NODE(&waiter->pi_tree.entry); + RB_CLEAR_NODE(&waiter->tree.entry); waiter->wake_state = TASK_NORMAL; waiter->task = NULL; } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 56f139201f24..3ad2cc4823e5 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -96,25 +96,25 @@ __ww_waiter_first(struct rt_mutex *lock) struct rb_node *n = rb_first(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_next(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_next(&w->tree_entry); + struct rb_node *n = rb_next(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * __ww_waiter_prev(struct rt_mutex *lock, struct rt_mutex_waiter *w) { - struct rb_node *n = rb_prev(&w->tree_entry); + struct rb_node *n = rb_prev(&w->tree.entry); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline struct rt_mutex_waiter * @@ -123,7 +123,7 @@ __ww_waiter_last(struct rt_mutex *lock) struct rb_node *n = rb_last(&lock->rtmutex.waiters.rb_root); if (!n) return NULL; - return rb_entry(n, struct rt_mutex_waiter, tree_entry); + return rb_entry(n, struct rt_mutex_waiter, tree.entry); } static inline void diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 80d9c6d77a45..15781acaac1c 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -30,7 +30,7 @@ static struct kmem_cache *nsproxy_cachep; struct nsproxy init_nsproxy = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .uts_ns = &init_uts_ns, #if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) .ipc_ns = &init_ipc_ns, @@ -55,7 +55,7 @@ static inline struct nsproxy *create_nsproxy(void) nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); if (nsproxy) - atomic_set(&nsproxy->count, 1); + refcount_set(&nsproxy->count, 1); return nsproxy; } diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index e1b4bfa938dd..2b4a946a6ff5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1166,7 +1166,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, int error; if (!hibernation_available()) - return 0; + return n; if (len && buf[len-1] == '\n') len--; diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98c1544cf572..5befd8780dcd 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -511,6 +511,14 @@ static inline void show_rcu_tasks_gp_kthreads(void) {} void rcu_request_urgent_qs_task(struct task_struct *t); #endif /* #else #ifdef CONFIG_TINY_RCU */ +#ifdef CONFIG_TASKS_RCU +struct task_struct *get_rcu_tasks_gp_kthread(void); +#endif // # ifdef CONFIG_TASKS_RCU + +#ifdef CONFIG_TASKS_RUDE_RCU +struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +#endif // # ifdef CONFIG_TASKS_RUDE_RCU + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index d1221731c7cf..ffdb30495e3c 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -84,15 +84,17 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>"); #endif torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives"); -torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader"); +torture_param(int, gp_async_max, 1000, "Max # outstanding waits per writer"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); +torture_param(int, minruntime, 0, "Minimum run time (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); torture_param(bool, shutdown, RCUSCALE_SHUTDOWN, "Shutdown at end of scalability tests."); torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); +torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?"); torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?"); @@ -139,6 +141,7 @@ struct rcu_scale_ops { void (*gp_barrier)(void); void (*sync)(void); void (*exp_sync)(void); + struct task_struct *(*rso_gp_kthread)(void); const char *name; }; @@ -295,6 +298,7 @@ static struct rcu_scale_ops tasks_ops = { .gp_barrier = rcu_barrier_tasks, .sync = synchronize_rcu_tasks, .exp_sync = synchronize_rcu_tasks, + .rso_gp_kthread = get_rcu_tasks_gp_kthread, .name = "tasks" }; @@ -306,6 +310,44 @@ static struct rcu_scale_ops tasks_ops = { #endif // #else // #ifdef CONFIG_TASKS_RCU +#ifdef CONFIG_TASKS_RUDE_RCU + +/* + * Definitions for RCU-tasks-rude scalability testing. + */ + +static int tasks_rude_scale_read_lock(void) +{ + return 0; +} + +static void tasks_rude_scale_read_unlock(int idx) +{ +} + +static struct rcu_scale_ops tasks_rude_ops = { + .ptype = RCU_TASKS_RUDE_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_rude_scale_read_lock, + .readunlock = tasks_rude_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks_rude, + .gp_barrier = rcu_barrier_tasks_rude, + .sync = synchronize_rcu_tasks_rude, + .exp_sync = synchronize_rcu_tasks_rude, + .rso_gp_kthread = get_rcu_tasks_rude_gp_kthread, + .name = "tasks-rude" +}; + +#define TASKS_RUDE_OPS &tasks_rude_ops, + +#else // #ifdef CONFIG_TASKS_RUDE_RCU + +#define TASKS_RUDE_OPS + +#endif // #else // #ifdef CONFIG_TASKS_RUDE_RCU + #ifdef CONFIG_TASKS_TRACE_RCU /* @@ -334,6 +376,7 @@ static struct rcu_scale_ops tasks_tracing_ops = { .gp_barrier = rcu_barrier_tasks_trace, .sync = synchronize_rcu_tasks_trace, .exp_sync = synchronize_rcu_tasks_trace, + .rso_gp_kthread = get_rcu_tasks_trace_gp_kthread, .name = "tasks-tracing" }; @@ -410,10 +453,12 @@ rcu_scale_writer(void *arg) { int i = 0; int i_max; + unsigned long jdone; long me = (long)arg; struct rcu_head *rhp = NULL; bool started = false, done = false, alldone = false; u64 t; + DEFINE_TORTURE_RANDOM(tr); u64 *wdp; u64 *wdpp = writer_durations[me]; @@ -424,7 +469,7 @@ rcu_scale_writer(void *arg) sched_set_fifo_low(current); if (holdoff) - schedule_timeout_uninterruptible(holdoff * HZ); + schedule_timeout_idle(holdoff * HZ); /* * Wait until rcu_end_inkernel_boot() is called for normal GP tests @@ -445,9 +490,12 @@ rcu_scale_writer(void *arg) } } + jdone = jiffies + minruntime * HZ; do { if (writer_holdoff) udelay(writer_holdoff); + if (writer_holdoff_jiffies) + schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1); wdp = &wdpp[i]; *wdp = ktime_get_mono_fast_ns(); if (gp_async) { @@ -475,7 +523,7 @@ retry: if (!started && atomic_read(&n_rcu_scale_writer_started) >= nrealwriters) started = true; - if (!done && i >= MIN_MEAS) { + if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) { done = true; sched_set_normal(current, 0); pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n", @@ -518,8 +566,8 @@ static void rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n", - scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown); + "--- %s: gp_async=%d gp_async_max=%d gp_exp=%d holdoff=%d minruntime=%d nreaders=%d nwriters=%d writer_holdoff=%d writer_holdoff_jiffies=%d verbose=%d shutdown=%d\n", + scale_type, tag, gp_async, gp_async_max, gp_exp, holdoff, minruntime, nrealreaders, nrealwriters, writer_holdoff, writer_holdoff_jiffies, verbose, shutdown); } /* @@ -556,6 +604,8 @@ static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; static atomic_t n_kfree_scale_thread_started; static atomic_t n_kfree_scale_thread_ended; +static struct task_struct *kthread_tp; +static u64 kthread_stime; struct kfree_obj { char kfree_obj[8]; @@ -701,6 +751,10 @@ kfree_scale_init(void) unsigned long jif_start; unsigned long orig_jif; + pr_alert("%s" SCALE_FLAG + "--- kfree_rcu_test: kfree_mult=%d kfree_by_call_rcu=%d kfree_nthreads=%d kfree_alloc_num=%d kfree_loops=%d kfree_rcu_test_double=%d kfree_rcu_test_single=%d\n", + scale_type, kfree_mult, kfree_by_call_rcu, kfree_nthreads, kfree_alloc_num, kfree_loops, kfree_rcu_test_double, kfree_rcu_test_single); + // Also, do a quick self-test to ensure laziness is as much as // expected. if (kfree_by_call_rcu && !IS_ENABLED(CONFIG_RCU_LAZY)) { @@ -797,6 +851,18 @@ rcu_scale_cleanup(void) if (gp_exp && gp_async) SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + // If built-in, just report all of the GP kthread's CPU time. + if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread) + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) { + u32 ns; + u64 us; + + kthread_stime = kthread_tp->stime - kthread_stime; + us = div_u64_rem(kthread_stime, 1000, &ns); + pr_info("rcu_scale: Grace-period kthread CPU time: %llu.%03u us\n", us, ns); + show_rcu_gp_kthreads(); + } if (kfree_rcu_test) { kfree_scale_cleanup(); return; @@ -885,7 +951,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_TRACING_OPS + &rcu_ops, &srcu_ops, &srcud_ops, TASKS_OPS TASKS_RUDE_OPS TASKS_TRACING_OPS }; if (!torture_init_begin(scale_type, verbose)) @@ -910,6 +976,11 @@ rcu_scale_init(void) if (cur_ops->init) cur_ops->init(); + if (cur_ops->rso_gp_kthread) { + kthread_tp = cur_ops->rso_gp_kthread(); + if (kthread_tp) + kthread_stime = kthread_tp->stime; + } if (kfree_rcu_test) return kfree_scale_init(); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 147551c23baf..ade42d6a9d9b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1581,6 +1581,7 @@ rcu_torture_writer(void *arg) rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); + show_rcu_gp_kthreads(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); } @@ -1876,7 +1877,7 @@ static int rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) { int mask = rcutorture_extend_mask_max(); - unsigned long randmask1 = torture_random(trsp) >> 8; + unsigned long randmask1 = torture_random(trsp); unsigned long randmask2 = randmask1 >> 3; unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; @@ -1935,7 +1936,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, if (!((mask - 1) & mask)) return rtrsp; /* Current RCU reader not extendable. */ /* Bias towards larger numbers of loops. */ - i = (torture_random(trsp) >> 3); + i = torture_random(trsp); i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1; for (j = 0; j < i; j++) { mask = rcutorture_extend_mask(*readstate, trsp); @@ -2136,7 +2137,7 @@ static int rcu_nocb_toggle(void *arg) toggle_fuzz = NSEC_PER_USEC; do { r = torture_random(&rand); - cpu = (r >> 4) % (maxcpu + 1); + cpu = (r >> 1) % (maxcpu + 1); if (r & 0x1) { rcu_nocb_cpu_offload(cpu); atomic_long_inc(&n_nocb_offload); diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1970ce5f22d4..91a0fd0d4d9a 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -528,6 +528,38 @@ static struct ref_scale_ops clock_ops = { .name = "clock" }; +static void ref_jiffies_section(const int nloops) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) + x += jiffies; + preempt_enable(); + stopopts = x; +} + +static void ref_jiffies_delay_section(const int nloops, const int udl, const int ndl) +{ + u64 x = 0; + int i; + + preempt_disable(); + for (i = nloops; i >= 0; i--) { + x += jiffies; + un_delay(udl, ndl); + } + preempt_enable(); + stopopts = x; +} + +static struct ref_scale_ops jiffies_ops = { + .readsection = ref_jiffies_section, + .delaysection = ref_jiffies_delay_section, + .name = "jiffies" +}; + //////////////////////////////////////////////////////////////////////// // // Methods leveraging SLAB_TYPESAFE_BY_RCU. @@ -1047,7 +1079,7 @@ ref_scale_init(void) int firsterr = 0; static struct ref_scale_ops *scale_ops[] = { &rcu_ops, &srcu_ops, RCU_TRACE_OPS RCU_TASKS_OPS &refcnt_ops, &rwlock_ops, - &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, + &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops, }; @@ -1107,12 +1139,11 @@ ref_scale_init(void) VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); for (i = 0; i < nreaders; i++) { + init_waitqueue_head(&reader_tasks[i].wq); firsterr = torture_create_kthread(ref_scale_reader, (void *)i, reader_tasks[i].task); if (torture_init_error(firsterr)) goto unwind; - - init_waitqueue_head(&(reader_tasks[i].wq)); } // Main Task diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b770add3f843..8d65f7d576a3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -25,6 +25,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. * @rtp_jiffies: Jiffies counter value for statistics. + * @lazy_timer: Timer to unlazify callbacks. + * @urgent_gp: Number of additional non-lazy grace periods. * @rtp_n_lock_retries: Rough lock-contention statistic. * @rtp_work: Work queue for invoking callbacks. * @rtp_irq_work: IRQ work queue for deferred wakeups. @@ -38,6 +40,8 @@ struct rcu_tasks_percpu { raw_spinlock_t __private lock; unsigned long rtp_jiffies; unsigned long rtp_n_lock_retries; + struct timer_list lazy_timer; + unsigned int urgent_gp; struct work_struct rtp_work; struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; @@ -51,7 +55,6 @@ struct rcu_tasks_percpu { * @cbs_wait: RCU wait allowing a new callback to get kthread's attention. * @cbs_gbl_lock: Lock protecting callback list. * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. - * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. @@ -61,6 +64,8 @@ struct rcu_tasks_percpu { * @tasks_gp_seq: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. + * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @lazy_jiffies: Number of jiffies to allow callbacks to be lazy. * @pregp_func: This flavor's pre-grace-period function (optional). * @pertask_func: This flavor's per-task scan function (optional). * @postscan_func: This flavor's post-task scan function (optional). @@ -92,6 +97,7 @@ struct rcu_tasks { unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; + unsigned long lazy_jiffies; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; pertask_func_t pertask_func; @@ -127,6 +133,7 @@ static struct rcu_tasks rt_name = \ .gp_func = gp, \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ + .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ @@ -139,9 +146,7 @@ static struct rcu_tasks rt_name = \ #ifdef CONFIG_TASKS_RCU /* Track exiting tasks in order to allow them to be waited for. */ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); -#endif -#ifdef CONFIG_TASKS_RCU /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); @@ -171,6 +176,8 @@ static int rcu_task_contend_lim __read_mostly = 100; module_param(rcu_task_contend_lim, int, 0444); static int rcu_task_collapse_lim __read_mostly = 10; module_param(rcu_task_collapse_lim, int, 0444); +static int rcu_task_lazy_lim __read_mostly = 32; +module_param(rcu_task_lazy_lim, int, 0444); /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 @@ -229,7 +236,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) #endif /* #ifndef CONFIG_TINY_RCU */ // Initialize per-CPU callback lists for the specified flavor of -// Tasks RCU. +// Tasks RCU. Do not enqueue callbacks before this function is invoked. static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; @@ -237,7 +244,6 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int lim; int shift; - raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rcu_task_enqueue_lim < 0) { rcu_task_enqueue_lim = 1; rcu_task_cb_adjust = true; @@ -260,22 +266,48 @@ static void cblist_init_generic(struct rcu_tasks *rtp) WARN_ON_ONCE(!rtpcp); if (cpu) raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + local_irq_save(flags); // serialize initialization if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); + local_irq_restore(flags); INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } - raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim), rcu_task_cb_adjust); } +// Compute wakeup time for lazy callback timer. +static unsigned long rcu_tasks_lazy_time(struct rcu_tasks *rtp) +{ + return jiffies + rtp->lazy_jiffies; +} + +// Timer handler that unlazifies lazy callbacks. +static void call_rcu_tasks_generic_timer(struct timer_list *tlp) +{ + unsigned long flags; + bool needwake = false; + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = from_timer(rtpcp, tlp, lazy_timer); + + rtp = rtpcp->rtpp; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rcu_segcblist_empty(&rtpcp->cblist) && rtp->lazy_jiffies) { + if (!rtpcp->urgent_gp) + rtpcp->urgent_gp = 1; + needwake = true; + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + } + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (needwake) + rcuwait_wake_up(&rtp->cbs_wait); +} + // IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) { @@ -292,6 +324,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, { int chosen_cpu; unsigned long flags; + bool havekthread = smp_load_acquire(&rtp->kthread_ptr); int ideal_cpu; unsigned long j; bool needadjust = false; @@ -316,12 +349,19 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) needadjust = true; // Defer adjustment to avoid deadlock. } - if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { - raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. - cblist_init_generic(rtp); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + // Queuing callbacks before initialization not yet supported. + if (WARN_ON_ONCE(!rcu_segcblist_is_enabled(&rtpcp->cblist))) + rcu_segcblist_init(&rtpcp->cblist); + needwake = (func == wakeme_after_rcu) || + (rcu_segcblist_n_cbs(&rtpcp->cblist) == rcu_task_lazy_lim); + if (havekthread && !needwake && !timer_pending(&rtpcp->lazy_timer)) { + if (rtp->lazy_jiffies) + mod_timer(&rtpcp->lazy_timer, rcu_tasks_lazy_time(rtp)); + else + needwake = rcu_segcblist_empty(&rtpcp->cblist); } - needwake = rcu_segcblist_empty(&rtpcp->cblist); + if (needwake) + rtpcp->urgent_gp = 3; rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); if (unlikely(needadjust)) { @@ -415,9 +455,14 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) } rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + if (rtpcp->urgent_gp > 0 && rcu_segcblist_pend_cbs(&rtpcp->cblist)) { + if (rtp->lazy_jiffies) + rtpcp->urgent_gp--; needgpcb |= 0x3; - if (!rcu_segcblist_empty(&rtpcp->cblist)) + } else if (rcu_segcblist_empty(&rtpcp->cblist)) { + rtpcp->urgent_gp = 0; + } + if (rcu_segcblist_ready_cbs(&rtpcp->cblist)) needgpcb |= 0x1; raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } @@ -525,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) if (unlikely(midboot)) { needgpcb = 0x2; } else { + mutex_unlock(&rtp->tasks_gp_mutex); set_tasks_gp_state(rtp, RTGS_WAIT_CBS); rcuwait_wait_event(&rtp->cbs_wait, (needgpcb = rcu_tasks_need_gpcb(rtp)), TASK_IDLE); + mutex_lock(&rtp->tasks_gp_mutex); } if (needgpcb & 0x2) { @@ -549,11 +596,19 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot) // RCU-tasks kthread that detects grace periods and invokes callbacks. static int __noreturn rcu_tasks_kthread(void *arg) { + int cpu; struct rcu_tasks *rtp = arg; + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + timer_setup(&rtpcp->lazy_timer, call_rcu_tasks_generic_timer, 0); + rtpcp->urgent_gp = 1; + } + /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current, HK_TYPE_RCU); - WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! + smp_store_release(&rtp->kthread_ptr, current); // Let GPs start! /* * Each pass through the following loop makes one check for @@ -635,16 +690,22 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { int cpu; bool havecbs = false; + bool haveurgent = false; + bool haveurgentcbs = false; for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) { + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist))) havecbs = true; + if (data_race(rtpcp->urgent_gp)) + haveurgent = true; + if (!data_race(rcu_segcblist_empty(&rtpcp->cblist)) && data_race(rtpcp->urgent_gp)) + haveurgentcbs = true; + if (havecbs && haveurgent && haveurgentcbs) break; - } } - pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", + pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c%c%c l:%lu %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), @@ -652,6 +713,9 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[havecbs], + ".u"[haveurgent], + ".U"[haveurgentcbs], + rtp->lazy_jiffies, s); } #endif // #ifndef CONFIG_TINY_RCU @@ -1020,11 +1084,16 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); +int rcu_tasks_lazy_ms = -1; +module_param(rcu_tasks_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_kthread(void) { cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; + if (rcu_tasks_lazy_ms >= 0) + rcu_tasks.lazy_jiffies = msecs_to_jiffies(rcu_tasks_lazy_ms); rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -1042,6 +1111,12 @@ void show_rcu_tasks_classic_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_gp_kthread(void) +{ + return rcu_tasks.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); + /* * Contribute to protect against tasklist scan blind spot while the * task is exiting and may be removed from the tasklist. See @@ -1173,10 +1248,15 @@ void rcu_barrier_tasks_rude(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); +int rcu_tasks_rude_lazy_ms = -1; +module_param(rcu_tasks_rude_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_rude_kthread(void) { cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; + if (rcu_tasks_rude_lazy_ms >= 0) + rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } @@ -1188,6 +1268,13 @@ void show_rcu_tasks_rude_gp_kthread(void) } EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) + +struct task_struct *get_rcu_tasks_rude_gp_kthread(void) +{ + return rcu_tasks_rude.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); + #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// @@ -1793,6 +1880,9 @@ void rcu_barrier_tasks_trace(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); +int rcu_tasks_trace_lazy_ms = -1; +module_param(rcu_tasks_trace_lazy_ms, int, 0444); + static int __init rcu_spawn_tasks_trace_kthread(void) { cblist_init_generic(&rcu_tasks_trace); @@ -1807,6 +1897,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void) if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; } + if (rcu_tasks_trace_lazy_ms >= 0) + rcu_tasks_trace.lazy_jiffies = msecs_to_jiffies(rcu_tasks_trace_lazy_ms); rcu_tasks_trace.pregp_func = rcu_tasks_trace_pregp_step; rcu_tasks_trace.postscan_func = rcu_tasks_trace_postscan; rcu_tasks_trace.holdouts_func = check_all_holdout_tasks_trace; @@ -1830,6 +1922,12 @@ void show_rcu_tasks_trace_gp_kthread(void) EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); #endif // !defined(CONFIG_TINY_RCU) +struct task_struct *get_rcu_tasks_trace_gp_kthread(void) +{ + return rcu_tasks_trace.kthread_ptr; +} +EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1449cb69a0e0..cb1caefa8bd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -632,7 +632,7 @@ void __rcu_irq_enter_check_tick(void) // prevents self-deadlock. So we can safely recheck under the lock. // Note that the nohz_full state currently cannot change. raw_spin_lock_rcu_node(rdp->mynode); - if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { // A nohz_full CPU is in the kernel and RCU needs a // quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); @@ -677,12 +677,16 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) } /** - * rcu_is_watching - see if RCU thinks that the current CPU is not idle + * rcu_is_watching - RCU read-side critical sections permitted on current CPU? * - * Return true if RCU is watching the running CPU, which means that this - * CPU can safely enter RCU read-side critical sections. In other words, - * if the current CPU is not in its idle loop or is in an interrupt or - * NMI handler, return true. + * Return @true if RCU is watching the running CPU and @false otherwise. + * An @true return means that this CPU can safely enter RCU read-side + * critical sections. + * + * Although calls to rcu_is_watching() from most parts of the kernel + * will return @true, there are important exceptions. For example, if the + * current CPU is deep within its idle loop, in kernel entry/exit code, + * or offline, rcu_is_watching() will return @false. * * Make notrace because it can be called by the internal functions of * ftrace, and making this notrace removes unnecessary recursion calls. diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 43229d2b0c44..5598212d1f27 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -77,9 +77,9 @@ __setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { rcu_nocb_poll = true; - return 0; + return 1; } -early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +__setup("rcu_nocb_poll", parse_rcu_nocb_poll); /* * Don't bother bypassing ->cblist if the call_rcu() rate is low. diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5d113aa59e77..59032aaccd18 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -171,7 +171,8 @@ static void scf_torture_stats_print(void) scfs.n_all_wait += scf_stats_p[i].n_all_wait; } if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || - atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) + atomic_read(&n_mb_out_errs) || + (!IS_ENABLED(CONFIG_KASAN) && atomic_read(&n_alloc_errs))) bangstr = "!!! "; pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ", SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, @@ -312,6 +313,7 @@ static void scf_handler_1(void *scfc_in) // Randomly do an smp_call_function*() invocation. static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp) { + bool allocfail = false; uintptr_t cpu; int ret = 0; struct scf_check *scfcp = NULL; @@ -323,8 +325,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra preempt_disable(); if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) { scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC); - if (WARN_ON_ONCE(!scfcp)) { + if (!scfcp) { + WARN_ON_ONCE(!IS_ENABLED(CONFIG_KASAN)); atomic_inc(&n_alloc_errs); + allocfail = true; } else { scfcp->scfc_cpu = -1; scfcp->scfc_wait = scfsp->scfs_wait; @@ -431,7 +435,9 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra cpus_read_unlock(); else preempt_enable(); - if (!(torture_random(trsp) & 0xfff)) + if (allocfail) + schedule_timeout_idle((1 + longwait) * HZ); // Let no-wait handlers complete. + else if (!(torture_random(trsp) & 0xfff)) schedule_timeout_uninterruptible(1); } diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index d57a5c1c1cd9..3561ab533dd4 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,23 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +static void complete_with_flags(struct completion *x, int wake_flags) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&x->wait.lock, flags); + + if (x->done != UINT_MAX) + x->done++; + swake_up_locked(&x->wait, wake_flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void complete_on_current_cpu(struct completion *x) +{ + return complete_with_flags(x, WF_CURRENT_CPU); +} + /** * complete: - signals a single thread waiting on this completion * @x: holds the state of this particular completion @@ -27,14 +44,7 @@ */ void complete(struct completion *x) { - unsigned long flags; - - raw_spin_lock_irqsave(&x->wait.lock, flags); - - if (x->done != UINT_MAX) - x->done++; - swake_up_locked(&x->wait); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + complete_with_flags(x, 0); } EXPORT_SYMBOL(complete); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c52c2eba7c73..4d63e063608a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4193,8 +4193,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { unsigned long flags; int cpu, success = 0; @@ -7030,7 +7029,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3e25be58e2b..ceb5d4c4738e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7741,6 +7741,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (wake_flags & WF_TTWU) { record_wakee(p); + if ((wake_flags & WF_CURRENT_CPU) && + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e93e006a942b..48d0be005f08 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2131,12 +2131,13 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -3229,6 +3230,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + #ifdef CONFIG_PREEMPT_DYNAMIC extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 76b9b796e695..72505cd3b60a 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -18,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head); * If for some reason it would return 0, that means the previously waiting * task is already running, so it will observe condition true (or has already). */ -void swake_up_locked(struct swait_queue_head *q) +void swake_up_locked(struct swait_queue_head *q, int wake_flags) { struct swait_queue *curr; @@ -26,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q) return; curr = list_first_entry(&q->task_list, typeof(*curr), task_list); - wake_up_process(curr->task); + try_to_wake_up(curr->task, TASK_NORMAL, wake_flags); list_del_init(&curr->task_list); } EXPORT_SYMBOL(swake_up_locked); @@ -41,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked); void swake_up_all_locked(struct swait_queue_head *q) { while (!list_empty(&q->task_list)) - swake_up_locked(q); + swake_up_locked(q, 0); } void swake_up_one(struct swait_queue_head *q) @@ -49,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q) unsigned long flags; raw_spin_lock_irqsave(&q->lock, flags); - swake_up_locked(q); + swake_up_locked(q, 0); raw_spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(swake_up_one); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 48c53e4739ea..802d98cf2de3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL(__wake_up); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key) +{ + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key); +} + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d3e584065c7f..255999ba9190 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -110,11 +110,13 @@ struct seccomp_knotif { * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. * @ioctl_flags: The flags used for the seccomp_addfd ioctl. + * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd * installation, or gone away (either due to successful * reply, or signal) + * @list: list_head for chaining seccomp_kaddfd together. * */ struct seccomp_kaddfd { @@ -138,14 +140,17 @@ struct seccomp_kaddfd { * structure is fairly large, we store the notification-specific stuff in a * separate structure. * - * @request: A semaphore that users of this notification can wait on for - * changes. Actual reads and writes are still controlled with - * filter->notify_lock. + * @requests: A semaphore that users of this notification can wait on for + * changes. Actual reads and writes are still controlled with + * filter->notify_lock. + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. */ + struct notification { - struct semaphore request; + atomic_t requests; + u32 flags; u64 next_id; struct list_head notifications; }; @@ -555,6 +560,8 @@ static void __seccomp_filter_release(struct seccomp_filter *orig) * drop its reference count, and notify * about unused filters * + * @tsk: task the filter should be released from. + * * This function should only be called when the task is exiting as * it detaches it from its filter tree. As such, READ_ONCE() and * barriers are not needed here, as would normally be needed. @@ -574,6 +581,8 @@ void seccomp_filter_release(struct task_struct *tsk) /** * seccomp_sync_threads: sets all threads to use current's filter * + * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. + * * Expects sighand and cred_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. @@ -1116,8 +1125,11 @@ static int seccomp_do_user_notification(int this_syscall, list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); - up(&match->notif->request); - wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); + atomic_inc(&match->notif->requests); + if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM); + else + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); /* * This is where we wait for a reply from userspace. @@ -1450,6 +1462,37 @@ find_notification(struct seccomp_filter *filter, u64 id) return NULL; } +static int recv_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, + void *key) +{ + /* Avoid a wakeup if event not interesting for us. */ + if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static int recv_wait_event(struct seccomp_filter *filter) +{ + DEFINE_WAIT_FUNC(wait, recv_wake_function); + int ret; + + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) + return 0; + + for (;;) { + ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE); + + if (atomic_dec_if_positive(&filter->notif->requests) >= 0) + break; + + if (ret) + return ret; + + schedule(); + } + finish_wait(&filter->wqh, &wait); + return 0; +} static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) @@ -1467,7 +1510,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, memset(&unotif, 0, sizeof(unotif)); - ret = down_interruptible(&filter->notif->request); + ret = recv_wait_event(filter); if (ret < 0) return ret; @@ -1515,7 +1558,8 @@ out: if (should_sleep_killable(filter, knotif)) complete(&knotif->ready); knotif->state = SECCOMP_NOTIFY_INIT; - up(&filter->notif->request); + atomic_inc(&filter->notif->requests); + wake_up_poll(&filter->wqh, EPOLLIN | EPOLLRDNORM); } mutex_unlock(&filter->notify_lock); } @@ -1561,7 +1605,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->error = resp.error; knotif->val = resp.val; knotif->flags = resp.flags; - complete(&knotif->ready); + if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + complete_on_current_cpu(&knotif->ready); + else + complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); return ret; @@ -1591,6 +1638,22 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_set_flags(struct seccomp_filter *filter, + unsigned long flags) +{ + long ret; + + if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) + return -EINVAL; + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + return ret; + filter->notif->flags = flags; + mutex_unlock(&filter->notify_lock); + return 0; +} + static long seccomp_notify_addfd(struct seccomp_filter *filter, struct seccomp_notif_addfd __user *uaddfd, unsigned int size) @@ -1720,6 +1783,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + case SECCOMP_IOCTL_NOTIF_SET_FLAGS: + return seccomp_notify_set_flags(filter, arg); } /* Extensible Argument ioctls */ @@ -1777,7 +1842,6 @@ static struct file *init_listener(struct seccomp_filter *filter) if (!filter->notif) goto out; - sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); diff --git a/kernel/signal.c b/kernel/signal.c index b5370fe5c198..128e9bb3d1a2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -562,6 +562,10 @@ bool unhandled_signal(struct task_struct *tsk, int sig) if (handler != SIG_IGN && handler != SIG_DFL) return false; + /* If dying, we handle all new signals by ignoring them */ + if (fatal_signal_pending(tsk)) + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } diff --git a/kernel/smp.c b/kernel/smp.c index 385179dae360..8455a53465af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -46,6 +46,8 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); +static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); + static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) @@ -253,13 +255,15 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 * *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { - dump_cpu_task(cpu); + if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) + dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } - dump_stack(); + if (firsttime) + dump_stack(); *ts1 = ts2; return false; @@ -433,9 +437,14 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) struct llist_node *entry, *prev; struct llist_head *head; static bool warned; + atomic_t *tbt; lockdep_assert_irqs_disabled(); + /* Allow waiters to send backtrace NMI from here onwards */ + tbt = this_cpu_ptr(&trigger_backtrace); + atomic_set_release(tbt, 1); + head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); diff --git a/kernel/sys.c b/kernel/sys.c index 05f838929e72..2410e3999ebe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2535,11 +2535,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_AUXV: - if (arg4 || arg5) - return -EINVAL; - error = prctl_get_auxv((void __user *)arg2, arg3); - break; default: return -EINVAL; } @@ -2694,6 +2689,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; + case PR_GET_AUXV: + if (arg4 || arg5) + return -EINVAL; + error = prctl_get_auxv((void __user *)arg2, arg3); + break; #ifdef CONFIG_KSM case PR_SET_MEMORY_MERGE: if (arg3 || arg4 || arg5) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 88cbc1181b23..c108ed8a9804 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -473,8 +473,8 @@ static void clocksource_watchdog(struct timer_list *unused) /* Check the deviation from the watchdog clocksource. */ md = cs->uncertainty_margin + watchdog->uncertainty_margin; if (abs(cs_nsec - wd_nsec) > md) { - u64 cs_wd_msec; - u64 wd_msec; + s64 cs_wd_msec; + s64 wd_msec; u32 wd_rem; pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", @@ -483,8 +483,8 @@ static void clocksource_watchdog(struct timer_list *unused) watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, cs_nsec, csnow, cslast, cs->mask); - cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem); - wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem); + cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); + wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); if (curr_clocksource == cs) diff --git a/kernel/torture.c b/kernel/torture.c index 1a0519b836ac..b28b05bbef02 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -37,6 +37,7 @@ #include <linux/ktime.h> #include <asm/byteorder.h> #include <linux/torture.h> +#include <linux/sched/rt.h> #include "rcu/rcu.h" MODULE_LICENSE("GPL"); @@ -54,6 +55,9 @@ module_param(verbose_sleep_frequency, int, 0444); static int verbose_sleep_duration = 1; module_param(verbose_sleep_duration, int, 0444); +static int random_shuffle; +module_param(random_shuffle, int, 0444); + static char *torture_type; static int verbose; @@ -88,8 +92,8 @@ int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_s ktime_t hto = baset_ns; if (trsp) - hto += (torture_random(trsp) >> 3) % fuzzt_ns; - set_current_state(TASK_UNINTERRUPTIBLE); + hto += torture_random(trsp) % fuzzt_ns; + set_current_state(TASK_IDLE); return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); } EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); @@ -350,22 +354,22 @@ torture_onoff(void *arg) if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); - schedule_timeout_interruptible(onoff_holdoff); + torture_hrtimeout_jiffies(onoff_holdoff, &rand); VERBOSE_TOROUT_STRING("torture_onoff end holdoff"); } while (!torture_must_stop()) { if (disable_onoff_at_boot && !rcu_inkernel_boot_has_ended()) { - schedule_timeout_interruptible(HZ / 10); + torture_hrtimeout_jiffies(HZ / 10, &rand); continue; } - cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); + cpu = torture_random(&rand) % (maxcpu + 1); if (!torture_offline(cpu, &n_offline_attempts, &n_offline_successes, &sum_offline, &min_offline, &max_offline)) torture_online(cpu, &n_online_attempts, &n_online_successes, &sum_online, &min_online, &max_online); - schedule_timeout_interruptible(onoff_interval); + torture_hrtimeout_jiffies(onoff_interval, &rand); } stop: @@ -518,6 +522,7 @@ static void torture_shuffle_task_unregister_all(void) */ static void torture_shuffle_tasks(void) { + DEFINE_TORTURE_RANDOM(rand); struct shuffle_task *stp; cpumask_setall(shuffle_tmp_mask); @@ -537,8 +542,10 @@ static void torture_shuffle_tasks(void) cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask); mutex_lock(&shuffle_task_mutex); - list_for_each_entry(stp, &shuffle_task_list, st_l) - set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + list_for_each_entry(stp, &shuffle_task_list, st_l) { + if (!random_shuffle || torture_random(&rand) & 0x1) + set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask); + } mutex_unlock(&shuffle_task_mutex); cpus_read_unlock(); @@ -550,9 +557,11 @@ static void torture_shuffle_tasks(void) */ static int torture_shuffle(void *arg) { + DEFINE_TORTURE_RANDOM(rand); + VERBOSE_TOROUT_STRING("torture_shuffle task started"); do { - schedule_timeout_interruptible(shuffle_interval); + torture_hrtimeout_jiffies(shuffle_interval, &rand); torture_shuffle_tasks(); torture_shutdown_absorb("torture_shuffle"); } while (!torture_must_stop()); @@ -728,12 +737,12 @@ bool stutter_wait(const char *title) cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { - if (!ret) { + if (!ret && !rt_task(current)) { sched_set_normal(current, MAX_NICE); ret = true; } if (spt == 1) { - schedule_timeout_interruptible(1); + torture_hrtimeout_jiffies(1, NULL); } else if (spt == 2) { while (READ_ONCE(stutter_pause_test)) { if (!(i++ & 0xffff)) @@ -741,7 +750,7 @@ bool stutter_wait(const char *title) cond_resched(); } } else { - schedule_timeout_interruptible(round_jiffies_relative(HZ)); + torture_hrtimeout_jiffies(round_jiffies_relative(HZ), NULL); } torture_shutdown_absorb(title); } @@ -926,7 +935,7 @@ EXPORT_SYMBOL_GPL(torture_kthread_stopping); * it starts, you will need to open-code your own. */ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, - char *f, struct task_struct **tp) + char *f, struct task_struct **tp, void (*cbf)(struct task_struct *tp)) { int ret = 0; @@ -938,6 +947,10 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, *tp = NULL; return ret; } + + if (cbf) + cbf(*tp); + wake_up_process(*tp); // Process is sleeping, so ordering provided. torture_shuffle_task_register(*tp); return ret; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5f2dcabad202..bd1a42b23f3f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -661,8 +661,7 @@ static DEFINE_PER_CPU(int, bpf_trace_nest_level); BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { - struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); - int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct bpf_trace_sample_data *sds; struct perf_raw_record raw = { .frag = { .size = size, @@ -670,7 +669,11 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, }, }; struct perf_sample_data *sd; - int err; + int nest_level, err; + + preempt_disable(); + sds = this_cpu_ptr(&bpf_trace_sds); + nest_level = this_cpu_inc_return(bpf_trace_nest_level); if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { err = -EBUSY; @@ -688,9 +691,9 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, perf_sample_save_raw_data(sd, &raw); err = __bpf_perf_event_output(regs, map, flags, sd); - out: this_cpu_dec(bpf_trace_nest_level); + preempt_enable(); return err; } @@ -715,7 +718,6 @@ static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { - int nest_level = this_cpu_inc_return(bpf_event_output_nest_level); struct perf_raw_frag frag = { .copy = ctx_copy, .size = ctx_size, @@ -732,8 +734,12 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; struct perf_sample_data *sd; struct pt_regs *regs; + int nest_level; u64 ret; + preempt_disable(); + nest_level = this_cpu_inc_return(bpf_event_output_nest_level); + if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) { ret = -EBUSY; goto out; @@ -748,6 +754,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, ret = __bpf_perf_event_output(regs, map, flags, sd); out: this_cpu_dec(bpf_event_output_nest_level); + preempt_enable(); return ret; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 14d8001140c8..52dea5dd5362 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -523,6 +523,8 @@ struct ring_buffer_per_cpu { rb_time_t before_stamp; u64 event_stamp[MAX_NEST]; u64 read_stamp; + /* pages removed since last reset */ + unsigned long pages_removed; /* ring buffer pages to update, > 0 to add, < 0 to remove */ long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ @@ -536,6 +538,7 @@ struct trace_buffer { unsigned flags; int cpus; atomic_t record_disabled; + atomic_t resizing; cpumask_var_t cpumask; struct lock_class_key *reader_lock_key; @@ -558,6 +561,7 @@ struct ring_buffer_iter { struct buffer_page *head_page; struct buffer_page *cache_reader_page; unsigned long cache_read; + unsigned long cache_pages_removed; u64 read_stamp; u64 page_stamp; struct ring_buffer_event *event; @@ -946,6 +950,7 @@ static void rb_wake_up_waiters(struct irq_work *work) /** * ring_buffer_wake_waiters - wake up any waiters on this ring buffer * @buffer: The ring buffer to wake waiters on + * @cpu: The CPU buffer to wake waiters on * * In the case of a file that represents a ring buffer is closing, * it is prudent to wake up any waiters that are on this. @@ -1956,6 +1961,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) to_remove = rb_list_head(to_remove)->next; head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD; } + /* Read iterators need to reset themselves when some pages removed */ + cpu_buffer->pages_removed += nr_removed; next_page = rb_list_head(to_remove)->next; @@ -1977,12 +1984,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) cpu_buffer->head_page = list_entry(next_page, struct buffer_page, list); - /* - * change read pointer to make sure any read iterators reset - * themselves - */ - cpu_buffer->read = 0; - /* pages are removed, resume tracing and then free the pages */ atomic_dec(&cpu_buffer->record_disabled); raw_spin_unlock_irq(&cpu_buffer->reader_lock); @@ -2167,7 +2168,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); - + atomic_inc(&buffer->resizing); if (cpu_id == RING_BUFFER_ALL_CPUS) { /* @@ -2322,6 +2323,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, atomic_dec(&buffer->record_disabled); } + atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return 0; @@ -2342,6 +2344,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, } } out_err_unlock: + atomic_dec(&buffer->resizing); mutex_unlock(&buffer->mutex); return err; } @@ -3373,7 +3376,6 @@ void ring_buffer_nest_end(struct trace_buffer *buffer) /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to - * @event: The event pointer to commit. * * This commits the data to the ring buffer, and releases any locks held. * @@ -4392,6 +4394,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->cache_reader_page = iter->head_page; iter->cache_read = cpu_buffer->read; + iter->cache_pages_removed = cpu_buffer->pages_removed; if (iter->head) { iter->read_stamp = cpu_buffer->read_stamp; @@ -4846,12 +4849,13 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) buffer = cpu_buffer->buffer; /* - * Check if someone performed a consuming read to - * the buffer. A consuming read invalidates the iterator - * and we need to reset the iterator in this case. + * Check if someone performed a consuming read to the buffer + * or removed some pages from the buffer. In these cases, + * iterator was invalidated and we need to reset it. */ if (unlikely(iter->cache_read != cpu_buffer->read || - iter->cache_reader_page != cpu_buffer->reader_page)) + iter->cache_reader_page != cpu_buffer->reader_page || + iter->cache_pages_removed != cpu_buffer->pages_removed)) rb_iter_reset(iter); again: @@ -5295,6 +5299,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->last_overrun = 0; rb_head_page_activate(cpu_buffer); + cpu_buffer->pages_removed = 0; } /* Must have disabled the cpu buffer then done a synchronize_rcu */ @@ -5353,7 +5358,6 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of - * @cpu: The CPU buffer to be reset */ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer) { @@ -5541,6 +5545,15 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a, if (local_read(&cpu_buffer_b->committing)) goto out_dec; + /* + * When resize is in progress, we cannot swap it because + * it will mess the state of the cpu buffer. + */ + if (atomic_read(&buffer_a->resizing)) + goto out_dec; + if (atomic_read(&buffer_b->resizing)) + goto out_dec; + buffer_a->buffers[cpu] = cpu_buffer_b; buffer_b->buffers[cpu] = cpu_buffer_a; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be847d45d81c..8e64aaad5361 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1928,9 +1928,10 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * place on this CPU. We fail to record, but we reset * the max trace buffer (no one writes directly to it) * and flag that it failed. + * Another reason is resize is in progress. */ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, - "Failed to swap buffers due to commit in progress\n"); + "Failed to swap buffers due to commit or resize in progress\n"); } WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); @@ -4212,8 +4213,15 @@ static void *s_start(struct seq_file *m, loff_t *pos) * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); - if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) { + /* Close iter->trace before switching to the new current tracer */ + if (iter->trace->close) + iter->trace->close(iter); *iter->trace = *tr->current_trace; + /* Reopen the new current tracer */ + if (iter->trace->open) + iter->trace->open(iter); + } mutex_unlock(&trace_types_lock); #ifdef CONFIG_TRACER_MAX_TRACE @@ -5276,11 +5284,17 @@ int tracing_set_cpumask(struct trace_array *tr, !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_disable_cpu(tr->max_buffer.buffer, cpu); +#endif } if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->array_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->array_buffer.buffer, cpu); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_record_enable_cpu(tr->max_buffer.buffer, cpu); +#endif } } arch_spin_unlock(&tr->max_lock); @@ -6704,10 +6718,36 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, #endif +static int open_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + if (cpumask_empty(tr->pipe_cpumask)) { + cpumask_setall(tr->pipe_cpumask); + return 0; + } + } else if (!cpumask_test_cpu(cpu, tr->pipe_cpumask)) { + cpumask_set_cpu(cpu, tr->pipe_cpumask); + return 0; + } + return -EBUSY; +} + +static void close_pipe_on_cpu(struct trace_array *tr, int cpu) +{ + if (cpu == RING_BUFFER_ALL_CPUS) { + WARN_ON(!cpumask_full(tr->pipe_cpumask)); + cpumask_clear(tr->pipe_cpumask); + } else { + WARN_ON(!cpumask_test_cpu(cpu, tr->pipe_cpumask)); + cpumask_clear_cpu(cpu, tr->pipe_cpumask); + } +} + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; struct trace_iterator *iter; + int cpu; int ret; ret = tracing_check_open_get_tr(tr); @@ -6715,13 +6755,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) return ret; mutex_lock(&trace_types_lock); + cpu = tracing_get_cpu(inode); + ret = open_pipe_on_cpu(tr, cpu); + if (ret) + goto fail_pipe_on_cpu; /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { ret = -ENOMEM; - __trace_array_put(tr); - goto out; + goto fail_alloc_iter; } trace_seq_init(&iter->seq); @@ -6744,7 +6787,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->tr = tr; iter->array_buffer = &tr->array_buffer; - iter->cpu_file = tracing_get_cpu(inode); + iter->cpu_file = cpu; mutex_init(&iter->mutex); filp->private_data = iter; @@ -6754,12 +6797,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) nonseekable_open(inode, filp); tr->trace_ref++; -out: + mutex_unlock(&trace_types_lock); return ret; fail: kfree(iter); +fail_alloc_iter: + close_pipe_on_cpu(tr, cpu); +fail_pipe_on_cpu: __trace_array_put(tr); mutex_unlock(&trace_types_lock); return ret; @@ -6776,7 +6822,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) if (iter->trace->pipe_close) iter->trace->pipe_close(iter); - + close_pipe_on_cpu(tr, iter->cpu_file); mutex_unlock(&trace_types_lock); free_cpumask_var(iter->started); @@ -9440,6 +9486,9 @@ static struct trace_array *trace_array_create(const char *name) if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) goto out_free_tr; + if (!alloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL)) + goto out_free_tr; + tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS; cpumask_copy(tr->tracing_cpumask, cpu_all_mask); @@ -9481,6 +9530,7 @@ static struct trace_array *trace_array_create(const char *name) out_free_tr: ftrace_free_ftrace_ops(tr); free_trace_buffers(tr); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -9583,6 +9633,7 @@ static int __remove_instance(struct trace_array *tr) } kfree(tr->topts); + free_cpumask_var(tr->pipe_cpumask); free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -10380,12 +10431,14 @@ __init static int tracer_alloc_buffers(void) if (trace_create_savedcmd() < 0) goto out_free_temp_buffer; + if (!alloc_cpumask_var(&global_trace.pipe_cpumask, GFP_KERNEL)) + goto out_free_savedcmd; + /* TODO: make the number of buffers hot pluggable with CPUS */ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { MEM_FAIL(1, "tracer: failed to allocate ring buffer!\n"); - goto out_free_savedcmd; + goto out_free_pipe_cpumask; } - if (global_trace.buffer_disabled) tracing_off(); @@ -10438,6 +10491,8 @@ __init static int tracer_alloc_buffers(void) return 0; +out_free_pipe_cpumask: + free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: free_saved_cmdlines_buffer(savedcmd); out_free_temp_buffer: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e1edc2197fc8..73eaec158473 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -377,6 +377,8 @@ struct trace_array { struct list_head events; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ + /* one per_cpu trace_pipe can be opened by only one user */ + cpumask_var_t pipe_cpumask; int ref; int trace_ref; #ifdef CONFIG_FUNCTION_TRACER @@ -1295,6 +1297,14 @@ static inline void trace_branch_disable(void) /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); +union trace_synth_field { + u8 as_u8; + u16 as_u16; + u32 as_u32; + u64 as_u64; + struct trace_dynamic_info as_dynamic; +}; + struct ftrace_event_field { struct list_head link; const char *name; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6ae4eae510..578f1f7d49a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -611,7 +611,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, { struct trace_event_call *call = file->event_call; struct trace_array *tr = file->tr; - unsigned long file_flags = file->flags; int ret = 0; int disable; @@ -635,6 +634,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED; clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Disable use of trace_buffered_event */ + trace_buffered_event_disable(); } else disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE); @@ -673,6 +674,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, if (atomic_inc_return(&file->sm_ref) > 1) break; set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags); + /* Enable use of trace_buffered_event */ + trace_buffered_event_enable(); } if (!(file->flags & EVENT_FILE_FL_ENABLED)) { @@ -712,15 +715,6 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file, break; } - /* Enable or disable use of trace_buffered_event */ - if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) != - (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) { - if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) - trace_buffered_event_enable(); - else - trace_buffered_event_disable(); - } - return ret; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index c8c61381eba4..d06938ae0717 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6668,7 +6668,8 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_unreg; if (has_hist_vars(hist_data) || hist_data->n_var_refs) { - if (save_hist_vars(hist_data)) + ret = save_hist_vars(hist_data); + if (ret) goto out_unreg; } diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d6a70aff2410..9897d0bfcab7 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -127,7 +127,7 @@ static bool synth_event_match(const char *system, const char *event, struct synth_trace_event { struct trace_entry ent; - u64 fields[]; + union trace_synth_field fields[]; }; static int synth_event_define_fields(struct trace_event_call *call) @@ -321,19 +321,19 @@ static const char *synth_field_fmt(char *type) static void print_synth_event_num_val(struct trace_seq *s, char *print_fmt, char *name, - int size, u64 val, char *space) + int size, union trace_synth_field *val, char *space) { switch (size) { case 1: - trace_seq_printf(s, print_fmt, name, (u8)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u8, space); break; case 2: - trace_seq_printf(s, print_fmt, name, (u16)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u16, space); break; case 4: - trace_seq_printf(s, print_fmt, name, (u32)val, space); + trace_seq_printf(s, print_fmt, name, val->as_u32, space); break; default: @@ -350,7 +350,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, struct trace_seq *s = &iter->seq; struct synth_trace_event *entry; struct synth_event *se; - unsigned int i, n_u64; + unsigned int i, j, n_u64; char print_fmt[32]; const char *fmt; @@ -374,43 +374,28 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, /* parameter values */ if (se->fields[i]->is_string) { if (se->fields[i]->is_dynamic) { - u32 offset, data_offset; - char *str_field; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - - str_field = (char *)entry + data_offset; + union trace_synth_field *data = &entry->fields[n_u64]; trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - str_field, + (char *)entry + data->as_dynamic.offset, i == se->n_fields - 1 ? "" : " "); n_u64++; } else { trace_seq_printf(s, print_fmt, se->fields[i]->name, STR_VAR_LEN_MAX, - (char *)&entry->fields[n_u64], + (char *)&entry->fields[n_u64].as_u64, i == se->n_fields - 1 ? "" : " "); n_u64 += STR_VAR_LEN_MAX / sizeof(u64); } } else if (se->fields[i]->is_stack) { - u32 offset, data_offset, len; - unsigned long *p, *end; - - offset = (u32)entry->fields[n_u64]; - data_offset = offset & 0xffff; - len = offset >> 16; - - p = (void *)entry + data_offset; - end = (void *)p + len - (sizeof(long) - 1); + union trace_synth_field *data = &entry->fields[n_u64]; + unsigned long *p = (void *)entry + data->as_dynamic.offset; trace_seq_printf(s, "%s=STACK:\n", se->fields[i]->name); - - for (; *p && p < end; p++) - trace_seq_printf(s, "=> %pS\n", (void *)*p); + for (j = 1; j < data->as_dynamic.len / sizeof(long); j++) + trace_seq_printf(s, "=> %pS\n", (void *)p[j]); n_u64++; - } else { struct trace_print_flags __flags[] = { __def_gfpflag_names, {-1, NULL} }; @@ -419,13 +404,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, print_synth_event_num_val(s, print_fmt, se->fields[i]->name, se->fields[i]->size, - entry->fields[n_u64], + &entry->fields[n_u64], space); if (strcmp(se->fields[i]->type, "gfp_t") == 0) { trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", - entry->fields[n_u64], + entry->fields[n_u64].as_u64, __flags); trace_seq_putc(s, ')'); } @@ -454,21 +439,16 @@ static unsigned int trace_string(struct synth_trace_event *entry, int ret; if (is_dynamic) { - u32 data_offset; + union trace_synth_field *data = &entry->fields[*n_u64]; - data_offset = struct_size(entry, fields, event->n_u64); - data_offset += data_size; - - len = fetch_store_strlen((unsigned long)str_val); - - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + data->as_dynamic.offset = struct_size(entry, fields, event->n_u64) + data_size; + data->as_dynamic.len = fetch_store_strlen((unsigned long)str_val); ret = fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); (*n_u64)++; } else { - str_field = (char *)&entry->fields[*n_u64]; + str_field = (char *)&entry->fields[*n_u64].as_u64; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if ((unsigned long)str_val < TASK_SIZE) @@ -492,6 +472,7 @@ static unsigned int trace_stack(struct synth_trace_event *entry, unsigned int data_size, unsigned int *n_u64) { + union trace_synth_field *data = &entry->fields[*n_u64]; unsigned int len; u32 data_offset; void *data_loc; @@ -504,10 +485,6 @@ static unsigned int trace_stack(struct synth_trace_event *entry, break; } - /* Include the zero'd element if it fits */ - if (len < HIST_STACKTRACE_DEPTH) - len++; - len *= sizeof(long); /* Find the dynamic section to copy the stack into. */ @@ -515,8 +492,9 @@ static unsigned int trace_stack(struct synth_trace_event *entry, memcpy(data_loc, stack, len); /* Fill in the field that holds the offset/len combo */ - data_offset |= len << 16; - *(u32 *)&entry->fields[*n_u64] = data_offset; + + data->as_dynamic.offset = data_offset; + data->as_dynamic.len = len; (*n_u64)++; @@ -550,7 +528,8 @@ static notrace void trace_event_raw_event_synth(void *__data, str_val = (char *)(long)var_ref_vals[val_idx]; if (event->dynamic_fields[i]->is_stack) { - len = *((unsigned long *)str_val); + /* reserve one extra element for size */ + len = *((unsigned long *)str_val) + 1; len *= sizeof(unsigned long); } else { len = fetch_store_strlen((unsigned long)str_val); @@ -592,19 +571,19 @@ static notrace void trace_event_raw_event_synth(void *__data, switch (field->size) { case 1: - *(u8 *)&entry->fields[n_u64] = (u8)val; + entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&entry->fields[n_u64] = (u16)val; + entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&entry->fields[n_u64] = (u32)val; + entry->fields[n_u64].as_u32 = (u32)val; break; default: - entry->fields[n_u64] = val; + entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1230,6 +1209,7 @@ EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start); * synth_event_gen_cmd_array_start - Start synthetic event command from an array * @cmd: A pointer to the dynevent_cmd struct representing the new event * @name: The name of the synthetic event + * @mod: The module creating the event, NULL if not created from a module * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * @@ -1790,19 +1770,19 @@ int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...) switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -1883,19 +1863,19 @@ int synth_event_trace_array(struct trace_event_file *file, u64 *vals, switch (field->size) { case 1: - *(u8 *)&state.entry->fields[n_u64] = (u8)val; + state.entry->fields[n_u64].as_u8 = (u8)val; break; case 2: - *(u16 *)&state.entry->fields[n_u64] = (u16)val; + state.entry->fields[n_u64].as_u16 = (u16)val; break; case 4: - *(u32 *)&state.entry->fields[n_u64] = (u32)val; + state.entry->fields[n_u64].as_u32 = (u32)val; break; default: - state.entry->fields[n_u64] = val; + state.entry->fields[n_u64].as_u64 = val; break; } n_u64++; @@ -2030,19 +2010,19 @@ static int __synth_event_add_val(const char *field_name, u64 val, } else { switch (field->size) { case 1: - *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val; + trace_state->entry->fields[field->offset].as_u8 = (u8)val; break; case 2: - *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val; + trace_state->entry->fields[field->offset].as_u16 = (u16)val; break; case 4: - *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val; + trace_state->entry->fields[field->offset].as_u32 = (u32)val; break; default: - trace_state->entry->fields[field->offset] = val; + trace_state->entry->fields[field->offset].as_u64 = val; break; } } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index e535959939d3..46439e3bcec4 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -31,7 +31,9 @@ void trigger_data_free(struct event_trigger_data *data) /** * event_triggers_call - Call triggers associated with a trace event * @file: The trace_event_file associated with the event + * @buffer: The ring buffer that the event is being written to * @rec: The trace entry for the event, NULL for unconditional invocation + * @event: The event meta data in the ring buffer * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command. If rec is diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 590b3d51afae..ba37f768e2f2 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -231,7 +231,8 @@ static void irqsoff_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); - + else + iter->private = NULL; } static void irqsoff_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b2b726bea1f9..c68a72707852 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -386,12 +386,12 @@ static const struct btf_type *find_btf_func_proto(const char *funcname) /* Get BTF_KIND_FUNC type */ t = btf_type_by_id(btf, id); - if (!btf_type_is_func(t)) + if (!t || !btf_type_is_func(t)) return ERR_PTR(-ENOENT); /* The type of BTF_KIND_FUNC is BTF_KIND_FUNC_PROTO */ t = btf_type_by_id(btf, t->type); - if (!btf_type_is_func_proto(t)) + if (!t || !btf_type_is_func_proto(t)) return ERR_PTR(-ENOENT); return t; @@ -443,7 +443,7 @@ static int parse_btf_arg(const char *varname, struct fetch_insn *code, if (!ctx->params) { params = find_btf_func_param(ctx->funcname, &ctx->nr_params, ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR(params)) { + if (IS_ERR_OR_NULL(params)) { trace_probe_log_err(ctx->offset, NO_BTF_ENTRY); return PTR_ERR(params); } @@ -1273,7 +1273,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[], params = find_btf_func_param(ctx->funcname, &nr_params, ctx->flags & TPARG_FL_TPOINT); - if (IS_ERR(params)) { + if (IS_ERR_OR_NULL(params)) { if (args_idx != -1) { /* $arg* requires BTF info */ trace_probe_log_err(0, NOSUP_BTFARG); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 330aee1c1a49..0469a04a355f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -168,6 +168,8 @@ static void wakeup_trace_open(struct trace_iterator *iter) { if (is_graph(iter->tr)) graph_trace_open(iter); + else + iter->private = NULL; } static void wakeup_trace_close(struct trace_iterator *iter) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index e5e299260d0c..bac06ee3b98b 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -131,6 +131,7 @@ EXPORT_SYMBOL_GPL(trace_seq_bitmask); * trace_seq_vprintf - sequence printing of trace information * @s: trace sequence descriptor * @fmt: printf format string + * @args: Arguments for the format string * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formatting of a trace diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index 2c765ee2a4d4..99c37eeebc16 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -272,10 +272,6 @@ extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); -extern void tracing_map_set_field_descr(struct tracing_map *map, - unsigned int i, - unsigned int key_offset, - tracing_map_cmp_fn_t cmp_fn); extern int tracing_map_sort_entries(struct tracing_map *map, struct tracing_map_sort_key *sort_keys, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..800b4208dba9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -52,6 +52,7 @@ #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/kvm_para.h> +#include <linux/delay.h> #include "workqueue_internal.h" @@ -338,8 +339,10 @@ static cpumask_var_t *wq_numa_possible_cpumask; * Per-cpu work items which run for longer than the following threshold are * automatically considered CPU intensive and excluded from concurrency * management to prevent them from noticeably delaying other per-cpu work items. + * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter. + * The actual value is initialized in wq_cpu_intensive_thresh_init(). */ -static unsigned long wq_cpu_intensive_thresh_us = 10000; +static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); static bool wq_disable_numa; @@ -6513,6 +6516,42 @@ void __init workqueue_init_early(void) !system_freezable_power_efficient_wq); } +static void __init wq_cpu_intensive_thresh_init(void) +{ + unsigned long thresh; + unsigned long bogo; + + /* if the user set it to a specific value, keep it */ + if (wq_cpu_intensive_thresh_us != ULONG_MAX) + return; + + /* + * The default of 10ms is derived from the fact that most modern (as of + * 2023) processors can do a lot in 10ms and that it's just below what + * most consider human-perceivable. However, the kernel also runs on a + * lot slower CPUs including microcontrollers where the threshold is way + * too low. + * + * Let's scale up the threshold upto 1 second if BogoMips is below 4000. + * This is by no means accurate but it doesn't have to be. The mechanism + * is still useful even when the threshold is fully scaled up. Also, as + * the reports would usually be applicable to everyone, some machines + * operating on longer thresholds won't significantly diminish their + * usefulness. + */ + thresh = 10 * USEC_PER_MSEC; + + /* see init/calibrate.c for lpj -> BogoMIPS calculation */ + bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1); + if (bogo < 4000) + thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC); + + pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n", + loops_per_jiffy, bogo, thresh); + + wq_cpu_intensive_thresh_us = thresh; +} + /** * workqueue_init - bring workqueue subsystem fully online * @@ -6528,6 +6567,8 @@ void __init workqueue_init(void) struct worker_pool *pool; int cpu, bkt; + wq_cpu_intensive_thresh_init(); + /* * It'd be simpler to initialize NUMA in workqueue_init_early() but * CPU to node mapping may not be available that early on some |